├── ASAP
├── __init__.py
├── .DS_Store
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── __init__.cpython-37.pyc
│ ├── FeatureExtraction.cpython-36.pyc
│ ├── FeatureExtraction.cpython-37.pyc
│ ├── DesignRecommendation.cpython-36.pyc
│ ├── SequenceAndFeatureAnalysis.cpython-36.pyc
│ └── SequenceAndFeatureAnalysis.cpython-37.pyc
├── DesignRecommendation.py
├── S_SequenceInRegion.py
├── FeatureExtraction.py
└── SequenceAndFeatureAnalysis.py
├── .DS_Store
├── data
├── .DS_Store
├── pigs_canonical.txt
└── blosum62.csv
├── results
├── .DS_Store
├── MMP-IGHV
│ ├── .DS_Store
│ ├── IGHV_Only pI Features_ROC.png
│ ├── IGHV_Except pI Features_ROC.png
│ ├── IGHV_All Features Included_ROC.png
│ ├── IGHV_Except Germline Features_ROC.png
│ ├── IGHV_Only Germline Features_ROC.png
│ ├── IGHV_Except CDR Canonical Structure Features_ROC.png
│ ├── IGHV_Only CDR Canonical Structure Features_ROC.png
│ ├── IGHV_Only Frequent Positional Motif Features_ROC.png
│ ├── IGHV_Except Frequent Positional Motif Features_ROC.png
│ ├── IGHV_RankFisherAndFS.csv
│ └── IGHV_Jaccard Feature Coefficient.csv
└── MMP-PDB
│ ├── .DS_Store
│ ├── MMP-cluster_DTreeAllFeature.png
│ ├── MMP-cluster_Extracted Features.png
│ ├── MMP-cluster_Except pI Features_ROC.png
│ ├── MMP-cluster_Heavy Chain Sequences.png
│ ├── MMP-cluster_Light Chain Sequences.png
│ ├── MMP-cluster_Only pI Features_ROC.png
│ ├── MMP-cluster_All Features Included_ROC.png
│ ├── MMP-cluster_Only Germline Features_ROC.png
│ ├── MMP-cluster_Except Germline Features_ROC.png
│ ├── MMP-cluster_Only CDR Canonical Structure Features_ROC.png
│ ├── MMP-cluster_Except CDR Canonical Structure Features_ROC.png
│ ├── MMP-cluster_Only Frequent Positional Motif Features_ROC.png
│ ├── MMP-cluster_All Features Included(Exclude Correlated)_ROC.png
│ ├── MMP-cluster_Except Frequent Positional Motif Features_ROC.png
│ ├── MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png
│ ├── MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png
│ ├── MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png
│ └── MMP-cluster_RankFisherAndFS.csv
├── testCase
├── .DS_Store
├── IGHV
│ ├── .DS_Store
│ ├── reference-IGHV
│ │ └── .DS_Store
│ └── targeting-MMP-IGHV
│ │ └── .DS_Store
└── MMP-cluster
│ ├── .DS_Store
│ ├── reference-PDB
│ └── .DS_Store
│ └── targeting-MMP
│ └── .DS_Store
├── __pycache__
└── ASAP.cpython-36.pyc
├── requirements.txt
├── supporting information
├── .DS_Store
├── Figure S1.png
├── Figure S2.png
├── Figure S3.png
├── Figure S4.png
├── Table S1.xlsx
├── Table S2.xlsx
├── Table S3.xlsx
└── Table S4.xlsx
├── .idea
├── vcs.xml
├── misc.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
├── ASAP-1.0.iml
└── workspace.xml
├── LICENSE
├── environment.yml
├── README.md
├── ASAP.ipynb
└── .ipynb_checkpoints
└── ASAP-checkpoint.ipynb
/ASAP/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/.DS_Store
--------------------------------------------------------------------------------
/ASAP/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/.DS_Store
--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/data/.DS_Store
--------------------------------------------------------------------------------
/results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/.DS_Store
--------------------------------------------------------------------------------
/testCase/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/.DS_Store
--------------------------------------------------------------------------------
/testCase/IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/.DS_Store
--------------------------------------------------------------------------------
/results/MMP-IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/.DS_Store
--------------------------------------------------------------------------------
/results/MMP-PDB/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/.DS_Store
--------------------------------------------------------------------------------
/__pycache__/ASAP.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/__pycache__/ASAP.cpython-36.pyc
--------------------------------------------------------------------------------
/testCase/MMP-cluster/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/.DS_Store
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pydotplus==2.0.2
2 | scipy==0.19.1
3 | matplotlib==2.1.0
4 | numpy==1.14.1
5 | scikit-learn==0.19.2
6 |
--------------------------------------------------------------------------------
/supporting information/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/.DS_Store
--------------------------------------------------------------------------------
/supporting information/Figure S1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S1.png
--------------------------------------------------------------------------------
/supporting information/Figure S2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S2.png
--------------------------------------------------------------------------------
/supporting information/Figure S3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S3.png
--------------------------------------------------------------------------------
/supporting information/Figure S4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S4.png
--------------------------------------------------------------------------------
/supporting information/Table S1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S1.xlsx
--------------------------------------------------------------------------------
/supporting information/Table S2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S2.xlsx
--------------------------------------------------------------------------------
/supporting information/Table S3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S3.xlsx
--------------------------------------------------------------------------------
/supporting information/Table S4.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S4.xlsx
--------------------------------------------------------------------------------
/testCase/IGHV/reference-IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/reference-IGHV/.DS_Store
--------------------------------------------------------------------------------
/ASAP/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/ASAP/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/testCase/IGHV/targeting-MMP-IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/targeting-MMP-IGHV/.DS_Store
--------------------------------------------------------------------------------
/testCase/MMP-cluster/reference-PDB/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/reference-PDB/.DS_Store
--------------------------------------------------------------------------------
/testCase/MMP-cluster/targeting-MMP/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/targeting-MMP/.DS_Store
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only pI Features_ROC.png
--------------------------------------------------------------------------------
/ASAP/__pycache__/FeatureExtraction.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/FeatureExtraction.cpython-36.pyc
--------------------------------------------------------------------------------
/ASAP/__pycache__/FeatureExtraction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/FeatureExtraction.cpython-37.pyc
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except pI Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_DTreeAllFeature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_DTreeAllFeature.png
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_All Features Included_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_All Features Included_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Extracted Features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Extracted Features.png
--------------------------------------------------------------------------------
/ASAP/__pycache__/DesignRecommendation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/DesignRecommendation.cpython-36.pyc
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except Germline Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only Germline Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except pI Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Heavy Chain Sequences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Heavy Chain Sequences.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Light Chain Sequences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Light Chain Sequences.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only pI Features_ROC.png
--------------------------------------------------------------------------------
/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-36.pyc
--------------------------------------------------------------------------------
/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-37.pyc
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_All Features Included_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_All Features Included_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Germline Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Germline Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except CDR Canonical Structure Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only CDR Canonical Structure Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only Frequent Positional Motif Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except Frequent Positional Motif Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except CDR Canonical Structure Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Frequent Positional Motif Features_ROC.png
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_All Features Included(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_All Features Included(Exclude Correlated)_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Frequent Positional Motif Features_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/ASAP-1.0.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018 Xinmeng Li
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/data/pigs_canonical.txt:
--------------------------------------------------------------------------------
1 | L1 1 6 29 VIL
2 | L1 2 7 29 VIL
3 | L1 3 13 29 VIL
4 | L1 4 12 29 VIL
5 | L1 5 11 29 VIL
6 | L1 6 8 29 VIL
7 | L2 1 3
8 | L3 1 6 95 P 90 HNQ
9 | L3 2 6 94 P 90 Q
10 | L3 3 5 96 P 90 Q
11 | L3 4 4 90 Q
12 | L3 5 7 95A P 90 Q
13 | L3 6 5 90 Q 94 L
14 | H1 1 7
15 | H1 2 8
16 | H1 3 9
17 | H1 4 6
18 | H2 1 3 71 AVL
19 | H2 2 3 71 RK
20 | H2 3 4 71 AVL
21 | H2 4 4 71 RK
22 | H2 5 6 71 AVL
23 | H2 6 6 71 RK
24 | H3 2 11 94 ACDEFGHILMNPQSTVYW
25 | H3 3 11 94 RK
26 | H3 1 10
27 | H3 2 12 94 ACDEFGHILMNPQSTVYW
28 | H3 3 12 94 RK
29 | H3 2 13 94 ACDEFGHILMNPQSTVYW
30 | H3 3 13 94 RK
31 | H3 2 14 94 ACDEFGHILMNPQSTVYW
32 | H3 3 14 94 RK
33 | H3 2 15 94 ACDEFGHILMNPQSTVYW
34 | H3 3 15 94 RK
35 | H3 2 16 94 ACDEFGHILMNPQSTVYW
36 | H3 3 16 94 RK
37 | H3 2 17 94 ACDEFGHILMNPQSTVYW
38 | H3 3 17 94 RK
39 | H3 2 18 94 ACDEFGHILMNPQSTVYW
40 | H3 3 18 94 RK
41 | H3 2 19 94 ACDEFGHILMNPQSTVYW
42 | H3 3 19 94 RK
43 | H3 2 20 94 ACDEFGHILMNPQSTVYW
44 | H3 3 20 94 RK
45 | H3 2 21 94 ACDEFGHILMNPQSTVYW
46 | H3 3 21 94 RK
47 | H3 2 22 94 ACDEFGHILMNPQSTVYW
48 | H3 3 22 94 RK
49 | H3 2 23 94 ACDEFGHILMNPQSTVYW
50 | H3 3 23 94 RK
51 | H3 2 24 94 ACDEFGHILMNPQSTVYW
52 | H3 3 24 94 RK
53 | H3 2 25 94 ACDEFGHILMNPQSTVYW
54 | H3 3 25 94 RK
55 | H3 2 26 94 ACDEFGHILMNPQSTVYW
56 | H3 3 26 94 RK
57 | H3 2 27 94 ACDEFGHILMNPQSTVYW
58 | H3 3 27 94 RK
--------------------------------------------------------------------------------
/data/blosum62.csv:
--------------------------------------------------------------------------------
1 | A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V,B,Z,X,_
2 | 4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0,-2,-1,0,-4
3 | -1,5,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3,-1,0,-1,-4
4 | -2,0,6,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3,3,0,-1,-4
5 | -2,-2,1,6,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3,4,1,-1,-4
6 | 0,-3,-3,-3,9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4
7 | -1,1,0,0,-3,5,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2,0,3,-1,-4
8 | -1,0,0,2,-4,2,5,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4
9 | 0,-2,0,-1,-3,-2,-2,6,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3,-1,-2,-1,-4
10 | -2,0,1,-1,-3,0,0,-2,8,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3,0,0,-1,-4
11 | -1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3,-3,-3,-1,-4
12 | -1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1,-4,-3,-1,-4
13 | -1,2,0,-1,-3,1,1,-2,-1,-3,-2,5,-1,-3,-1,0,-1,-3,-2,-2,0,1,-1,-4
14 | -1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,5,0,-2,-1,-1,-1,-1,1,-3,-1,-1,-4
15 | -2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,6,-4,-2,-2,1,3,-1,-3,-3,-1,-4
16 | -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,7,-1,-1,-4,-3,-2,-2,-1,-2,-4
17 | 1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2,0,0,0,-4
18 | 0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,5,-2,-2,0,-1,-1,0,-4
19 | -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,11,2,-3,-4,-3,-2,-4
20 | -2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,7,-1,-3,-2,-1,-4
21 | 0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4,-3,-2,-1,-4
22 | -2,-1,3,4,-3,0,1,-1,0,-3,-4,0,-3,-3,-2,0,-1,-4,-3,-3,4,1,-1,-4
23 | -1,0,0,1,-3,3,4,-2,0,-3,-3,1,-1,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4
24 | 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2,0,0,-2,-1,-1,-1,-1,-1,-4
25 | -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,1
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_RankFisherAndFS.csv:
--------------------------------------------------------------------------------
1 | Feature, Feature Value,Fisher Test p-value, Feature Selection (thereshold = 0.0167),Rank of Statistic Significancy, Rank of Feature Selection
2 | Canonical H1,0,1,0.000480165,,
3 | Canonical H1,1,0.774906367,0.000600854,,
4 | Canonical H1,2,1,0.000175353,,
5 | Canonical H1,3,1,0,,
6 | Canonical H2,0,0.535049709,0.012889063,,
7 | Canonical H2,6,0.644296474,0.010232892,,
8 | Canonical H3,0,0.083794647,0.01479616,,
9 | Canonical H3,1,0.234700323,0.013119923,,
10 | Canonical H3,2,0.002614369,0.033944391,7.62,8
11 | Canonical H3,3,0.999938155,0.053446578,,2
12 | Germ HJ,IGHJ1*01,0.688488693,0.004603937,,
13 | Germ HJ,IGHJ2*01,0.354049029,0.004012455,,
14 | Germ HJ,IGHJ3*01,0.576535581,0.003012808,,
15 | Germ HJ,IGHJ3*02,1.32E-05,0.047185683,1.29,3
16 | Germ HJ,IGHJ4*02,0.553358944,0.034656349,,7
17 | Germ HJ,IGHJ5*01,0.96537164,0.015574391,,
18 | Germ HJ,IGHJ5*02,0.197274561,0.018123477,,19
19 | Germ HJ,IGHJ6*01,0.999947777,0.066944116,,1
20 | Germ HJ,IGHJ6*04,0.000416969,0.032812578,6.36,9
21 | Motif,10_YY,0.07051444,0.016529828,,
22 | Motif,10_YYG,0.053349784,0.014285162,,
23 | Motif,10_YYY,0.343316239,0.011916447,,
24 | Motif,2_GG,1,0.008545674,,
25 | Motif,2_GS,0.762847397,0.007176706,,
26 | Motif,2_YG,0.005067609,0.036587558,10.51,6
27 | Motif,2_YY,0.102581789,0.018412503,,18
28 | Motif,3_SG,0.624699976,0.017795207,,20
29 | Motif,3_SS,0.617371164,0.008419981,,
30 | Motif,3_YY,0.001265975,0.032171889,6.41,11
31 | Motif,3_YYD,0.002689351,0.00459369,7.61,
32 | Motif,4_SG,0.955215392,0.009229585,,
33 | Motif,4_SS,0.305268831,0.012016952,,
34 | Motif,4_YD,0.000317996,0.026747743,3.51,14
35 | Motif,4_YDS,0.002069412,0.006805006,7.59,
36 | Motif,5_DS,0.004330063,0.007100598,10.42,
37 | Motif,5_SG,0.626303426,0.010727004,,
38 | Motif,5_YY,0.026298446,0.037488145,15.28,5
39 | Motif,6_SG,0.509513574,0.022391224,,16
40 | Motif,6_SS,0.023126499,0.004795634,15.85,
41 | Motif,6_SSG,0.002423729,0.008186862,8.61,
42 | Motif,6_YY,0.937239799,0.011581004,,
43 | Motif,7_SG,0.028197383,0.007329619,16.75,
44 | Motif,7_SGY,0.003847006,0.003018869,10.29,
45 | Motif,7_YY,0.846405186,0.014090079,,
46 | Motif,7_YYY,0.591550617,0.004151414,,
47 | Motif,8_GY,0.012402538,0.010409138,12.7,
48 | Motif,8_YY,0.364641908,0.02334784,,15
49 | Motif,8_YYY,0.27775982,0.005765425,,
50 | Motif,9_FD,0.555232757,0.019582355,,17
51 | Motif,9_YY,0.017733467,0.01198022,13.5,
52 | Motif,9_YYY,0.0259799,0.009162702,14.58,
53 | PI,0.0-3.5,0.018313867,0.032143423,13.67,12
54 | PI,3.5-3.9375,0.08010366,0.039139254,,4
55 | PI,3.9375-4.375,0.485241177,0.032641723,,10
56 | PI,4.375-4.8125,0.901919558,0.009318928,,
57 | PI,4.8125-5.25,0.936906848,0.015848292,,
58 | PI,5.25-5.6875,0.824022061,0.010279235,,
59 | PI,5.6875-6.125,0.904667929,0.027072376,,13
60 | PI,6.125-7.0,0.945280631,0.009723836,,
61 | PI,7.0-14.0,0.966549563,0.014879694,,
--------------------------------------------------------------------------------
/ASAP/DesignRecommendation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pydotplus
3 | from sklearn import tree
4 |
5 | SET_NAME = 'MMP-cluster'
6 | IF_ONLY_HEAVY = False
7 | CNT_DB = 2
8 | CNT_TARGET = 1
9 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'
10 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'
11 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster
12 |
13 |
14 |
15 | #################################################################################################################
16 | # function SanityFeature:
17 | # Omit non-recommending features, such as motif features and type 0 canonical structures, as feature value for decision tree
18 | #
19 | # Input: AgreeFeature, AllFeatureNames
20 | # Output: 1. SanityAgreeFeature, [] a list of index according to the AllFeatureNames that remain to put in decision tree
21 | #################################################################################################################
22 |
23 | def SanityFeature(AgreeFeature, AllFeatureNames):
24 | SanityAgreeFeature=[]
25 | for idx in AgreeFeature:
26 | if not(AllFeatureNames[idx].split('_')[0] == 'Motif') and not(AllFeatureNames[idx].split('_')[0] == 'Canonical' and AllFeatureNames[idx].split('_')[2] == '0'):
27 | SanityAgreeFeature.append(idx)
28 | return SanityAgreeFeature
29 |
30 | #################################################################################################################
31 | # function MultiDecisionTree:
32 | # Decision tree drawn with combined data across multiple iteration
33 | #
34 | # Input: X_DS, Y_DS, FeatureN, type
35 | #################################################################################################################
36 | def MultiDecisionTree(iterate, X_IDS, Y_IDS, AllFeatureNames, type):
37 | Y = np.concatenate(Y_IDS, axis=0)
38 | AgreeFeature =[i for i in range(len(AllFeatureNames)) ]
39 | SanityAgreeFeature = SanityFeature(AgreeFeature, AllFeatureNames)
40 |
41 | SanityAgreeFeatureName = []
42 | for idx in SanityAgreeFeature:
43 | SanityAgreeFeatureName.append(AllFeatureNames[idx])
44 |
45 | Sig_X_DS =[[] for i in range(iterate)]
46 | for i in range(iterate):
47 | X_IDS[i]=np.array(X_IDS[i])
48 | Sig_X_DS[i] = X_IDS[i][:,SanityAgreeFeature]
49 |
50 | X =np.concatenate(Sig_X_DS, axis=0)
51 |
52 | minLeafSize = int(0.025 *len(Y))
53 | clf = tree.DecisionTreeClassifier(min_samples_leaf = minLeafSize)
54 | clf = clf.fit(np.ones((len(Y),len(X[0])))-X, Y) #flip the X for decision tree to meet the true false
55 |
56 | dot_data = tree.export_graphviz(clf, out_file=None, filled=True,feature_names=SanityAgreeFeatureName, class_names=['Reference', 'Targeting'], rounded=True)
57 | pydotplus.graph_from_dot_data(dot_data).write_png("./results/"+ SET_NAME + "_DTree"+ type +".png")
58 |
59 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: asap
2 | channels:
3 | - conda-forge/label/cf201901
4 | - anaconda
5 | - conda-forge
6 | - defaults
7 | dependencies:
8 | - ca-certificates=2019.11.27=0
9 | - certifi=2019.11.28=py36_0
10 | - openssl=1.1.1=h1de35cc_0
11 | - pandas=0.25.3=py36h0a44026_0
12 | - pytz=2019.3=py_0
13 | - appnope=0.1.0=py36_1000
14 | - attrs=19.3.0=py_0
15 | - backcall=0.1.0=py_0
16 | - bleach=3.1.0=py_0
17 | - cycler=0.10.0=py_2
18 | - decorator=4.4.1=py_0
19 | - defusedxml=0.6.0=py_0
20 | - entrypoints=0.3=py36_1000
21 | - freetype=2.10.0=h24853df_1
22 | - graphviz=2.42.3=h98dfb87_0
23 | - icu=58.2=h0a44026_1000
24 | - importlib_metadata=1.3.0=py36_0
25 | - ipykernel=5.1.3=py36h5ca1d4c_0
26 | - ipython=7.11.0=py36h5ca1d4c_0
27 | - ipython_genutils=0.2.0=py_1
28 | - ipywidgets=7.5.1=py_0
29 | - jedi=0.15.2=py36_0
30 | - jinja2=2.10.3=py_0
31 | - joblib=0.14.1=py_0
32 | - jsonschema=3.2.0=py36_0
33 | - jupyter=1.0.0=py_2
34 | - jupyter_client=5.3.3=py36_1
35 | - jupyter_console=5.1.0=py36_0
36 | - jupyter_core=4.6.1=py36_0
37 | - kiwisolver=1.1.0=py36ha1b3eb9_0
38 | - libblas=3.8.0=14_openblas
39 | - libcblas=3.8.0=14_openblas
40 | - libcxx=9.0.0=h89e68fa_1
41 | - libffi=3.2.1=h6de7cb9_1006
42 | - libgfortran=4.0.0=2
43 | - liblapack=3.8.0=14_openblas
44 | - libopenblas=0.3.7=h3d69b6c_4
45 | - libpng=1.6.37=h2573ce8_0
46 | - libsodium=1.0.17=h01d97ff_0
47 | - libtiff=4.1.0=ha78913b_1
48 | - llvm-openmp=8.0.1=h770b8ee_0
49 | - lz4-c=1.8.3=h6de7cb9_1001
50 | - markupsafe=1.1.1=py36h0b31af3_0
51 | - matplotlib=3.1.2=py36_1
52 | - matplotlib-base=3.1.2=py36h11da6c2_1
53 | - mistune=0.8.4=py36h0b31af3_1000
54 | - more-itertools=8.0.2=py_0
55 | - nbconvert=5.6.1=py36_0
56 | - nbformat=4.4.0=py_1
57 | - ncurses=6.1=h0a44026_1002
58 | - notebook=6.0.1=py36_0
59 | - numpy=1.17.3=py36hde6bac1_0
60 | - pandoc=2.9.1=0
61 | - pandocfilters=1.4.2=py_1
62 | - parso=0.5.2=py_0
63 | - pexpect=4.7.0=py36_0
64 | - pickleshare=0.7.5=py36_1000
65 | - pip=19.3.1=py36_0
66 | - prometheus_client=0.7.1=py_0
67 | - prompt_toolkit=3.0.2=py_0
68 | - ptyprocess=0.6.0=py_1001
69 | - pydot=1.4.1=py36_1001
70 | - pydotplus=2.0.2=pyhd1c1de3_3
71 | - pygments=2.5.2=py_0
72 | - pyparsing=2.4.6=py_0
73 | - pyqt=5.6.0=py36hc26a216_1008
74 | - pyrsistent=0.15.6=py36h0b31af3_0
75 | - python=3.6.7=h8dc6b48_1004
76 | - python-dateutil=2.8.1=py_0
77 | - pyzmq=18.1.1=py36h4bf09a9_0
78 | - qt=5.6.2=h822fa55_1013
79 | - qtconsole=4.6.0=py_0
80 | - scikit-learn=0.21.3=py36hd4ffd6c_0
81 | - scipy=1.4.1=py36h82752d6_0
82 | - send2trash=1.5.0=py_0
83 | - setuptools=42.0.2=py36_0
84 | - sip=4.18.1=py36h0a44026_1000
85 | - six=1.13.0=py36_0
86 | - terminado=0.8.3=py36_0
87 | - testpath=0.4.4=py_0
88 | - tk=8.6.10=hbbe82c9_0
89 | - tornado=6.0.3=py36h0b31af3_0
90 | - traitlets=4.3.3=py36_0
91 | - wcwidth=0.1.7=py_1
92 | - webencodings=0.5.1=py_1
93 | - wheel=0.33.6=py36_0
94 | - widgetsnbextension=3.5.1=py36_0
95 | - xz=5.2.4=h1de35cc_1001
96 | - zeromq=4.3.2=h6de7cb9_2
97 | - zipp=0.6.0=py_0
98 | - zlib=1.2.11=h0b31af3_1006
99 | - zstd=1.4.4=he7fca8b_1
100 | - biopython=1.72=py36h470a237_0
101 | - jpeg=9c=h470a237_1
102 | - readline=7.0=haf1bffa_1
103 | - sqlite=3.26.0=hb1c47c0_0
104 |
105 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ASAP-SML: An Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning
2 |
3 | Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning (ASAP-SML) is a pipeline to identify distinguishing features in targeting antibody set when compared to a reference non-targeting set. The pipeline first extracts germline, CDR canonical structure, isoelectric point and frequent positional motifs features from sequences and creates an antibody feature fingerprint. Machine-learning and statistical significance testing are applied to antibody sequences and feature fingerprints to identify distinguishing feature values and combinations thereof. When applied to an MMP-targeting set, ASAP identifies salient features and recommends features to use when designing novel MPP-targeting antibody sequences.
4 |
5 | ## How to install
6 | ### Requirements:
7 | An [Anaconda python environment](https://www.anaconda.com/download) is recommmended.
8 | Check the environment.yml file, but primarily:
9 | - python >= 3.5
10 | - pandas
11 | - graphviz
12 | - jupyter
13 | - numpy
14 | - scikit-learn
15 | - scipy
16 | - biopython
17 |
18 | Jupyter notebook is required to run the ipynb examples.
19 |
20 | ### via Anaconda
21 | We recommend installing using Anaconda as follows:
22 | ```
23 | conda create --name asap --file enviroment.yml
24 | source activate asap
25 | ```
26 |
27 | ## Example: Matrix Metalloproteinases (MMP) targeting and reference antibody sequence set
28 |
29 | This repository contains an example of how to run the ASAP pipeline on the MMP-targeting and reference antibody sequence set.
30 |
31 | To run the script, open the terminal and go to the project directory, then run:
32 |
33 | `
34 | jupyter notebook
35 | `
36 |
37 | Take a look at the file "ASAP.ipynb". Parameters are set based on the users choice. Once you have set the parameters, run the notebook document step-by-step (one cell a time) by
38 |
39 | - Pressing shift + enter
40 |
41 | Or, run the whole notebook in a single step by
42 |
43 | - Clicking on the menu Cell -> Run All.
44 |
45 | ## Components
46 | ASAP.ipynb : main script for running ASAP pipeline
47 |
48 | - **./ASAP/FeatureExtraction.py** - functions for feature extraction on Chothia numbered antibody sequences.
49 | - **./ASAP/SequenceAndFeatureAnalysis.py** - functions for sequence and feature analysis on antibody sequences.
50 | - **./ASAP/DesignRecommendation.py** - functions to generate design recommendation trees for specific targeting antibody sequences.
51 |
52 | ## Data
53 |
54 | - Data to run ASAP: [BLOSUM-62 substitution matrix](https://en.wikipedia.org/wiki/BLOSUM#cite_ref-henikoff_1-0) and [Canonical Structure Definition](http://circe.med.uniroma1.it/pigs/canonical.php)
55 |
56 | - Data to run ASAP on MMP-targeting example: MMP-targeting and reference set.
57 |
58 | MMP-targeting set is composed of publicly available antibody sequence data. Reference set is from the Protein Data Bank (PDB) and it consists of human and murine antibody sequences that do not bind or inhibit MMPs. Please see our paper for details.
59 |
60 | ## Authors:
61 | This software is written by Xinmeng Li, James Van Deventer, Soha Hassoun (Soha.Hassoun@tufts.edu).
62 |
63 | Publication: ["ASAP-SML: An Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning"](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007779)
64 |
65 | **Please cite our work:**
66 |
67 | Li, Xinmeng, James A. Van Deventer, and Soha Hassoun. "ASAP-SML: An antibody sequence analysis pipeline using statistical testing and machine learning." PLoS computational biology 16.4 (2020): e1007779.
68 |
69 | ## License
70 |
71 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
72 |
73 |
--------------------------------------------------------------------------------
/ASAP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#################################################################################################################\n",
10 | "# #\n",
11 | "# Section 1 Data Preperation #\n",
12 | "# #\n",
13 | "#################################################################################################################\n",
14 | "\n",
15 | "\n",
16 | "######################################### User define variables ###############################################\n",
17 | "\n",
18 | "# User Choice C_MMPTest\n",
19 | "# Run test case for MMP? (y/n)\n",
20 | "# If True: Default testCase-MMP files \n",
21 | "# If False: User upload Chothia-numbered sequences files to \"targeting\" and \"reference\" folders under \"./user/data/\" respectively.\n",
22 | "# Default: True\n",
23 | "\n",
24 | "C_SAMPLE_Test = True\n",
25 | "\n",
26 | "# User Choice C_PIGS\n",
27 | "# Use PIGS template for CDR canonical structure? (y/n)\n",
28 | "# If True: Default PIGS CDR Canonical structure template under Chothia numbering\n",
29 | "# If False: User upload fomatted CDR Canonical structure template under \"./user/data/\"\n",
30 | "# Default: True\n",
31 | "\n",
32 | "C_PIGS = True\n",
33 | "\n",
34 | "# User Choice C_DesireSize\n",
35 | "# Use default desire size for targeting dataset? (y/n)\n",
36 | "# If True: Default desire size, 44 for the MMP test case, medium for user upload files\n",
37 | "# If False: User define desire size for targeting dataset\n",
38 | "# Default: True\n",
39 | "\n",
40 | "C_DesireSize = True\n",
41 | "\n",
42 | "# User Choice C_k\n",
43 | "# Use default number of iterations? (y/n)\n",
44 | "# If True: Default number of iterations, k = 100\n",
45 | "# If False: User define number of iterations\n",
46 | "# Default: True\n",
47 | "C_k = True\n",
48 | "\n",
49 | "\n",
50 | "####################################### Define global variables ###############################################\n",
51 | "\n",
52 | "\n",
53 | "# SET_NAME = 'IGHV'\n",
54 | "# IF_ONLY_HEAVY = True\n",
55 | "# CNT_DB = 1\n",
56 | "# CNT_TARGET = 1\n",
57 | "# REFERENCE_PATH_TESTCASE = './testCase/IGHV/reference-IGHV/'\n",
58 | "# TARGETING_PATH_TESTCASE = './testCase/IGHV/targeting-MMP-IGHV/'\n",
59 | "# TARGET_DESIRE_SIZE = 134 #44 #IGHV\n",
60 | "\n",
61 | "SET_NAME = 'MMP-cluster'\n",
62 | "IF_ONLY_HEAVY = False\n",
63 | "CNT_DB = 2\n",
64 | "CNT_TARGET = 1\n",
65 | "REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'\n",
66 | "TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'\n",
67 | "TARGET_DESIRE_SIZE = 166\n",
68 | "\n",
69 | "PIGS_PATH = './data/pigs_canonical.txt'\n",
70 | "TEMPLATE_PATH = './user/data/'\n",
71 | "\n",
72 | "\n",
73 | "\n",
74 | "ITERATION = 100\n",
75 | "\n",
76 | "######################################## Determine variable values ############################################\n",
77 | "\n",
78 | "if C_SAMPLE_Test == True:\n",
79 | " targeting_direct = TARGETING_PATH_TESTCASE\n",
80 | " reference_direct = REFERENCE_PATH_TESTCASE\n",
81 | "else:\n",
82 | " print(\"Each pair of light and heavy chain sequence should be in the order of LIGHT/HEAVY/LIGHT/HEAVY\")\n",
83 | " targeting_direct = TARGETING_PATH\n",
84 | " reference_direct = REFERENCE_PATH\n",
85 | " \n",
86 | "if C_PIGS == True:\n",
87 | " canonical_direct = PIGS_PATH\n",
88 | "else:\n",
89 | " print(\"Upload CDR canonical structure templates. \")\n",
90 | " print(\"In the template, the first column must be the L1, L2, L3, H1, H2, or H3, \")\n",
91 | " print(\"the second column is the length of the region defined in the first column, \")\n",
92 | " print(\"starting from the third column, it is the position and candidate amino acid on each position, such as 1 ABC 2 CDETFG.\") \n",
93 | " template_name = input(\"What is the name of the template?\")\n",
94 | " canonical_direct = TEMPLATE_PATH + template_name\n",
95 | " \n",
96 | "if C_SAMPLE_Test == True and C_DesireSize == True:\n",
97 | " size = TARGET_DESIRE_SIZE\n",
98 | "elif C_SAMPLE_Test == False and C_DesireSize == True:\n",
99 | " size = 'medium'\n",
100 | "else:\n",
101 | " size = int(input('What is the desire size for the targeting set?'))\n",
102 | " \n",
103 | "if C_k == True:\n",
104 | " iterate = ITERATION\n",
105 | "else:\n",
106 | " iterate = int(input(\"What is the number of iterations?\"))\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 2,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "Data:\n",
119 | "r1 : 276\n",
120 | "r2 : 219\n",
121 | "t1 : 166\n",
122 | "Sum: 661\n",
123 | "\n",
124 | "Number of feature values:\n",
125 | "Germline: 334\n",
126 | "CDR canonical structures: 20\n",
127 | "Isoelectric points (pI): 8\n",
128 | "Frequent positional motif: 42\n",
129 | "Total: 404\n"
130 | ]
131 | }
132 | ],
133 | "source": [
134 | "#################################################################################################################\n",
135 | "# #\n",
136 | "# Section 2 Feature Extraction #\n",
137 | "# #\n",
138 | "#################################################################################################################\n",
139 | "\n",
140 | "\n",
141 | "############################################ Import libaries ##################################################\n",
142 | "\n",
143 | "import ASAP.FeatureExtraction as extract\n",
144 | "\n",
145 | "\n",
146 | "############################################ Function calls ##################################################\n",
147 | "\n",
148 | "Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)\n",
149 | "\n",
150 | "OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)\n",
151 | "\n",
152 | "OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)\n",
153 | "\n",
154 | "CDRH3 = extract.GetCDRH3(Amino, Num)\n",
155 | "\n",
156 | "OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)\n",
157 | "\n",
158 | "MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)\n",
159 | "\n",
160 | "AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)\n",
161 | "\n",
162 | "\n",
163 | "############################################ Report section results #############################################\n",
164 | "\n",
165 | "print(\"Data:\")\n",
166 | "for i in range(len(DatasetSize)):\n",
167 | " print(DatasetName[i], \":\",DatasetSize[i],)\n",
168 | "print(\"Sum:\", sum(DatasetSize))\n",
169 | "\n",
170 | "print(\"\\nNumber of feature values:\")\n",
171 | "print(\"Germline:\", len(GermFeatureNames),)\n",
172 | "print(\"CDR canonical structures:\", len(CanonFeatureNames),)\n",
173 | "print(\"Isoelectric points (pI):\", len(PIFeatureNames),)\n",
174 | "print(\"Frequent positional motif:\",len(MotifFeatureNames),)\n",
175 | "print(\"Total:\", len(AllFeatureNames))"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 3,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "name": "stderr",
185 | "output_type": "stream",
186 | "text": [
187 | "/Users/xinmeng/anaconda3/envs/homework/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
188 | " return f(*args, **kwds)\n"
189 | ]
190 | },
191 | {
192 | "name": "stdout",
193 | "output_type": "stream",
194 | "text": [
195 | "RanksumsResult(statistic=-147.48812830271845, pvalue=0.0) RanksumsResult(statistic=-191.24982487069587, pvalue=0.0)\n",
196 | "Statistical tests (Reference against Targeting) succeed.\n",
197 | "(661, 404) 495 166\n",
198 | "Average AUC with all features: \n",
199 | "SVM\t\t 0.9900013753999437\n",
200 | "Random forest\t 0.9858123420498757\n",
201 | "AdaBoost\t 0.985921855997419\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "#################################################################################################################\n",
207 | "# #\n",
208 | "# Section 3 Sequence and Feature Analysis #\n",
209 | "# #\n",
210 | "#################################################################################################################\n",
211 | "\n",
212 | "############################################ Import libaries ##################################################\n",
213 | "\n",
214 | "import ASAP.SequenceAndFeatureAnalysis as analysis\n",
215 | "\n",
216 | "############################################ Function calls ##################################################\n",
217 | "\n",
218 | "X_IDS, Y_IDS, SeqName_IDS = analysis.IterationDuplicateSelectFeature(size, iterate, DatasetName, \n",
219 | " DatasetSize, ExcludeIGHVVectors)\n",
220 | "\n",
221 | "###################### Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################\n",
222 | "\n",
223 | "H_Idist, L_Idist = analysis.HeatmapHL(size, iterate, SeqName_IDS, Amino, Num)\n",
224 | "analysis.Draw_heatmap(size, H_Idist[1], 'Heavy Chain Sequences', DatasetSize)\n",
225 | "if not IF_ONLY_HEAVY:\n",
226 | " analysis.Draw_heatmap(size, L_Idist[1], 'Light Chain Sequences', DatasetSize)\n",
227 | "F_Idist = analysis.HeatmapFeature(size, iterate, X_IDS, ExcludeFeatureNames, MotifFeatureNames)\n",
228 | "analysis.Draw_heatmap(size, F_Idist[0], 'Extracted Features', DatasetSize)\n",
229 | "\n",
230 | "############################### Section 3.2 Similarity analysis (Statistical test) #############################\n",
231 | "\n",
232 | "analysis.MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist)\n",
233 | "\n",
234 | "####################################### Section 3.3 Salient feature-value analysis ############################ #\n",
235 | "\n",
236 | "analysis.MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, ExcludeIGHVVectors, \n",
237 | " ExcludeFeatureNames)\n",
238 | "\n",
239 | "####################################### Section 3.4 Classification on segments ################################ \n",
240 | "\n",
241 | "analysis.MultiAuc(iterate, X_IDS, Y_IDS)\n",
242 | "analysis.ROCDrawing(X_IDS[0], Y_IDS[0], GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames)\n",
243 | "\n",
244 | "analysis.JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize)"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 4,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "#################################################################################################################\n",
254 | "# #\n",
255 | "# Section 4 Design Recommendation #\n",
256 | "# #\n",
257 | "#################################################################################################################\n",
258 | "\n",
259 | "############################################ Import libaries ##################################################\n",
260 | "\n",
261 | "import ASAP.DesignRecommendation as design\n",
262 | "\n",
263 | "############################################ Function calls ##################################################\n",
264 | "\n",
265 | "design.MultiDecisionTree(iterate, X_IDS, Y_IDS, ExcludeFeatureNames, 'AllFeature')"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": []
274 | }
275 | ],
276 | "metadata": {
277 | "kernelspec": {
278 | "display_name": "Python 3",
279 | "language": "python",
280 | "name": "python3"
281 | },
282 | "language_info": {
283 | "codemirror_mode": {
284 | "name": "ipython",
285 | "version": 3
286 | },
287 | "file_extension": ".py",
288 | "mimetype": "text/x-python",
289 | "name": "python",
290 | "nbconvert_exporter": "python",
291 | "pygments_lexer": "ipython3",
292 | "version": "3.6.8"
293 | }
294 | },
295 | "nbformat": 4,
296 | "nbformat_minor": 1
297 | }
298 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/ASAP-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#################################################################################################################\n",
10 | "# #\n",
11 | "# Section 1 Data Preperation #\n",
12 | "# #\n",
13 | "#################################################################################################################\n",
14 | "\n",
15 | "\n",
16 | "######################################### User define variables ###############################################\n",
17 | "\n",
18 | "# User Choice C_MMPTest\n",
19 | "# Run test case for MMP? (y/n)\n",
20 | "# If True: Default testCase-MMP files \n",
21 | "# If False: User upload Chothia-numbered sequences files to \"targeting\" and \"reference\" folders under \"./user/data/\" respectively.\n",
22 | "# Default: True\n",
23 | "\n",
24 | "C_SAMPLE_Test = True\n",
25 | "\n",
26 | "# User Choice C_PIGS\n",
27 | "# Use PIGS template for CDR canonical structure? (y/n)\n",
28 | "# If True: Default PIGS CDR Canonical structure template under Chothia numbering\n",
29 | "# If False: User upload fomatted CDR Canonical structure template under \"./user/data/\"\n",
30 | "# Default: True\n",
31 | "\n",
32 | "C_PIGS = True\n",
33 | "\n",
34 | "# User Choice C_DesireSize\n",
35 | "# Use default desire size for targeting dataset? (y/n)\n",
36 | "# If True: Default desire size, 44 for the MMP test case, medium for user upload files\n",
37 | "# If False: User define desire size for targeting dataset\n",
38 | "# Default: True\n",
39 | "\n",
40 | "C_DesireSize = True\n",
41 | "\n",
42 | "# User Choice C_k\n",
43 | "# Use default number of iterations? (y/n)\n",
44 | "# If True: Default number of iterations, k = 100\n",
45 | "# If False: User define number of iterations\n",
46 | "# Default: True\n",
47 | "C_k = True\n",
48 | "\n",
49 | "\n",
50 | "####################################### Define global variables ###############################################\n",
51 | "\n",
52 | "\n",
53 | "# SET_NAME = 'IGHV'\n",
54 | "# IF_ONLY_HEAVY = True\n",
55 | "# CNT_DB = 1\n",
56 | "# CNT_TARGET = 1\n",
57 | "# REFERENCE_PATH_TESTCASE = './testCase/IGHV/reference-IGHV/'\n",
58 | "# TARGETING_PATH_TESTCASE = './testCase/IGHV/targeting-MMP-IGHV/'\n",
59 | "# TARGET_DESIRE_SIZE = 134 #44 #IGHV\n",
60 | "\n",
61 | "SET_NAME = 'MMP-cluster'\n",
62 | "IF_ONLY_HEAVY = False\n",
63 | "CNT_DB = 2\n",
64 | "CNT_TARGET = 1\n",
65 | "REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'\n",
66 | "TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'\n",
67 | "TARGET_DESIRE_SIZE = 166\n",
68 | "\n",
69 | "PIGS_PATH = './data/pigs_canonical.txt'\n",
70 | "TEMPLATE_PATH = './user/data/'\n",
71 | "\n",
72 | "\n",
73 | "\n",
74 | "ITERATION = 100\n",
75 | "\n",
76 | "######################################## Determine variable values ############################################\n",
77 | "\n",
78 | "if C_SAMPLE_Test == True:\n",
79 | " targeting_direct = TARGETING_PATH_TESTCASE\n",
80 | " reference_direct = REFERENCE_PATH_TESTCASE\n",
81 | "else:\n",
82 | " print(\"Each pair of light and heavy chain sequence should be in the order of LIGHT/HEAVY/LIGHT/HEAVY\")\n",
83 | " targeting_direct = TARGETING_PATH\n",
84 | " reference_direct = REFERENCE_PATH\n",
85 | " \n",
86 | "if C_PIGS == True:\n",
87 | " canonical_direct = PIGS_PATH\n",
88 | "else:\n",
89 | " print(\"Upload CDR canonical structure templates. \")\n",
90 | " print(\"In the template, the first column must be the L1, L2, L3, H1, H2, or H3, \")\n",
91 | " print(\"the second column is the length of the region defined in the first column, \")\n",
92 | " print(\"starting from the third column, it is the position and candidate amino acid on each position, such as 1 ABC 2 CDETFG.\") \n",
93 | " template_name = input(\"What is the name of the template?\")\n",
94 | " canonical_direct = TEMPLATE_PATH + template_name\n",
95 | " \n",
96 | "if C_SAMPLE_Test == True and C_DesireSize == True:\n",
97 | " size = TARGET_DESIRE_SIZE\n",
98 | "elif C_SAMPLE_Test == False and C_DesireSize == True:\n",
99 | " size = 'medium'\n",
100 | "else:\n",
101 | " size = int(input('What is the desire size for the targeting set?'))\n",
102 | " \n",
103 | "if C_k == True:\n",
104 | " iterate = ITERATION\n",
105 | "else:\n",
106 | " iterate = int(input(\"What is the number of iterations?\"))\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 2,
112 | "metadata": {},
113 | "outputs": [
114 | {
115 | "name": "stdout",
116 | "output_type": "stream",
117 | "text": [
118 | "Data:\n",
119 | "r1 : 276\n",
120 | "r2 : 219\n",
121 | "t1 : 166\n",
122 | "Sum: 661\n",
123 | "\n",
124 | "Number of feature values:\n",
125 | "Germline: 334\n",
126 | "CDR canonical structures: 20\n",
127 | "Isoelectric points (pI): 8\n",
128 | "Frequent positional motif: 42\n",
129 | "Total: 404\n"
130 | ]
131 | }
132 | ],
133 | "source": [
134 | "#################################################################################################################\n",
135 | "# #\n",
136 | "# Section 2 Feature Extraction #\n",
137 | "# #\n",
138 | "#################################################################################################################\n",
139 | "\n",
140 | "\n",
141 | "############################################ Import libaries ##################################################\n",
142 | "\n",
143 | "import ASAP.FeatureExtraction as extract\n",
144 | "\n",
145 | "\n",
146 | "############################################ Function calls ##################################################\n",
147 | "\n",
148 | "Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)\n",
149 | "\n",
150 | "OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)\n",
151 | "\n",
152 | "OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)\n",
153 | "\n",
154 | "CDRH3 = extract.GetCDRH3(Amino, Num)\n",
155 | "\n",
156 | "OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)\n",
157 | "\n",
158 | "MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)\n",
159 | "\n",
160 | "AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)\n",
161 | "\n",
162 | "\n",
163 | "############################################ Report section results #############################################\n",
164 | "\n",
165 | "print(\"Data:\")\n",
166 | "for i in range(len(DatasetSize)):\n",
167 | " print(DatasetName[i], \":\",DatasetSize[i],)\n",
168 | "print(\"Sum:\", sum(DatasetSize))\n",
169 | "\n",
170 | "print(\"\\nNumber of feature values:\")\n",
171 | "print(\"Germline:\", len(GermFeatureNames),)\n",
172 | "print(\"CDR canonical structures:\", len(CanonFeatureNames),)\n",
173 | "print(\"Isoelectric points (pI):\", len(PIFeatureNames),)\n",
174 | "print(\"Frequent positional motif:\",len(MotifFeatureNames),)\n",
175 | "print(\"Total:\", len(AllFeatureNames))"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 3,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "name": "stderr",
185 | "output_type": "stream",
186 | "text": [
187 | "/Users/xinmeng/anaconda3/envs/homework/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
188 | " return f(*args, **kwds)\n"
189 | ]
190 | },
191 | {
192 | "name": "stdout",
193 | "output_type": "stream",
194 | "text": [
195 | "RanksumsResult(statistic=-147.48812830271845, pvalue=0.0) RanksumsResult(statistic=-191.24982487069587, pvalue=0.0)\n",
196 | "Statistical tests (Reference against Targeting) succeed.\n",
197 | "(661, 404) 495 166\n",
198 | "Average AUC with all features: \n",
199 | "SVM\t\t 0.9900013753999437\n",
200 | "Random forest\t 0.9858123420498757\n",
201 | "AdaBoost\t 0.985921855997419\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "#################################################################################################################\n",
207 | "# #\n",
208 | "# Section 3 Sequence and Feature Analysis #\n",
209 | "# #\n",
210 | "#################################################################################################################\n",
211 | "\n",
212 | "############################################ Import libaries ##################################################\n",
213 | "\n",
214 | "import ASAP.SequenceAndFeatureAnalysis as analysis\n",
215 | "\n",
216 | "############################################ Function calls ##################################################\n",
217 | "\n",
218 | "X_IDS, Y_IDS, SeqName_IDS = analysis.IterationDuplicateSelectFeature(size, iterate, DatasetName, \n",
219 | " DatasetSize, ExcludeIGHVVectors)\n",
220 | "\n",
221 | "###################### Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################\n",
222 | "\n",
223 | "H_Idist, L_Idist = analysis.HeatmapHL(size, iterate, SeqName_IDS, Amino, Num)\n",
224 | "analysis.Draw_heatmap(size, H_Idist[1], 'Heavy Chain Sequences', DatasetSize)\n",
225 | "if not IF_ONLY_HEAVY:\n",
226 | " analysis.Draw_heatmap(size, L_Idist[1], 'Light Chain Sequences', DatasetSize)\n",
227 | "F_Idist = analysis.HeatmapFeature(size, iterate, X_IDS, ExcludeFeatureNames, MotifFeatureNames)\n",
228 | "analysis.Draw_heatmap(size, F_Idist[0], 'Extracted Features', DatasetSize)\n",
229 | "\n",
230 | "############################### Section 3.2 Similarity analysis (Statistical test) #############################\n",
231 | "\n",
232 | "analysis.MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist)\n",
233 | "\n",
234 | "####################################### Section 3.3 Salient feature-value analysis ############################ #\n",
235 | "\n",
236 | "analysis.MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, ExcludeIGHVVectors, \n",
237 | " ExcludeFeatureNames)\n",
238 | "\n",
239 | "####################################### Section 3.4 Classification on segments ################################ \n",
240 | "\n",
241 | "analysis.MultiAuc(iterate, X_IDS, Y_IDS)\n",
242 | "analysis.ROCDrawing(X_IDS[0], Y_IDS[0], GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames)\n",
243 | "\n",
244 | "analysis.JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize)"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 4,
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "#################################################################################################################\n",
254 | "# #\n",
255 | "# Section 4 Design Recommendation #\n",
256 | "# #\n",
257 | "#################################################################################################################\n",
258 | "\n",
259 | "############################################ Import libaries ##################################################\n",
260 | "\n",
261 | "import ASAP.DesignRecommendation as design\n",
262 | "\n",
263 | "############################################ Function calls ##################################################\n",
264 | "\n",
265 | "design.MultiDecisionTree(iterate, X_IDS, Y_IDS, ExcludeFeatureNames, 'AllFeature')"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": []
274 | }
275 | ],
276 | "metadata": {
277 | "kernelspec": {
278 | "display_name": "Python 3",
279 | "language": "python",
280 | "name": "python3"
281 | },
282 | "language_info": {
283 | "codemirror_mode": {
284 | "name": "ipython",
285 | "version": 3
286 | },
287 | "file_extension": ".py",
288 | "mimetype": "text/x-python",
289 | "name": "python",
290 | "nbconvert_exporter": "python",
291 | "pygments_lexer": "ipython3",
292 | "version": "3.6.8"
293 | }
294 | },
295 | "nbformat": 4,
296 | "nbformat_minor": 1
297 | }
298 |
--------------------------------------------------------------------------------
/ASAP/S_SequenceInRegion.py:
--------------------------------------------------------------------------------
1 | import Bio.SeqUtils.ProtParam
2 | import os
3 | import ASAP.FeatureExtraction as extract
4 | import pandas as pd
5 | import matplotlib.pyplot as plt
6 | import numpy as np
7 |
8 | # Chothia numbering definition for CDR regions
9 | CHOTHIA_CDR = {'L': {'1': [24, 34], '2': [50, 56], '3': [89, 97]}, 'H':{'1': [26, 32], '2': [52, 56], '3': [95, 102]}}
10 | canonical_direct = '../data/pigs_canonical.txt'
11 |
12 | SET_NAME = 'IGHV'
13 | IF_ONLY_HEAVY = True
14 | CNT_DB = 1
15 | CNT_TARGET = 1
16 | REFERENCE_PATH_TESTCASE = '../testCase/IGHV/reference-IGHV/'
17 | TARGETING_PATH_TESTCASE = '../testCase/IGHV/targeting-MMP-IGHV/'
18 | TARGET_DESIRE_SIZE = 134 #44 #IGHV
19 |
20 | targeting_direct = TARGETING_PATH_TESTCASE
21 | reference_direct = REFERENCE_PATH_TESTCASE
22 |
23 | Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)
24 |
25 | seq_id = []
26 | for i, name in enumerate(DatasetName):
27 | # if i<2:
28 | # continue
29 | tmp= [[] for j in range(int(DatasetSize[i]))]
30 | # for every seq in that dataset
31 | for j in range(int(DatasetSize[i])):
32 | seq_name = name + '_' + str(j)
33 | seq_id.append(seq_name)
34 |
35 | # raw sequence
36 | def sequence_raw():
37 | def getSequenceHL(sname):
38 | SH = ''.join(Amino['H'][sname])
39 | SL = ''
40 | if not IF_ONLY_HEAVY:
41 | SL = ''.join(Amino['L'][sname])
42 | return SL, SH
43 | else:
44 | return [SH]
45 |
46 | with open('../results/'+SET_NAME +'_Sequence.csv','w') as fi:
47 | fi.write('sequence name, ')
48 | if not IF_ONLY_HEAVY:
49 | fi.write('light chain, ')
50 | fi.write('heavy chain\n')
51 | for sname in seq_id:
52 | fi.write(sname + ',' + ','.join(getSequenceHL(sname))+ '\n')
53 |
54 | # sequence with numbering
55 | def sequence_num():
56 | def getSequenceHL_num(sname):
57 | NH = ','.join(Num['H'][sname])
58 | SH = ','.join(Amino['H'][sname])
59 | NL = ','.join(Num['L'][sname])
60 | SL = ','.join(Amino['L'][sname])
61 | return NH, SH, NL, SL
62 |
63 | with open('./Sequence_numbered.csv','w') as fi:
64 | for sname in seq_id:
65 | NH, SH, NL, SL = getSequenceHL_num(sname)
66 | fi.write(sname + ' light num,' + NL + '\n')
67 | fi.write(sname + ' light seq,' + SL + '\n')
68 | fi.write(sname + ' heavy num,' + NH + '\n')
69 | fi.write(sname + ' heavy seq,' + SH + '\n')
70 |
71 | # sequence with region
72 | def sequence_region():
73 | def getSequenceHL_region(sname):
74 | NH = Num['H'][sname]
75 |
76 | HFW1, HCDR1, HFW2, HCDR2, HFW3, HCDR3, HFW4 = '', '', '', '', '', '', ''
77 |
78 | for i, number in enumerate(NH):
79 | if number[-1] >= 'A' and number[-1] <= 'Z':
80 | num_i = int(number[:-1])
81 | else:
82 | num_i = int(number)
83 | if num_i < CHOTHIA_CDR['H']['1'][0]:
84 | HFW1 += Amino['H'][sname][i]
85 | elif num_i <= CHOTHIA_CDR['H']['1'][1]:
86 | HCDR1+= Amino['H'][sname][i]
87 | elif num_i < CHOTHIA_CDR['H']['2'][0]:
88 | HFW2 += Amino['H'][sname][i]
89 | elif num_i <= CHOTHIA_CDR['H']['2'][1]:
90 | HCDR2 += Amino['H'][sname][i]
91 | elif num_i < CHOTHIA_CDR['H']['3'][0]:
92 | HFW3 += Amino['H'][sname][i]
93 | elif num_i <= CHOTHIA_CDR['H']['3'][1]:
94 | HCDR3 += Amino['H'][sname][i]
95 | else:
96 | HFW4 += Amino['H'][sname][i]
97 | if IF_ONLY_HEAVY:
98 | return ''.join(HFW1), ''.join(HCDR1), ''.join(HFW2), ''.join(HCDR2), ''.join(HFW3), ''.join(HCDR3), ''.join(
99 | HFW4)
100 | else:
101 | NL = Num['L'][sname]
102 | LFW1, LCDR1, LFW2, LCDR2, LFW3, LCDR3, LFW4 = '', '', '', '', '', '', ''
103 | for i, number in enumerate(NL):
104 | if number[-1] >= 'A' and number[-1] <= 'Z':
105 | num_i = int(number[:-1])
106 | else:
107 | num_i = int(number)
108 | if num_i < CHOTHIA_CDR['L']['1'][0]:
109 | LFW1 += Amino['L'][sname][i]
110 | elif num_i <= CHOTHIA_CDR['L']['1'][1]:
111 | LCDR1 += Amino['L'][sname][i]
112 | elif num_i < CHOTHIA_CDR['L']['2'][0]:
113 | LFW2 += Amino['L'][sname][i]
114 | elif num_i <= CHOTHIA_CDR['L']['2'][1]:
115 | LCDR2 += Amino['L'][sname][i]
116 | elif num_i < CHOTHIA_CDR['L']['3'][0]:
117 | LFW3 += Amino['L'][sname][i]
118 | elif num_i <= CHOTHIA_CDR['L']['3'][1]:
119 | LCDR3 += Amino['L'][sname][i]
120 | else:
121 | LFW4 += Amino['L'][sname][i]
122 | return ''.join(LFW1), ''.join(LCDR1), ''.join(LFW2), ''.join(LCDR2), ''.join(LFW3), ''.join(LCDR3), ''.join(LFW4),\
123 | ''.join(HFW1), ''.join(HCDR1), ''.join(HFW2), ''.join(HCDR2), ''.join(HFW3), ''.join(HCDR3), ''.join(HFW4)
124 |
125 | with open('../results/'+SET_NAME +'_Sequence_region.csv','w') as fi:
126 | if IF_ONLY_HEAVY:
127 | fi.write(
128 | 'sequence id, heavy chain FW1, heavy chain CDR1, heavy chain FW2, heavy chain CDR2, heavy chain FW3, heavy chain CDR3, heavy chain FW4\n')
129 |
130 | else:
131 | fi.write('sequence id, light chain FW1, light chain CDR1, light chain FW2, light chain CDR2, light chain FW3, light chain CDR3, light chain FW4, '+
132 | 'heavy chain FW1, heavy chain CDR1, heavy chain FW2, heavy chain CDR2, heavy chain FW3, heavy chain CDR3, heavy chain FW4\n')
133 | for sname in seq_id:
134 | fi.write(sname + ',' + ','.join(getSequenceHL_region(sname)) + '\n')
135 |
136 |
137 | def feature_distribution():
138 | from collections import Counter
139 | write_out = [[] for i in range(len(seq_id))]
140 | for fi in range(1,12):
141 | feat = []
142 | for item in write_out:
143 | feat.append(item[fi])
144 |
145 | feat_count = Counter(feat)
146 | sorted_count = sorted(feat_count.items(), key=lambda kv: kv[1], reverse=True)
147 | if fi==11:
148 | feat_type = sorted_count[0][0].split('_')[0]
149 | else:
150 | feat_type = sorted_count[0][0].split('_')[0] + sorted_count[0][0].split('_')[1]
151 | with open('./Features_distribution_'+feat_type+'.csv','w') as fi:
152 | for i in range(len(sorted_count)):
153 | fi.write(sorted_count[i][0]+','+str(sorted_count[i][1])+'\n')
154 |
155 | def feature():
156 | write_out = [[] for i in range(len(seq_id))]
157 | for i in range(len(seq_id)):
158 | write_out[i].append(seq_id[i])
159 | for idx, f in enumerate(AllFeatureVectors[i]):
160 | if f == 1:
161 | write_out[i].append(AllFeatureNames[idx])
162 |
163 | with open('../results/'+SET_NAME +'_Features.csv', 'w') as fi:
164 |
165 | fi.write('sequence id, ')
166 | if not IF_ONLY_HEAVY:
167 | fi.write('light chain V region, light chain J region, ')
168 | fi.write('heavy chain V region, heavy chain J region, ')
169 | if not IF_ONLY_HEAVY:
170 | fi.write('Canonical L1, Canonical L2, Canonical L3, ')
171 | fi.write('Canonical H1, Canonical H2, Canonical H3, ' )
172 | fi.write('PI, frequent positional motif\n')
173 | for i in range(len(write_out)):
174 | fi.write(','.join(write_out[i]) + '\n')
175 |
176 |
177 | def correlation_feature():
178 |
179 | ###### plot correlation matrix
180 | data = pd.DataFrame(AllFeatureVectors, columns=AllFeatureNames)
181 | # print(AllFeatureVectors.shape)
182 | corr = data.corr()
183 | import numpy as np
184 | corr = np.array(corr)
185 | with open('../results/Pearson_feature_correlation.csv', 'w') as fi:
186 | fi.write('Feature value 1, Feature value 2, Pearson coefficient\n')
187 | for i in range(len(AllFeatureNames)):
188 | for j in range(i+1, len(AllFeatureNames)):
189 | # if str(corr[i][j])=='nan':
190 | # print('nan', AllFeatureNames[i], AllFeatureNames[j])
191 | fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(corr[i][j])+'\n')
192 |
193 |
194 |
195 | # data.to_csv(r'../results/Feature_test.csv', header=True)
196 |
197 | # fig = plt.figure(figsize=(100, 70))
198 | # ax = fig.add_subplot(111)
199 | # cax = ax.matshow(corr, cmap='seismic', vmin=-1, vmax =1)
200 | # fig.colorbar(cax)
201 | # ticks = np.arange(0, len(data.columns),1)
202 | # ax.set_xticks(ticks)
203 | # plt.xticks(rotation=90)
204 | # ax.set_yticks(ticks)
205 | # ax.set_xticklabels(data.columns)
206 | # ax.set_yticklabels(data.columns)
207 | # plt.savefig('../results/feature_correlation.png')
208 | # corr = pd.DataFrame(corr, index=AllFeatureNames, columns=AllFeatureNames)
209 | ###### display pairwise correlation value
210 | # au_corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
211 | # au_corr = au_corr.stack().sort_values(ascending=False)
212 | # au_corr = corr.unstack()
213 | # au_corr.columns = [' 1', 'Feature 2', 'Pearson Correlation Value']
214 | # au_corr = pd.DataFrame(au_corr.values, columns = ['Feature 1, Feature 2, Pearson Correlation Value'])
215 | # au_corr.to_csv(r'../results/Pearson_feature_correlation.csv', header=True)
216 | # print(len(au_corr))
217 |
218 | # print(AllFeatureVectors[:, AllFeatureNames.index('Germ_LJ_IGKJ3*01')])
219 | # print(AllFeatureVectors[:, AllFeatureNames.index('Canonical_L2_0')])
220 |
221 | # def JaccardCoefficientAnalysis():
222 | # df = pd.DataFrame(AllFeatureVectors, columns=AllFeatureNames)
223 | #
224 | # interest_feature=['Germ_HV_IGHV3-23*01', 'Canonical_H2_6', 'Germ_HJ_IGHJ4*02', 'Germ_HJ_IGHJ6*01', 'Germ_LV_IGKV1D-39*01',
225 | # 'Canonical_H2_5', 'Germ_HJ_IGHJ4*01']
226 | # jac_sim = np.eye(len(AllFeatureNames))
227 | # for i in range(len(AllFeatureNames)):
228 | # for j in range(i+1, len(AllFeatureNames)):
229 | # if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
230 | # continue
231 | # a = AllFeatureVectors[:, i]
232 | # b = AllFeatureVectors[:, j]
233 | # aandb =0
234 | # aorb = 0
235 | # for k in range(len(a)):
236 | # if a[k]==b[k] and a[k]==1:
237 | # aandb +=1
238 | # if a[k]==1 or b[k]==1:
239 | # aorb +=1
240 | # if aorb==0:
241 | # jac_tmp=0
242 | # else:
243 | # jac_tmp = float(aandb)/aorb
244 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
245 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
246 | #
247 | # jac_sim[i][j]=jac_tmp
248 | # jac_sim[j][i]=jac_tmp
249 | #
250 | #
251 | # with open('../results/Jaccard_feature_coefficient.csv', 'w') as fi:
252 | # fi.write('Feature value 1, Feature value 2, Jaccard coefficient\n')
253 | # for i in range(len(AllFeatureNames)):
254 | # for j in range(i+1, len(AllFeatureNames)):
255 | # if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
256 | # continue
257 | # fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(jac_sim[i][j])+'\n')
258 | #
259 | #
260 | # fig = plt.figure(figsize=(100, 70))
261 | # ax = fig.add_subplot(111)
262 | # cax = ax.matshow(jac_sim, cmap='Blues', vmin=0, vmax =1)
263 | # fig.colorbar(cax)
264 | # ticks = np.arange(0, len(df.columns),1)
265 | # ax.set_xticks(ticks)
266 | # plt.xticks(rotation=90)
267 | # ax.set_yticks(ticks)
268 | # ax.set_xticklabels(df.columns)
269 | # ax.set_yticklabels(df.columns)
270 | # plt.savefig('../results/feature_coefficient.png')
271 | #
272 | # # print(AllFeatureVectors[:,AllFeatureNames.index('Germ_LJ_IGKJ3*01')])
273 | # # print(AllFeatureVectors[:,AllFeatureNames.index('Canonical_L2_0*01')])
274 | # # where(np.triu(np.ones(jac_sim.shape), k=1).astype(np.bool))
275 | # # au_jac = jac_sim.where(np.triu(np.ones(jac_sim.shape), k=0).astype(np.bool))
276 | # # au_jac = au_jac.stack().sort_values(ascending=False)
277 | # # au_jac = jac_sim.unstack()
278 | # # print(len(au_jac))
279 | # # au_jac.to_csv(r'../results/Jaccard_feature_coefficient.csv', header=True)
280 |
281 | def JaccardCoefficientAnalysis():
282 |
283 | PDB_size = DatasetSize[0]
284 |
285 | jac_sim_PDB = np.eye(len(AllFeatureNames))
286 | for i in range(len(AllFeatureNames)):
287 | for j in range(i+1, len(AllFeatureNames)):
288 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
289 | continue
290 | a = AllFeatureVectors[:PDB_size, i]
291 | b = AllFeatureVectors[:PDB_size, j]
292 | aandb =0
293 | aorb = 0
294 | for k in range(len(a)):
295 | if a[k]==b[k] and a[k]==1:
296 | aandb +=1
297 | if a[k]==1 or b[k]==1:
298 | aorb +=1
299 | if aorb==0:
300 | jac_tmp=0
301 | else:
302 | jac_tmp = float(aandb)/aorb
303 |
304 | # if AllFeatureNames[i] == 'Germ_HV_IGHV3-23*01' and AllFeatureNames[j] =='Canonical_H2_6':
305 | # print(a, b, jac_tmp)
306 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
307 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
308 | jac_sim_PDB[i][j]=jac_tmp
309 | jac_sim_PDB[j][i]=jac_tmp
310 |
311 | jac_sim_MMP = np.eye(len(AllFeatureNames))
312 | for i in range(len(AllFeatureNames)):
313 | for j in range(i+1, len(AllFeatureNames)):
314 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
315 | continue
316 | a = AllFeatureVectors[PDB_size:, i]
317 | b = AllFeatureVectors[PDB_size:, j]
318 |
319 | aandb =0
320 | aorb = 0
321 | for k in range(len(a)):
322 | if a[k]==b[k] and a[k]==1:
323 | aandb +=1
324 | if a[k]==1 or b[k]==1:
325 | aorb +=1
326 | if aorb==0:
327 | jac_tmp=0
328 | else:
329 | jac_tmp = float(aandb)/aorb
330 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
331 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
332 |
333 | jac_sim_MMP[i][j]=jac_tmp
334 | jac_sim_MMP[j][i]=jac_tmp
335 |
336 |
337 | with open('../results/'+SET_NAME+'_Jaccard Feature Coefficient.csv', 'w') as fi:
338 | fi.write('Feature value 1, Feature value 2, Jaccard coefficient for reference set, Jaccard coefficient for MMP-targeting set\n')
339 | for i in range(len(AllFeatureNames)):
340 | for j in range(i+1, len(AllFeatureNames)):
341 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
342 | continue
343 | fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(jac_sim_PDB[i][j])+','+ str(jac_sim_MMP[i][j])+'\n')
344 | if __name__=='__main__':
345 | sequence_raw()
346 | sequence_region()
347 | OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)
348 | OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)
349 | CDRH3 = extract.GetCDRH3(Amino, Num)
350 | OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)
351 | MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)
352 | AllFeatureVectors, AllFeatureNames, _, _ = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)
353 |
354 | feature()
355 | # correlation_feature()
356 | JaccardCoefficientAnalysis()
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_RankFisherAndFS.csv:
--------------------------------------------------------------------------------
1 | Feature type, Feature value,"Fisher Exact Test
2 | p-value"," Feature selection
3 | (thereshold = 0.0025)","Average rank of
4 | Fisher Exact Test","Rank of
5 | feature selection"
6 | Canonical H1,0,1.00E+00,0.00070915,,
7 | Canonical H1,1,1.48E-05,0.003064429,11.86,38
8 | Canonical H1,2,1.00E+00,0.000198741,,
9 | Canonical H1,3,1.00E+00,0.001203345,,
10 | Canonical H1,4,1.00E+00,0,,
11 | Canonical H2,0,1.00E+00,0.029372109,,5
12 | Canonical H2,3,1.00E+00,0,,
13 | Canonical H2,5,1.00E+00,0.038609987,,4
14 | Canonical H2,6,1.71E-39,0.090003385,2,2
15 | Canonical H3,0,9.99E-01,0.004348375,,27
16 | Canonical H3,1,9.97E-01,0.003626023,,35
17 | Canonical H3,2,2.79E-03,0.005589529,31.75,20
18 | Canonical H3,3,9.33E-03,0.001775536,38.91,
19 | Canonical L1,0,9.99E-01,0.006550694,,18
20 | Canonical L1,2,1.00E+00,0,,
21 | Canonical L1,3,1.00E+00,0,,
22 | Canonical L1,4,1.45E-03,0.003968341,29.65,33
23 | Canonical L1,5,1.12E-01,0.002855355,,44
24 | Canonical L2,0,1.00E+00,0,,
25 | Canonical L3,0,1.00E+00,0,,
26 | Germ HJ,IGHJ1*01,9.89E-01,0.002300919,,
27 | Germ HJ,IGHJ1*02,1.00E+00,0.001660937,,
28 | Germ HJ,IGHJ1*03,1.00E+00,0,,
29 | Germ HJ,IGHJ2*01,1.00E+00,0.005897943,,19
30 | Germ HJ,IGHJ2*03,1.00E+00,0,,
31 | Germ HJ,IGHJ3*01,1.00E+00,0.015279146,,8
32 | Germ HJ,IGHJ3*02,2.20E-05,0.011174225,15.66,11
33 | Germ HJ,IGHJ4*01,1.00E+00,0.029354208,,6
34 | Germ HJ,IGHJ4*02,2.79E-23,0.051628515,3,3
35 | Germ HJ,IGHJ5*01,9.97E-01,0.007708994,,15
36 | Germ HJ,IGHJ5*02,2.20E-02,0.004523093,45.14,24
37 | Germ HJ,IGHJ6*01,1.58E-16,0.028791732,4,7
38 | Germ HJ,IGHJ6*02,1.00E+00,0.014648606,,9
39 | Germ HJ,IGHJ6*04,7.96E-02,0.002245223,,
40 | Germ HV,IGHV1-12*01,1.00E+00,0,,
41 | Germ HV,IGHV1-14*01,1.00E+00,0,,
42 | Germ HV,IGHV1-15*01,1.00E+00,0,,
43 | Germ HV,IGHV1-18*01,1.00E+00,0,,
44 | Germ HV,IGHV1-18*04,1.00E+00,0.000532851,,
45 | Germ HV,IGHV1-19*01,1.00E+00,0,,
46 | Germ HV,IGHV1-2*02,6.28E-01,0.003518566,,36
47 | Germ HV,IGHV1-2*04,1.00E+00,0,,
48 | Germ HV,IGHV1-22*01,1.00E+00,0,,
49 | Germ HV,IGHV1-24*01,1.00E+00,0,,
50 | Germ HV,IGHV1-26*01,1.00E+00,0,,
51 | Germ HV,IGHV1-3*01,1.00E+00,0.000294827,,
52 | Germ HV,IGHV1-34*01,1.00E+00,0,,
53 | Germ HV,IGHV1-34*02,1.00E+00,0,,
54 | Germ HV,IGHV1-37*01,1.00E+00,0,,
55 | Germ HV,IGHV1-39*01,1.00E+00,0,,
56 | Germ HV,IGHV1-4*01,1.00E+00,0,,
57 | Germ HV,IGHV1-42*01,1.00E+00,0,,
58 | Germ HV,IGHV1-46*01,1.00E+00,0.000184878,,
59 | Germ HV,IGHV1-47*01,1.00E+00,0,,
60 | Germ HV,IGHV1-5*01,1.00E+00,0.000178421,,
61 | Germ HV,IGHV1-50*01,1.00E+00,0,,
62 | Germ HV,IGHV1-53*01,1.00E+00,0,,
63 | Germ HV,IGHV1-54*01,1.00E+00,0,,
64 | Germ HV,IGHV1-55*01,1.00E+00,0,,
65 | Germ HV,IGHV1-59*01,1.00E+00,0,,
66 | Germ HV,IGHV1-6*02,1.00E+00,0,,
67 | Germ HV,IGHV1-61*01,1.00E+00,0.000155636,,
68 | Germ HV,IGHV1-63*02,1.00E+00,0,,
69 | Germ HV,IGHV1-66*01,1.00E+00,0,,
70 | Germ HV,IGHV1-67*01,1.00E+00,0,,
71 | Germ HV,IGHV1-69-2*01,1.00E+00,0,,
72 | Germ HV,IGHV1-69*01,1.00E+00,0,,
73 | Germ HV,IGHV1-69*02,1.00E+00,0,,
74 | Germ HV,IGHV1-69*08,1.00E+00,0.000118438,,
75 | Germ HV,IGHV1-69*10,1.00E+00,0,,
76 | Germ HV,IGHV1-69*14,1.00E+00,8.53E-06,,
77 | Germ HV,IGHV1-69D*01,1.00E+00,5.27E-05,,
78 | Germ HV,IGHV1-7*01,1.00E+00,0,,
79 | Germ HV,IGHV1-72*01,1.00E+00,0,,
80 | Germ HV,IGHV1-8*01,1.00E+00,0.000303056,,
81 | Germ HV,IGHV1-81*01,1.00E+00,0,,
82 | Germ HV,IGHV1-82*01,1.00E+00,2.15E-05,,
83 | Germ HV,IGHV1-84*01,1.00E+00,0,,
84 | Germ HV,IGHV1-85*01,1.00E+00,0,,
85 | Germ HV,IGHV1-9*01,1.00E+00,0.001006833,,
86 | Germ HV,IGHV10-1*02,1.00E+00,0,,
87 | Germ HV,IGHV10-3*01,1.00E+00,0,,
88 | Germ HV,IGHV10S3*01,1.00E+00,0,,
89 | Germ HV,IGHV13-2*02,1.00E+00,0,,
90 | Germ HV,IGHV14-1*01,1.00E+00,0,,
91 | Germ HV,IGHV14-1*02,1.00E+00,0,,
92 | Germ HV,IGHV14-3*02,1.00E+00,2.62E-05,,
93 | Germ HV,IGHV14-4*02,1.00E+00,0.000340449,,
94 | Germ HV,IGHV1S14*01,1.00E+00,0,,
95 | Germ HV,IGHV1S29*02,1.00E+00,0,,
96 | Germ HV,IGHV1S45*01,1.00E+00,0,,
97 | Germ HV,IGHV1S53*02,1.00E+00,0.000151308,,
98 | Germ HV,IGHV1S61*01,1.00E+00,0,,
99 | Germ HV,IGHV1S69*01,1.00E+00,0,,
100 | Germ HV,IGHV1S7*01,1.00E+00,6.14E-05,,
101 | Germ HV,IGHV2-2*03,1.00E+00,0,,
102 | Germ HV,IGHV2-5*02,1.00E+00,0,,
103 | Germ HV,IGHV2-5*09,1.00E+00,0,,
104 | Germ HV,IGHV2-6-4*01,1.00E+00,0,,
105 | Germ HV,IGHV2-6-5*01,1.00E+00,0,,
106 | Germ HV,IGHV2-6-7*02,1.00E+00,0,,
107 | Germ HV,IGHV2-70*13,1.00E+00,0,,
108 | Germ HV,IGHV2-9*02,5.98E-01,0.002199624,,
109 | Germ HV,IGHV2S33*01,1.00E+00,0,,
110 | Germ HV,IGHV2S5*01,1.00E+00,0,,
111 | Germ HV,IGHV3-1*01,1.00E+00,0,,
112 | Germ HV,IGHV3-11*01,1.00E+00,0,,
113 | Germ HV,IGHV3-11*05,1.00E+00,0,,
114 | Germ HV,IGHV3-13*04,1.00E+00,0,,
115 | Germ HV,IGHV3-15*04,1.00E+00,0,,
116 | Germ HV,IGHV3-15*06,1.00E+00,0,,
117 | Germ HV,IGHV3-15*07,1.00E+00,0.000143431,,
118 | Germ HV,IGHV3-2*02,1.00E+00,0,,
119 | Germ HV,IGHV3-20*01,1.00E+00,0,,
120 | Germ HV,IGHV3-21*03,1.00E+00,0.00016636,,
121 | Germ HV,IGHV3-23*01,1.75E-72,0.419321293,1,1
122 | Germ HV,IGHV3-23*02,1.00E+00,0,,
123 | Germ HV,IGHV3-23*03,5.00E-01,0.003982053,,32
124 | Germ HV,IGHV3-23*04,1.00E+00,0.000206359,,
125 | Germ HV,IGHV3-30*02,1.00E+00,0,,
126 | Germ HV,IGHV3-30*03,1.00E+00,0.000711737,,
127 | Germ HV,IGHV3-30*10,1.00E+00,0,,
128 | Germ HV,IGHV3-30*11,1.00E+00,0.00283518,,45
129 | Germ HV,IGHV3-30*18,1.00E+00,1.79E-05,,
130 | Germ HV,IGHV3-33*01,6.06E-02,0.002916101,,43
131 | Germ HV,IGHV3-33*03,1.00E+00,0,,
132 | Germ HV,IGHV3-48*01,1.00E+00,0.000344457,,
133 | Germ HV,IGHV3-53*01,5.00E-01,0.002345587,,
134 | Germ HV,IGHV3-6*02,1.00E+00,0,,
135 | Germ HV,IGHV3-64*01,1.00E+00,0,,
136 | Germ HV,IGHV3-66*02,1.00E+00,0,,
137 | Germ HV,IGHV3-66*03,1.00E+00,0,,
138 | Germ HV,IGHV3-7*02,1.00E+00,0.00028511,,
139 | Germ HV,IGHV3-72*01,1.00E+00,0,,
140 | Germ HV,IGHV3-73*01,1.00E+00,0,,
141 | Germ HV,IGHV3-74*01,1.00E+00,0,,
142 | Germ HV,IGHV3-74*03,1.00E+00,1.34E-05,,
143 | Germ HV,IGHV3-8*02,1.00E+00,0,,
144 | Germ HV,IGHV3-9*01,1.00E+00,0.00076671,,
145 | Germ HV,IGHV3S1*01,1.00E+00,0,,
146 | Germ HV,IGHV4-1*02,1.00E+00,0,,
147 | Germ HV,IGHV4-2*02,1.00E+00,0,,
148 | Germ HV,IGHV4-30-4*07,1.00E+00,0,,
149 | Germ HV,IGHV4-31*02,1.00E+00,0,,
150 | Germ HV,IGHV4-31*05,1.00E+00,0,,
151 | Germ HV,IGHV4-34*01,1.00E+00,0.000527108,,
152 | Germ HV,IGHV4-38-2*01,1.00E+00,0,,
153 | Germ HV,IGHV4-38-2*02,1.00E+00,0,,
154 | Germ HV,IGHV4-39*07,1.00E+00,3.88E-05,,
155 | Germ HV,IGHV4-4*02,1.00E+00,0,,
156 | Germ HV,IGHV4-4*07,1.00E+00,0,,
157 | Germ HV,IGHV4-4*08,1.00E+00,0,,
158 | Germ HV,IGHV4-59*02,1.00E+00,0,,
159 | Germ HV,IGHV4-59*03,1.00E+00,0,,
160 | Germ HV,IGHV4-59*04,1.00E+00,0,,
161 | Germ HV,IGHV4-59*05,1.00E+00,0,,
162 | Germ HV,IGHV4-59*07,1.00E+00,0,,
163 | Germ HV,IGHV4-59*08,1.00E+00,7.30E-05,,
164 | Germ HV,IGHV5-10-1*04,1.00E+00,0.000203388,,
165 | Germ HV,IGHV5-12-2*01,1.00E+00,0,,
166 | Germ HV,IGHV5-12*01,1.00E+00,0,,
167 | Germ HV,IGHV5-15*02,1.00E+00,0,,
168 | Germ HV,IGHV5-17*02,1.00E+00,0.000348178,,
169 | Germ HV,IGHV5-4*02,1.00E+00,0,,
170 | Germ HV,IGHV5-51*01,1.00E+00,0.000351596,,
171 | Germ HV,IGHV5-6-1*01,1.00E+00,0,,
172 | Germ HV,IGHV5-6-2*01,1.00E+00,0.000314301,,
173 | Germ HV,IGHV5-6-3*01,1.00E+00,0,,
174 | Germ HV,IGHV5-9-3*01,1.00E+00,3.18E-06,,
175 | Germ HV,IGHV5-9*01,5.00E-01,0.004882342,,22
176 | Germ HV,IGHV5-9*02,1.00E+00,0.000445259,,
177 | Germ HV,IGHV5-9*03,1.00E+00,0,,
178 | Germ HV,IGHV5S4*01,1.00E+00,0,,
179 | Germ HV,IGHV5S9*01,1.00E+00,0,,
180 | Germ HV,IGHV6-3*02,1.00E+00,0,,
181 | Germ HV,IGHV6-6*01,1.00E+00,0.000228354,,
182 | Germ HV,IGHV6-6*02,1.00E+00,0,,
183 | Germ HV,IGHV6-7*02,1.00E+00,0,,
184 | Germ HV,IGHV7-3*02,1.00E+00,0,,
185 | Germ HV,IGHV7-3*04,1.00E+00,0,,
186 | Germ HV,IGHV7-4-1*02,1.00E+00,0,,
187 | Germ HV,IGHV8-12*01,1.00E+00,0,,
188 | Germ HV,IGHV8-5*01,1.00E+00,0,,
189 | Germ HV,IGHV8-8*01,1.00E+00,0.000214988,,
190 | Germ HV,IGHV9-1*02,1.00E+00,0,,
191 | Germ HV,IGHV9-2-1*01,1.00E+00,0.000557316,,
192 | Germ HV,IGHV9-3-1*01,1.00E+00,0,,
193 | Germ HV,IGHV9-3*01,1.00E+00,0,,
194 | Germ HV,IGHV9-4*02,1.00E+00,0,,
195 | Germ LJ,IGKJ1-1*03,1.00E+00,0,,
196 | Germ LJ,IGKJ1-2*02,1.00E+00,0.001764855,,
197 | Germ LJ,IGKJ1-2*03,1.00E+00,0,,
198 | Germ LJ,IGKJ1*01,9.88E-01,0.00544662,,21
199 | Germ LJ,IGKJ1*02,1.00E+00,6.77E-05,,
200 | Germ LJ,IGKJ2-1*01,1.00E+00,0,,
201 | Germ LJ,IGKJ2-3*01,1.00E+00,0,,
202 | Germ LJ,IGKJ2*01,8.87E-01,0.002790228,,46
203 | Germ LJ,IGKJ2*02,1.00E+00,0,,
204 | Germ LJ,IGKJ2*03,1.00E+00,0,,
205 | Germ LJ,IGKJ3*01,1.29E-05,0.014631149,13.14,10
206 | Germ LJ,IGKJ4*01,2.45E-03,0.000513796,31.39,
207 | Germ LJ,IGKJ4*02,1.00E+00,0,,
208 | Germ LJ,IGKJ5*01,9.68E-01,0.001509162,,
209 | Germ LJ,IGLJ1*01,4.22E-01,0.003266883,,37
210 | Germ LJ,IGLJ2*01,1.00E+00,0.000485164,,
211 | Germ LJ,IGLJ3*01,3.28E-01,0.000824574,,
212 | Germ LJ,IGLJ3*02,7.95E-01,0.000667714,,
213 | Germ LJ,IGLJ6*01,2.85E-01,0.000181587,,
214 | Germ LJ,IGLJ7*01,6.60E-01,0,,
215 | Germ LV,IGKV1-110*01,1.00E+00,0,,
216 | Germ LV,IGKV1-110*02,1.00E+00,0,,
217 | Germ LV,IGKV1-117*01,9.41E-01,0,,
218 | Germ LV,IGKV1-117*02,1.00E+00,0,,
219 | Germ LV,IGKV1-12*01,3.58E-03,0.002993798,39.91,40
220 | Germ LV,IGKV1-133*01,1.00E+00,0,,
221 | Germ LV,IGKV1-16*01,1.24E-01,0.008987967,,14
222 | Germ LV,IGKV1-17*01,5.63E-01,0.001137982,,
223 | Germ LV,IGKV1-17*03,5.00E-01,0,,
224 | Germ LV,IGKV1-27*01,5.80E-02,0.001384217,,
225 | Germ LV,IGKV1-39*01,1.00E+00,0.001736249,,
226 | Germ LV,IGKV1-5*01,1.00E+00,0,,
227 | Germ LV,IGKV1-5*03,7.09E-01,0.000669564,,
228 | Germ LV,IGKV1-6*01,2.49E-01,1.51E-05,,
229 | Germ LV,IGKV1-88*01,1.00E+00,0,,
230 | Germ LV,IGKV1-9*01,3.38E-01,0.001313416,,
231 | Germ LV,IGKV1-NL1*01,1.00E+00,0,,
232 | Germ LV,IGKV10-94*02,1.00E+00,4.83E-05,,
233 | Germ LV,IGKV10-94*05,1.00E+00,0,,
234 | Germ LV,IGKV10-96*02,1.00E+00,0.001710628,,
235 | Germ LV,IGKV10-96*04,1.00E+00,1.14E-05,,
236 | Germ LV,IGKV12-41*01,1.00E+00,4.10E-06,,
237 | Germ LV,IGKV12-44*01,1.00E+00,0,,
238 | Germ LV,IGKV12-46*01,1.00E+00,0,,
239 | Germ LV,IGKV12S24*01,1.00E+00,0,,
240 | Germ LV,IGKV13-84*01,1.00E+00,0,,
241 | Germ LV,IGKV14-100*01,1.00E+00,0,,
242 | Germ LV,IGKV14-111*01,1.00E+00,0,,
243 | Germ LV,IGKV14-126*01,1.00E+00,0,,
244 | Germ LV,IGKV16-104*01,1.00E+00,0,,
245 | Germ LV,IGKV17-121*01,1.00E+00,0,,
246 | Germ LV,IGKV17-127*01,1.00E+00,0.000376466,,
247 | Germ LV,IGKV19-93*02,1.00E+00,0,,
248 | Germ LV,IGKV1D-13*01,7.38E-01,4.45E-07,,
249 | Germ LV,IGKV1D-33*01,8.24E-01,0.000615482,,
250 | Germ LV,IGKV1D-39*01,4.88E-10,0.009239291,5.03,13
251 | Germ LV,IGKV1S10*01,1.00E+00,0,,
252 | Germ LV,IGKV1S11*01,5.75E-01,0,,
253 | Germ LV,IGKV1S12*01,1.00E+00,0,,
254 | Germ LV,IGKV1S14*01,1.00E+00,0,,
255 | Germ LV,IGKV1S15*01,1.00E+00,0,,
256 | Germ LV,IGKV1S17*01,1.00E+00,0,,
257 | Germ LV,IGKV1S2*01,8.07E-01,0,,
258 | Germ LV,IGKV1S2*02,1.00E+00,0,,
259 | Germ LV,IGKV1S22*01,1.00E+00,0,,
260 | Germ LV,IGKV1S24*01,1.00E+00,0,,
261 | Germ LV,IGKV1S3*01,1.00E+00,0,,
262 | Germ LV,IGKV1S3*02,1.00E+00,0,,
263 | Germ LV,IGKV1S5*01,1.00E+00,0,,
264 | Germ LV,IGKV2-109*01,1.00E+00,0,,
265 | Germ LV,IGKV2-109*03,1.00E+00,0,,
266 | Germ LV,IGKV2-112*01,1.00E+00,0,,
267 | Germ LV,IGKV2-137*01,1.00E+00,0,,
268 | Germ LV,IGKV2-28*01,1.36E-01,0.004282447,,28
269 | Germ LV,IGKV2-29*02,1.00E+00,0,,
270 | Germ LV,IGKV2-30*01,6.91E-01,1.29E-05,,
271 | Germ LV,IGKV22S7*01,1.00E+00,0,,
272 | Germ LV,IGKV2D-29*02,1.00E+00,0,,
273 | Germ LV,IGKV2S3*01,1.00E+00,0,,
274 | Germ LV,IGKV3-1*01,1.00E+00,5.72E-05,,
275 | Germ LV,IGKV3-10*01,1.00E+00,0,,
276 | Germ LV,IGKV3-11*01,5.67E-04,0.000541447,25.77,
277 | Germ LV,IGKV3-11*02,5.00E-01,0,,
278 | Germ LV,IGKV3-12*01,1.00E+00,0,,
279 | Germ LV,IGKV3-2*01,1.00E+00,0,,
280 | Germ LV,IGKV3-20*01,4.99E-05,0.007138248,15.98,16
281 | Germ LV,IGKV3-3*01,1.00E+00,0,,
282 | Germ LV,IGKV3-4*01,1.00E+00,0,,
283 | Germ LV,IGKV3-5*01,1.00E+00,0.00027996,,
284 | Germ LV,IGKV3-7*01,1.00E+00,0,,
285 | Germ LV,IGKV3D-11*01,1.00E+00,0,,
286 | Germ LV,IGKV3D-15*01,3.74E-01,0.002168255,,
287 | Germ LV,IGKV3D-20*01,2.49E-01,3.88E-06,,
288 | Germ LV,IGKV3S3*01,5.00E-01,0,,
289 | Germ LV,IGKV3S9*01,1.00E+00,0,,
290 | Germ LV,IGKV4-1*01,7.34E-01,0.00259438,,49
291 | Germ LV,IGKV4-53*01,1.00E+00,0,,
292 | Germ LV,IGKV4-55*01,1.00E+00,0,,
293 | Germ LV,IGKV4-57-1*01,1.00E+00,0,,
294 | Germ LV,IGKV4-57*01,1.00E+00,0,,
295 | Germ LV,IGKV4-59*01,1.00E+00,0,,
296 | Germ LV,IGKV4-61*01,1.00E+00,0,,
297 | Germ LV,IGKV4-63*01,1.00E+00,0,,
298 | Germ LV,IGKV4-68*01,1.00E+00,0,,
299 | Germ LV,IGKV4-70*01,1.00E+00,0,,
300 | Germ LV,IGKV4-72*01,1.00E+00,0,,
301 | Germ LV,IGKV4-74*01,1.00E+00,0,,
302 | Germ LV,IGKV4-79*01,1.00E+00,0,,
303 | Germ LV,IGKV4-80*01,1.00E+00,0,,
304 | Germ LV,IGKV4-81*01,1.00E+00,0,,
305 | Germ LV,IGKV4-86*01,1.00E+00,0,,
306 | Germ LV,IGKV4-91*01,1.00E+00,0,,
307 | Germ LV,IGKV5-39*01,1.00E+00,0.000331762,,
308 | Germ LV,IGKV5-43*01,1.00E+00,0,,
309 | Germ LV,IGKV5-48*01,1.00E+00,0.000960814,,
310 | Germ LV,IGKV6-14*01,1.00E+00,2.54E-05,,
311 | Germ LV,IGKV6-15*01,1.00E+00,4.80E-05,,
312 | Germ LV,IGKV6-17*01,8.76E-01,0.002060703,,
313 | Germ LV,IGKV6-20*01,1.00E+00,0.000203542,,
314 | Germ LV,IGKV6-21*01,1.00E+00,0.000638102,,
315 | Germ LV,IGKV6-21*02,1.00E+00,0,,
316 | Germ LV,IGKV6-23*01,1.00E+00,0,,
317 | Germ LV,IGKV6-25*01,1.00E+00,0,,
318 | Germ LV,IGKV6-32*01,1.00E+00,0,,
319 | Germ LV,IGKV6-32*02,1.00E+00,0,,
320 | Germ LV,IGKV8-19*01,1.00E+00,0,,
321 | Germ LV,IGKV8-21*01,1.00E+00,0,,
322 | Germ LV,IGKV8-24*01,1.00E+00,9.87E-06,,
323 | Germ LV,IGKV8-27*01,1.00E+00,0,,
324 | Germ LV,IGKV8-28*01,1.00E+00,0,,
325 | Germ LV,IGKV8-30*01,1.00E+00,0,,
326 | Germ LV,IGKV9-120*01,1.00E+00,0,,
327 | Germ LV,IGKV9-124*01,1.00E+00,0,,
328 | Germ LV,IGLV1-10*01,1.00E+00,0,,
329 | Germ LV,IGLV1-36*01,1.00E+00,0,,
330 | Germ LV,IGLV1-40*01,5.65E-01,0,,
331 | Germ LV,IGLV1-40*03,1.00E+00,0,,
332 | Germ LV,IGLV1-44*01,2.20E-01,0.000696396,,
333 | Germ LV,IGLV1-47*01,1.64E-01,0.001464818,,
334 | Germ LV,IGLV1-47*02,2.70E-01,0.001099102,,
335 | Germ LV,IGLV1-51*01,1.00E+00,0.00029127,,
336 | Germ LV,IGLV1-51*02,1.00E+00,0.000544347,,
337 | Germ LV,IGLV1*01,1.00E+00,0.000329822,,
338 | Germ LV,IGLV1*02,1.00E+00,0,,
339 | Germ LV,IGLV2-11*01,2.49E-01,0.001072122,,
340 | Germ LV,IGLV2-14*01,2.57E-01,0.002751961,,47
341 | Germ LV,IGLV2-14*02,1.00E+00,0,,
342 | Germ LV,IGLV2-23*01,5.78E-01,4.83E-06,,
343 | Germ LV,IGLV2-23*02,1.00E+00,0,,
344 | Germ LV,IGLV2-8*01,1.77E-01,0.000704107,,
345 | Germ LV,IGLV2S1*01,3.14E-01,0.000799662,,
346 | Germ LV,IGLV2S9*01,1.00E+00,0,,
347 | Germ LV,IGLV3-1*01,7.82E-01,0.00093926,,
348 | Germ LV,IGLV3-10*01,1.00E+00,0,,
349 | Germ LV,IGLV3-19*01,5.78E-01,4.91E-07,,
350 | Germ LV,IGLV3-21*01,1.00E+00,0,,
351 | Germ LV,IGLV3-21*02,2.23E-01,0.0010809,,
352 | Germ LV,IGLV3-21*03,1.00E+00,0,,
353 | Germ LV,IGLV3-25*03,1.00E+00,0,,
354 | Germ LV,IGLV3-9*02,1.00E+00,0,,
355 | Germ LV,IGLV3*01,1.00E+00,2.63E-06,,
356 | Germ LV,IGLV5S10*01,1.00E+00,0,,
357 | Germ LV,IGLV6-57*01,5.00E-01,0,,
358 | Germ LV,IGLV6-57*02,1.00E+00,0,,
359 | Germ LV,IGLV7-43*01,1.00E+00,0,,
360 | Motif,10_FD,6.83E-03,0.001842351,35.99,
361 | Motif,10_MD,2.44E-02,0.002656346,46.63,48
362 | Motif,10_NG,7.46E-04,0,26.18,
363 | Motif,10_YY,3.01E-03,0.001841431,30.18,
364 | Motif,2_AY,2.45E-03,0.000718347,31.58,
365 | Motif,2_AYG,5.19E-07,0.004093965,9.95,30
366 | Motif,2_GG,3.29E-01,0.00048807,,
367 | Motif,2_YG,9.50E-06,0.004157068,11.55,29
368 | Motif,2_YY,3.01E-03,2.95E-05,36.18,
369 | Motif,3_YG,9.50E-06,0.000856833,10.55,
370 | Motif,3_YY,3.01E-03,0.00297028,33.18,41
371 | Motif,4_GD,1.82E-03,0.000754301,28.08,
372 | Motif,4_GDY,1.03E-05,3.70E-06,12.05,
373 | Motif,4_SV,9.73E-01,0,,
374 | Motif,4_SVT,1.00E+00,1.10E-05,,
375 | Motif,4_YD,5.54E-02,0.004696185,,23
376 | Motif,4_YY,3.01E-03,0.000362391,31.18,
377 | Motif,5_DY,3.58E-04,0.000492204,22.06,
378 | Motif,5_DYV,1.67E-05,0,14.44,
379 | Motif,5_FD,6.83E-03,8.46E-06,37.99,
380 | Motif,5_YY,3.01E-03,0.001704189,29.18,
381 | Motif,6_YA,1.00E+00,0.004067616,,31
382 | Motif,6_YD,5.54E-02,0.000727645,,
383 | Motif,6_YF,9.15E-01,0.000529252,,
384 | Motif,6_YV,7.20E-03,0.004395786,37.55,25
385 | Motif,6_YVG,2.20E-05,0.007033143,15.02,17
386 | Motif,6_YY,3.01E-03,0.002312894,34.18,
387 | Motif,7_FD,6.83E-03,0.001472312,38.99,
388 | Motif,7_MD,2.44E-02,0.000534293,47.63,
389 | Motif,7_VG,1.10E-04,0.004383491,18.22,26
390 | Motif,7_VGW,5.19E-07,0,8.75,
391 | Motif,8_GW,5.89E-04,0.000457018,22.57,
392 | Motif,8_GWN,1.11E-06,0.003937073,11.95,34
393 | Motif,8_MD,2.44E-02,0.002503496,45.63,50
394 | Motif,8_SA,1.00E+00,0.000719992,,
395 | Motif,8_YF,9.15E-01,6.92E-06,,
396 | Motif,8_YY,3.01E-03,0.00200915,32.18,
397 | Motif,9_AM,1.00E+00,0,,
398 | Motif,9_AMD,1.00E+00,0,,
399 | Motif,9_FD,6.83E-03,0.000525799,36.99,
400 | Motif,9_WN,3.34E-05,1.01E-05,16.75,
401 | Motif,9_YY,3.01E-03,0.010646544,35.18,12
402 | PI,0.0-3.5,8.32E-03,0.000693414,39.16,
403 | PI,3.5-3.9375,3.53E-03,0.001465018,34.51,
404 | PI,3.9375-4.375,6.74E-01,0.002927121,,42
405 | PI,4.375-5.25,9.54E-01,0.000670093,,
406 | PI,5.25-5.6875,9.80E-01,7.98E-05,,
407 | PI,5.6875-6.125,9.15E-01,0.003030013,,39
408 | PI,6.125-7.0,9.89E-01,0.001250475,,
409 | PI,7.0-14.0,9.92E-01,0.001905193,,
--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Jaccard Feature Coefficient.csv:
--------------------------------------------------------------------------------
1 | Feature value 1, Feature value 2, Jaccard coefficient for MMP-IGHV-targeting set, Jaccard coefficient for IGHV-reference set
2 | Canonical_H1_0,Canonical_H2_0,0,0
3 | Canonical_H1_0,Canonical_H2_6,0,0.003383277
4 | Canonical_H1_0,Canonical_H3_0,0,0.006116208
5 | Canonical_H1_0,Canonical_H3_1,0,0.004878049
6 | Canonical_H1_0,Canonical_H3_2,0,0.004938272
7 | Canonical_H1_0,Canonical_H3_3,0,0.002617801
8 | Canonical_H1_0,PI_0.0-3.5,0,0.002770083
9 | Canonical_H1_0,PI_3.5-3.9375,0,0.004587156
10 | Canonical_H1_0,PI_3.9375-4.375,0,0.001090513
11 | Canonical_H1_0,PI_4.375-4.8125,0,0
12 | Canonical_H1_0,PI_4.8125-5.25,0,0.007092199
13 | Canonical_H1_0,PI_5.25-5.6875,0,0.005813953
14 | Canonical_H1_0,PI_5.6875-6.125,0,0.003496503
15 | Canonical_H1_0,PI_6.125-7.0,0,0.004255319
16 | Canonical_H1_0,PI_7.0-14.0,0,0
17 | Canonical_H1_1,Canonical_H1_0,0,0
18 | Canonical_H1_1,Canonical_H1_2,0,0
19 | Canonical_H1_1,Canonical_H2_0,0.052238806,0.044938615
20 | Canonical_H1_1,Canonical_H2_6,0.947761194,0.951523546
21 | Canonical_H1_1,Canonical_H3_0,0.141791045,0.072238944
22 | Canonical_H1_1,Canonical_H3_1,0.074626866,0.044243688
23 | Canonical_H1_1,Canonical_H3_2,0.246268657,0.09029868
24 | Canonical_H1_1,Canonical_H3_3,0.537313433,0.791262136
25 | Canonical_H1_1,PI_0.0-3.5,0.194029851,0.080379893
26 | Canonical_H1_1,PI_3.5-3.9375,0.402985075,0.299097849
27 | Canonical_H1_1,PI_3.9375-4.375,0.21641791,0.208893006
28 | Canonical_H1_1,PI_4.375-4.8125,0.02238806,0.052826691
29 | Canonical_H1_1,PI_4.8125-5.25,0.02238806,0.062065771
30 | Canonical_H1_1,PI_5.25-5.6875,0.02238806,0.03659949
31 | Canonical_H1_1,PI_5.6875-6.125,0.074626866,0.129226494
32 | Canonical_H1_1,PI_6.125-7.0,0.014925373,0.050949514
33 | Canonical_H1_1,PI_7.0-14.0,0.029850746,0.079240037
34 | Canonical_H1_2,Canonical_H1_0,0,0
35 | Canonical_H1_2,Canonical_H2_0,0,0
36 | Canonical_H1_2,Canonical_H2_6,0,0.000241663
37 | Canonical_H1_2,Canonical_H3_0,0,0
38 | Canonical_H1_2,Canonical_H3_1,0,0
39 | Canonical_H1_2,Canonical_H3_2,0,0.002544529
40 | Canonical_H1_2,Canonical_H3_3,0,0
41 | Canonical_H1_2,PI_0.0-3.5,0,0
42 | Canonical_H1_2,PI_3.5-3.9375,0,0.000769231
43 | Canonical_H1_2,PI_3.9375-4.375,0,0
44 | Canonical_H1_2,PI_4.375-4.8125,0,0
45 | Canonical_H1_2,PI_4.8125-5.25,0,0
46 | Canonical_H1_2,PI_5.25-5.6875,0,0
47 | Canonical_H1_2,PI_5.6875-6.125,0,0
48 | Canonical_H1_2,PI_6.125-7.0,0,0
49 | Canonical_H1_2,PI_7.0-14.0,0,0
50 | Canonical_H1_3,Canonical_H1_0,0,0
51 | Canonical_H1_3,Canonical_H1_1,0,0
52 | Canonical_H1_3,Canonical_H1_2,0,0
53 | Canonical_H1_3,Canonical_H2_0,0,0.005102041
54 | Canonical_H1_3,Canonical_H2_6,0,0.000241604
55 | Canonical_H1_3,Canonical_H3_0,0,0.003164557
56 | Canonical_H1_3,Canonical_H3_1,0,0
57 | Canonical_H1_3,Canonical_H3_2,0,0
58 | Canonical_H1_3,Canonical_H3_3,0,0.000291206
59 | Canonical_H1_3,PI_0.0-3.5,0,0
60 | Canonical_H1_3,PI_3.5-3.9375,0,0
61 | Canonical_H1_3,PI_3.9375-4.375,0,0.001104972
62 | Canonical_H1_3,PI_4.375-4.8125,0,0
63 | Canonical_H1_3,PI_4.8125-5.25,0,0
64 | Canonical_H1_3,PI_5.25-5.6875,0,0
65 | Canonical_H1_3,PI_5.6875-6.125,0,0
66 | Canonical_H1_3,PI_6.125-7.0,0,0.004484305
67 | Canonical_H1_3,PI_7.0-14.0,0,0
68 | Canonical_H2_0,Canonical_H3_0,0,0.040816327
69 | Canonical_H2_0,Canonical_H3_1,0,0.043126685
70 | Canonical_H2_0,Canonical_H3_2,0.052631579,0.022608696
71 | Canonical_H2_0,Canonical_H3_3,0.067567568,0.041929925
72 | Canonical_H2_0,PI_0.0-3.5,0.064516129,0.018761726
73 | Canonical_H2_0,PI_3.5-3.9375,0.051724138,0.049122807
74 | Canonical_H2_0,PI_3.9375-4.375,0.028571429,0.036792453
75 | Canonical_H2_0,PI_4.375-4.8125,0,0.02173913
76 | Canonical_H2_0,PI_4.8125-5.25,0,0.035634744
77 | Canonical_H2_0,PI_5.25-5.6875,0,0.014326648
78 | Canonical_H2_0,PI_5.6875-6.125,0,0.030013643
79 | Canonical_H2_0,PI_6.125-7.0,0.125,0.037313433
80 | Canonical_H2_0,PI_7.0-14.0,0,0.017045455
81 | Canonical_H2_6,Canonical_H2_0,0,0
82 | Canonical_H2_6,Canonical_H3_0,0.149606299,0.070947571
83 | Canonical_H2_6,Canonical_H3_1,0.078740157,0.042368801
84 | Canonical_H2_6,Canonical_H3_2,0.240310078,0.091544206
85 | Canonical_H2_6,Canonical_H3_3,0.507575758,0.767273576
86 | Canonical_H2_6,PI_0.0-3.5,0.186046512,0.081485053
87 | Canonical_H2_6,PI_3.5-3.9375,0.392307692,0.29230038
88 | Canonical_H2_6,PI_3.9375-4.375,0.21875,0.207086426
89 | Canonical_H2_6,PI_4.375-4.8125,0.023622047,0.05280926
90 | Canonical_H2_6,PI_4.8125-5.25,0.023622047,0.061145883
91 | Canonical_H2_6,PI_5.25-5.6875,0.023622047,0.037171132
92 | Canonical_H2_6,PI_5.6875-6.125,0.078740157,0.129326923
93 | Canonical_H2_6,PI_6.125-7.0,0.0078125,0.049843487
94 | Canonical_H2_6,PI_7.0-14.0,0.031496063,0.080299011
95 | Canonical_H3_0,Canonical_H3_1,0,0
96 | Canonical_H3_0,Canonical_H3_3,0,0
97 | Canonical_H3_0,PI_0.0-3.5,0.022727273,0.016871166
98 | Canonical_H3_0,PI_3.5-3.9375,0.140625,0.079545455
99 | Canonical_H3_0,PI_3.9375-4.375,0.043478261,0.043664384
100 | Canonical_H3_0,PI_4.375-4.8125,0,0.011173184
101 | Canonical_H3_0,PI_4.8125-5.25,0.1,0.026315789
102 | Canonical_H3_0,PI_5.25-5.6875,0.1,0.025974026
103 | Canonical_H3_0,PI_5.6875-6.125,0.035714286,0.069682152
104 | Canonical_H3_0,PI_6.125-7.0,0,0.036679537
105 | Canonical_H3_0,PI_7.0-14.0,0.095238095,0.039556962
106 | Canonical_H3_1,Canonical_H3_3,0,0
107 | Canonical_H3_1,PI_0.0-3.5,0.090909091,0.018867925
108 | Canonical_H3_1,PI_3.5-3.9375,0.032258065,0.037552156
109 | Canonical_H3_1,PI_3.9375-4.375,0.026315789,0.046800382
110 | Canonical_H3_1,PI_4.375-4.8125,0,0.004784689
111 | Canonical_H3_1,PI_4.8125-5.25,0,0.01986755
112 | Canonical_H3_1,PI_5.25-5.6875,0,0.03539823
113 | Canonical_H3_1,PI_5.6875-6.125,0.25,0.034387895
114 | Canonical_H3_1,PI_6.125-7.0,0,0.027295285
115 | Canonical_H3_1,PI_7.0-14.0,0,0.038910506
116 | Canonical_H3_2,Canonical_H3_0,0,0
117 | Canonical_H3_2,Canonical_H3_1,0,0
118 | Canonical_H3_2,Canonical_H3_3,0,0
119 | Canonical_H3_2,PI_0.0-3.5,0.092592593,0.042194093
120 | Canonical_H3_2,PI_3.5-3.9375,0.225352113,0.068813131
121 | Canonical_H3_2,PI_3.9375-4.375,0.127272727,0.054471545
122 | Canonical_H3_2,PI_4.375-4.8125,0.028571429,0.019704433
123 | Canonical_H3_2,PI_4.8125-5.25,0,0.055732484
124 | Canonical_H3_2,PI_5.25-5.6875,0,0.041509434
125 | Canonical_H3_2,PI_5.6875-6.125,0.048780488,0.049559471
126 | Canonical_H3_2,PI_6.125-7.0,0.029411765,0.040609137
127 | Canonical_H3_2,PI_7.0-14.0,0.027777778,0.071428571
128 | Canonical_H3_3,PI_0.0-3.5,0.209876543,0.085246843
129 | Canonical_H3_3,PI_3.5-3.9375,0.272727273,0.274024226
130 | Canonical_H3_3,PI_3.9375-4.375,0.231707317,0.204722222
131 | Canonical_H3_3,PI_4.375-4.8125,0.02739726,0.060237475
132 | Canonical_H3_3,PI_4.8125-5.25,0.013513514,0.060423826
133 | Canonical_H3_3,PI_5.25-5.6875,0.013513514,0.032480598
134 | Canonical_H3_3,PI_5.6875-6.125,0.037974684,0.121629213
135 | Canonical_H3_3,PI_6.125-7.0,0.01369863,0.04817895
136 | Canonical_H3_3,PI_7.0-14.0,0.013333333,0.070314715
137 | Germ_HJ_IGHJ1*01,Canonical_H1_0,0,0
138 | Germ_HJ_IGHJ1*01,Canonical_H1_1,0.007462687,0.007645968
139 | Germ_HJ_IGHJ1*01,Canonical_H1_2,0,0
140 | Germ_HJ_IGHJ1*01,Canonical_H1_3,0,0
141 | Germ_HJ_IGHJ1*01,Canonical_H2_0,0,0.008849558
142 | Germ_HJ_IGHJ1*01,Canonical_H2_6,0.007874016,0.007487923
143 | Germ_HJ_IGHJ1*01,Canonical_H3_0,0,0.01754386
144 | Germ_HJ_IGHJ1*01,Canonical_H3_1,0,0.022727273
145 | Germ_HJ_IGHJ1*01,Canonical_H3_2,0.03030303,0.007092199
146 | Germ_HJ_IGHJ1*01,Canonical_H3_3,0,0.005512039
147 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ2*01,0,0
148 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ4*02,0,0
149 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ5*02,0,0
150 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ6*01,0,0
151 | Germ_HJ_IGHJ1*01,PI_0.0-3.5,0,0
152 | Germ_HJ_IGHJ1*01,PI_3.5-3.9375,0,0.00150263
153 | Germ_HJ_IGHJ1*01,PI_3.9375-4.375,0,0.002139037
154 | Germ_HJ_IGHJ1*01,PI_4.375-4.8125,0,0.007722008
155 | Germ_HJ_IGHJ1*01,PI_4.8125-5.25,0,0.027118644
156 | Germ_HJ_IGHJ1*01,PI_5.25-5.6875,0,0.010526316
157 | Germ_HJ_IGHJ1*01,PI_5.6875-6.125,0,0.003384095
158 | Germ_HJ_IGHJ1*01,PI_6.125-7.0,0,0.036585366
159 | Germ_HJ_IGHJ1*01,PI_7.0-14.0,0.25,0.016260163
160 | Germ_HJ_IGHJ2*01,Canonical_H1_0,0,0
161 | Germ_HJ_IGHJ2*01,Canonical_H1_1,0.014925373,0.003707136
162 | Germ_HJ_IGHJ2*01,Canonical_H1_2,0,0
163 | Germ_HJ_IGHJ2*01,Canonical_H1_3,0,0
164 | Germ_HJ_IGHJ2*01,Canonical_H2_0,0,0.004761905
165 | Germ_HJ_IGHJ2*01,Canonical_H2_6,0.015748031,0.003624064
166 | Germ_HJ_IGHJ2*01,Canonical_H3_0,0,0.012232416
167 | Germ_HJ_IGHJ2*01,Canonical_H3_1,0,0.009708738
168 | Germ_HJ_IGHJ2*01,Canonical_H3_2,0.060606061,0.009876543
169 | Germ_HJ_IGHJ2*01,Canonical_H3_3,0,0.001742666
170 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ4*02,0,0
171 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ5*02,0,0
172 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ6*01,0,0
173 | Germ_HJ_IGHJ2*01,PI_0.0-3.5,0,0
174 | Germ_HJ_IGHJ2*01,PI_3.5-3.9375,0,0.00152207
175 | Germ_HJ_IGHJ2*01,PI_3.9375-4.375,0.068965517,0.003271538
176 | Germ_HJ_IGHJ2*01,PI_4.375-4.8125,0,0
177 | Germ_HJ_IGHJ2*01,PI_4.8125-5.25,0,0.003508772
178 | Germ_HJ_IGHJ2*01,PI_5.25-5.6875,0,0.005747126
179 | Germ_HJ_IGHJ2*01,PI_5.6875-6.125,0,0.003484321
180 | Germ_HJ_IGHJ2*01,PI_6.125-7.0,0,0.021459227
181 | Germ_HJ_IGHJ2*01,PI_7.0-14.0,0,0.005617978
182 | Germ_HJ_IGHJ3*01,Canonical_H1_0,0,0
183 | Germ_HJ_IGHJ3*01,Canonical_H1_1,0.007462687,0.002548656
184 | Germ_HJ_IGHJ3*01,Canonical_H1_2,0,0
185 | Germ_HJ_IGHJ3*01,Canonical_H1_3,0,0
186 | Germ_HJ_IGHJ3*01,Canonical_H2_0,0,0.009803922
187 | Germ_HJ_IGHJ3*01,Canonical_H2_6,0.007874016,0.002173913
188 | Germ_HJ_IGHJ3*01,Canonical_H3_0,0,0
189 | Germ_HJ_IGHJ3*01,Canonical_H3_1,0,0
190 | Germ_HJ_IGHJ3*01,Canonical_H3_2,0,0
191 | Germ_HJ_IGHJ3*01,Canonical_H3_3,0.013888889,0.003204195
192 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ1*01,0,0
193 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ2*01,0,0
194 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ4*02,0,0
195 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ5*01,0,0
196 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ5*02,0,0
197 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ6*01,0,0
198 | Germ_HJ_IGHJ3*01,PI_0.0-3.5,0.038461538,0.008426966
199 | Germ_HJ_IGHJ3*01,PI_3.5-3.9375,0,0.003060444
200 | Germ_HJ_IGHJ3*01,PI_3.9375-4.375,0,0.002190581
201 | Germ_HJ_IGHJ3*01,PI_4.375-4.8125,0,0.004201681
202 | Germ_HJ_IGHJ3*01,PI_4.8125-5.25,0,0
203 | Germ_HJ_IGHJ3*01,PI_5.25-5.6875,0,0
204 | Germ_HJ_IGHJ3*01,PI_5.6875-6.125,0,0
205 | Germ_HJ_IGHJ3*01,PI_6.125-7.0,0,0
206 | Germ_HJ_IGHJ3*01,PI_7.0-14.0,0,0.002840909
207 | Germ_HJ_IGHJ3*02,Canonical_H1_0,0,0
208 | Germ_HJ_IGHJ3*02,Canonical_H1_1,0.164179104,0.007877665
209 | Germ_HJ_IGHJ3*02,Canonical_H1_2,0,0
210 | Germ_HJ_IGHJ3*02,Canonical_H1_3,0,0
211 | Germ_HJ_IGHJ3*02,Canonical_H2_0,0.035714286,0.004385965
212 | Germ_HJ_IGHJ3*02,Canonical_H2_6,0.1640625,0.00797294
213 | Germ_HJ_IGHJ3*02,Canonical_H3_0,0.138888889,0.002873563
214 | Germ_HJ_IGHJ3*02,Canonical_H3_1,0.066666667,0.013452915
215 | Germ_HJ_IGHJ3*02,Canonical_H3_2,0.078431373,0.007075472
216 | Germ_HJ_IGHJ3*02,Canonical_H3_3,0.13253012,0.007848837
217 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ1*01,0,0
218 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ2*01,0,0
219 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ3*01,0,0
220 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ4*02,0,0
221 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ5*01,0,0
222 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ5*02,0,0
223 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ6*01,0,0
224 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ6*04,0,0
225 | Germ_HJ_IGHJ3*02,PI_0.0-3.5,0.090909091,0.013262599
226 | Germ_HJ_IGHJ3*02,PI_3.5-3.9375,0.151515152,0.014448669
227 | Germ_HJ_IGHJ3*02,PI_3.9375-4.375,0.159090909,0.003208556
228 | Germ_HJ_IGHJ3*02,PI_4.375-4.8125,0,0.011583012
229 | Germ_HJ_IGHJ3*02,PI_4.8125-5.25,0,0
230 | Germ_HJ_IGHJ3*02,PI_5.25-5.6875,0,0.005208333
231 | Germ_HJ_IGHJ3*02,PI_5.6875-6.125,0.032258065,0.005076142
232 | Germ_HJ_IGHJ3*02,PI_6.125-7.0,0,0
233 | Germ_HJ_IGHJ3*02,PI_7.0-14.0,0,0
234 | Germ_HJ_IGHJ4*02,Canonical_H1_0,0,0.001960784
235 | Germ_HJ_IGHJ4*02,Canonical_H1_1,0.358208955,0.350694444
236 | Germ_HJ_IGHJ4*02,Canonical_H1_2,0,0
237 | Germ_HJ_IGHJ4*02,Canonical_H1_3,0,0.000657895
238 | Germ_HJ_IGHJ4*02,Canonical_H2_0,0.057692308,0.042579075
239 | Germ_HJ_IGHJ4*02,Canonical_H2_6,0.346153846,0.344344106
240 | Germ_HJ_IGHJ4*02,Canonical_H3_0,0.155172414,0.11965812
241 | Germ_HJ_IGHJ4*02,Canonical_H3_1,0.094339623,0.079495268
242 | Germ_HJ_IGHJ4*02,Canonical_H3_2,0.08,0.06935123
243 | Germ_HJ_IGHJ4*02,Canonical_H3_3,0.304347826,0.276617685
244 | Germ_HJ_IGHJ4*02,Germ_HJ_IGHJ5*02,0,0
245 | Germ_HJ_IGHJ4*02,Germ_HJ_IGHJ6*01,0,0
246 | Germ_HJ_IGHJ4*02,PI_0.0-3.5,0.104477612,0.058390023
247 | Germ_HJ_IGHJ4*02,PI_3.5-3.9375,0.2,0.185449958
248 | Germ_HJ_IGHJ4*02,PI_3.9375-4.375,0.203125,0.168837434
249 | Germ_HJ_IGHJ4*02,PI_4.375-4.8125,0.02,0.042985075
250 | Germ_HJ_IGHJ4*02,PI_4.8125-5.25,0.0625,0.055457227
251 | Germ_HJ_IGHJ4*02,PI_5.25-5.6875,0.02,0.03198032
252 | Germ_HJ_IGHJ4*02,PI_5.6875-6.125,0.054545455,0.118343195
253 | Germ_HJ_IGHJ4*02,PI_6.125-7.0,0,0.046274038
254 | Germ_HJ_IGHJ4*02,PI_7.0-14.0,0.06122449,0.062821245
255 | Germ_HJ_IGHJ5*01,Canonical_H1_0,0,0
256 | Germ_HJ_IGHJ5*01,Canonical_H1_1,0.007462687,0.041241891
257 | Germ_HJ_IGHJ5*01,Canonical_H1_2,0,0
258 | Germ_HJ_IGHJ5*01,Canonical_H1_3,0,0
259 | Germ_HJ_IGHJ5*01,Canonical_H2_0,0,0.041899441
260 | Germ_HJ_IGHJ5*01,Canonical_H2_6,0.007874016,0.039248736
261 | Germ_HJ_IGHJ5*01,Canonical_H3_0,0,0.040084388
262 | Germ_HJ_IGHJ5*01,Canonical_H3_1,0,0.022099448
263 | Germ_HJ_IGHJ5*01,Canonical_H3_2,0.03030303,0.010619469
264 | Germ_HJ_IGHJ5*01,Canonical_H3_3,0,0.041834968
265 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ1*01,0,0
266 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ2*01,0,0
267 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ4*02,0,0
268 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ5*02,0,0
269 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ6*01,0,0
270 | Germ_HJ_IGHJ5*01,PI_0.0-3.5,0,0.017408124
271 | Germ_HJ_IGHJ5*01,PI_3.5-3.9375,0,0.041578576
272 | Germ_HJ_IGHJ5*01,PI_3.9375-4.375,0,0.045410628
273 | Germ_HJ_IGHJ5*01,PI_4.375-4.8125,0,0.01754386
274 | Germ_HJ_IGHJ5*01,PI_4.8125-5.25,0,0.037037037
275 | Germ_HJ_IGHJ5*01,PI_5.25-5.6875,0,0.018126888
276 | Germ_HJ_IGHJ5*01,PI_5.6875-6.125,0.1,0.026425591
277 | Germ_HJ_IGHJ5*01,PI_6.125-7.0,0,0.015228426
278 | Germ_HJ_IGHJ5*01,PI_7.0-14.0,0,0.017612524
279 | Germ_HJ_IGHJ5*02,Canonical_H1_0,0,0
280 | Germ_HJ_IGHJ5*02,Canonical_H1_1,0.067164179,0.038452629
281 | Germ_HJ_IGHJ5*02,Canonical_H1_2,0,0
282 | Germ_HJ_IGHJ5*02,Canonical_H1_3,0,0.005952381
283 | Germ_HJ_IGHJ5*02,Canonical_H2_0,0.066666667,0.025495751
284 | Germ_HJ_IGHJ5*02,Canonical_H2_6,0.0625,0.038099831
285 | Germ_HJ_IGHJ5*02,Canonical_H3_0,0.037037037,0.02771855
286 | Germ_HJ_IGHJ5*02,Canonical_H3_1,0.055555556,0.022792023
287 | Germ_HJ_IGHJ5*02,Canonical_H3_2,0.024390244,0.02003643
288 | Germ_HJ_IGHJ5*02,Canonical_H3_3,0.08,0.038961039
289 | Germ_HJ_IGHJ5*02,Germ_HJ_IGHJ6*01,0,0
290 | Germ_HJ_IGHJ5*02,PI_0.0-3.5,0,0.038306452
291 | Germ_HJ_IGHJ5*02,PI_3.5-3.9375,0.016129032,0.034555712
292 | Germ_HJ_IGHJ5*02,PI_3.9375-4.375,0,0.024880383
293 | Germ_HJ_IGHJ5*02,PI_4.375-4.8125,0.090909091,0.010230179
294 | Germ_HJ_IGHJ5*02,PI_4.8125-5.25,0,0.025821596
295 | Germ_HJ_IGHJ5*02,PI_5.25-5.6875,0.2,0.01242236
296 | Germ_HJ_IGHJ5*02,PI_5.6875-6.125,0.1875,0.043041607
297 | Germ_HJ_IGHJ5*02,PI_6.125-7.0,0.222222222,0.023684211
298 | Germ_HJ_IGHJ5*02,PI_7.0-14.0,0,0.030364372
299 | Germ_HJ_IGHJ6*01,Canonical_H1_0,0,0.004627682
300 | Germ_HJ_IGHJ6*01,Canonical_H1_1,0.291044776,0.545748614
301 | Germ_HJ_IGHJ6*01,Canonical_H1_2,0,0.00042123
302 | Germ_HJ_IGHJ6*01,Canonical_H1_3,0,0
303 | Germ_HJ_IGHJ6*01,Canonical_H2_0,0.045454545,0.038399353
304 | Germ_HJ_IGHJ6*01,Canonical_H2_6,0.286821705,0.53838885
305 | Germ_HJ_IGHJ6*01,Canonical_H3_0,0.054545455,0.029085343
306 | Germ_HJ_IGHJ6*01,Canonical_H3_1,0.042553191,0.015835313
307 | Germ_HJ_IGHJ6*01,Canonical_H3_2,0.263157895,0.095841584
308 | Germ_HJ_IGHJ6*01,Canonical_H3_3,0.206521739,0.531785808
309 | Germ_HJ_IGHJ6*01,PI_0.0-3.5,0.181818182,0.083167529
310 | Germ_HJ_IGHJ6*01,PI_3.5-3.9375,0.273972603,0.245423729
311 | Germ_HJ_IGHJ6*01,PI_3.9375-4.375,0.096774194,0.167794799
312 | Germ_HJ_IGHJ6*01,PI_4.375-4.8125,0.024390244,0.056435242
313 | Germ_HJ_IGHJ6*01,PI_4.8125-5.25,0,0.055489022
314 | Germ_HJ_IGHJ6*01,PI_5.25-5.6875,0,0.038114754
315 | Germ_HJ_IGHJ6*01,PI_5.6875-6.125,0.042553191,0.107169811
316 | Germ_HJ_IGHJ6*01,PI_6.125-7.0,0,0.046774194
317 | Germ_HJ_IGHJ6*01,PI_7.0-14.0,0,0.079062376
318 | Germ_HJ_IGHJ6*04,Canonical_H1_0,0,0
319 | Germ_HJ_IGHJ6*04,Canonical_H1_1,0.082089552,0.000231696
320 | Germ_HJ_IGHJ6*04,Canonical_H1_2,0,0
321 | Germ_HJ_IGHJ6*04,Canonical_H1_3,0,0
322 | Germ_HJ_IGHJ6*04,Canonical_H2_0,0,0
323 | Germ_HJ_IGHJ6*04,Canonical_H2_6,0.086614173,0.000241663
324 | Germ_HJ_IGHJ6*04,Canonical_H3_0,0.034482759,0
325 | Germ_HJ_IGHJ6*04,Canonical_H3_1,0,0
326 | Germ_HJ_IGHJ6*04,Canonical_H3_2,0.073170732,0
327 | Germ_HJ_IGHJ6*04,Canonical_H3_3,0.092105263,0.00029129
328 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ1*01,0,0
329 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ2*01,0,0
330 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ3*01,0,0
331 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ4*02,0,0
332 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ5*01,0,0
333 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ5*02,0,0
334 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ6*01,0,0
335 | Germ_HJ_IGHJ6*04,PI_0.0-3.5,0.121212121,0
336 | Germ_HJ_IGHJ6*04,PI_3.5-3.9375,0.101694915,0
337 | Germ_HJ_IGHJ6*04,PI_3.9375-4.375,0.025641026,0
338 | Germ_HJ_IGHJ6*04,PI_4.375-4.8125,0,0
339 | Germ_HJ_IGHJ6*04,PI_4.8125-5.25,0,0.003703704
340 | Germ_HJ_IGHJ6*04,PI_5.25-5.6875,0,0
341 | Germ_HJ_IGHJ6*04,PI_5.6875-6.125,0,0
342 | Germ_HJ_IGHJ6*04,PI_6.125-7.0,0,0
343 | Germ_HJ_IGHJ6*04,PI_7.0-14.0,0,0
344 | Germ_HV_IGHV3-23*01,Canonical_H1_0,0,0.003231018
345 | Germ_HV_IGHV3-23*01,Canonical_H1_1,1,0.996076621
346 | Germ_HV_IGHV3-23*01,Canonical_H1_2,0,0.000230787
347 | Germ_HV_IGHV3-23*01,Canonical_H1_3,0,0.000461574
348 | Germ_HV_IGHV3-23*01,Canonical_H2_0,0.052238806,0.045003462
349 | Germ_HV_IGHV3-23*01,Canonical_H2_6,0.947761194,0.954996538
350 | Germ_HV_IGHV3-23*01,Canonical_H3_0,0.141791045,0.0726979
351 | Germ_HV_IGHV3-23*01,Canonical_H3_1,0.074626866,0.044311101
352 | Germ_HV_IGHV3-23*01,Canonical_H3_2,0.246268657,0.090699285
353 | Germ_HV_IGHV3-23*01,Canonical_H3_3,0.537313433,0.792291715
354 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ1*01,0.007462687,0.00761597
355 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ2*01,0.014925373,0.003692592
356 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ3*01,0.007462687,0.002538657
357 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ3*02,0.164179104,0.007846757
358 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ4*02,0.358208955,0.350565428
359 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ5*01,0.007462687,0.041080083
360 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ5*02,0.067164179,0.038541426
361 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ6*01,0.291044776,0.547888299
362 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ6*04,0.082089552,0.000230787
363 | Germ_HV_IGHV3-23*01,PI_0.0-3.5,0.194029851,0.08031387
364 | Germ_HV_IGHV3-23*01,PI_3.5-3.9375,0.402985075,0.300023079
365 | Germ_HV_IGHV3-23*01,PI_3.9375-4.375,0.21641791,0.208631433
366 | Germ_HV_IGHV3-23*01,PI_4.375-4.8125,0.02238806,0.052619432
367 | Germ_HV_IGHV3-23*01,PI_4.8125-5.25,0.02238806,0.062312486
368 | Germ_HV_IGHV3-23*01,PI_5.25-5.6875,0.02238806,0.03669513
369 | Germ_HV_IGHV3-23*01,PI_5.6875-6.125,0.074626866,0.129240711
370 | Germ_HV_IGHV3-23*01,PI_6.125-7.0,0.014925373,0.05123471
371 | Germ_HV_IGHV3-23*01,PI_7.0-14.0,0.029850746,0.078929148
372 | PI_0.0-3.5,PI_3.5-3.9375,0,0
373 | PI_0.0-3.5,PI_3.9375-4.375,0,0
374 | PI_0.0-3.5,PI_4.375-4.8125,0,0
375 | PI_0.0-3.5,PI_4.8125-5.25,0,0
376 | PI_0.0-3.5,PI_5.25-5.6875,0,0
377 | PI_0.0-3.5,PI_5.6875-6.125,0,0
378 | PI_0.0-3.5,PI_6.125-7.0,0,0
379 | PI_0.0-3.5,PI_7.0-14.0,0,0
380 | PI_3.5-3.9375,PI_3.9375-4.375,0,0
381 | PI_3.5-3.9375,PI_4.375-4.8125,0,0
382 | PI_3.5-3.9375,PI_4.8125-5.25,0,0
383 | PI_3.5-3.9375,PI_5.25-5.6875,0,0
384 | PI_3.5-3.9375,PI_5.6875-6.125,0,0
385 | PI_3.5-3.9375,PI_6.125-7.0,0,0
386 | PI_3.5-3.9375,PI_7.0-14.0,0,0
387 | PI_3.9375-4.375,PI_4.375-4.8125,0,0
388 | PI_3.9375-4.375,PI_4.8125-5.25,0,0
389 | PI_3.9375-4.375,PI_5.25-5.6875,0,0
390 | PI_3.9375-4.375,PI_5.6875-6.125,0,0
391 | PI_3.9375-4.375,PI_6.125-7.0,0,0
392 | PI_3.9375-4.375,PI_7.0-14.0,0,0
393 | PI_4.375-4.8125,PI_4.8125-5.25,0,0
394 | PI_4.375-4.8125,PI_5.25-5.6875,0,0
395 | PI_4.375-4.8125,PI_5.6875-6.125,0,0
396 | PI_4.375-4.8125,PI_6.125-7.0,0,0
397 | PI_4.375-4.8125,PI_7.0-14.0,0,0
398 | PI_4.8125-5.25,PI_5.25-5.6875,0,0
399 | PI_4.8125-5.25,PI_5.6875-6.125,0,0
400 | PI_4.8125-5.25,PI_6.125-7.0,0,0
401 | PI_4.8125-5.25,PI_7.0-14.0,0,0
402 | PI_5.25-5.6875,PI_5.6875-6.125,0,0
403 | PI_5.25-5.6875,PI_6.125-7.0,0,0
404 | PI_5.25-5.6875,PI_7.0-14.0,0,0
405 | PI_5.6875-6.125,PI_6.125-7.0,0,0
406 | PI_5.6875-6.125,PI_7.0-14.0,0,0
407 | PI_6.125-7.0,PI_7.0-14.0,0,0
--------------------------------------------------------------------------------
/ASAP/FeatureExtraction.py:
--------------------------------------------------------------------------------
1 | import Bio.SeqUtils.ProtParam
2 | import os
3 | import numpy as np
4 |
5 | SET_NAME = 'MMP-cluster'
6 | IF_ONLY_HEAVY = False
7 | CNT_DB = 2
8 | CNT_TARGET = 1
9 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'
10 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'
11 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster
12 |
13 |
14 |
15 |
16 | # Chothia numbering definition for CDR regions
17 | CHOTHIA_CDR = {'L': {'1': [24, 34], '2': [50, 56], '3': [89, 97]}, 'H':{'1': [26, 32], '2': [52, 56], '3': [95, 102]}}
18 |
19 | #################################################################################################################
20 | # function ReadAminoAndNum:
21 | # Read in the Chothia number reference and targeting files. Store the numbering and putative germline.
22 | #
23 | # Input: targeting_direct, reference_direct
24 | # Output:1. dictionary of Amino, {'L': {}, 'H': {}}
25 | # 2. dictionary of Num , {'L': {}, 'H': {}}
26 | # 3. dictionary of Germ , {'L': {'V': {}, 'J':{}}, 'H': {'V': {}, 'J':{}}}
27 | # 4. list of DatasetName, [dh, dm, p1,....]
28 | # 5. list of DatasetSize, [ , , ,...]
29 | #################################################################################################################
30 |
31 | def ReadAminoNumGerm(targeting_direct, reference_direct):
32 | Amino = {'L': {}, 'H': {}}
33 | Num ={'L': {}, 'H': {}}
34 | Germ = {'L': {'V': {}, 'J':{}}, 'H': {'V': {}, 'J':{}}}
35 | DatasetName = []
36 | DatasetSize = []
37 |
38 | targeting_filenames = sorted(os.listdir(targeting_direct))
39 | reference_filenames = sorted(os.listdir(reference_direct))
40 |
41 | for i, name in enumerate(reference_filenames + targeting_filenames):
42 | if not name.endswith('.txt'):
43 | continue
44 | if i < len(reference_filenames):
45 | direct = reference_direct
46 | else:
47 | direct = targeting_direct
48 | with open(direct + name, 'r') as fi:
49 | data = fi.readlines()
50 | DatasetName.append(name.split('_')[0])
51 | cnt_pattern = 0
52 | cnt_seq = 0
53 | tmp_num = []
54 | tmp_seq = []
55 | tmp_germ_V = ' '
56 | tmp_germ_J = ' '
57 | buff = ''
58 | for j in range(len(data)):
59 | # if chain begin
60 | if data[j][0] =='L' or data[j][0] =='H':
61 | L_H = data[j][0]
62 | tmp_seq.append(data[j].split()[-1])
63 | if len(data[j].split()) == 3:
64 | tmp_num.append(data[j].split()[-2])
65 | else:
66 | tmp_num.append(data[j].split()[1] + data[j].split()[-2])
67 |
68 | # second time of #|, line of germline
69 | if data[j][0]=='#' and data[j][1] == '|':
70 | cnt_pattern += 1
71 | if (cnt_pattern % 4) == 0:
72 | tmp_germ_V = data[j].split("|")[2]
73 | tmp_germ_J = data[j].split("|")[4]
74 |
75 |
76 | # time of \\, ending a sequence, need \\ to present \
77 | if data[j][0] == '/':
78 | if IF_ONLY_HEAVY:
79 | seq_name = name.split('_')[0] + '_' + str(cnt_seq)
80 | else:
81 | seq_name = name.split('_')[0] + '_' + str(int(cnt_seq / 2))
82 | cnt_seq += 1
83 | Amino[L_H][seq_name] = tmp_seq
84 | Num[L_H][seq_name] =tmp_num
85 | Germ[L_H]['V'][seq_name] = tmp_germ_V
86 | Germ[L_H]['J'][seq_name] = tmp_germ_J
87 | # if not tmp_germ_V.startswith('IGHV3-23'):
88 | # print(data[j - 8])
89 | # print(seq_name)
90 | # print(tmp_germ_V, tmp_germ_J)
91 | tmp_num = []
92 | tmp_seq = []
93 | tmp_germ_V = ' '
94 | tmp_germ_J = ' '
95 |
96 | if IF_ONLY_HEAVY:
97 | DatasetSize.append(cnt_seq)
98 | else:
99 | DatasetSize.append(int(cnt_seq / 2))
100 | return Amino, Num, Germ, DatasetName, DatasetSize
101 |
102 |
103 | #################################################################################################################
104 | # function GetOneHotGerm:
105 | # Transform the stored putative germline into one-hot encoded features.
106 | #
107 | # Input: Germ, DatasetSize, DatasetName
108 | # Output: 1. array of OneHotGerm, [[seq1 onehot], [seq2 onehot], [seq3 onehot], ...]
109 | # 2. list of GermFeatureNames according to one hot, [LV_IGLV1*1, LV_IGLV1*2,....
110 | # LJ_XXXX,
111 | # HV_XXXX,
112 | # HJ_XXXX ...]
113 | #################################################################################################################
114 |
115 | def GetOneHotGerm(Germ, DatasetSize, DatasetName):
116 | OneHotGerm = []
117 | GermFeatureNames = []
118 | # for every feature type
119 | for H_L in Germ:
120 | if IF_ONLY_HEAVY:
121 | if H_L=='L':
122 | continue
123 | for V_J in Germ[H_L]:
124 | # every feature name in that type
125 | candidate = list(sorted(set(Germ[H_L][V_J].values())))
126 | for can in candidate:
127 | GermFeatureNames.append('Germ_' +H_L+ V_J+'_'+can)
128 |
129 | # for every dataset
130 | for i, name in enumerate(DatasetName):
131 | tmp = [[] for j in range(int(DatasetSize[i]))]
132 | # for every seq in that dataset
133 | for j in range(int(DatasetSize[i])):
134 | seq_name = name + '_' + str(j)
135 |
136 | for k in range(len(GermFeatureNames)):
137 | H_L = GermFeatureNames[k].split('_')[1][0]
138 | V_J = GermFeatureNames[k].split('_')[1][1]
139 | if Germ[H_L][V_J][seq_name] == GermFeatureNames[k].split('_')[2]:
140 | tmp[j].append(1)
141 | else:
142 | tmp[j].append(0)
143 | OneHotGerm += tmp
144 |
145 | return OneHotGerm, GermFeatureNames
146 |
147 |
148 | #################################################################################################################
149 | # function ReadCanonTemp:
150 | # Read in the template file (default PIGS) and store it.
151 | #
152 | # Output: 1. dictionary of CanonTemp, {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}}
153 | #################################################################################################################
154 | def ReadCanonTemp(canonical_direct):
155 | CanonTemp = {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}}
156 | with open(canonical_direct, 'r') as fi:
157 | data = fi.readlines()
158 | for i in range(len(data)):
159 | if data[i].split()[1] not in CanonTemp[data[i][0]][data[i][1]]:
160 | CanonTemp[data[i][0]][data[i][1]][data[i].split()[1]] = []
161 | CanonTemp[data[i][0]][data[i][1]][data[i].split()[1]].append(data[i].split()[2:])
162 | return CanonTemp
163 |
164 | #################################################################################################################
165 | # function GetCanon:
166 | # Assign each sequence witht the predicted type of canonical structure according to the template.
167 | #
168 | # Input: Amino, Num
169 | # Output: 1. dictionary of CanonTemp, {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}}
170 | # optional: PIGS / Chothia
171 | #################################################################################################################
172 |
173 | def GetCanon(canonical_direct, Amino, Num):
174 | CanonTemp = ReadCanonTemp(canonical_direct)
175 | Canon = {'L': {'1': {}, '2': {}, '3': {}}, 'H': {'1': {}, '2': {}, '3': {}}}
176 | # for every sequence
177 | for seq_name in Num['H']:
178 |
179 | for L_H in Canon:
180 | if IF_ONLY_HEAVY:
181 | if L_H == 'L':
182 | continue
183 |
184 | for j in Canon[L_H]:
185 | cnt_len = 0
186 |
187 | for k in Num[L_H][seq_name]:
188 | if k[-1]>='A'and k[-1]<='Z':
189 | num_i = int(k[:-1])
190 | else:
191 | num_i = int(k)
192 | if num_i >= CHOTHIA_CDR[L_H][j][0] and num_i <= CHOTHIA_CDR[L_H][j][1]:
193 | cnt_len += 1
194 | length = cnt_len
195 | # for every type number on specific CDR region
196 | for k in CanonTemp[L_H][j]:
197 | ############## same type have diff version of template
198 | for m in range(len(CanonTemp[L_H][j][k])):
199 | # if have matched CDR length, then give zero type
200 | if CanonTemp[L_H][j][k][m][0] == str(length):
201 | # check if length is the only restriction
202 | if len(CanonTemp[L_H][j][k][m]) == 1:
203 | Canon[L_H][j][seq_name] = k
204 | # check for each position with in specific motif
205 | else:
206 | restriction = CanonTemp[L_H][j][k][m][1:]
207 | for l in range(0,len(restriction),2):
208 |
209 | pos = CanonTemp[L_H][j][k][m][l+1]
210 |
211 | # index of the number
212 | if pos not in Num[L_H][seq_name]:
213 | break
214 | else:
215 | id = int(Num[L_H][seq_name].index(pos))
216 | s=CanonTemp[L_H][j][k][m][l + 2]
217 |
218 | if Amino[L_H][seq_name][id] not in CanonTemp[L_H][j][k][m][l+2]:
219 | break
220 | Canon[L_H][j][seq_name] = k
221 | # if no match canonical structure found, then append 0
222 | if seq_name not in Canon[L_H][j]:
223 | Canon[L_H][j][seq_name] = '0'
224 | return Canon
225 |
226 | #################################################################################################################
227 | # function GetOneHotCanon:
228 | # Similar to GetOneHotGerm, transform the stored canonical structure into one-hot encoded features.
229 | #
230 | # Input: Amino, Num, DatasetSize, DatasetName
231 | # Output: 1. array of OneHotCanon, [[seq1 onehot], [seq2 onehot], [seq3 onehot], ...]
232 | # 2. list of CanonFeatureNames according to one hot, [Canon_L1_1, Canon_L1_2,....
233 | # Canon_L2_1,
234 | # Canon_L3_1,
235 | # Canon_H1_1,
236 | # Canon_H2_1,
237 | # Canon_H3_1,...]
238 | #################################################################################################################
239 |
240 | def GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName):
241 | Canon = GetCanon(canonical_direct, Amino, Num)
242 | OneHotCanon = []
243 | CanonFeatureNames = []
244 | # for every feature type
245 |
246 | for H_L in Canon:
247 | if IF_ONLY_HEAVY:
248 | if H_L=='L':
249 | continue
250 | # O_T_T stands for 1_2_3
251 | for O_T_T in Canon[H_L]:
252 | # every feature name in that type
253 | candidate = list(sorted(set(Canon[H_L][O_T_T].values())))
254 | for can in candidate:
255 | CanonFeatureNames.append('Canonical_' +H_L+ O_T_T+'_'+can)
256 |
257 | # for every dataset
258 | for i, name in enumerate(DatasetName):
259 | tmp = [[] for j in range(int(DatasetSize[i]))]
260 | # for every seq in that dataset
261 | for j in range(int(DatasetSize[i])):
262 | seq_name = name + '_' + str(j)
263 | for k in range(len(CanonFeatureNames)):
264 | H_L = CanonFeatureNames[k].split('_')[1][0]
265 | O_T_T = CanonFeatureNames[k].split('_')[1][1]
266 | if Canon[H_L][O_T_T][seq_name] == CanonFeatureNames[k].split('_')[2]:
267 | tmp[j].append(1)
268 | else:
269 | tmp[j].append(0)
270 | OneHotCanon += tmp
271 |
272 | return OneHotCanon, CanonFeatureNames
273 |
274 | #################################################################################################################
275 | # function GetCDRH3:
276 | # Take the CDR-H3 of each seqeunce.
277 | #
278 | # Input: Amino, Num
279 | # Output: 1. dictionary of CDRH3, {}
280 | #################################################################################################################
281 |
282 | def GetCDRH3(Amino, Num):
283 | CDRH3={}
284 | for seq_name in Amino['H']:
285 | CDRH3[seq_name]=''
286 | for i in range(len(Num['H'][seq_name])):
287 | number = Num['H'][seq_name][i]
288 | if number[-1] >= 'A' and number[-1] <= 'Z':
289 | num_i = int(number[:-1])
290 | else:
291 | num_i = int(number)
292 | if num_i >= CHOTHIA_CDR['H']['3'][0] and num_i <= CHOTHIA_CDR['H']['3'][1]:
293 | CDRH3[seq_name] += Amino['H'][seq_name][i]
294 | return CDRH3
295 |
296 | #################################################################################################################
297 | # function GetCDRH3PI:
298 | # Calculate the pI value for each sequence
299 | #
300 | # Input: CDRH3
301 | # Output: 1. dictionary of PI, {}
302 | #################################################################################################################
303 |
304 | def GetCDRH3PI(CDRH3):
305 | void = ['KYPLAVSGIIT', '-------V', 'GVVTAAIDGMDV','DLYSGYRSYGLDV', 'GGTSYYGTDV','EEGDIPGTTCMDV']
306 | PI_CDRH3={}
307 | for seq_name in CDRH3:
308 | prot = Bio.SeqUtils.ProtParam.ProteinAnalysis(CDRH3[seq_name])
309 | try:
310 | PI_CDRH3[seq_name] = prot.isoelectric_point()
311 | except:
312 | PI_CDRH3[seq_name] = -1
313 |
314 | return PI_CDRH3
315 |
316 |
317 | #################################################################################################################
318 | # function GetPIBin:
319 | # Halve the bin of pI following the binning method using sequence's pI information.
320 | #
321 | # Input: PI_CDRH3
322 | # Output: 1. a list of PITheresholds, []
323 | #################################################################################################################
324 |
325 | def GetPIBin(PI_CDRH3):
326 | PITheresholds = [0.0, 7.0, 14.0]
327 | tenPercent = 0.1*len(PI_CDRH3)
328 | PITolerance = 0.3
329 | cnt = 0
330 | while cnt > tenPercent or len(PITheresholds) == 3:
331 | # count how many sequence over threshold
332 | for i in range(1, len(PITheresholds)):
333 | cnt = 0
334 | if (PITheresholds[i] - PITheresholds[i-1])< (2 * PITolerance):
335 | continue
336 | # go over the dict
337 | for seq in PI_CDRH3:
338 | if PI_CDRH3[seq]> PITheresholds[i-1] and PI_CDRH3[seq] tenPercent:
343 | PITheresholds.append((PITheresholds[i-1] + PITheresholds[i])/2.0)
344 | PITheresholds = sorted(PITheresholds)
345 | break
346 | return PITheresholds
347 |
348 | #################################################################################################################
349 | # function GetOneHotPI:
350 | # Transform the pI values into one-hot encoded pI bin features.
351 | #
352 | # Input: CDRH3, DatasetSize, DatasetName
353 | # Output: 1. array of OneHotPI, [[seq1 onehot],
354 | # [seq2 onehot],
355 | # [seq3 onehot],
356 | # ...]
357 | # 2. list of PIFeatureNames according to one hot, [PI_bin1, PI_bin2, PI_bin3...]
358 | #################################################################################################################
359 |
360 | def GetOneHotPI(CDRH3, DatasetSize, DatasetName):
361 |
362 | PI_CDRH3 = GetCDRH3PI(CDRH3)
363 |
364 | PITheresholds = GetPIBin(PI_CDRH3)
365 |
366 | PIFeatureNames = []
367 | OneHotPI = []
368 | for i in range(1, len(PITheresholds)):
369 | PIFeatureNames.append('PI_'+str(PITheresholds[i-1])+'-'+str(PITheresholds[i]))
370 |
371 | # for every dataset
372 | for i, name in enumerate(DatasetName):
373 | tmp = [[0 for k in range(len(PIFeatureNames))] for j in range(int(DatasetSize[i]))]
374 | # for every seq in that dataset
375 | for j in range(int(DatasetSize[i])):
376 | seq_name = name + '_' + str(j)
377 | for k in range(1, len(PITheresholds)):
378 | if PI_CDRH3[seq_name] >= float(PITheresholds[k-1]) and PI_CDRH3[seq_name] <= float(PITheresholds[k]):
379 | tmp[j][k-1] = 1
380 | break
381 | OneHotPI += tmp
382 | return OneHotPI, PIFeatureNames
383 |
384 | #################################################################################################################
385 | # function GetPositionalMotifFreq:
386 | # Count the frequency of each possible frequent possitional motif for each dataset.
387 | #
388 | # Input: CDRH3
389 | # Output: 1. dictionary of MotifFreq, {'r1':{}, 'r2':{},'t1':{}, 't2':{}, 't3':{}, 't4':{}, 't5':{}, 't6':{}, 't7':{}, 't8':{}}
390 | #################################################################################################################
391 |
392 | def GetPositionalMotifFreq(CDRH3):
393 | MotifFreq ={'r1':{}, 'r2':{},'t1':{}, 't2':{}, 't3':{}, 't4':{}, 't5':{}, 't6':{}, 't7':{}, 't8':{}}
394 | MotifDict = {}
395 | for seq_name in CDRH3:
396 | MotifDict[seq_name] = []
397 | f_name = seq_name.split('_')[0]
398 | # length of motif
399 | for i in range(2, 10):
400 | if i > len(CDRH3[seq_name]):
401 | continue
402 | else:
403 | for j in range(len(CDRH3[seq_name])-i):
404 | PostionalMotif = str(j) +'_'+CDRH3[seq_name][j:j+i]
405 |
406 | MotifDict[seq_name].append(PostionalMotif)
407 | if PostionalMotif in MotifFreq[f_name]:
408 | MotifFreq[f_name][PostionalMotif] += 1
409 | else:
410 | MotifFreq[f_name][PostionalMotif] = 1
411 | return MotifFreq, MotifDict
412 |
413 | #################################################################################################################
414 | # function GetImpMotif (Version 1.0):
415 | # Take only the most 2 frequent motif in each data set, top 2 * 10 set * 9 length = 180
416 | #
417 | # Input: MotifFreq
418 | # Output: 1. list of ImpMotif, [motif1, motif2, ...]
419 | #################################################################################################################
420 |
421 | def GetImpMotif(MotifFreq):
422 | ImpMotif = []
423 | Top2 = 2
424 | for f_name in MotifFreq:
425 | motif_dic = MotifFreq[f_name]
426 | for i in range(2, 11):
427 | tmp = {}
428 | for motif in motif_dic:
429 |
430 | if motif.split('_')[0] == str(i):
431 | tmp[motif]= motif_dic[motif]
432 | sorted_tmp = sorted(tmp.items(),key= lambda k: k[1],reverse= True)
433 | for j in range(Top2):
434 | if len(sorted_tmp)> j:
435 | ImpMotif.append(sorted_tmp[j][0])
436 | ImpMotif = list(sorted(set(ImpMotif)))
437 | return ImpMotif
438 |
439 | #################################################################################################################
440 | # function GetCDRH3Motif:
441 | # Assign present frequent motif for each sequence
442 | #
443 | # Input: ImpMotif, CDRH3
444 | # Output: 1. dictionary of Motif_CDRH3, {}
445 | #################################################################################################################
446 |
447 | def GetCDRH3Motif(ImpMotif, CDRH3, MotifDict):
448 | Motif_CDRH3={}
449 | for seq_name in CDRH3:
450 | # seq_len = len(CDRH3[seq_name])
451 | Motif_CDRH3[seq_name]=[0 for z in range(len(ImpMotif))]
452 | for i in range(len(ImpMotif)):
453 | if ImpMotif[i] in MotifDict[seq_name]:
454 | Motif_CDRH3[seq_name][i] = 1
455 | return Motif_CDRH3
456 |
457 | #################################################################################################################
458 | # function MultiHotMotif:
459 | # Transfer motif information for each sequence to multi-hot encoded features.
460 | #
461 | # Input: CDRH3, DatasetSize, DatasetName
462 | # Output: 1. array of MultiHotMotif, [[seq1 multihot], [seq2 multihot], [seq3 multihot],...]
463 | # 2. list of MotifFeatureNames according to multi hot, [Motif1, Motif2, ...]
464 | #################################################################################################################
465 |
466 | def MultiHotMotif(CDRH3, DatasetSize, DatasetName):
467 | MotifFreq, MotifDict = GetPositionalMotifFreq(CDRH3)
468 |
469 | ImpMotif = GetImpMotif(MotifFreq)
470 |
471 | Motif_CDRH3 = GetCDRH3Motif(ImpMotif, CDRH3, MotifDict)
472 |
473 | MotifFeatureNames = []
474 | for motif in ImpMotif:
475 | MotifFeatureNames.append("Motif_"+ motif)
476 |
477 | MultiHotMotif =[]
478 | for i, name in enumerate(DatasetName):
479 | tmp = [[] for j in range(int(DatasetSize[i]))]
480 | # for every seq in that dataset
481 | for j in range(int(DatasetSize[i])):
482 | seq_name = name + '_' + str(j)
483 | tmp[j]= Motif_CDRH3[seq_name]
484 | MultiHotMotif+=tmp
485 | return MultiHotMotif, MotifFeatureNames
486 |
487 | #################################################################################################################
488 | # function GetFeatureVectors:
489 | # Combine germline, canonical structure, pI, motif features to feature vectors
490 | #
491 | # Input: OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames
492 | # Output: 1. AllFeatureVectors for every sequence, [[seq1 LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...],
493 | # [seq2 LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...],
494 | # ...]
495 | #
496 | # 2. AllFeatureNames [LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...]
497 | #################################################################################################################
498 |
499 | def GetFeatureVectors(OneHotGerm, GermFeatureNames,
500 | OneHotCanon, CanonFeatureNames,
501 | OneHotPI, PIFeatureNames,
502 | MultiHotMotif, MotifFeatureNames):
503 | AllFeatureNames= GermFeatureNames + CanonFeatureNames + PIFeatureNames + MotifFeatureNames
504 | AllFeatureVectors =[[] for i in range(len(OneHotGerm))]
505 | # num of seq
506 | for i in range(len(OneHotGerm)):
507 | AllFeatureVectors[i] += OneHotGerm[i]
508 | AllFeatureVectors[i] += OneHotCanon[i]
509 | AllFeatureVectors[i] += OneHotPI[i]
510 | AllFeatureVectors[i] += MultiHotMotif[i]
511 |
512 |
513 | AllFeatureVectors = np.array(AllFeatureVectors)
514 | ExcludeIGHVVectors = AllFeatureVectors
515 | ExcludeFeatureNames = AllFeatureNames
516 | if SET_NAME == 'IGHV':
517 | name_index = []
518 | ExcludeFeatureNames = []
519 | for i, name in enumerate(AllFeatureNames):
520 | if not name.startswith('Germ_HV_IGHV3-23'):
521 | name_index.append(i)
522 | ExcludeFeatureNames.append(AllFeatureNames[i])
523 |
524 | ExcludeIGHVVectors = AllFeatureVectors[:, name_index]
525 |
526 | return AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames
527 |
528 | if __name__=='__main__':
529 | targeting_direct = '../testCase-MMP/data/IGHV/'
530 | reference_direct = '../testCase-MMP/data/IGHV/'
531 | Amino, Num, Germ, DatasetName, DatasetSize = ReadAminoNumGerm(targeting_direct, reference_direct)
532 |
533 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
162 |
163 |
164 |
165 | distance
166 | motif
167 | PostionalMotifposi
168 | jaccar
169 | corr
170 | print
171 | moti
172 | head
173 | heatmap
174 | rank
175 | less'
176 | .index
177 | open(
178 | WriteFisherFS
179 | sta
180 | heat map
181 | float(
182 | feature
183 | print(
184 | all
185 | shuffle
186 | referen
187 | set
188 | frequency
189 | _new
190 | set(
191 | startswith
192 | 2_GG
193 | mean
194 | importance
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 | true
223 | DEFINITION_ORDER
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 | project
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 | 1532624332261
476 |
477 |
478 | 1532624332261
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
--------------------------------------------------------------------------------
/ASAP/SequenceAndFeatureAnalysis.py:
--------------------------------------------------------------------------------
1 | import random
2 | from matplotlib import rc, rcParams
3 | import matplotlib.pyplot as plt
4 | import scipy.stats as sta
5 | from sklearn.ensemble import ExtraTreesClassifier
6 | from sklearn import svm
7 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
8 | from scipy import interp
9 | from sklearn.metrics import roc_curve, auc
10 | import numpy as np
11 | import pandas as pd
12 |
13 | np.random.seed(8)
14 |
15 | BLOSUM62_DIRECT = "./data/blosum62.csv"
16 |
17 | SET_NAME = 'MMP-cluster'
18 | IF_ONLY_HEAVY = False
19 | CNT_DB = 2
20 | CNT_TARGET = 1
21 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'
22 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'
23 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster
24 |
25 |
26 |
27 | #################################################################################################################
28 | # function DuplicateSelectFeature:
29 | # Sample with replacement, each time the selection is on the total
30 | #
31 | # Input: DatasetName, DatasetSize, AllFeatureVectors
32 | # Output: 1. X_DS
33 | # 2. Y_DS
34 | # 3. SeqName_DS
35 | #################################################################################################################
36 | def DuplicateSelectFeature(size, DatasetName, DatasetSize, AllFeatureVectors):
37 | X_DS = []
38 | Y_DS = []
39 | SeqName_DS =[]
40 | previous = 0
41 | for i in range(len(DatasetSize)):
42 | if i < CNT_DB:
43 | actual_size = int(size * CNT_TARGET / CNT_DB)
44 | else:
45 | actual_size = size
46 |
47 |
48 | if actual_size <= DatasetSize[i]:
49 | shuffle_x = np.array([sh_i for sh_i in range(DatasetSize[i])])
50 | np.random.shuffle(shuffle_x)
51 | for j in range(actual_size):
52 | # idx = np.random.randint(DatasetSize[i])
53 | idx = shuffle_x[j]
54 | SeqName_DS.append(DatasetName[i]+'_'+str(idx))
55 | X_DS.append(AllFeatureVectors[previous+idx])
56 | if i < CNT_DB:
57 | Y_DS.append(0)
58 | else:
59 | Y_DS.append(1)
60 | previous += DatasetSize[i]
61 |
62 | else:
63 | for j in range(actual_size):
64 | idx = np.random.randint(DatasetSize[i])
65 | SeqName_DS.append(DatasetName[i]+'_'+str(idx))
66 | X_DS.append(AllFeatureVectors[previous+idx])
67 | if i < CNT_DB:
68 | Y_DS.append(0)
69 | else:
70 | Y_DS.append(1)
71 | previous += DatasetSize[i]
72 | return X_DS, Y_DS, SeqName_DS
73 |
74 | #################################################################################################################
75 | # function IterationDuplicateSelectFeature:
76 | # Iteratively sample with replacement
77 | #
78 | # Input: DatasetName, DatasetSize, AllFeatureVectors
79 | # Output: 1. X_DS
80 | # 2. Y_DS
81 | # 3. SeqName_DS
82 | #################################################################################################################
83 | def IterationDuplicateSelectFeature(size, iterate, DatasetName, DatasetSize, AllFeatureVectors):
84 | X_IDS = [[] for i in range(iterate)]
85 | Y_IDS = [[] for i in range(iterate)]
86 | SeqName_IDS = [[] for i in range(iterate)]
87 | for i in range(iterate):
88 | X_DS, Y_DS, SeqName_DS = DuplicateSelectFeature(size, DatasetName, DatasetSize, AllFeatureVectors)
89 | X_IDS[i] = X_DS
90 | Y_IDS[i] = Y_DS
91 | SeqName_IDS[i] = SeqName_DS
92 | return X_IDS, Y_IDS, SeqName_IDS
93 |
94 |
95 | #################################################################################################################
96 | # function normalize:
97 | # Normalize the distance matrix
98 | #
99 | # Input: dist
100 | # Output: 1. tmp_dist
101 | #################################################################################################################
102 |
103 | def normalize(dist):
104 | tmp_min = dist.min()
105 | tmp_max = dist.max()
106 | tmp_dist = (dist-tmp_min)/(tmp_max-tmp_min)
107 | return tmp_dist
108 |
109 | #################################################################################################################
110 | # function Draw_heatmap:
111 | # Draw heatmap according to the input distance matrix
112 | #
113 | # Input: dist, name
114 | #################################################################################################################
115 | def Draw_heatmap(size, dist, name, DatasetSize):
116 | rc('font', size=20)
117 | #ticks bold
118 | rcParams['text.latex.preamble'] = [r'\usepackage{sfmath} \boldmath']
119 | # figure size
120 | fig, ax = plt.subplots(figsize=(10, 7))
121 | # heatmap with color bar
122 |
123 | dist = normalize(dist)
124 | plt.imshow(dist, cmap='Blues', interpolation='nearest')
125 | v = np.linspace(0.0, 1.0, 6, endpoint=True)
126 | cb = plt.colorbar(ticks=v)
127 | plt.title(name, y=1.08)
128 |
129 | N_DB = size * (CNT_TARGET / CNT_DB)
130 | N_TARGET = size
131 |
132 | x = []
133 | for i in range(CNT_DB):
134 | x.append((i+0.5) * N_DB)
135 | for i in range(CNT_TARGET):
136 | x.append((i + 0.5) * N_TARGET + CNT_DB * N_DB)
137 |
138 | y = x
139 |
140 | if SET_NAME == 'IGHV':
141 | labels = ['Reference'] + ['MMP-targeting']
142 | elif SET_NAME == 'MMP':
143 | labels = ['Human', 'Murine'] + ['MMP-targeting']
144 | elif SET_NAME == 'MMP-cluster':
145 | labels = ['Human', 'Murine'] + [str(i + 1) for i in range(CNT_TARGET)]
146 | elif SET_NAME == 'DEKOSKY':
147 | labels = ['Naive 1', 'Naive 2'] + [str(i+1) for i in range(CNT_TARGET)]
148 | else:
149 | labels = ['Naive 1', 'Naive 2'] + [str(i + 1) for i in range(CNT_TARGET)]
150 | plt.xticks(x, labels)
151 | plt.yticks(y, labels)
152 |
153 | ax.xaxis.tick_top()
154 |
155 | a = []
156 | for i in range(CNT_DB):
157 | a.append((i+1) * N_DB)
158 |
159 | for i in range(CNT_TARGET - 1):
160 | a.append((i + 1) * N_TARGET + CNT_DB * N_DB)
161 |
162 | for idx, item in enumerate(a):
163 | if idx == CNT_DB-1:
164 | ax.axhline(item, linestyle='-', color='black', linewidth=3)
165 | ax.axvline(item, linestyle='-', color='black', linewidth=3)
166 | else:
167 | ax.axhline(item, linestyle='-', color='black', linewidth=1)
168 | ax.axvline(item, linestyle='-', color='black', linewidth=1)
169 |
170 | fig.savefig('./results/'+SET_NAME +'_'+ name+'.png')
171 |
172 | ###################### Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################
173 |
174 | #################################################################################################################
175 | # function ReadBLOSUM:
176 | # Read in the BLOSUM 62 substitution matrix
177 | #
178 | # Output: 1. BLOSUM, a dictionary of pairwise permutation
179 | #################################################################################################################
180 | def ReadBLOSUM():
181 | with open(BLOSUM62_DIRECT, "r") as fi:
182 | data = fi.readlines()
183 | for i in range(len(data)):
184 | data[i] = data[i].strip().split(',')
185 |
186 | names = data[0]
187 | BLOSUM = {}
188 | for i in range(len(names)):
189 | for j in range(len(names)):
190 | BLOSUM[names[i] + names[j]] = data[i + 1][j]
191 | return BLOSUM
192 |
193 | #################################################################################################################
194 | # function CalBLOSUM:
195 | # Calculate the sequence similarity for each sequence.
196 | #
197 | # Input: SeqName_DS, Amino, Num, BLOSUM, chain
198 | # Output: 1. dist
199 | #################################################################################################################
200 | def CalBLOSUM(SeqName_DS, Amino, Num, BLOSUM, chain):
201 | dist = np.zeros((len(SeqName_DS), len(SeqName_DS)))
202 | for i, s1 in enumerate(SeqName_DS):
203 | seq1 = {}
204 | for k in range(len(Amino[chain][s1])):
205 | seq1[Num[chain][s1][k]] = Amino[chain][s1][k]
206 | for j, s2 in enumerate(SeqName_DS):
207 | seq2 = {}
208 | for k in range(len(Amino[chain][s2])):
209 | seq2[Num[chain][s2][k]] = Amino[chain][s2][k]
210 | cnt = 0
211 | for key in seq1:
212 | if key in seq2 and (seq1[key] + seq2[key]) in BLOSUM:
213 | cnt += int(BLOSUM[seq1[key] + seq2[key]])
214 | else:
215 | cnt += -4
216 | for key in seq2:
217 | if key not in seq1:
218 | cnt += -4
219 | dist[i][j] = cnt
220 | return dist
221 |
222 | #################################################################################################################
223 | # function CalBLOSUMVAR:
224 | # Calculate the sequence similarity for each sequence only on non-constant region.
225 | #
226 | # Input: SeqName_DS, Amino, Num, BLOSUM, chain
227 | # Output: 1. dist
228 | #################################################################################################################
229 | def CalBLOSUMVAR(SeqName_DS, Amino, Num, BLOSUM, chain):
230 | dist = np.zeros((len(SeqName_DS), len(SeqName_DS)))
231 | # should be the same, since being normalized afterwards
232 |
233 | for i, s1 in enumerate(SeqName_DS):
234 | seq1 = {}
235 | for k in range(len(Amino[chain][s1])):
236 | seq1[Num[chain][s1][k]] = Amino[chain][s1][k]
237 | for j, s2 in enumerate(SeqName_DS):
238 | seq2 = {}
239 | for k in range(len(Amino[chain][s2])):
240 | seq2[Num[chain][s2][k]] = Amino[chain][s2][k]
241 | cnt = 0
242 | for key in seq1:
243 | if key in seq2 and (seq1[key] + seq2[key]) in BLOSUM:
244 | cnt += int(BLOSUM[seq1[key] + seq2[key]])
245 | else:
246 | cnt += -4
247 | for key in seq2:
248 | if key not in seq1:
249 | cnt += -4
250 | dist[i][j] = cnt
251 | return dist
252 |
253 | #################################################################################################################
254 | # function HeatmapHL:
255 | # Calculate the heavy and light chain heatmap over multiple iteration, draw the first heatmap
256 | #
257 | # Input: SeqName_IDS, Amino, Num
258 | # Output: 1. H_Idist
259 | # 2. L_Idist
260 | #################################################################################################################
261 | def HeatmapHL(size, iterate, SeqName_IDS, Amino, Num):
262 | iterate = 1
263 | BLOSUM = ReadBLOSUM()
264 | H_Idist = []
265 | L_Idist = []
266 | for i in range(iterate):
267 | H_Idist.append(CalBLOSUM(SeqName_IDS[i], Amino, Num, BLOSUM, 'H'))
268 | H_Idist.append(CalBLOSUMVAR(SeqName_IDS[i], Amino, Num, BLOSUM, 'H'))
269 | if IF_ONLY_HEAVY:
270 | continue
271 | L_Idist.append(CalBLOSUM(SeqName_IDS[i], Amino, Num, BLOSUM, 'L'))
272 | L_Idist.append(CalBLOSUMVAR(SeqName_IDS[i], Amino, Num, BLOSUM, 'L'))
273 | return H_Idist, L_Idist
274 |
275 | #################################################################################################################
276 | # function HeatmapFeature:
277 | # Calculate the feature heatmap over multiple iteration, draw the first heatmap
278 | #
279 | # Input: X_IDS, AllFeatureNames, MotifFeatureNames
280 | # Output: 1. Idist
281 | #################################################################################################################
282 | def HeatmapFeature(size, iterate, X_IDS, AllFeatureNames, MotifFeatureNames):
283 | iterate = 1
284 | motifStart = len(AllFeatureNames) - len(MotifFeatureNames)
285 | Idist = [np.zeros((len(X_IDS[0]), len(X_IDS[0]))) for i in range(iterate)]
286 | for m in range(iterate):
287 | for i in range(len(X_IDS[m])):
288 | a = X_IDS[m][i]
289 | for j in range(i + 1):
290 | b = X_IDS[m][j]
291 |
292 | AandB = 0
293 | AorB = 0
294 | extr = [0 for x in range(len(X_IDS[m][j]))]
295 | for l in range(motifStart, len(X_IDS[m][i])):
296 | if a[l] == 1 and b[l] == 1:
297 | AandB += 1
298 | if a[l] == 1 or b[l] == 1:
299 | AorB += 1
300 | if AorB == 0:
301 | jaccard = 0
302 | else:
303 | jaccard = AandB / (1.0 * AorB)
304 |
305 | for k in range(0, motifStart):
306 | if a[k] == 1 and b[k] == 1:
307 | extr[k] = 1
308 | else:
309 | extr[k] = 0
310 |
311 | extr = np.array(extr)
312 | # jaccar score for motif, use except motif sum and motif jaccard score
313 | Idist[m][i][j] = np.sum(extr) + jaccard
314 | Idist[m][j][i] = Idist[m][i][j]
315 | # if SET_NAME=='MMP-cluster':
316 | # Idist_new = []
317 | # for j in range(len(Idist[0])):
318 | # if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
319 | # j].startswith('Canonical_L2') \
320 | # or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
321 | # continue
322 | # Idist_new.append(Idist[0][:, j])
323 | # Idist_new = np.array(Idist_new)
324 | # Idist_new = Idist_new.T
325 | # Idist = Idist_new
326 | return Idist
327 |
328 | ############################### Section 3.3 Similarity analysis (Statistical test) #############################
329 |
330 | #################################################################################################################
331 | # function RankTestBlock:
332 | # Use Mann-Whitney test to check if hypothsis on the within set holds
333 | #
334 | # Input: dist
335 | # Output: 1. p_value
336 | #################################################################################################################
337 | def RankTestBlock(size, dist):
338 | stop = int(len(dist)/2)
339 | block1 = np.reshape(dist[:stop, :stop], [-1])
340 | block4 = np.reshape(dist[stop:, stop:], [-1])
341 | mean1 = np.mean(block1)
342 | mean4 = np.mean(block4)
343 | std1 = np.std(block1)
344 | std4 = np.std(block4)
345 | effect_size1 = (mean1 - mean4)/std1
346 | effect_size4 = (mean1 - mean4) / std4
347 | p_value = sta.ranksums(block1, block4) #, alternative='less')
348 | return p_value
349 |
350 | #################################################################################################################
351 | # function RankTestBlock:
352 | # Use Mann-Whitney test to check if hypothsis on the correlation between heatmaps holds
353 | #
354 | # Input: dist1, dist2
355 | # Output: 1. p_value
356 | #################################################################################################################
357 | def RankTestHeatMap(dist1, dist2):
358 | map1 = np.reshape(dist1, [-1])
359 | map2 = np.reshape(dist2, [-1])
360 | mean1 = np.mean(map1)
361 | mean2 = np.mean(map2)
362 | std1 = np.std(map1)
363 | std2 = np.std(map2)
364 | effect_size1 = (mean1 - mean2) / std1
365 | effect_size2 = (mean1 - mean2) / std2
366 | p_value = sta.ranksums(map1, map2)#, alternative='less')
367 | return p_value
368 |
369 | #################################################################################################################
370 | # function MultiRankTest:
371 | # Check if the statistical test for two hypothesis holds over multiple iterations
372 | #
373 | # Input: F_Idist, H_Idist, L_Idist
374 | #################################################################################################################
375 | def MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist):
376 | iterate = 1
377 | p_value_F = [[] for i in range(iterate)]
378 | p_value_H = [[] for i in range(iterate)]
379 | p_value_L = [[] for i in range(iterate)]
380 | p_value_Diff = [[] for i in range(iterate)]
381 | for i in range(iterate):
382 | # Wilcxon sum rank test, with effect size
383 | p_value_F[i] = RankTestBlock(size, F_Idist[i])
384 | p_value_H[i] = RankTestBlock(size, H_Idist[i])
385 | if not IF_ONLY_HEAVY:
386 | p_value_L[i] = RankTestBlock(size, L_Idist[i])
387 | p_value_Diff[i] = RankTestHeatMap(F_Idist[i] - H_Idist[i], F_Idist[i] - L_Idist[i])
388 | print(p_value_F[0], p_value_H[0])
389 | if np.max(p_value_F)<0.05 and np.max(p_value_H)<0.05 and (IF_ONLY_HEAVY or np.max(p_value_L)<0.05): #and np.max(p_value_Diff)<0.05:
390 | print("Statistical tests (Reference against Targeting) succeed.")
391 | else:
392 | print("Statistical tests results (Reference against Targeting):")
393 | print('Extracted features:', p_value_F[0])
394 | print('Heavy chain sequence:', p_value_H[0])
395 | if not IF_ONLY_HEAVY:
396 | print('Light chain sequence:', p_value_L[0])
397 | print('Difference between (Feature, Heavy) and (Feature, Light):', p_value_Diff)
398 |
399 | ####################################### Section 3.4 Salient feature-value analysis ############################ #
400 |
401 |
402 | #################################################################################################################
403 | # function Fisher:
404 | # Calculate p-value with FET
405 | #
406 | # Input: X_DS, Y_DS, AllFeatureNames
407 | # Output: 1. contingency_table
408 | # 2. pvalue
409 | # 3. X_DS
410 | # 4. Y_DS
411 | #################################################################################################################
412 | def Fisher(X_DS, Y_DS, AllFeatureNames):
413 | contingency_table=[[] for i in range(len(AllFeatureNames))]
414 | N_feature = len(AllFeatureNames)
415 | pvalue= [0 for i in range(N_feature)]
416 | for i, name in enumerate(AllFeatureNames):
417 | a, b, c, d = 0, 0, 0, 0
418 | for j in range(len(Y_DS)):
419 | if Y_DS[j] == 0:
420 | if X_DS[j][i]==1:
421 | a += 1
422 | else:
423 | c += 1
424 | else:
425 | if X_DS[j][i]==1:
426 | b += 1
427 | else:
428 | d += 1
429 | contingency_table[i] = [[a, b], [c, d]]
430 | for i in range(N_feature):
431 | oddsratio, pv = sta.fisher_exact(contingency_table[i], "less") # greater sig in DB # less sig in patent, "two-sided"
432 | pvalue[i] = pv
433 | return contingency_table, pvalue, X_DS, Y_DS
434 |
435 | #################################################################################################################
436 | # function Importance:
437 | # Calculate importance score through feature selection
438 | #
439 | # Input: X_DS, Y_DS, AllFeatureNames
440 | # Output: 1. importances
441 | # 2. X_DS
442 | # 3. Y_DS
443 | #################################################################################################################
444 | def Importance(X_DS, Y_DS, AllFeatureNames):
445 | # X_DS, Y_DS, SeqName_DS = DuplicateSelectFeature(DatasetName, DatasetSize, AllFeatureVectors, size)
446 | clf_featureSelect = ExtraTreesClassifier()
447 | clf_featureSelect = clf_featureSelect.fit(X_DS, Y_DS)
448 | importances = clf_featureSelect.feature_importances_
449 | # print(len(X_DS[0]), len(AllFeatureNames))
450 | X_DS = np.array(X_DS)
451 | # Y_DS = np.array(Y_DS)
452 | # print(len(X_DS))
453 | a = AllFeatureNames.index('Germ_HJ_IGHJ4*02')
454 | b = AllFeatureNames.index('Motif_5_YY')
455 | # b = AllFeatureNames.index('PI_3.5-3.9375')
456 | ###################################################################################################################################################
457 | # b = AllFeatureNames.index('Canonical_H3_3')
458 | # b = AllFeatureNames.index('Germ_HJ_IGHJ6*01')
459 | sum_ref = 0
460 | for j in range(int(len(X_DS)/2)):
461 | if X_DS[j,a]==1 and X_DS[j,b]==0:
462 | sum_ref+=1
463 | sum_tar = 0
464 | for j in range(int(len(X_DS)/2),len(X_DS)):
465 | if X_DS[j,a]==1 and X_DS[j,b]==0:
466 | sum_tar +=1
467 |
468 | # print(AllFeatureNames[a], AllFeatureNames[b], 'reference: ',sum_ref, 'targeting: ',sum_tar)
469 | return importances, X_DS, Y_DS, sum_ref, sum_tar
470 |
471 | #################################################################################################################
472 | # function RankFisherFS:
473 | # Sort feature values according to FET and feature selection statistics
474 | #
475 | # Input: Fpvalue, importances
476 | # Output: 1. RankFpvalue
477 | # 2. RankImportance
478 | #################################################################################################################
479 | def RankFisherFS(Fpvalue, importances):
480 | RankFpvalue =[-1 for i in range(len(Fpvalue))]
481 | s_Fpvalue = sorted(range(len(Fpvalue)), key=lambda k: Fpvalue[k])
482 | for rank, idx in enumerate(s_Fpvalue):
483 | RankFpvalue[idx] = rank+1 # real rank start from 1
484 |
485 | RankImportance =[-1 for i in range(len(importances))]
486 | s_Importance = sorted(range(len(importances)), key=lambda k: importances[k], reverse = True)
487 | for rank, idx in enumerate(s_Importance):
488 | RankImportance[idx] = rank+1 # real rank start from 1
489 | return RankFpvalue, RankImportance
490 |
491 | #################################################################################################################
492 | # function WriteFisherFS:
493 | # Write FET and feature selection results to csv files
494 | #
495 | # Input: Fpvalue, importances, Fpvalue_std, importances_std, RankFpvalue, RankImportance, AllFeatureNames
496 | #################################################################################################################
497 | def WriteFisherFS(Fpvalue, importances, Fpvalue_std, importances_std, RankFpvalue, RankImportance, AllFeatureNames, AllFeatureVectors, DatasetSize):
498 | fo = open('./results/'+SET_NAME+'_RankFisherAndFS.csv', 'w')
499 | fo.write('Feature, Feature Value,')
500 | cnt_db = int(sum(DatasetSize[:CNT_DB]))
501 | cnt_mmp = int(sum(DatasetSize[CNT_DB:]))
502 |
503 | fo.write('Fisher Test p-value, Feature Selection (thereshold = ' + format(np.mean(importances), '.4f') + '),')
504 | fo.write('Rank of Statistic Significancy, Rank of Feature Selection, ')
505 | fo.write('Frequency in Reference , Frequency in Targeting \n')
506 | AgreeFeature = []
507 | for i in range(len(AllFeatureNames)):
508 | if AllFeatureNames[i].split('_')[0] == 'Germ' or AllFeatureNames[i].split('_')[0] == 'Canonical':
509 | fo.write(AllFeatureNames[i].split('_')[0] + ' ' + AllFeatureNames[i].split('_')[1])
510 | fo.write(','+AllFeatureNames[i].split('_')[2]+',')
511 | elif AllFeatureNames[i].split('_')[0] == 'PI':
512 | fo.write(AllFeatureNames[i].split('_')[0]+','+AllFeatureNames[i].split('_')[1]+',')
513 | elif AllFeatureNames[i].split('_')[0] == 'Motif':
514 | fo.write(AllFeatureNames[i].split('_')[0]+',')
515 | fo.write(AllFeatureNames[i].split('_')[1] + '_' + AllFeatureNames[i].split('_')[2]+',')
516 | fo.write(str(Fpvalue[i])+',')
517 | fo.write(str(importances[i])+',')
518 |
519 | if Fpvalue[i]<0.05:
520 | fo.write(str(RankFpvalue[i]))
521 | fo.write(',')
522 | if importances[i]>np.mean(importances):
523 | fo.write(str(RankImportance[i]))
524 | fo.write(',')
525 | fo.write(str('{:.2f}'.format(sum(AllFeatureVectors[:cnt_db, i])/cnt_db * 100)) + '%,')
526 | fo.write(str('{:.2f}'.format(sum(AllFeatureVectors[cnt_db:, i])/cnt_mmp * 100)) + '%,')
527 |
528 | fo.write('\n')
529 | if Fpvalue[i]<0.05 and importances[i]>np.mean(importances):
530 | AgreeFeature.append(i)
531 | print(AllFeatureVectors.shape, cnt_db, cnt_mmp)
532 | fo.close()
533 |
534 | #################################################################################################################
535 | # function MultiFisherFS:
536 | # Average p-values for FET and importance scores for feature select over multiple iterations
537 | #
538 | # Input: DatasetName, DatasetSize, AllFeatureVectors
539 | #################################################################################################################
540 | def MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, AllFeatureVectors, AllFeatureNames):
541 | Fpvalue = [[] for i in range(iterate)]
542 | importances = [[] for i in range(iterate)]
543 | RankFpvalue = [[] for i in range(iterate)]
544 | RankImportance= [[] for i in range(iterate)]
545 |
546 | ref_list = [0 for i in range(iterate)]
547 | tar_list = [0 for i in range(iterate)]
548 | for i in range(iterate):
549 | _, Fpvalue[i], _, _ = Fisher(X_IDS[i], Y_IDS[i], AllFeatureNames)
550 | # importances[i], _, _, ref_list[i], tar_list[i]= Importance(X_IDS[i], Y_IDS[i], AllFeatureNames)
551 | RankFpvalue[i], _= RankFisherFS(Fpvalue[i], importances[i])
552 |
553 | X_IDS_all = []
554 | Y_IDS_all = []
555 | for i in range(iterate):
556 | X_IDS_all+=X_IDS[i]
557 | Y_IDS_all+=Y_IDS[i]
558 | X_IDS_all = np.array(X_IDS_all)
559 | Y_IDS_all = np.array(Y_IDS_all)
560 |
561 | importances_all, _, _, _, _ = Importance(X_IDS_all, Y_IDS_all, AllFeatureNames)
562 | RankImportance_all = [-1 for i in range(len(importances_all))]
563 | s_Importance = sorted(range(len(importances_all)), key=lambda k: importances_all[k], reverse=True)
564 | for rank, idx in enumerate(s_Importance):
565 | RankImportance_all[idx] = rank + 1 # real rank start from 1
566 |
567 | Fpvalue_avg = np.mean(Fpvalue, axis = 0)
568 | # importances_avg = np.mean(importances, axis = 0)
569 |
570 | Fpvalue_std = np.std(Fpvalue, axis=0)
571 | # importances_std = np.std(importances, axis=0)
572 |
573 |
574 | # print('tar', '{:.2f}'.format(100*np.mean(tar_list)*2/len(X_IDS[0]))+'% ','ref','{:.2f}'.format(100*np.mean(ref_list)*2/len(X_IDS[0]))+'% ')
575 | ####### avgR
576 | RankFpvalue_avgR = np.mean(RankFpvalue, axis = 0)
577 | # RankImportance_avgR = np.mean(RankImportance, axis = 0)
578 | WriteFisherFS(Fpvalue_avg, importances_all,Fpvalue_std,Fpvalue_std, RankFpvalue_avgR, RankImportance_all, AllFeatureNames, AllFeatureVectors, DatasetSize)
579 |
580 |
581 | ####################################### Section 3.4 Classification on segments ################################
582 | #################################################################################################################
583 | # function calculate_auc:
584 | # Calculate mean AUC over ten-fold cross validation for three algorithms, SVM, random forest, AdaBoost
585 | #
586 | # Input: X, Y
587 | # Output: 1. auc(mean_fpr, mean_tpr_svm)
588 | # 2. auc(mean_fpr, mean_tpr_rf)
589 | # 3. auc(mean_fpr, mean_tpr_ada)
590 | #################################################################################################################
591 | def calculate_auc(X, Y):
592 | clf_svm = svm.SVC(kernel='linear', probability=True, random_state=0)
593 | clf_randomforest = RandomForestClassifier() # max_depth=5, n_estimators=10, max_features=1
594 | clf_adaboost = AdaBoostClassifier()
595 |
596 | X = np.array(X)
597 | Y = np.array(Y)
598 | indices = [i for i in range(len(Y))]
599 | random.shuffle(indices)
600 |
601 | mean_fpr = np.linspace(0, 1, 100)
602 | tpr_svms = []
603 | tpr_rfs = []
604 | tpr_adas = []
605 |
606 | for i in range(10):
607 | test_i = indices[int(i * len(Y) / 10):int((i + 1) * len(Y) / 10)]
608 | train_i = indices[:int(i * len(Y) / 10)] + indices[int((i + 1) * len(Y) / 10):]
609 | X_train, X_test, Y_train, Y_test = X[train_i], X[test_i], Y[train_i], Y[test_i]
610 |
611 | clf_svm = clf_svm.fit(X_train, Y_train)
612 | clf_randomforest = clf_randomforest.fit(X_train, Y_train)
613 | clf_adaboost = clf_adaboost.fit(X_train, Y_train)
614 |
615 | fpr_svm, tpr_svm, _ = roc_curve(Y_test, clf_svm.predict_proba(X_test)[:, 1], pos_label=1)
616 | tpr_svms.append(interp(mean_fpr, fpr_svm, tpr_svm))
617 | tpr_svms[-1][0] = 0.0
618 |
619 | fpr_rf, tpr_rf, _ = roc_curve(Y_test, clf_randomforest.predict_proba(X_test)[:, 1], pos_label=1)
620 | tpr_rfs.append(interp(mean_fpr, fpr_rf, tpr_rf))
621 | tpr_rfs[-1][0] = 0.0
622 |
623 | fpr_ada, tpr_ada, _ = roc_curve(Y_test, clf_adaboost.predict_proba(X_test)[:, 1], pos_label=1)
624 | tpr_adas.append(interp(mean_fpr, fpr_ada, tpr_ada))
625 | tpr_adas[-1][0] = 0.0
626 |
627 | mean_tpr_svm = np.mean(tpr_svms, axis=0)
628 | mean_tpr_svm[-1] = 1.0
629 | mean_tpr_rf = np.mean(tpr_rfs, axis=0)
630 | mean_tpr_rf[-1] = 1.0
631 | mean_tpr_ada = np.mean(tpr_adas, axis=0)
632 | mean_tpr_ada[-1] = 1.0
633 | return auc(mean_fpr, mean_tpr_svm), auc(mean_fpr, mean_tpr_rf), auc(mean_fpr, mean_tpr_ada)
634 |
635 | #################################################################################################################
636 | # function MultiAuc:
637 | # Average AUC for three classification with all features over multiple iterations
638 | #
639 | # Input: X_IDS, Y_IDS
640 | #################################################################################################################
641 | def MultiAuc(iterate, X_IDS, Y_IDS):
642 | auc_1 = [[] for i in range(iterate)]
643 | auc_2 = [[] for i in range(iterate)]
644 | auc_3 = [[] for i in range(iterate)]
645 |
646 | for i in range(iterate):
647 | auc_1[i], auc_2[i],auc_3[i] = calculate_auc(X_IDS[i], Y_IDS[i])
648 | print("Average AUC with all features: ")
649 | print("SVM\t\t", np.mean(auc_1, axis = 0))
650 | print("Random forest\t",np.mean(auc_2, axis=0))
651 | print("AdaBoost\t",np.mean(auc_3, axis=0))
652 |
653 | #################################################################################################################
654 | # function Classify:
655 | # Classify the reference and targeting set with three algorithms, SVM, random forest, AdaBoost
656 | #
657 | # Input: X, Y, roc_name
658 | #################################################################################################################
659 | def Classify(X, Y, roc_name):
660 | clf_svm = svm.SVC(kernel='linear', probability=True, random_state=0)
661 | clf_randomforest = RandomForestClassifier() # max_depth=5, n_estimators=10, max_features=1
662 | clf_adaboost = AdaBoostClassifier()
663 |
664 | X = np.array(X)
665 | Y = np.array(Y)
666 | indices = [i for i in range(len(Y))]
667 | random.shuffle(indices)
668 |
669 | mean_fpr = np.linspace(0, 1, 100)
670 | tpr_svms = []
671 | tpr_rfs = []
672 | tpr_adas = []
673 |
674 | plt.figure(figsize=(10, 7))
675 | lw = 2
676 | for i in range(10):
677 | test_i = indices[int(i * len(Y) / 10):int((i + 1) * len(Y) / 10)]
678 | train_i = indices[:int(i * len(Y) / 10)] + indices[int((i + 1) * len(Y) / 10):]
679 | X_train, X_test, Y_train, Y_test = X[train_i], X[test_i], Y[train_i], Y[test_i]
680 |
681 | clf_svm = clf_svm.fit(X_train, Y_train)
682 | clf_randomforest = clf_randomforest.fit(X_train, Y_train)
683 | clf_adaboost = clf_adaboost.fit(X_train, Y_train)
684 |
685 | fpr_svm, tpr_svm, _ = roc_curve(Y_test, clf_svm.predict_proba(X_test)[:, 1], pos_label=1)
686 | tpr_svms.append(interp(mean_fpr, fpr_svm, tpr_svm))
687 | tpr_svms[-1][0] = 0.0
688 |
689 | fpr_rf, tpr_rf, _ = roc_curve(Y_test, clf_randomforest.predict_proba(X_test)[:, 1], pos_label=1)
690 | tpr_rfs.append(interp(mean_fpr, fpr_rf, tpr_rf))
691 | tpr_rfs[-1][0] = 0.0
692 |
693 | fpr_ada, tpr_ada, _ = roc_curve(Y_test, clf_adaboost.predict_proba(X_test)[:, 1], pos_label=1)
694 | tpr_adas.append(interp(mean_fpr, fpr_ada, tpr_ada))
695 | tpr_adas[-1][0] = 0.0
696 |
697 | mean_tpr_svm = np.mean(tpr_svms, axis=0)
698 | mean_tpr_svm[-1] = 1.0
699 | mean_tpr_rf = np.mean(tpr_rfs, axis=0)
700 | mean_tpr_rf[-1] = 1.0
701 | mean_tpr_ada = np.mean(tpr_adas, axis=0)
702 | mean_tpr_ada[-1] = 1.0
703 |
704 | plt.plot(mean_fpr, mean_tpr_svm, color='darkorange',
705 | lw=lw, alpha=1, label='SVM (AUC = %0.4f)' % auc(mean_fpr, mean_tpr_svm))
706 | plt.plot(mean_fpr, mean_tpr_rf, color='green',
707 | lw=lw, label='Random Forest (AUC = %0.4f)' % auc(mean_fpr, mean_tpr_rf))
708 | plt.plot(mean_fpr, mean_tpr_ada, color='darkred',
709 | lw=lw, label='AdaBoost (AUC = %0.4f)' % auc(mean_fpr, mean_tpr_ada))
710 |
711 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
712 | plt.xlim([0.0, 1.0])
713 | plt.ylim([0.0, 1.05])
714 | plt.xlabel('False Positive Rate')
715 | plt.ylabel('True Positive Rate')
716 | plt.title(roc_name)
717 | plt.legend(loc="lower right")
718 | plt.savefig('./results/'+SET_NAME +'_'+ roc_name + "_ROC.png")
719 |
720 | #################################################################################################################
721 | # function ROCDrawing:
722 | # Draw ROC and report AUC for classification
723 | #
724 | # Input: X_S, Y_S
725 | #################################################################################################################
726 | def ROCDrawing(X_S, Y_S, GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames):
727 | # MMP features over 0.8 jaccard coefficient
728 | if SET_NAME == 'MMP-cluster':
729 | correlate_feature = ['Germ_HV_IGHV3-23*01', 'Canonical_L2_0', 'Canonical_L3_0', 'Canonical_H1_1', 'Canonical_H2_6']
730 | # elif SET_NAME == 'IGHV':
731 | # # # IGHV features over 0.8 jaccard coefficient
732 | # correlate_feature = ['Germ_HV_IGHV3-23*01', 'Canonical_H1_1', 'Canonical_H2_6']
733 | else:
734 | correlate_feature = []
735 | X_S = np.array(X_S)
736 |
737 | X_S_new = []
738 | for j in range(len(X_S[0])):
739 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2')or AllFeatureNames[j].startswith('Canonical_L2') \
740 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
741 | continue
742 | X_S_new.append(X_S[:, j])
743 | X_S_new = np.array(X_S_new)
744 | X_S_new = X_S_new.T
745 | Classify(X_S_new, Y_S, 'All Features Included (Exclude Correlated)')
746 |
747 |
748 | Germ_E = len(GermFeatureNames)
749 |
750 | Canon_E = Germ_E + len(CanonFeatureNames)
751 | PI_E = Canon_E + len(PIFeatureNames)
752 |
753 |
754 | # Exclude the features correlated
755 | X_S_new = []
756 | for j in range(Germ_E):
757 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
758 | j].startswith('Canonical_L2') \
759 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
760 | continue
761 | X_S_new.append(X_S[:,j])
762 | X_S_new = np.array(X_S_new)
763 | X_S_new = X_S_new.T
764 | Classify(X_S_new, Y_S, 'Only Germline Features (Exclude Correlated)')
765 |
766 | X_S_new = []
767 | for j in range(Germ_E, Canon_E):
768 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
769 | j].startswith('Canonical_L2') \
770 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
771 | continue
772 | X_S_new.append(X_S[:, j])
773 | X_S_new = np.array(X_S_new)
774 | X_S_new = X_S_new.T
775 | Classify(X_S_new, Y_S, 'Only CDR Canonical Structure Features (Exclude Correlated)')
776 |
777 | X_S_new = []
778 | for j in range(Germ_E, X_S.shape[1]):
779 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
780 | j].startswith('Canonical_L2') \
781 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
782 | continue
783 | X_S_new.append(X_S[:, j])
784 | X_S_new = np.array(X_S_new)
785 | X_S_new = X_S_new.T
786 | Classify(X_S_new, Y_S, 'Except Germline Features (Exclude Correlated)')
787 |
788 | X_S_new = []
789 | for j in range(Germ_E):
790 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
791 | j].startswith('Canonical_L2') \
792 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
793 | continue
794 | X_S_new.append(X_S[:, j])
795 | for j in range(Canon_E, X_S.shape[1]):
796 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
797 | j].startswith('Canonical_L2') \
798 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
799 | continue
800 | X_S_new.append(X_S[:, j])
801 | X_S_new = np.array(X_S_new)
802 | X_S_new = X_S_new.T
803 | Classify(np.concatenate((X_S[:, :Germ_E], X_S[:, Canon_E:]), axis=1), Y_S,
804 | 'Except CDR Canonical Structure Features (Exclude Correlated)')
805 |
806 | X_S_new = []
807 | for j in range(Canon_E):
808 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
809 | j].startswith('Canonical_L2') \
810 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
811 | continue
812 | X_S_new.append(X_S[:, j])
813 | for j in range(PI_E, X_S.shape[1]):
814 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
815 | j].startswith('Canonical_L2') \
816 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
817 | continue
818 | X_S_new.append(X_S[:, j])
819 | X_S_new = np.array(X_S_new)
820 | X_S_new = X_S_new.T
821 | Classify(np.concatenate((X_S[:, :Canon_E], X_S[:, PI_E:]), axis=1), Y_S, 'Except pI Features (Exclude Correlated)')
822 |
823 | X_S_new = []
824 | for j in range(PI_E):
825 | if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
826 | j].startswith('Canonical_L2') \
827 | or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
828 | continue
829 | X_S_new.append(X_S[:, j])
830 | X_S_new = np.array(X_S_new)
831 | X_S_new = X_S_new.T
832 | Classify(X_S_new, Y_S, 'Except Frequent Positional Motif Features (Exclude Correlated)')
833 |
834 | # Classify(X_S[:,:Germ_E], Y_S, 'Only Germline Features')
835 | # Classify(X_S[:,Germ_E:Canon_E], Y_S, 'Only CDR Canonical Structure Features')
836 | # Classify(X_S[:,Canon_E:PI_E], Y_S, 'Only pI Features')
837 | # Classify(X_S[:,PI_E:], Y_S, 'Only Frequent Positional Motif Features')
838 | #
839 | # Classify(X_S[:,Germ_E:], Y_S, 'Except Germline Features')
840 | # Classify(np.concatenate((X_S[:,:Germ_E],X_S[:,Canon_E:]),axis=1) , Y_S, 'Except CDR Canonical Structure Features')
841 | # Classify(np.concatenate((X_S[:,:Canon_E],X_S[:,PI_E:]),axis=1), Y_S, 'Except pI Features')
842 | # Classify(X_S[:,:PI_E], Y_S, 'Except Frequent Positional Motif Features')
843 |
844 | def JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize):
845 | if SET_NAME=='MMP-cluster' :
846 | PDB_size = DatasetSize[0] + DatasetSize[1]
847 | elif SET_NAME=='IGHV':
848 | PDB_size = DatasetSize[0]
849 |
850 | jac_sim_PDB = np.eye(len(AllFeatureNames))
851 | for i in range(len(AllFeatureNames)):
852 | for j in range(i + 1, len(AllFeatureNames)):
853 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
854 | continue
855 | a = AllFeatureVectors[:PDB_size, i]
856 | b = AllFeatureVectors[:PDB_size, j]
857 | aandb = 0
858 | aorb = 0
859 | for k in range(len(a)):
860 | if a[k] == b[k] and a[k] == 1:
861 | aandb += 1
862 | if a[k] == 1 or b[k] == 1:
863 | aorb += 1
864 | if aorb == 0:
865 | jac_tmp = 0
866 | else:
867 | jac_tmp = float(aandb) / aorb
868 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
869 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
870 | jac_sim_PDB[i][j] = jac_tmp
871 | jac_sim_PDB[j][i] = jac_tmp
872 |
873 | jac_sim_MMP = np.eye(len(AllFeatureNames))
874 | for i in range(len(AllFeatureNames)):
875 | for j in range(i + 1, len(AllFeatureNames)):
876 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
877 | continue
878 | a = AllFeatureVectors[PDB_size:, i]
879 | b = AllFeatureVectors[PDB_size:, j]
880 |
881 | aandb = 0
882 | aorb = 0
883 | for k in range(len(a)):
884 | if a[k] == b[k] and a[k] == 1:
885 | aandb += 1
886 | if a[k] == 1 or b[k] == 1:
887 | aorb += 1
888 | if aorb == 0:
889 | jac_tmp = 0
890 | else:
891 | jac_tmp = float(aandb) / aorb
892 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
893 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
894 |
895 | jac_sim_MMP[i][j] = jac_tmp
896 | jac_sim_MMP[j][i] = jac_tmp
897 |
898 | with open('./results/' + SET_NAME + '_Jaccard Feature Coefficient.csv', 'w') as fi:
899 | fi.write(
900 | 'Feature value 1, Feature value 2, Jaccard coefficient for reference set, Jaccard coefficient for MMP-targeting set\n')
901 | for i in range(len(AllFeatureNames)):
902 | for j in range(i + 1, len(AllFeatureNames)):
903 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
904 | continue
905 | fi.write(AllFeatureNames[i] + ',' + AllFeatureNames[j] + ',' + str(jac_sim_PDB[i][j]) + ',' + str(
906 | jac_sim_MMP[i][j]) + '\n')
907 |
--------------------------------------------------------------------------------