├── ASAP
    ├── __init__.py
    ├── .DS_Store
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── FeatureExtraction.cpython-36.pyc
    │   ├── FeatureExtraction.cpython-37.pyc
    │   ├── DesignRecommendation.cpython-36.pyc
    │   ├── SequenceAndFeatureAnalysis.cpython-36.pyc
    │   └── SequenceAndFeatureAnalysis.cpython-37.pyc
    ├── DesignRecommendation.py
    ├── S_SequenceInRegion.py
    ├── FeatureExtraction.py
    └── SequenceAndFeatureAnalysis.py
├── .DS_Store
├── data
    ├── .DS_Store
    ├── pigs_canonical.txt
    └── blosum62.csv
├── results
    ├── .DS_Store
    ├── MMP-IGHV
    │   ├── .DS_Store
    │   ├── IGHV_Only pI Features_ROC.png
    │   ├── IGHV_Except pI Features_ROC.png
    │   ├── IGHV_All Features Included_ROC.png
    │   ├── IGHV_Except Germline Features_ROC.png
    │   ├── IGHV_Only Germline Features_ROC.png
    │   ├── IGHV_Except CDR Canonical Structure Features_ROC.png
    │   ├── IGHV_Only CDR Canonical Structure Features_ROC.png
    │   ├── IGHV_Only Frequent Positional Motif Features_ROC.png
    │   ├── IGHV_Except Frequent Positional Motif Features_ROC.png
    │   ├── IGHV_RankFisherAndFS.csv
    │   └── IGHV_Jaccard Feature Coefficient.csv
    └── MMP-PDB
    │   ├── .DS_Store
    │   ├── MMP-cluster_DTreeAllFeature.png
    │   ├── MMP-cluster_Extracted Features.png
    │   ├── MMP-cluster_Except pI Features_ROC.png
    │   ├── MMP-cluster_Heavy Chain Sequences.png
    │   ├── MMP-cluster_Light Chain Sequences.png
    │   ├── MMP-cluster_Only pI Features_ROC.png
    │   ├── MMP-cluster_All Features Included_ROC.png
    │   ├── MMP-cluster_Only Germline Features_ROC.png
    │   ├── MMP-cluster_Except Germline Features_ROC.png
    │   ├── MMP-cluster_Only CDR Canonical Structure Features_ROC.png
    │   ├── MMP-cluster_Except CDR Canonical Structure Features_ROC.png
    │   ├── MMP-cluster_Only Frequent Positional Motif Features_ROC.png
    │   ├── MMP-cluster_All Features Included(Exclude Correlated)_ROC.png
    │   ├── MMP-cluster_Except Frequent Positional Motif Features_ROC.png
    │   ├── MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png
    │   ├── MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png
    │   ├── MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png
    │   └── MMP-cluster_RankFisherAndFS.csv
├── testCase
    ├── .DS_Store
    ├── IGHV
    │   ├── .DS_Store
    │   ├── reference-IGHV
    │   │   └── .DS_Store
    │   └── targeting-MMP-IGHV
    │   │   └── .DS_Store
    └── MMP-cluster
    │   ├── .DS_Store
    │   ├── reference-PDB
    │       └── .DS_Store
    │   └── targeting-MMP
    │       └── .DS_Store
├── __pycache__
    └── ASAP.cpython-36.pyc
├── requirements.txt
├── supporting information
    ├── .DS_Store
    ├── Figure S1.png
    ├── Figure S2.png
    ├── Figure S3.png
    ├── Figure S4.png
    ├── Table S1.xlsx
    ├── Table S2.xlsx
    ├── Table S3.xlsx
    └── Table S4.xlsx
├── .idea
    ├── vcs.xml
    ├── misc.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── modules.xml
    ├── ASAP-1.0.iml
    └── workspace.xml
├── LICENSE
├── environment.yml
├── README.md
├── ASAP.ipynb
└── .ipynb_checkpoints
    └── ASAP-checkpoint.ipynb


/ASAP/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/.DS_Store


--------------------------------------------------------------------------------
/ASAP/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/.DS_Store


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/data/.DS_Store


--------------------------------------------------------------------------------
/results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/.DS_Store


--------------------------------------------------------------------------------
/testCase/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/.DS_Store


--------------------------------------------------------------------------------
/testCase/IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/.DS_Store


--------------------------------------------------------------------------------
/results/MMP-IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/.DS_Store


--------------------------------------------------------------------------------
/results/MMP-PDB/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/.DS_Store


--------------------------------------------------------------------------------
/__pycache__/ASAP.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/__pycache__/ASAP.cpython-36.pyc


--------------------------------------------------------------------------------
/testCase/MMP-cluster/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/.DS_Store


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pydotplus==2.0.2
2 | scipy==0.19.1
3 | matplotlib==2.1.0
4 | numpy==1.14.1
5 | scikit-learn==0.19.2
6 | 


--------------------------------------------------------------------------------
/supporting information/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/.DS_Store


--------------------------------------------------------------------------------
/supporting information/Figure S1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S1.png


--------------------------------------------------------------------------------
/supporting information/Figure S2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S2.png


--------------------------------------------------------------------------------
/supporting information/Figure S3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S3.png


--------------------------------------------------------------------------------
/supporting information/Figure S4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S4.png


--------------------------------------------------------------------------------
/supporting information/Table S1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S1.xlsx


--------------------------------------------------------------------------------
/supporting information/Table S2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S2.xlsx


--------------------------------------------------------------------------------
/supporting information/Table S3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S3.xlsx


--------------------------------------------------------------------------------
/supporting information/Table S4.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S4.xlsx


--------------------------------------------------------------------------------
/testCase/IGHV/reference-IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/reference-IGHV/.DS_Store


--------------------------------------------------------------------------------
/ASAP/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/ASAP/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/testCase/IGHV/targeting-MMP-IGHV/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/targeting-MMP-IGHV/.DS_Store


--------------------------------------------------------------------------------
/testCase/MMP-cluster/reference-PDB/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/reference-PDB/.DS_Store


--------------------------------------------------------------------------------
/testCase/MMP-cluster/targeting-MMP/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/targeting-MMP/.DS_Store


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only pI Features_ROC.png


--------------------------------------------------------------------------------
/ASAP/__pycache__/FeatureExtraction.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/FeatureExtraction.cpython-36.pyc


--------------------------------------------------------------------------------
/ASAP/__pycache__/FeatureExtraction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/FeatureExtraction.cpython-37.pyc


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except pI Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_DTreeAllFeature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_DTreeAllFeature.png


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_All Features Included_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_All Features Included_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Extracted Features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Extracted Features.png


--------------------------------------------------------------------------------
/ASAP/__pycache__/DesignRecommendation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/DesignRecommendation.cpython-36.pyc


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except Germline Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only Germline Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except pI Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Heavy Chain Sequences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Heavy Chain Sequences.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Light Chain Sequences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Light Chain Sequences.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only pI Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only pI Features_ROC.png


--------------------------------------------------------------------------------
/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-36.pyc


--------------------------------------------------------------------------------
/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-37.pyc


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_All Features Included_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_All Features Included_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Germline Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except Germline Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Germline Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except CDR Canonical Structure Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only CDR Canonical Structure Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Only Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only Frequent Positional Motif Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Except Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except Frequent Positional Motif Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except CDR Canonical Structure Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except CDR Canonical Structure Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Frequent Positional Motif Features_ROC.png


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_All Features Included(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_All Features Included(Exclude Correlated)_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except Frequent Positional Motif Features_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Frequent Positional Motif Features_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (homework)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="useProjectProfile" value="false" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ASAP-1.0.iml" filepath="$PROJECT_DIR$/.idea/ASAP-1.0.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/ASAP-1.0.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.6 (homework)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2018 Xinmeng Li
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/data/pigs_canonical.txt:
--------------------------------------------------------------------------------
 1 | L1 1 6 29 VIL
 2 | L1 2 7 29 VIL
 3 | L1 3 13 29 VIL
 4 | L1 4 12 29 VIL
 5 | L1 5 11 29 VIL
 6 | L1 6 8 29 VIL
 7 | L2 1 3 
 8 | L3 1 6 95 P 90 HNQ
 9 | L3 2 6 94 P 90 Q
10 | L3 3 5 96 P 90 Q
11 | L3 4 4 90 Q
12 | L3 5 7 95A P 90 Q
13 | L3 6 5 90 Q 94 L
14 | H1 1 7 
15 | H1 2 8 
16 | H1 3 9 
17 | H1 4 6 
18 | H2 1 3 71 AVL
19 | H2 2 3 71 RK
20 | H2 3 4 71 AVL
21 | H2 4 4 71 RK
22 | H2 5 6 71 AVL
23 | H2 6 6 71 RK
24 | H3 2 11 94 ACDEFGHILMNPQSTVYW
25 | H3 3 11 94 RK
26 | H3 1 10 
27 | H3 2 12 94 ACDEFGHILMNPQSTVYW
28 | H3 3 12 94 RK
29 | H3 2 13 94 ACDEFGHILMNPQSTVYW
30 | H3 3 13 94 RK
31 | H3 2 14 94 ACDEFGHILMNPQSTVYW
32 | H3 3 14 94 RK
33 | H3 2 15 94 ACDEFGHILMNPQSTVYW
34 | H3 3 15 94 RK
35 | H3 2 16 94 ACDEFGHILMNPQSTVYW
36 | H3 3 16 94 RK
37 | H3 2 17 94 ACDEFGHILMNPQSTVYW
38 | H3 3 17 94 RK
39 | H3 2 18 94 ACDEFGHILMNPQSTVYW
40 | H3 3 18 94 RK
41 | H3 2 19 94 ACDEFGHILMNPQSTVYW
42 | H3 3 19 94 RK
43 | H3 2 20 94 ACDEFGHILMNPQSTVYW
44 | H3 3 20 94 RK
45 | H3 2 21 94 ACDEFGHILMNPQSTVYW
46 | H3 3 21 94 RK
47 | H3 2 22 94 ACDEFGHILMNPQSTVYW
48 | H3 3 22 94 RK
49 | H3 2 23 94 ACDEFGHILMNPQSTVYW
50 | H3 3 23 94 RK
51 | H3 2 24 94 ACDEFGHILMNPQSTVYW
52 | H3 3 24 94 RK
53 | H3 2 25 94 ACDEFGHILMNPQSTVYW
54 | H3 3 25 94 RK
55 | H3 2 26 94 ACDEFGHILMNPQSTVYW
56 | H3 3 26 94 RK
57 | H3 2 27 94 ACDEFGHILMNPQSTVYW
58 | H3 3 27 94 RK


--------------------------------------------------------------------------------
/data/blosum62.csv:
--------------------------------------------------------------------------------
 1 | A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V,B,Z,X,_
 2 | 4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0,-2,-1,0,-4
 3 | -1,5,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3,-1,0,-1,-4
 4 | -2,0,6,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3,3,0,-1,-4
 5 | -2,-2,1,6,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3,4,1,-1,-4
 6 | 0,-3,-3,-3,9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4
 7 | -1,1,0,0,-3,5,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2,0,3,-1,-4
 8 | -1,0,0,2,-4,2,5,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4
 9 | 0,-2,0,-1,-3,-2,-2,6,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3,-1,-2,-1,-4
10 | -2,0,1,-1,-3,0,0,-2,8,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3,0,0,-1,-4
11 | -1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3,-3,-3,-1,-4
12 | -1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1,-4,-3,-1,-4
13 | -1,2,0,-1,-3,1,1,-2,-1,-3,-2,5,-1,-3,-1,0,-1,-3,-2,-2,0,1,-1,-4
14 | -1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,5,0,-2,-1,-1,-1,-1,1,-3,-1,-1,-4
15 | -2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,6,-4,-2,-2,1,3,-1,-3,-3,-1,-4
16 | -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,7,-1,-1,-4,-3,-2,-2,-1,-2,-4
17 | 1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2,0,0,0,-4
18 | 0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,5,-2,-2,0,-1,-1,0,-4
19 | -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,11,2,-3,-4,-3,-2,-4
20 | -2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,7,-1,-3,-2,-1,-4
21 | 0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4,-3,-2,-1,-4
22 | -2,-1,3,4,-3,0,1,-1,0,-3,-4,0,-3,-3,-2,0,-1,-4,-3,-3,4,1,-1,-4
23 | -1,0,0,1,-3,3,4,-2,0,-3,-3,1,-1,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4
24 | 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2,0,0,-2,-1,-1,-1,-1,-1,-4
25 | -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,1


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_RankFisherAndFS.csv:
--------------------------------------------------------------------------------
 1 | Feature, Feature Value,Fisher Test p-value, Feature Selection (thereshold = 0.0167),Rank of Statistic Significancy, Rank of Feature Selection
 2 | Canonical H1,0,1,0.000480165,,
 3 | Canonical H1,1,0.774906367,0.000600854,,
 4 | Canonical H1,2,1,0.000175353,,
 5 | Canonical H1,3,1,0,,
 6 | Canonical H2,0,0.535049709,0.012889063,,
 7 | Canonical H2,6,0.644296474,0.010232892,,
 8 | Canonical H3,0,0.083794647,0.01479616,,
 9 | Canonical H3,1,0.234700323,0.013119923,,
10 | Canonical H3,2,0.002614369,0.033944391,7.62,8
11 | Canonical H3,3,0.999938155,0.053446578,,2
12 | Germ HJ,IGHJ1*01,0.688488693,0.004603937,,
13 | Germ HJ,IGHJ2*01,0.354049029,0.004012455,,
14 | Germ HJ,IGHJ3*01,0.576535581,0.003012808,,
15 | Germ HJ,IGHJ3*02,1.32E-05,0.047185683,1.29,3
16 | Germ HJ,IGHJ4*02,0.553358944,0.034656349,,7
17 | Germ HJ,IGHJ5*01,0.96537164,0.015574391,,
18 | Germ HJ,IGHJ5*02,0.197274561,0.018123477,,19
19 | Germ HJ,IGHJ6*01,0.999947777,0.066944116,,1
20 | Germ HJ,IGHJ6*04,0.000416969,0.032812578,6.36,9
21 | Motif,10_YY,0.07051444,0.016529828,,
22 | Motif,10_YYG,0.053349784,0.014285162,,
23 | Motif,10_YYY,0.343316239,0.011916447,,
24 | Motif,2_GG,1,0.008545674,,
25 | Motif,2_GS,0.762847397,0.007176706,,
26 | Motif,2_YG,0.005067609,0.036587558,10.51,6
27 | Motif,2_YY,0.102581789,0.018412503,,18
28 | Motif,3_SG,0.624699976,0.017795207,,20
29 | Motif,3_SS,0.617371164,0.008419981,,
30 | Motif,3_YY,0.001265975,0.032171889,6.41,11
31 | Motif,3_YYD,0.002689351,0.00459369,7.61,
32 | Motif,4_SG,0.955215392,0.009229585,,
33 | Motif,4_SS,0.305268831,0.012016952,,
34 | Motif,4_YD,0.000317996,0.026747743,3.51,14
35 | Motif,4_YDS,0.002069412,0.006805006,7.59,
36 | Motif,5_DS,0.004330063,0.007100598,10.42,
37 | Motif,5_SG,0.626303426,0.010727004,,
38 | Motif,5_YY,0.026298446,0.037488145,15.28,5
39 | Motif,6_SG,0.509513574,0.022391224,,16
40 | Motif,6_SS,0.023126499,0.004795634,15.85,
41 | Motif,6_SSG,0.002423729,0.008186862,8.61,
42 | Motif,6_YY,0.937239799,0.011581004,,
43 | Motif,7_SG,0.028197383,0.007329619,16.75,
44 | Motif,7_SGY,0.003847006,0.003018869,10.29,
45 | Motif,7_YY,0.846405186,0.014090079,,
46 | Motif,7_YYY,0.591550617,0.004151414,,
47 | Motif,8_GY,0.012402538,0.010409138,12.7,
48 | Motif,8_YY,0.364641908,0.02334784,,15
49 | Motif,8_YYY,0.27775982,0.005765425,,
50 | Motif,9_FD,0.555232757,0.019582355,,17
51 | Motif,9_YY,0.017733467,0.01198022,13.5,
52 | Motif,9_YYY,0.0259799,0.009162702,14.58,
53 | PI,0.0-3.5,0.018313867,0.032143423,13.67,12
54 | PI,3.5-3.9375,0.08010366,0.039139254,,4
55 | PI,3.9375-4.375,0.485241177,0.032641723,,10
56 | PI,4.375-4.8125,0.901919558,0.009318928,,
57 | PI,4.8125-5.25,0.936906848,0.015848292,,
58 | PI,5.25-5.6875,0.824022061,0.010279235,,
59 | PI,5.6875-6.125,0.904667929,0.027072376,,13
60 | PI,6.125-7.0,0.945280631,0.009723836,,
61 | PI,7.0-14.0,0.966549563,0.014879694,,


--------------------------------------------------------------------------------
/ASAP/DesignRecommendation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pydotplus
 3 | from sklearn import tree
 4 | 
 5 | SET_NAME = 'MMP-cluster'
 6 | IF_ONLY_HEAVY = False
 7 | CNT_DB = 2
 8 | CNT_TARGET = 1
 9 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'
10 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'
11 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster
12 | 
13 | 
14 | 
15 | #################################################################################################################
16 | #  function SanityFeature:
17 | #  Omit non-recommending features, such as motif features and type 0 canonical structures, as feature value for decision tree
18 | # 
19 | #  Input:  AgreeFeature, AllFeatureNames
20 | #  Output: 1. SanityAgreeFeature, [] a list of index according to the AllFeatureNames that remain to put in decision tree
21 | #################################################################################################################
22 | 
23 | def SanityFeature(AgreeFeature, AllFeatureNames):
24 |     SanityAgreeFeature=[]
25 |     for idx in AgreeFeature:
26 |         if not(AllFeatureNames[idx].split('_')[0] == 'Motif') and not(AllFeatureNames[idx].split('_')[0] == 'Canonical' and AllFeatureNames[idx].split('_')[2] == '0'):
27 |             SanityAgreeFeature.append(idx)
28 |     return SanityAgreeFeature
29 | 
30 | #################################################################################################################
31 | #  function MultiDecisionTree:
32 | #  Decision tree drawn with combined data across multiple iteration
33 | # 
34 | #  Input:   X_DS, Y_DS, FeatureN, type
35 | #################################################################################################################
36 | def MultiDecisionTree(iterate, X_IDS, Y_IDS, AllFeatureNames, type):
37 |     Y = np.concatenate(Y_IDS, axis=0)
38 |     AgreeFeature =[i for i in range(len(AllFeatureNames)) ]
39 |     SanityAgreeFeature = SanityFeature(AgreeFeature, AllFeatureNames)
40 | 
41 |     SanityAgreeFeatureName = []
42 |     for idx in SanityAgreeFeature:
43 |         SanityAgreeFeatureName.append(AllFeatureNames[idx])
44 |     
45 |     Sig_X_DS =[[] for i in range(iterate)]
46 |     for i in range(iterate):
47 |         X_IDS[i]=np.array(X_IDS[i])
48 |         Sig_X_DS[i] = X_IDS[i][:,SanityAgreeFeature]
49 |         
50 |     X =np.concatenate(Sig_X_DS, axis=0)
51 | 
52 |     minLeafSize = int(0.025 *len(Y))
53 |     clf = tree.DecisionTreeClassifier(min_samples_leaf = minLeafSize)
54 |     clf = clf.fit(np.ones((len(Y),len(X[0])))-X, Y) #flip the X for decision tree to meet the true false
55 | 
56 |     dot_data = tree.export_graphviz(clf, out_file=None, filled=True,feature_names=SanityAgreeFeatureName, class_names=['Reference', 'Targeting'], rounded=True)
57 |     pydotplus.graph_from_dot_data(dot_data).write_png("./results/"+ SET_NAME + "_DTree"+ type +".png")
58 | 
59 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: asap
  2 | channels:
  3 |   - conda-forge/label/cf201901
  4 |   - anaconda
  5 |   - conda-forge
  6 |   - defaults
  7 | dependencies:
  8 |   - ca-certificates=2019.11.27=0
  9 |   - certifi=2019.11.28=py36_0
 10 |   - openssl=1.1.1=h1de35cc_0
 11 |   - pandas=0.25.3=py36h0a44026_0
 12 |   - pytz=2019.3=py_0
 13 |   - appnope=0.1.0=py36_1000
 14 |   - attrs=19.3.0=py_0
 15 |   - backcall=0.1.0=py_0
 16 |   - bleach=3.1.0=py_0
 17 |   - cycler=0.10.0=py_2
 18 |   - decorator=4.4.1=py_0
 19 |   - defusedxml=0.6.0=py_0
 20 |   - entrypoints=0.3=py36_1000
 21 |   - freetype=2.10.0=h24853df_1
 22 |   - graphviz=2.42.3=h98dfb87_0
 23 |   - icu=58.2=h0a44026_1000
 24 |   - importlib_metadata=1.3.0=py36_0
 25 |   - ipykernel=5.1.3=py36h5ca1d4c_0
 26 |   - ipython=7.11.0=py36h5ca1d4c_0
 27 |   - ipython_genutils=0.2.0=py_1
 28 |   - ipywidgets=7.5.1=py_0
 29 |   - jedi=0.15.2=py36_0
 30 |   - jinja2=2.10.3=py_0
 31 |   - joblib=0.14.1=py_0
 32 |   - jsonschema=3.2.0=py36_0
 33 |   - jupyter=1.0.0=py_2
 34 |   - jupyter_client=5.3.3=py36_1
 35 |   - jupyter_console=5.1.0=py36_0
 36 |   - jupyter_core=4.6.1=py36_0
 37 |   - kiwisolver=1.1.0=py36ha1b3eb9_0
 38 |   - libblas=3.8.0=14_openblas
 39 |   - libcblas=3.8.0=14_openblas
 40 |   - libcxx=9.0.0=h89e68fa_1
 41 |   - libffi=3.2.1=h6de7cb9_1006
 42 |   - libgfortran=4.0.0=2
 43 |   - liblapack=3.8.0=14_openblas
 44 |   - libopenblas=0.3.7=h3d69b6c_4
 45 |   - libpng=1.6.37=h2573ce8_0
 46 |   - libsodium=1.0.17=h01d97ff_0
 47 |   - libtiff=4.1.0=ha78913b_1
 48 |   - llvm-openmp=8.0.1=h770b8ee_0
 49 |   - lz4-c=1.8.3=h6de7cb9_1001
 50 |   - markupsafe=1.1.1=py36h0b31af3_0
 51 |   - matplotlib=3.1.2=py36_1
 52 |   - matplotlib-base=3.1.2=py36h11da6c2_1
 53 |   - mistune=0.8.4=py36h0b31af3_1000
 54 |   - more-itertools=8.0.2=py_0
 55 |   - nbconvert=5.6.1=py36_0
 56 |   - nbformat=4.4.0=py_1
 57 |   - ncurses=6.1=h0a44026_1002
 58 |   - notebook=6.0.1=py36_0
 59 |   - numpy=1.17.3=py36hde6bac1_0
 60 |   - pandoc=2.9.1=0
 61 |   - pandocfilters=1.4.2=py_1
 62 |   - parso=0.5.2=py_0
 63 |   - pexpect=4.7.0=py36_0
 64 |   - pickleshare=0.7.5=py36_1000
 65 |   - pip=19.3.1=py36_0
 66 |   - prometheus_client=0.7.1=py_0
 67 |   - prompt_toolkit=3.0.2=py_0
 68 |   - ptyprocess=0.6.0=py_1001
 69 |   - pydot=1.4.1=py36_1001
 70 |   - pydotplus=2.0.2=pyhd1c1de3_3
 71 |   - pygments=2.5.2=py_0
 72 |   - pyparsing=2.4.6=py_0
 73 |   - pyqt=5.6.0=py36hc26a216_1008
 74 |   - pyrsistent=0.15.6=py36h0b31af3_0
 75 |   - python=3.6.7=h8dc6b48_1004
 76 |   - python-dateutil=2.8.1=py_0
 77 |   - pyzmq=18.1.1=py36h4bf09a9_0
 78 |   - qt=5.6.2=h822fa55_1013
 79 |   - qtconsole=4.6.0=py_0
 80 |   - scikit-learn=0.21.3=py36hd4ffd6c_0
 81 |   - scipy=1.4.1=py36h82752d6_0
 82 |   - send2trash=1.5.0=py_0
 83 |   - setuptools=42.0.2=py36_0
 84 |   - sip=4.18.1=py36h0a44026_1000
 85 |   - six=1.13.0=py36_0
 86 |   - terminado=0.8.3=py36_0
 87 |   - testpath=0.4.4=py_0
 88 |   - tk=8.6.10=hbbe82c9_0
 89 |   - tornado=6.0.3=py36h0b31af3_0
 90 |   - traitlets=4.3.3=py36_0
 91 |   - wcwidth=0.1.7=py_1
 92 |   - webencodings=0.5.1=py_1
 93 |   - wheel=0.33.6=py36_0
 94 |   - widgetsnbextension=3.5.1=py36_0
 95 |   - xz=5.2.4=h1de35cc_1001
 96 |   - zeromq=4.3.2=h6de7cb9_2
 97 |   - zipp=0.6.0=py_0
 98 |   - zlib=1.2.11=h0b31af3_1006
 99 |   - zstd=1.4.4=he7fca8b_1
100 |   - biopython=1.72=py36h470a237_0
101 |   - jpeg=9c=h470a237_1
102 |   - readline=7.0=haf1bffa_1
103 |   - sqlite=3.26.0=hb1c47c0_0
104 | 
105 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ASAP-SML: An Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning
 2 | 
 3 | Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning (ASAP-SML) is a pipeline to identify distinguishing features in targeting antibody set when compared to a reference non-targeting set. The pipeline first extracts germline, CDR canonical structure, isoelectric point and frequent positional motifs features from sequences and creates an antibody feature fingerprint. Machine-learning and statistical significance testing are applied to antibody sequences and feature fingerprints to identify distinguishing feature values and combinations thereof. When applied to an MMP-targeting set, ASAP identifies salient features and recommends features to use when designing novel MPP-targeting antibody sequences.
 4 | 
 5 | ## How to install
 6 | ### Requirements: 
 7 | An [Anaconda python environment](https://www.anaconda.com/download) is recommmended.
 8 | Check the environment.yml file, but primarily:
 9 | - python >= 3.5
10 | - pandas
11 | - graphviz
12 | - jupyter
13 | - numpy
14 | - scikit-learn
15 | - scipy
16 | - biopython
17 | 
18 | Jupyter notebook is required to run the ipynb examples.
19 | 
20 | ### via Anaconda 
21 | We recommend installing using Anaconda as follows:
22 | ```
23 | conda create --name asap --file enviroment.yml
24 | source activate asap
25 | ```
26 | 
27 | ## Example: Matrix Metalloproteinases (MMP) targeting and reference antibody sequence set
28 | 
29 | This repository contains an example of how to run the ASAP pipeline on the MMP-targeting and reference antibody sequence set.
30 | 
31 | To run the script, open the terminal and go to the project directory, then run:
32 | 
33 | `
34 | jupyter notebook
35 | `
36 | 
37 | Take a look at the file "ASAP.ipynb". Parameters are set based on the users choice. Once you have set the parameters, run the notebook document step-by-step (one cell a time) by 
38 | 
39 | - Pressing shift + enter
40 | 
41 | Or, run the whole notebook in a single step by 
42 | 
43 | - Clicking on the menu Cell -> Run All.
44 | 
45 | ## Components
46 | ASAP.ipynb : main script for running ASAP pipeline 
47 | 
48 | - **./ASAP/FeatureExtraction.py** -  functions for feature extraction on Chothia numbered antibody sequences.
49 | - **./ASAP/SequenceAndFeatureAnalysis.py** - functions for sequence and feature analysis on antibody sequences. 
50 | - **./ASAP/DesignRecommendation.py** - functions to generate design recommendation trees for specific targeting antibody sequences.
51 | 
52 | ## Data
53 | 
54 | - Data to run ASAP: [BLOSUM-62 substitution matrix](https://en.wikipedia.org/wiki/BLOSUM#cite_ref-henikoff_1-0) and [Canonical Structure Definition](http://circe.med.uniroma1.it/pigs/canonical.php)
55 | 
56 | - Data to run ASAP on MMP-targeting example: MMP-targeting and reference set. 
57 | 
58 | MMP-targeting set is composed of publicly available antibody sequence data. Reference set is from the Protein Data Bank (PDB) and it consists of human and murine antibody sequences that do not bind or inhibit MMPs. Please see our paper for details.
59 | 
60 | ## Authors:
61 | This software is written by Xinmeng Li, James Van Deventer, Soha Hassoun (Soha.Hassoun@tufts.edu). 
62 | 
63 | Publication: ["ASAP-SML: An Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning"](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007779)
64 | 
65 | **Please cite our work:**
66 | 
67 | Li, Xinmeng, James A. Van Deventer, and Soha Hassoun. "ASAP-SML: An antibody sequence analysis pipeline using statistical testing and machine learning." PLoS computational biology 16.4 (2020): e1007779.
68 | 
69 | ## License
70 | 
71 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
72 | 
73 | 


--------------------------------------------------------------------------------
/ASAP.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#################################################################################################################\n",
 10 |     "#                                                                                                               #\n",
 11 |     "#                                      Section 1 Data Preperation                                               #\n",
 12 |     "#                                                                                                               #\n",
 13 |     "#################################################################################################################\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "#########################################  User define variables  ###############################################\n",
 17 |     "\n",
 18 |     "# User Choice C_MMPTest\n",
 19 |     "# Run test case for MMP? (y/n)\n",
 20 |     "# If True: Default testCase-MMP files \n",
 21 |     "# If False: User upload Chothia-numbered sequences files to \"targeting\" and \"reference\" folders under \"./user/data/\" respectively.\n",
 22 |     "# Default: True\n",
 23 |     "\n",
 24 |     "C_SAMPLE_Test = True\n",
 25 |     "\n",
 26 |     "# User Choice C_PIGS\n",
 27 |     "# Use PIGS template for CDR canonical structure? (y/n)\n",
 28 |     "# If True: Default PIGS CDR Canonical structure template under Chothia numbering\n",
 29 |     "# If False: User upload fomatted CDR Canonical structure template under \"./user/data/\"\n",
 30 |     "# Default: True\n",
 31 |     "\n",
 32 |     "C_PIGS = True\n",
 33 |     "\n",
 34 |     "# User Choice C_DesireSize\n",
 35 |     "# Use default desire size for targeting dataset? (y/n)\n",
 36 |     "# If True: Default desire size, 44 for the MMP test case, medium for user upload files\n",
 37 |     "# If False: User define desire size for targeting dataset\n",
 38 |     "# Default: True\n",
 39 |     "\n",
 40 |     "C_DesireSize = True\n",
 41 |     "\n",
 42 |     "# User Choice C_k\n",
 43 |     "# Use default number of iterations? (y/n)\n",
 44 |     "# If True: Default number of iterations, k = 100\n",
 45 |     "# If False: User define number of iterations\n",
 46 |     "# Default: True\n",
 47 |     "C_k = True\n",
 48 |     "\n",
 49 |     "\n",
 50 |     "#######################################  Define global variables  ###############################################\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "# SET_NAME = 'IGHV'\n",
 54 |     "# IF_ONLY_HEAVY = True\n",
 55 |     "# CNT_DB = 1\n",
 56 |     "# CNT_TARGET = 1\n",
 57 |     "# REFERENCE_PATH_TESTCASE = './testCase/IGHV/reference-IGHV/'\n",
 58 |     "# TARGETING_PATH_TESTCASE = './testCase/IGHV/targeting-MMP-IGHV/'\n",
 59 |     "# TARGET_DESIRE_SIZE = 134 #44  #IGHV\n",
 60 |     "\n",
 61 |     "SET_NAME = 'MMP-cluster'\n",
 62 |     "IF_ONLY_HEAVY = False\n",
 63 |     "CNT_DB = 2\n",
 64 |     "CNT_TARGET = 1\n",
 65 |     "REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'\n",
 66 |     "TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'\n",
 67 |     "TARGET_DESIRE_SIZE = 166\n",
 68 |     "\n",
 69 |     "PIGS_PATH = './data/pigs_canonical.txt'\n",
 70 |     "TEMPLATE_PATH = './user/data/'\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "ITERATION = 100\n",
 75 |     "\n",
 76 |     "########################################  Determine variable values  ############################################\n",
 77 |     "\n",
 78 |     "if C_SAMPLE_Test == True:\n",
 79 |     "    targeting_direct = TARGETING_PATH_TESTCASE\n",
 80 |     "    reference_direct = REFERENCE_PATH_TESTCASE\n",
 81 |     "else:\n",
 82 |     "    print(\"Each pair of light and heavy chain sequence should be in the order of LIGHT/HEAVY/LIGHT/HEAVY\")\n",
 83 |     "    targeting_direct = TARGETING_PATH\n",
 84 |     "    reference_direct = REFERENCE_PATH\n",
 85 |     "    \n",
 86 |     "if C_PIGS == True:\n",
 87 |     "    canonical_direct = PIGS_PATH\n",
 88 |     "else:\n",
 89 |     "    print(\"Upload CDR canonical structure templates. \")\n",
 90 |     "    print(\"In the template, the first column must be the L1, L2, L3, H1, H2, or H3, \")\n",
 91 |     "    print(\"the second column is the length of the region defined in the first column, \")\n",
 92 |     "    print(\"starting from the third column, it is the position and candidate amino acid on each position, such as 1 ABC 2 CDETFG.\") \n",
 93 |     "    template_name = input(\"What is the name of the template?\")\n",
 94 |     "    canonical_direct = TEMPLATE_PATH + template_name\n",
 95 |     "    \n",
 96 |     "if C_SAMPLE_Test == True and C_DesireSize == True:\n",
 97 |     "    size = TARGET_DESIRE_SIZE\n",
 98 |     "elif C_SAMPLE_Test == False and C_DesireSize == True:\n",
 99 |     "    size = 'medium'\n",
100 |     "else:\n",
101 |     "    size = int(input('What is the desire size for the targeting set?'))\n",
102 |     "    \n",
103 |     "if C_k == True:\n",
104 |     "    iterate = ITERATION\n",
105 |     "else:\n",
106 |     "    iterate = int(input(\"What is the number of iterations?\"))\n"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 2,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Data:\n",
119 |       "r1 : 276\n",
120 |       "r2 : 219\n",
121 |       "t1 : 166\n",
122 |       "Sum: 661\n",
123 |       "\n",
124 |       "Number of feature values:\n",
125 |       "Germline: 334\n",
126 |       "CDR canonical structures: 20\n",
127 |       "Isoelectric points (pI): 8\n",
128 |       "Frequent positional motif: 42\n",
129 |       "Total: 404\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "#################################################################################################################\n",
135 |     "#                                                                                                               #\n",
136 |     "#                                       Section 2 Feature Extraction                                            #\n",
137 |     "#                                                                                                               #\n",
138 |     "#################################################################################################################\n",
139 |     "\n",
140 |     "\n",
141 |     "############################################  Import libaries  ##################################################\n",
142 |     "\n",
143 |     "import ASAP.FeatureExtraction as extract\n",
144 |     "\n",
145 |     "\n",
146 |     "############################################  Function calls   ##################################################\n",
147 |     "\n",
148 |     "Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)\n",
149 |     "\n",
150 |     "OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)\n",
151 |     "\n",
152 |     "OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)\n",
153 |     "\n",
154 |     "CDRH3 = extract.GetCDRH3(Amino, Num)\n",
155 |     "\n",
156 |     "OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)\n",
157 |     "\n",
158 |     "MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)\n",
159 |     "\n",
160 |     "AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)\n",
161 |     "\n",
162 |     "\n",
163 |     "############################################  Report section results  #############################################\n",
164 |     "\n",
165 |     "print(\"Data:\")\n",
166 |     "for i in range(len(DatasetSize)):\n",
167 |     "    print(DatasetName[i], \":\",DatasetSize[i],)\n",
168 |     "print(\"Sum:\", sum(DatasetSize))\n",
169 |     "\n",
170 |     "print(\"\\nNumber of feature values:\")\n",
171 |     "print(\"Germline:\", len(GermFeatureNames),)\n",
172 |     "print(\"CDR canonical structures:\", len(CanonFeatureNames),)\n",
173 |     "print(\"Isoelectric points (pI):\", len(PIFeatureNames),)\n",
174 |     "print(\"Frequent positional motif:\",len(MotifFeatureNames),)\n",
175 |     "print(\"Total:\", len(AllFeatureNames))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 3,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stderr",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "/Users/xinmeng/anaconda3/envs/homework/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
188 |       "  return f(*args, **kwds)\n"
189 |      ]
190 |     },
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "RanksumsResult(statistic=-147.48812830271845, pvalue=0.0) RanksumsResult(statistic=-191.24982487069587, pvalue=0.0)\n",
196 |       "Statistical tests (Reference against Targeting) succeed.\n",
197 |       "(661, 404) 495 166\n",
198 |       "Average AUC with all features: \n",
199 |       "SVM\t\t 0.9900013753999437\n",
200 |       "Random forest\t 0.9858123420498757\n",
201 |       "AdaBoost\t 0.985921855997419\n"
202 |      ]
203 |     }
204 |    ],
205 |    "source": [
206 |     "#################################################################################################################\n",
207 |     "#                                                                                                               #\n",
208 |     "#                                   Section 3 Sequence and Feature Analysis                                     #\n",
209 |     "#                                                                                                               #\n",
210 |     "#################################################################################################################\n",
211 |     "\n",
212 |     "############################################  Import libaries  ##################################################\n",
213 |     "\n",
214 |     "import ASAP.SequenceAndFeatureAnalysis as analysis\n",
215 |     "\n",
216 |     "############################################  Function calls   ##################################################\n",
217 |     "\n",
218 |     "X_IDS, Y_IDS, SeqName_IDS = analysis.IterationDuplicateSelectFeature(size, iterate, DatasetName, \n",
219 |     "                                                                                       DatasetSize, ExcludeIGHVVectors)\n",
220 |     "\n",
221 |     "######################  Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################\n",
222 |     "\n",
223 |     "H_Idist, L_Idist = analysis.HeatmapHL(size, iterate, SeqName_IDS, Amino, Num)\n",
224 |     "analysis.Draw_heatmap(size, H_Idist[1], 'Heavy Chain Sequences', DatasetSize)\n",
225 |     "if not IF_ONLY_HEAVY:\n",
226 |     "    analysis.Draw_heatmap(size, L_Idist[1], 'Light Chain Sequences', DatasetSize)\n",
227 |     "F_Idist = analysis.HeatmapFeature(size, iterate, X_IDS, ExcludeFeatureNames, MotifFeatureNames)\n",
228 |     "analysis.Draw_heatmap(size, F_Idist[0], 'Extracted Features', DatasetSize)\n",
229 |     "\n",
230 |     "###############################  Section 3.2 Similarity analysis (Statistical test) #############################\n",
231 |     "\n",
232 |     "analysis.MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist)\n",
233 |     "\n",
234 |     "#######################################  Section 3.3 Salient feature-value analysis  ############################                                     #\n",
235 |     "\n",
236 |     "analysis.MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, ExcludeIGHVVectors, \n",
237 |     "                                         ExcludeFeatureNames)\n",
238 |     "\n",
239 |     "#######################################  Section 3.4 Classification on segments  ################################  \n",
240 |     "\n",
241 |     "analysis.MultiAuc(iterate, X_IDS, Y_IDS)\n",
242 |     "analysis.ROCDrawing(X_IDS[0], Y_IDS[0], GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames)\n",
243 |     "\n",
244 |     "analysis.JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 4,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "#################################################################################################################\n",
254 |     "#                                                                                                               #\n",
255 |     "#                                         Section 4 Design Recommendation                                       #\n",
256 |     "#                                                                                                               #\n",
257 |     "#################################################################################################################\n",
258 |     "\n",
259 |     "############################################  Import libaries  ##################################################\n",
260 |     "\n",
261 |     "import ASAP.DesignRecommendation as design\n",
262 |     "\n",
263 |     "############################################  Function calls   ##################################################\n",
264 |     "\n",
265 |     "design.MultiDecisionTree(iterate, X_IDS, Y_IDS, ExcludeFeatureNames, 'AllFeature')"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": []
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "kernelspec": {
278 |    "display_name": "Python 3",
279 |    "language": "python",
280 |    "name": "python3"
281 |   },
282 |   "language_info": {
283 |    "codemirror_mode": {
284 |     "name": "ipython",
285 |     "version": 3
286 |    },
287 |    "file_extension": ".py",
288 |    "mimetype": "text/x-python",
289 |    "name": "python",
290 |    "nbconvert_exporter": "python",
291 |    "pygments_lexer": "ipython3",
292 |    "version": "3.6.8"
293 |   }
294 |  },
295 |  "nbformat": 4,
296 |  "nbformat_minor": 1
297 | }
298 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/ASAP-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#################################################################################################################\n",
 10 |     "#                                                                                                               #\n",
 11 |     "#                                      Section 1 Data Preperation                                               #\n",
 12 |     "#                                                                                                               #\n",
 13 |     "#################################################################################################################\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "#########################################  User define variables  ###############################################\n",
 17 |     "\n",
 18 |     "# User Choice C_MMPTest\n",
 19 |     "# Run test case for MMP? (y/n)\n",
 20 |     "# If True: Default testCase-MMP files \n",
 21 |     "# If False: User upload Chothia-numbered sequences files to \"targeting\" and \"reference\" folders under \"./user/data/\" respectively.\n",
 22 |     "# Default: True\n",
 23 |     "\n",
 24 |     "C_SAMPLE_Test = True\n",
 25 |     "\n",
 26 |     "# User Choice C_PIGS\n",
 27 |     "# Use PIGS template for CDR canonical structure? (y/n)\n",
 28 |     "# If True: Default PIGS CDR Canonical structure template under Chothia numbering\n",
 29 |     "# If False: User upload fomatted CDR Canonical structure template under \"./user/data/\"\n",
 30 |     "# Default: True\n",
 31 |     "\n",
 32 |     "C_PIGS = True\n",
 33 |     "\n",
 34 |     "# User Choice C_DesireSize\n",
 35 |     "# Use default desire size for targeting dataset? (y/n)\n",
 36 |     "# If True: Default desire size, 44 for the MMP test case, medium for user upload files\n",
 37 |     "# If False: User define desire size for targeting dataset\n",
 38 |     "# Default: True\n",
 39 |     "\n",
 40 |     "C_DesireSize = True\n",
 41 |     "\n",
 42 |     "# User Choice C_k\n",
 43 |     "# Use default number of iterations? (y/n)\n",
 44 |     "# If True: Default number of iterations, k = 100\n",
 45 |     "# If False: User define number of iterations\n",
 46 |     "# Default: True\n",
 47 |     "C_k = True\n",
 48 |     "\n",
 49 |     "\n",
 50 |     "#######################################  Define global variables  ###############################################\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "# SET_NAME = 'IGHV'\n",
 54 |     "# IF_ONLY_HEAVY = True\n",
 55 |     "# CNT_DB = 1\n",
 56 |     "# CNT_TARGET = 1\n",
 57 |     "# REFERENCE_PATH_TESTCASE = './testCase/IGHV/reference-IGHV/'\n",
 58 |     "# TARGETING_PATH_TESTCASE = './testCase/IGHV/targeting-MMP-IGHV/'\n",
 59 |     "# TARGET_DESIRE_SIZE = 134 #44  #IGHV\n",
 60 |     "\n",
 61 |     "SET_NAME = 'MMP-cluster'\n",
 62 |     "IF_ONLY_HEAVY = False\n",
 63 |     "CNT_DB = 2\n",
 64 |     "CNT_TARGET = 1\n",
 65 |     "REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'\n",
 66 |     "TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'\n",
 67 |     "TARGET_DESIRE_SIZE = 166\n",
 68 |     "\n",
 69 |     "PIGS_PATH = './data/pigs_canonical.txt'\n",
 70 |     "TEMPLATE_PATH = './user/data/'\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "ITERATION = 100\n",
 75 |     "\n",
 76 |     "########################################  Determine variable values  ############################################\n",
 77 |     "\n",
 78 |     "if C_SAMPLE_Test == True:\n",
 79 |     "    targeting_direct = TARGETING_PATH_TESTCASE\n",
 80 |     "    reference_direct = REFERENCE_PATH_TESTCASE\n",
 81 |     "else:\n",
 82 |     "    print(\"Each pair of light and heavy chain sequence should be in the order of LIGHT/HEAVY/LIGHT/HEAVY\")\n",
 83 |     "    targeting_direct = TARGETING_PATH\n",
 84 |     "    reference_direct = REFERENCE_PATH\n",
 85 |     "    \n",
 86 |     "if C_PIGS == True:\n",
 87 |     "    canonical_direct = PIGS_PATH\n",
 88 |     "else:\n",
 89 |     "    print(\"Upload CDR canonical structure templates. \")\n",
 90 |     "    print(\"In the template, the first column must be the L1, L2, L3, H1, H2, or H3, \")\n",
 91 |     "    print(\"the second column is the length of the region defined in the first column, \")\n",
 92 |     "    print(\"starting from the third column, it is the position and candidate amino acid on each position, such as 1 ABC 2 CDETFG.\") \n",
 93 |     "    template_name = input(\"What is the name of the template?\")\n",
 94 |     "    canonical_direct = TEMPLATE_PATH + template_name\n",
 95 |     "    \n",
 96 |     "if C_SAMPLE_Test == True and C_DesireSize == True:\n",
 97 |     "    size = TARGET_DESIRE_SIZE\n",
 98 |     "elif C_SAMPLE_Test == False and C_DesireSize == True:\n",
 99 |     "    size = 'medium'\n",
100 |     "else:\n",
101 |     "    size = int(input('What is the desire size for the targeting set?'))\n",
102 |     "    \n",
103 |     "if C_k == True:\n",
104 |     "    iterate = ITERATION\n",
105 |     "else:\n",
106 |     "    iterate = int(input(\"What is the number of iterations?\"))\n"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 2,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Data:\n",
119 |       "r1 : 276\n",
120 |       "r2 : 219\n",
121 |       "t1 : 166\n",
122 |       "Sum: 661\n",
123 |       "\n",
124 |       "Number of feature values:\n",
125 |       "Germline: 334\n",
126 |       "CDR canonical structures: 20\n",
127 |       "Isoelectric points (pI): 8\n",
128 |       "Frequent positional motif: 42\n",
129 |       "Total: 404\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "#################################################################################################################\n",
135 |     "#                                                                                                               #\n",
136 |     "#                                       Section 2 Feature Extraction                                            #\n",
137 |     "#                                                                                                               #\n",
138 |     "#################################################################################################################\n",
139 |     "\n",
140 |     "\n",
141 |     "############################################  Import libaries  ##################################################\n",
142 |     "\n",
143 |     "import ASAP.FeatureExtraction as extract\n",
144 |     "\n",
145 |     "\n",
146 |     "############################################  Function calls   ##################################################\n",
147 |     "\n",
148 |     "Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)\n",
149 |     "\n",
150 |     "OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)\n",
151 |     "\n",
152 |     "OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)\n",
153 |     "\n",
154 |     "CDRH3 = extract.GetCDRH3(Amino, Num)\n",
155 |     "\n",
156 |     "OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)\n",
157 |     "\n",
158 |     "MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)\n",
159 |     "\n",
160 |     "AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)\n",
161 |     "\n",
162 |     "\n",
163 |     "############################################  Report section results  #############################################\n",
164 |     "\n",
165 |     "print(\"Data:\")\n",
166 |     "for i in range(len(DatasetSize)):\n",
167 |     "    print(DatasetName[i], \":\",DatasetSize[i],)\n",
168 |     "print(\"Sum:\", sum(DatasetSize))\n",
169 |     "\n",
170 |     "print(\"\\nNumber of feature values:\")\n",
171 |     "print(\"Germline:\", len(GermFeatureNames),)\n",
172 |     "print(\"CDR canonical structures:\", len(CanonFeatureNames),)\n",
173 |     "print(\"Isoelectric points (pI):\", len(PIFeatureNames),)\n",
174 |     "print(\"Frequent positional motif:\",len(MotifFeatureNames),)\n",
175 |     "print(\"Total:\", len(AllFeatureNames))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 3,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stderr",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "/Users/xinmeng/anaconda3/envs/homework/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n",
188 |       "  return f(*args, **kwds)\n"
189 |      ]
190 |     },
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "RanksumsResult(statistic=-147.48812830271845, pvalue=0.0) RanksumsResult(statistic=-191.24982487069587, pvalue=0.0)\n",
196 |       "Statistical tests (Reference against Targeting) succeed.\n",
197 |       "(661, 404) 495 166\n",
198 |       "Average AUC with all features: \n",
199 |       "SVM\t\t 0.9900013753999437\n",
200 |       "Random forest\t 0.9858123420498757\n",
201 |       "AdaBoost\t 0.985921855997419\n"
202 |      ]
203 |     }
204 |    ],
205 |    "source": [
206 |     "#################################################################################################################\n",
207 |     "#                                                                                                               #\n",
208 |     "#                                   Section 3 Sequence and Feature Analysis                                     #\n",
209 |     "#                                                                                                               #\n",
210 |     "#################################################################################################################\n",
211 |     "\n",
212 |     "############################################  Import libaries  ##################################################\n",
213 |     "\n",
214 |     "import ASAP.SequenceAndFeatureAnalysis as analysis\n",
215 |     "\n",
216 |     "############################################  Function calls   ##################################################\n",
217 |     "\n",
218 |     "X_IDS, Y_IDS, SeqName_IDS = analysis.IterationDuplicateSelectFeature(size, iterate, DatasetName, \n",
219 |     "                                                                                       DatasetSize, ExcludeIGHVVectors)\n",
220 |     "\n",
221 |     "######################  Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################\n",
222 |     "\n",
223 |     "H_Idist, L_Idist = analysis.HeatmapHL(size, iterate, SeqName_IDS, Amino, Num)\n",
224 |     "analysis.Draw_heatmap(size, H_Idist[1], 'Heavy Chain Sequences', DatasetSize)\n",
225 |     "if not IF_ONLY_HEAVY:\n",
226 |     "    analysis.Draw_heatmap(size, L_Idist[1], 'Light Chain Sequences', DatasetSize)\n",
227 |     "F_Idist = analysis.HeatmapFeature(size, iterate, X_IDS, ExcludeFeatureNames, MotifFeatureNames)\n",
228 |     "analysis.Draw_heatmap(size, F_Idist[0], 'Extracted Features', DatasetSize)\n",
229 |     "\n",
230 |     "###############################  Section 3.2 Similarity analysis (Statistical test) #############################\n",
231 |     "\n",
232 |     "analysis.MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist)\n",
233 |     "\n",
234 |     "#######################################  Section 3.3 Salient feature-value analysis  ############################                                     #\n",
235 |     "\n",
236 |     "analysis.MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, ExcludeIGHVVectors, \n",
237 |     "                                         ExcludeFeatureNames)\n",
238 |     "\n",
239 |     "#######################################  Section 3.4 Classification on segments  ################################  \n",
240 |     "\n",
241 |     "analysis.MultiAuc(iterate, X_IDS, Y_IDS)\n",
242 |     "analysis.ROCDrawing(X_IDS[0], Y_IDS[0], GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames)\n",
243 |     "\n",
244 |     "analysis.JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 4,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "#################################################################################################################\n",
254 |     "#                                                                                                               #\n",
255 |     "#                                         Section 4 Design Recommendation                                       #\n",
256 |     "#                                                                                                               #\n",
257 |     "#################################################################################################################\n",
258 |     "\n",
259 |     "############################################  Import libaries  ##################################################\n",
260 |     "\n",
261 |     "import ASAP.DesignRecommendation as design\n",
262 |     "\n",
263 |     "############################################  Function calls   ##################################################\n",
264 |     "\n",
265 |     "design.MultiDecisionTree(iterate, X_IDS, Y_IDS, ExcludeFeatureNames, 'AllFeature')"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": []
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "kernelspec": {
278 |    "display_name": "Python 3",
279 |    "language": "python",
280 |    "name": "python3"
281 |   },
282 |   "language_info": {
283 |    "codemirror_mode": {
284 |     "name": "ipython",
285 |     "version": 3
286 |    },
287 |    "file_extension": ".py",
288 |    "mimetype": "text/x-python",
289 |    "name": "python",
290 |    "nbconvert_exporter": "python",
291 |    "pygments_lexer": "ipython3",
292 |    "version": "3.6.8"
293 |   }
294 |  },
295 |  "nbformat": 4,
296 |  "nbformat_minor": 1
297 | }
298 | 


--------------------------------------------------------------------------------
/ASAP/S_SequenceInRegion.py:
--------------------------------------------------------------------------------
  1 | import Bio.SeqUtils.ProtParam
  2 | import os
  3 | import ASAP.FeatureExtraction as extract
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | 
  8 | # Chothia numbering definition for CDR regions
  9 | CHOTHIA_CDR = {'L': {'1': [24, 34], '2': [50, 56], '3': [89, 97]}, 'H':{'1': [26, 32], '2': [52, 56], '3': [95, 102]}}
 10 | canonical_direct = '../data/pigs_canonical.txt'
 11 | 
 12 | SET_NAME = 'IGHV'
 13 | IF_ONLY_HEAVY = True
 14 | CNT_DB = 1
 15 | CNT_TARGET = 1
 16 | REFERENCE_PATH_TESTCASE = '../testCase/IGHV/reference-IGHV/'
 17 | TARGETING_PATH_TESTCASE = '../testCase/IGHV/targeting-MMP-IGHV/'
 18 | TARGET_DESIRE_SIZE = 134 #44  #IGHV
 19 | 
 20 | targeting_direct = TARGETING_PATH_TESTCASE
 21 | reference_direct = REFERENCE_PATH_TESTCASE
 22 | 
 23 | Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)
 24 | 
 25 | seq_id = []
 26 | for i, name in enumerate(DatasetName):
 27 |     # if i<2:
 28 |     #     continue
 29 |     tmp= [[] for j in range(int(DatasetSize[i]))]
 30 |     # for every seq in that dataset
 31 |     for j in range(int(DatasetSize[i])):
 32 |         seq_name = name + '_' + str(j)
 33 |         seq_id.append(seq_name)
 34 | 
 35 | # raw sequence
 36 | def sequence_raw():
 37 |     def getSequenceHL(sname):
 38 |         SH = ''.join(Amino['H'][sname])
 39 |         SL = ''
 40 |         if not IF_ONLY_HEAVY:
 41 |             SL = ''.join(Amino['L'][sname])
 42 |             return SL, SH
 43 |         else:
 44 |             return [SH]
 45 | 
 46 |     with open('../results/'+SET_NAME +'_Sequence.csv','w') as fi:
 47 |         fi.write('sequence name, ')
 48 |         if not IF_ONLY_HEAVY:
 49 |             fi.write('light chain, ')
 50 |         fi.write('heavy chain\n')
 51 |         for sname in seq_id:
 52 |             fi.write(sname + ',' + ','.join(getSequenceHL(sname))+ '\n')
 53 | 
 54 | # sequence with numbering
 55 | def sequence_num():
 56 |     def getSequenceHL_num(sname):
 57 |         NH = ','.join(Num['H'][sname])
 58 |         SH = ','.join(Amino['H'][sname])
 59 |         NL = ','.join(Num['L'][sname])
 60 |         SL = ','.join(Amino['L'][sname])
 61 |         return NH, SH, NL, SL
 62 | 
 63 |     with open('./Sequence_numbered.csv','w') as fi:
 64 |         for sname in seq_id:
 65 |             NH, SH, NL, SL = getSequenceHL_num(sname)
 66 |             fi.write(sname + ' light num,' + NL + '\n')
 67 |             fi.write(sname + ' light seq,' + SL + '\n')
 68 |             fi.write(sname + ' heavy num,' + NH + '\n')
 69 |             fi.write(sname + ' heavy seq,' + SH + '\n')
 70 | 
 71 | # sequence with region
 72 | def sequence_region():
 73 |     def getSequenceHL_region(sname):
 74 |         NH = Num['H'][sname]
 75 | 
 76 |         HFW1, HCDR1, HFW2, HCDR2, HFW3, HCDR3, HFW4 = '', '', '', '', '', '', ''
 77 | 
 78 |         for i, number in enumerate(NH):
 79 |             if number[-1] >= 'A' and number[-1] <= 'Z':
 80 |                 num_i = int(number[:-1])
 81 |             else:
 82 |                 num_i = int(number)
 83 |             if num_i < CHOTHIA_CDR['H']['1'][0]:
 84 |                 HFW1 += Amino['H'][sname][i]
 85 |             elif num_i <= CHOTHIA_CDR['H']['1'][1]:
 86 |                 HCDR1+= Amino['H'][sname][i]
 87 |             elif num_i < CHOTHIA_CDR['H']['2'][0]:
 88 |                 HFW2 += Amino['H'][sname][i]
 89 |             elif num_i <= CHOTHIA_CDR['H']['2'][1]:
 90 |                 HCDR2 += Amino['H'][sname][i]
 91 |             elif num_i < CHOTHIA_CDR['H']['3'][0]:
 92 |                 HFW3 += Amino['H'][sname][i]
 93 |             elif num_i <= CHOTHIA_CDR['H']['3'][1]:
 94 |                 HCDR3 += Amino['H'][sname][i]
 95 |             else:
 96 |                 HFW4 += Amino['H'][sname][i]
 97 |         if IF_ONLY_HEAVY:
 98 |             return ''.join(HFW1), ''.join(HCDR1), ''.join(HFW2), ''.join(HCDR2), ''.join(HFW3), ''.join(HCDR3), ''.join(
 99 |                 HFW4)
100 |         else:
101 |             NL = Num['L'][sname]
102 |             LFW1, LCDR1, LFW2, LCDR2, LFW3, LCDR3, LFW4 = '', '', '', '', '', '', ''
103 |             for i, number in enumerate(NL):
104 |                 if number[-1] >= 'A' and number[-1] <= 'Z':
105 |                     num_i = int(number[:-1])
106 |                 else:
107 |                     num_i = int(number)
108 |                 if num_i < CHOTHIA_CDR['L']['1'][0]:
109 |                     LFW1 += Amino['L'][sname][i]
110 |                 elif num_i <= CHOTHIA_CDR['L']['1'][1]:
111 |                     LCDR1 += Amino['L'][sname][i]
112 |                 elif num_i < CHOTHIA_CDR['L']['2'][0]:
113 |                     LFW2 += Amino['L'][sname][i]
114 |                 elif num_i <= CHOTHIA_CDR['L']['2'][1]:
115 |                     LCDR2 += Amino['L'][sname][i]
116 |                 elif num_i < CHOTHIA_CDR['L']['3'][0]:
117 |                     LFW3 += Amino['L'][sname][i]
118 |                 elif num_i <= CHOTHIA_CDR['L']['3'][1]:
119 |                     LCDR3 += Amino['L'][sname][i]
120 |                 else:
121 |                     LFW4 += Amino['L'][sname][i]
122 |             return ''.join(LFW1), ''.join(LCDR1), ''.join(LFW2), ''.join(LCDR2), ''.join(LFW3), ''.join(LCDR3), ''.join(LFW4),\
123 |                    ''.join(HFW1), ''.join(HCDR1), ''.join(HFW2), ''.join(HCDR2), ''.join(HFW3), ''.join(HCDR3), ''.join(HFW4)
124 | 
125 |     with open('../results/'+SET_NAME +'_Sequence_region.csv','w') as fi:
126 |         if IF_ONLY_HEAVY:
127 |             fi.write(
128 |                 'sequence id, heavy chain FW1, heavy chain CDR1, heavy chain FW2, heavy chain CDR2, heavy chain FW3, heavy chain CDR3, heavy chain FW4\n')
129 | 
130 |         else:
131 |             fi.write('sequence id, light chain FW1, light chain CDR1, light chain FW2, light chain CDR2, light chain FW3, light chain CDR3, light chain FW4, '+
132 |                                 'heavy chain FW1, heavy chain CDR1, heavy chain FW2, heavy chain CDR2, heavy chain FW3, heavy chain CDR3, heavy chain FW4\n')
133 |         for sname in seq_id:
134 |             fi.write(sname + ',' + ','.join(getSequenceHL_region(sname)) + '\n')
135 | 
136 | 
137 | def feature_distribution():
138 |     from collections import Counter
139 |     write_out = [[] for i in range(len(seq_id))]
140 |     for fi in range(1,12):
141 |         feat = []
142 |         for item in write_out:
143 |             feat.append(item[fi])
144 | 
145 |         feat_count = Counter(feat)
146 |         sorted_count = sorted(feat_count.items(), key=lambda kv: kv[1], reverse=True)
147 |         if fi==11:
148 |             feat_type = sorted_count[0][0].split('_')[0]
149 |         else:
150 |             feat_type = sorted_count[0][0].split('_')[0] + sorted_count[0][0].split('_')[1]
151 |         with open('./Features_distribution_'+feat_type+'.csv','w') as fi:
152 |             for i in range(len(sorted_count)):
153 |                 fi.write(sorted_count[i][0]+','+str(sorted_count[i][1])+'\n')
154 | 
155 | def feature():
156 |     write_out = [[] for i in range(len(seq_id))]
157 |     for i in range(len(seq_id)):
158 |         write_out[i].append(seq_id[i])
159 |         for idx, f in enumerate(AllFeatureVectors[i]):
160 |             if f == 1:
161 |                 write_out[i].append(AllFeatureNames[idx])
162 | 
163 |     with open('../results/'+SET_NAME +'_Features.csv', 'w') as fi:
164 | 
165 |         fi.write('sequence id, ')
166 |         if not IF_ONLY_HEAVY:
167 |             fi.write('light chain V region, light chain J region, ')
168 |         fi.write('heavy chain V region, heavy chain J region, ')
169 |         if not IF_ONLY_HEAVY:
170 |             fi.write('Canonical L1, Canonical L2, Canonical L3, ')
171 |         fi.write('Canonical H1, Canonical H2, Canonical H3, ' )
172 |         fi.write('PI, frequent positional motif\n')
173 |         for i in range(len(write_out)):
174 |             fi.write(','.join(write_out[i]) + '\n')
175 | 
176 | 
177 | def correlation_feature():
178 | 
179 |     ###### plot correlation matrix
180 |     data = pd.DataFrame(AllFeatureVectors, columns=AllFeatureNames)
181 |     # print(AllFeatureVectors.shape)
182 |     corr = data.corr()
183 |     import numpy as np
184 |     corr = np.array(corr)
185 |     with open('../results/Pearson_feature_correlation.csv', 'w') as fi:
186 |         fi.write('Feature value 1, Feature value 2, Pearson coefficient\n')
187 |         for i in range(len(AllFeatureNames)):
188 |             for j in range(i+1, len(AllFeatureNames)):
189 |                 # if str(corr[i][j])=='nan':
190 |                 #     print('nan', AllFeatureNames[i], AllFeatureNames[j])
191 |                 fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(corr[i][j])+'\n')
192 | 
193 | 
194 | 
195 |     # data.to_csv(r'../results/Feature_test.csv', header=True)
196 | 
197 |     # fig = plt.figure(figsize=(100, 70))
198 |     # ax = fig.add_subplot(111)
199 |     # cax = ax.matshow(corr, cmap='seismic', vmin=-1, vmax =1)
200 |     # fig.colorbar(cax)
201 |     # ticks = np.arange(0, len(data.columns),1)
202 |     # ax.set_xticks(ticks)
203 |     # plt.xticks(rotation=90)
204 |     # ax.set_yticks(ticks)
205 |     # ax.set_xticklabels(data.columns)
206 |     # ax.set_yticklabels(data.columns)
207 |     # plt.savefig('../results/feature_correlation.png')
208 |     # corr = pd.DataFrame(corr, index=AllFeatureNames, columns=AllFeatureNames)
209 |     ###### display pairwise correlation value
210 |     # au_corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
211 |     # au_corr = au_corr.stack().sort_values(ascending=False)
212 |     # au_corr = corr.unstack()
213 |     # au_corr.columns = [' 1', 'Feature 2', 'Pearson Correlation Value']
214 |     # au_corr = pd.DataFrame(au_corr.values, columns = ['Feature 1, Feature 2, Pearson Correlation Value'])
215 |     # au_corr.to_csv(r'../results/Pearson_feature_correlation.csv', header=True)
216 |     # print(len(au_corr))
217 | 
218 |     # print(AllFeatureVectors[:, AllFeatureNames.index('Germ_LJ_IGKJ3*01')])
219 |     # print(AllFeatureVectors[:, AllFeatureNames.index('Canonical_L2_0')])
220 | 
221 | # def JaccardCoefficientAnalysis():
222 | #     df = pd.DataFrame(AllFeatureVectors, columns=AllFeatureNames)
223 | #
224 | #     interest_feature=['Germ_HV_IGHV3-23*01', 'Canonical_H2_6', 'Germ_HJ_IGHJ4*02', 'Germ_HJ_IGHJ6*01', 'Germ_LV_IGKV1D-39*01',
225 | #                       'Canonical_H2_5', 'Germ_HJ_IGHJ4*01']
226 | #     jac_sim = np.eye(len(AllFeatureNames))
227 | #     for i in range(len(AllFeatureNames)):
228 | #         for j in range(i+1, len(AllFeatureNames)):
229 | #             if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
230 | #                 continue
231 | #             a = AllFeatureVectors[:, i]
232 | #             b = AllFeatureVectors[:, j]
233 | #             aandb =0
234 | #             aorb = 0
235 | #             for k in range(len(a)):
236 | #                 if a[k]==b[k] and a[k]==1:
237 | #                     aandb +=1
238 | #                 if a[k]==1 or b[k]==1:
239 | #                     aorb +=1
240 | #             if aorb==0:
241 | #                 jac_tmp=0
242 | #             else:
243 | #                 jac_tmp = float(aandb)/aorb
244 | #             if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
245 | #                 print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
246 | #
247 | #             jac_sim[i][j]=jac_tmp
248 | #             jac_sim[j][i]=jac_tmp
249 | #
250 | #
251 | #     with open('../results/Jaccard_feature_coefficient.csv', 'w') as fi:
252 | #         fi.write('Feature value 1, Feature value 2, Jaccard coefficient\n')
253 | #         for i in range(len(AllFeatureNames)):
254 | #             for j in range(i+1, len(AllFeatureNames)):
255 | #                 if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
256 | #                     continue
257 | #                 fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(jac_sim[i][j])+'\n')
258 | #
259 | #
260 | #     fig = plt.figure(figsize=(100, 70))
261 | #     ax = fig.add_subplot(111)
262 | #     cax = ax.matshow(jac_sim, cmap='Blues', vmin=0, vmax =1)
263 | #     fig.colorbar(cax)
264 | #     ticks = np.arange(0, len(df.columns),1)
265 | #     ax.set_xticks(ticks)
266 | #     plt.xticks(rotation=90)
267 | #     ax.set_yticks(ticks)
268 | #     ax.set_xticklabels(df.columns)
269 | #     ax.set_yticklabels(df.columns)
270 | #     plt.savefig('../results/feature_coefficient.png')
271 | #
272 | #     # print(AllFeatureVectors[:,AllFeatureNames.index('Germ_LJ_IGKJ3*01')])
273 | #     # print(AllFeatureVectors[:,AllFeatureNames.index('Canonical_L2_0*01')])
274 | #     # where(np.triu(np.ones(jac_sim.shape), k=1).astype(np.bool))
275 | #     # au_jac = jac_sim.where(np.triu(np.ones(jac_sim.shape), k=0).astype(np.bool))
276 | #     # au_jac = au_jac.stack().sort_values(ascending=False)
277 | #     # au_jac = jac_sim.unstack()
278 | #     # print(len(au_jac))
279 | #     # au_jac.to_csv(r'../results/Jaccard_feature_coefficient.csv', header=True)
280 | 
281 | def JaccardCoefficientAnalysis():
282 | 
283 |     PDB_size = DatasetSize[0]
284 | 
285 |     jac_sim_PDB = np.eye(len(AllFeatureNames))
286 |     for i in range(len(AllFeatureNames)):
287 |         for j in range(i+1, len(AllFeatureNames)):
288 |             if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
289 |                 continue
290 |             a = AllFeatureVectors[:PDB_size, i]
291 |             b = AllFeatureVectors[:PDB_size, j]
292 |             aandb =0
293 |             aorb = 0
294 |             for k in range(len(a)):
295 |                 if a[k]==b[k] and a[k]==1:
296 |                     aandb +=1
297 |                 if a[k]==1 or b[k]==1:
298 |                     aorb +=1
299 |             if aorb==0:
300 |                 jac_tmp=0
301 |             else:
302 |                 jac_tmp = float(aandb)/aorb
303 | 
304 |             # if AllFeatureNames[i] == 'Germ_HV_IGHV3-23*01' and AllFeatureNames[j] =='Canonical_H2_6':
305 |             #     print(a, b, jac_tmp)
306 |             # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
307 |             #     print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
308 |             jac_sim_PDB[i][j]=jac_tmp
309 |             jac_sim_PDB[j][i]=jac_tmp
310 | 
311 |     jac_sim_MMP = np.eye(len(AllFeatureNames))
312 |     for i in range(len(AllFeatureNames)):
313 |         for j in range(i+1, len(AllFeatureNames)):
314 |             if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
315 |                 continue
316 |             a = AllFeatureVectors[PDB_size:, i]
317 |             b = AllFeatureVectors[PDB_size:, j]
318 | 
319 |             aandb =0
320 |             aorb = 0
321 |             for k in range(len(a)):
322 |                 if a[k]==b[k] and a[k]==1:
323 |                     aandb +=1
324 |                 if a[k]==1 or b[k]==1:
325 |                     aorb +=1
326 |             if aorb==0:
327 |                 jac_tmp=0
328 |             else:
329 |                 jac_tmp = float(aandb)/aorb
330 |             # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
331 |             #     print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
332 | 
333 |             jac_sim_MMP[i][j]=jac_tmp
334 |             jac_sim_MMP[j][i]=jac_tmp
335 | 
336 | 
337 |     with open('../results/'+SET_NAME+'_Jaccard Feature Coefficient.csv', 'w') as fi:
338 |         fi.write('Feature value 1, Feature value 2, Jaccard coefficient for reference set, Jaccard coefficient for MMP-targeting set\n')
339 |         for i in range(len(AllFeatureNames)):
340 |             for j in range(i+1, len(AllFeatureNames)):
341 |                 if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
342 |                     continue
343 |                 fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(jac_sim_PDB[i][j])+','+ str(jac_sim_MMP[i][j])+'\n')
344 | if __name__=='__main__':
345 |     sequence_raw()
346 |     sequence_region()
347 |     OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)
348 |     OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)
349 |     CDRH3 = extract.GetCDRH3(Amino, Num)
350 |     OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)
351 |     MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)
352 |     AllFeatureVectors, AllFeatureNames, _, _ = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)
353 | 
354 |     feature()
355 |     # correlation_feature()
356 |     JaccardCoefficientAnalysis()
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 


--------------------------------------------------------------------------------
/results/MMP-PDB/MMP-cluster_RankFisherAndFS.csv:
--------------------------------------------------------------------------------
  1 | Feature type, Feature value,"Fisher Exact Test 
  2 | p-value"," Feature selection 
  3 | (thereshold = 0.0025)","Average rank of 
  4 | Fisher Exact Test","Rank of 
  5 | feature selection"
  6 | Canonical H1,0,1.00E+00,0.00070915,,
  7 | Canonical H1,1,1.48E-05,0.003064429,11.86,38
  8 | Canonical H1,2,1.00E+00,0.000198741,,
  9 | Canonical H1,3,1.00E+00,0.001203345,,
 10 | Canonical H1,4,1.00E+00,0,,
 11 | Canonical H2,0,1.00E+00,0.029372109,,5
 12 | Canonical H2,3,1.00E+00,0,,
 13 | Canonical H2,5,1.00E+00,0.038609987,,4
 14 | Canonical H2,6,1.71E-39,0.090003385,2,2
 15 | Canonical H3,0,9.99E-01,0.004348375,,27
 16 | Canonical H3,1,9.97E-01,0.003626023,,35
 17 | Canonical H3,2,2.79E-03,0.005589529,31.75,20
 18 | Canonical H3,3,9.33E-03,0.001775536,38.91,
 19 | Canonical L1,0,9.99E-01,0.006550694,,18
 20 | Canonical L1,2,1.00E+00,0,,
 21 | Canonical L1,3,1.00E+00,0,,
 22 | Canonical L1,4,1.45E-03,0.003968341,29.65,33
 23 | Canonical L1,5,1.12E-01,0.002855355,,44
 24 | Canonical L2,0,1.00E+00,0,,
 25 | Canonical L3,0,1.00E+00,0,,
 26 | Germ HJ,IGHJ1*01,9.89E-01,0.002300919,,
 27 | Germ HJ,IGHJ1*02,1.00E+00,0.001660937,,
 28 | Germ HJ,IGHJ1*03,1.00E+00,0,,
 29 | Germ HJ,IGHJ2*01,1.00E+00,0.005897943,,19
 30 | Germ HJ,IGHJ2*03,1.00E+00,0,,
 31 | Germ HJ,IGHJ3*01,1.00E+00,0.015279146,,8
 32 | Germ HJ,IGHJ3*02,2.20E-05,0.011174225,15.66,11
 33 | Germ HJ,IGHJ4*01,1.00E+00,0.029354208,,6
 34 | Germ HJ,IGHJ4*02,2.79E-23,0.051628515,3,3
 35 | Germ HJ,IGHJ5*01,9.97E-01,0.007708994,,15
 36 | Germ HJ,IGHJ5*02,2.20E-02,0.004523093,45.14,24
 37 | Germ HJ,IGHJ6*01,1.58E-16,0.028791732,4,7
 38 | Germ HJ,IGHJ6*02,1.00E+00,0.014648606,,9
 39 | Germ HJ,IGHJ6*04,7.96E-02,0.002245223,,
 40 | Germ HV,IGHV1-12*01,1.00E+00,0,,
 41 | Germ HV,IGHV1-14*01,1.00E+00,0,,
 42 | Germ HV,IGHV1-15*01,1.00E+00,0,,
 43 | Germ HV,IGHV1-18*01,1.00E+00,0,,
 44 | Germ HV,IGHV1-18*04,1.00E+00,0.000532851,,
 45 | Germ HV,IGHV1-19*01,1.00E+00,0,,
 46 | Germ HV,IGHV1-2*02,6.28E-01,0.003518566,,36
 47 | Germ HV,IGHV1-2*04,1.00E+00,0,,
 48 | Germ HV,IGHV1-22*01,1.00E+00,0,,
 49 | Germ HV,IGHV1-24*01,1.00E+00,0,,
 50 | Germ HV,IGHV1-26*01,1.00E+00,0,,
 51 | Germ HV,IGHV1-3*01,1.00E+00,0.000294827,,
 52 | Germ HV,IGHV1-34*01,1.00E+00,0,,
 53 | Germ HV,IGHV1-34*02,1.00E+00,0,,
 54 | Germ HV,IGHV1-37*01,1.00E+00,0,,
 55 | Germ HV,IGHV1-39*01,1.00E+00,0,,
 56 | Germ HV,IGHV1-4*01,1.00E+00,0,,
 57 | Germ HV,IGHV1-42*01,1.00E+00,0,,
 58 | Germ HV,IGHV1-46*01,1.00E+00,0.000184878,,
 59 | Germ HV,IGHV1-47*01,1.00E+00,0,,
 60 | Germ HV,IGHV1-5*01,1.00E+00,0.000178421,,
 61 | Germ HV,IGHV1-50*01,1.00E+00,0,,
 62 | Germ HV,IGHV1-53*01,1.00E+00,0,,
 63 | Germ HV,IGHV1-54*01,1.00E+00,0,,
 64 | Germ HV,IGHV1-55*01,1.00E+00,0,,
 65 | Germ HV,IGHV1-59*01,1.00E+00,0,,
 66 | Germ HV,IGHV1-6*02,1.00E+00,0,,
 67 | Germ HV,IGHV1-61*01,1.00E+00,0.000155636,,
 68 | Germ HV,IGHV1-63*02,1.00E+00,0,,
 69 | Germ HV,IGHV1-66*01,1.00E+00,0,,
 70 | Germ HV,IGHV1-67*01,1.00E+00,0,,
 71 | Germ HV,IGHV1-69-2*01,1.00E+00,0,,
 72 | Germ HV,IGHV1-69*01,1.00E+00,0,,
 73 | Germ HV,IGHV1-69*02,1.00E+00,0,,
 74 | Germ HV,IGHV1-69*08,1.00E+00,0.000118438,,
 75 | Germ HV,IGHV1-69*10,1.00E+00,0,,
 76 | Germ HV,IGHV1-69*14,1.00E+00,8.53E-06,,
 77 | Germ HV,IGHV1-69D*01,1.00E+00,5.27E-05,,
 78 | Germ HV,IGHV1-7*01,1.00E+00,0,,
 79 | Germ HV,IGHV1-72*01,1.00E+00,0,,
 80 | Germ HV,IGHV1-8*01,1.00E+00,0.000303056,,
 81 | Germ HV,IGHV1-81*01,1.00E+00,0,,
 82 | Germ HV,IGHV1-82*01,1.00E+00,2.15E-05,,
 83 | Germ HV,IGHV1-84*01,1.00E+00,0,,
 84 | Germ HV,IGHV1-85*01,1.00E+00,0,,
 85 | Germ HV,IGHV1-9*01,1.00E+00,0.001006833,,
 86 | Germ HV,IGHV10-1*02,1.00E+00,0,,
 87 | Germ HV,IGHV10-3*01,1.00E+00,0,,
 88 | Germ HV,IGHV10S3*01,1.00E+00,0,,
 89 | Germ HV,IGHV13-2*02,1.00E+00,0,,
 90 | Germ HV,IGHV14-1*01,1.00E+00,0,,
 91 | Germ HV,IGHV14-1*02,1.00E+00,0,,
 92 | Germ HV,IGHV14-3*02,1.00E+00,2.62E-05,,
 93 | Germ HV,IGHV14-4*02,1.00E+00,0.000340449,,
 94 | Germ HV,IGHV1S14*01,1.00E+00,0,,
 95 | Germ HV,IGHV1S29*02,1.00E+00,0,,
 96 | Germ HV,IGHV1S45*01,1.00E+00,0,,
 97 | Germ HV,IGHV1S53*02,1.00E+00,0.000151308,,
 98 | Germ HV,IGHV1S61*01,1.00E+00,0,,
 99 | Germ HV,IGHV1S69*01,1.00E+00,0,,
100 | Germ HV,IGHV1S7*01,1.00E+00,6.14E-05,,
101 | Germ HV,IGHV2-2*03,1.00E+00,0,,
102 | Germ HV,IGHV2-5*02,1.00E+00,0,,
103 | Germ HV,IGHV2-5*09,1.00E+00,0,,
104 | Germ HV,IGHV2-6-4*01,1.00E+00,0,,
105 | Germ HV,IGHV2-6-5*01,1.00E+00,0,,
106 | Germ HV,IGHV2-6-7*02,1.00E+00,0,,
107 | Germ HV,IGHV2-70*13,1.00E+00,0,,
108 | Germ HV,IGHV2-9*02,5.98E-01,0.002199624,,
109 | Germ HV,IGHV2S33*01,1.00E+00,0,,
110 | Germ HV,IGHV2S5*01,1.00E+00,0,,
111 | Germ HV,IGHV3-1*01,1.00E+00,0,,
112 | Germ HV,IGHV3-11*01,1.00E+00,0,,
113 | Germ HV,IGHV3-11*05,1.00E+00,0,,
114 | Germ HV,IGHV3-13*04,1.00E+00,0,,
115 | Germ HV,IGHV3-15*04,1.00E+00,0,,
116 | Germ HV,IGHV3-15*06,1.00E+00,0,,
117 | Germ HV,IGHV3-15*07,1.00E+00,0.000143431,,
118 | Germ HV,IGHV3-2*02,1.00E+00,0,,
119 | Germ HV,IGHV3-20*01,1.00E+00,0,,
120 | Germ HV,IGHV3-21*03,1.00E+00,0.00016636,,
121 | Germ HV,IGHV3-23*01,1.75E-72,0.419321293,1,1
122 | Germ HV,IGHV3-23*02,1.00E+00,0,,
123 | Germ HV,IGHV3-23*03,5.00E-01,0.003982053,,32
124 | Germ HV,IGHV3-23*04,1.00E+00,0.000206359,,
125 | Germ HV,IGHV3-30*02,1.00E+00,0,,
126 | Germ HV,IGHV3-30*03,1.00E+00,0.000711737,,
127 | Germ HV,IGHV3-30*10,1.00E+00,0,,
128 | Germ HV,IGHV3-30*11,1.00E+00,0.00283518,,45
129 | Germ HV,IGHV3-30*18,1.00E+00,1.79E-05,,
130 | Germ HV,IGHV3-33*01,6.06E-02,0.002916101,,43
131 | Germ HV,IGHV3-33*03,1.00E+00,0,,
132 | Germ HV,IGHV3-48*01,1.00E+00,0.000344457,,
133 | Germ HV,IGHV3-53*01,5.00E-01,0.002345587,,
134 | Germ HV,IGHV3-6*02,1.00E+00,0,,
135 | Germ HV,IGHV3-64*01,1.00E+00,0,,
136 | Germ HV,IGHV3-66*02,1.00E+00,0,,
137 | Germ HV,IGHV3-66*03,1.00E+00,0,,
138 | Germ HV,IGHV3-7*02,1.00E+00,0.00028511,,
139 | Germ HV,IGHV3-72*01,1.00E+00,0,,
140 | Germ HV,IGHV3-73*01,1.00E+00,0,,
141 | Germ HV,IGHV3-74*01,1.00E+00,0,,
142 | Germ HV,IGHV3-74*03,1.00E+00,1.34E-05,,
143 | Germ HV,IGHV3-8*02,1.00E+00,0,,
144 | Germ HV,IGHV3-9*01,1.00E+00,0.00076671,,
145 | Germ HV,IGHV3S1*01,1.00E+00,0,,
146 | Germ HV,IGHV4-1*02,1.00E+00,0,,
147 | Germ HV,IGHV4-2*02,1.00E+00,0,,
148 | Germ HV,IGHV4-30-4*07,1.00E+00,0,,
149 | Germ HV,IGHV4-31*02,1.00E+00,0,,
150 | Germ HV,IGHV4-31*05,1.00E+00,0,,
151 | Germ HV,IGHV4-34*01,1.00E+00,0.000527108,,
152 | Germ HV,IGHV4-38-2*01,1.00E+00,0,,
153 | Germ HV,IGHV4-38-2*02,1.00E+00,0,,
154 | Germ HV,IGHV4-39*07,1.00E+00,3.88E-05,,
155 | Germ HV,IGHV4-4*02,1.00E+00,0,,
156 | Germ HV,IGHV4-4*07,1.00E+00,0,,
157 | Germ HV,IGHV4-4*08,1.00E+00,0,,
158 | Germ HV,IGHV4-59*02,1.00E+00,0,,
159 | Germ HV,IGHV4-59*03,1.00E+00,0,,
160 | Germ HV,IGHV4-59*04,1.00E+00,0,,
161 | Germ HV,IGHV4-59*05,1.00E+00,0,,
162 | Germ HV,IGHV4-59*07,1.00E+00,0,,
163 | Germ HV,IGHV4-59*08,1.00E+00,7.30E-05,,
164 | Germ HV,IGHV5-10-1*04,1.00E+00,0.000203388,,
165 | Germ HV,IGHV5-12-2*01,1.00E+00,0,,
166 | Germ HV,IGHV5-12*01,1.00E+00,0,,
167 | Germ HV,IGHV5-15*02,1.00E+00,0,,
168 | Germ HV,IGHV5-17*02,1.00E+00,0.000348178,,
169 | Germ HV,IGHV5-4*02,1.00E+00,0,,
170 | Germ HV,IGHV5-51*01,1.00E+00,0.000351596,,
171 | Germ HV,IGHV5-6-1*01,1.00E+00,0,,
172 | Germ HV,IGHV5-6-2*01,1.00E+00,0.000314301,,
173 | Germ HV,IGHV5-6-3*01,1.00E+00,0,,
174 | Germ HV,IGHV5-9-3*01,1.00E+00,3.18E-06,,
175 | Germ HV,IGHV5-9*01,5.00E-01,0.004882342,,22
176 | Germ HV,IGHV5-9*02,1.00E+00,0.000445259,,
177 | Germ HV,IGHV5-9*03,1.00E+00,0,,
178 | Germ HV,IGHV5S4*01,1.00E+00,0,,
179 | Germ HV,IGHV5S9*01,1.00E+00,0,,
180 | Germ HV,IGHV6-3*02,1.00E+00,0,,
181 | Germ HV,IGHV6-6*01,1.00E+00,0.000228354,,
182 | Germ HV,IGHV6-6*02,1.00E+00,0,,
183 | Germ HV,IGHV6-7*02,1.00E+00,0,,
184 | Germ HV,IGHV7-3*02,1.00E+00,0,,
185 | Germ HV,IGHV7-3*04,1.00E+00,0,,
186 | Germ HV,IGHV7-4-1*02,1.00E+00,0,,
187 | Germ HV,IGHV8-12*01,1.00E+00,0,,
188 | Germ HV,IGHV8-5*01,1.00E+00,0,,
189 | Germ HV,IGHV8-8*01,1.00E+00,0.000214988,,
190 | Germ HV,IGHV9-1*02,1.00E+00,0,,
191 | Germ HV,IGHV9-2-1*01,1.00E+00,0.000557316,,
192 | Germ HV,IGHV9-3-1*01,1.00E+00,0,,
193 | Germ HV,IGHV9-3*01,1.00E+00,0,,
194 | Germ HV,IGHV9-4*02,1.00E+00,0,,
195 | Germ LJ,IGKJ1-1*03,1.00E+00,0,,
196 | Germ LJ,IGKJ1-2*02,1.00E+00,0.001764855,,
197 | Germ LJ,IGKJ1-2*03,1.00E+00,0,,
198 | Germ LJ,IGKJ1*01,9.88E-01,0.00544662,,21
199 | Germ LJ,IGKJ1*02,1.00E+00,6.77E-05,,
200 | Germ LJ,IGKJ2-1*01,1.00E+00,0,,
201 | Germ LJ,IGKJ2-3*01,1.00E+00,0,,
202 | Germ LJ,IGKJ2*01,8.87E-01,0.002790228,,46
203 | Germ LJ,IGKJ2*02,1.00E+00,0,,
204 | Germ LJ,IGKJ2*03,1.00E+00,0,,
205 | Germ LJ,IGKJ3*01,1.29E-05,0.014631149,13.14,10
206 | Germ LJ,IGKJ4*01,2.45E-03,0.000513796,31.39,
207 | Germ LJ,IGKJ4*02,1.00E+00,0,,
208 | Germ LJ,IGKJ5*01,9.68E-01,0.001509162,,
209 | Germ LJ,IGLJ1*01,4.22E-01,0.003266883,,37
210 | Germ LJ,IGLJ2*01,1.00E+00,0.000485164,,
211 | Germ LJ,IGLJ3*01,3.28E-01,0.000824574,,
212 | Germ LJ,IGLJ3*02,7.95E-01,0.000667714,,
213 | Germ LJ,IGLJ6*01,2.85E-01,0.000181587,,
214 | Germ LJ,IGLJ7*01,6.60E-01,0,,
215 | Germ LV,IGKV1-110*01,1.00E+00,0,,
216 | Germ LV,IGKV1-110*02,1.00E+00,0,,
217 | Germ LV,IGKV1-117*01,9.41E-01,0,,
218 | Germ LV,IGKV1-117*02,1.00E+00,0,,
219 | Germ LV,IGKV1-12*01,3.58E-03,0.002993798,39.91,40
220 | Germ LV,IGKV1-133*01,1.00E+00,0,,
221 | Germ LV,IGKV1-16*01,1.24E-01,0.008987967,,14
222 | Germ LV,IGKV1-17*01,5.63E-01,0.001137982,,
223 | Germ LV,IGKV1-17*03,5.00E-01,0,,
224 | Germ LV,IGKV1-27*01,5.80E-02,0.001384217,,
225 | Germ LV,IGKV1-39*01,1.00E+00,0.001736249,,
226 | Germ LV,IGKV1-5*01,1.00E+00,0,,
227 | Germ LV,IGKV1-5*03,7.09E-01,0.000669564,,
228 | Germ LV,IGKV1-6*01,2.49E-01,1.51E-05,,
229 | Germ LV,IGKV1-88*01,1.00E+00,0,,
230 | Germ LV,IGKV1-9*01,3.38E-01,0.001313416,,
231 | Germ LV,IGKV1-NL1*01,1.00E+00,0,,
232 | Germ LV,IGKV10-94*02,1.00E+00,4.83E-05,,
233 | Germ LV,IGKV10-94*05,1.00E+00,0,,
234 | Germ LV,IGKV10-96*02,1.00E+00,0.001710628,,
235 | Germ LV,IGKV10-96*04,1.00E+00,1.14E-05,,
236 | Germ LV,IGKV12-41*01,1.00E+00,4.10E-06,,
237 | Germ LV,IGKV12-44*01,1.00E+00,0,,
238 | Germ LV,IGKV12-46*01,1.00E+00,0,,
239 | Germ LV,IGKV12S24*01,1.00E+00,0,,
240 | Germ LV,IGKV13-84*01,1.00E+00,0,,
241 | Germ LV,IGKV14-100*01,1.00E+00,0,,
242 | Germ LV,IGKV14-111*01,1.00E+00,0,,
243 | Germ LV,IGKV14-126*01,1.00E+00,0,,
244 | Germ LV,IGKV16-104*01,1.00E+00,0,,
245 | Germ LV,IGKV17-121*01,1.00E+00,0,,
246 | Germ LV,IGKV17-127*01,1.00E+00,0.000376466,,
247 | Germ LV,IGKV19-93*02,1.00E+00,0,,
248 | Germ LV,IGKV1D-13*01,7.38E-01,4.45E-07,,
249 | Germ LV,IGKV1D-33*01,8.24E-01,0.000615482,,
250 | Germ LV,IGKV1D-39*01,4.88E-10,0.009239291,5.03,13
251 | Germ LV,IGKV1S10*01,1.00E+00,0,,
252 | Germ LV,IGKV1S11*01,5.75E-01,0,,
253 | Germ LV,IGKV1S12*01,1.00E+00,0,,
254 | Germ LV,IGKV1S14*01,1.00E+00,0,,
255 | Germ LV,IGKV1S15*01,1.00E+00,0,,
256 | Germ LV,IGKV1S17*01,1.00E+00,0,,
257 | Germ LV,IGKV1S2*01,8.07E-01,0,,
258 | Germ LV,IGKV1S2*02,1.00E+00,0,,
259 | Germ LV,IGKV1S22*01,1.00E+00,0,,
260 | Germ LV,IGKV1S24*01,1.00E+00,0,,
261 | Germ LV,IGKV1S3*01,1.00E+00,0,,
262 | Germ LV,IGKV1S3*02,1.00E+00,0,,
263 | Germ LV,IGKV1S5*01,1.00E+00,0,,
264 | Germ LV,IGKV2-109*01,1.00E+00,0,,
265 | Germ LV,IGKV2-109*03,1.00E+00,0,,
266 | Germ LV,IGKV2-112*01,1.00E+00,0,,
267 | Germ LV,IGKV2-137*01,1.00E+00,0,,
268 | Germ LV,IGKV2-28*01,1.36E-01,0.004282447,,28
269 | Germ LV,IGKV2-29*02,1.00E+00,0,,
270 | Germ LV,IGKV2-30*01,6.91E-01,1.29E-05,,
271 | Germ LV,IGKV22S7*01,1.00E+00,0,,
272 | Germ LV,IGKV2D-29*02,1.00E+00,0,,
273 | Germ LV,IGKV2S3*01,1.00E+00,0,,
274 | Germ LV,IGKV3-1*01,1.00E+00,5.72E-05,,
275 | Germ LV,IGKV3-10*01,1.00E+00,0,,
276 | Germ LV,IGKV3-11*01,5.67E-04,0.000541447,25.77,
277 | Germ LV,IGKV3-11*02,5.00E-01,0,,
278 | Germ LV,IGKV3-12*01,1.00E+00,0,,
279 | Germ LV,IGKV3-2*01,1.00E+00,0,,
280 | Germ LV,IGKV3-20*01,4.99E-05,0.007138248,15.98,16
281 | Germ LV,IGKV3-3*01,1.00E+00,0,,
282 | Germ LV,IGKV3-4*01,1.00E+00,0,,
283 | Germ LV,IGKV3-5*01,1.00E+00,0.00027996,,
284 | Germ LV,IGKV3-7*01,1.00E+00,0,,
285 | Germ LV,IGKV3D-11*01,1.00E+00,0,,
286 | Germ LV,IGKV3D-15*01,3.74E-01,0.002168255,,
287 | Germ LV,IGKV3D-20*01,2.49E-01,3.88E-06,,
288 | Germ LV,IGKV3S3*01,5.00E-01,0,,
289 | Germ LV,IGKV3S9*01,1.00E+00,0,,
290 | Germ LV,IGKV4-1*01,7.34E-01,0.00259438,,49
291 | Germ LV,IGKV4-53*01,1.00E+00,0,,
292 | Germ LV,IGKV4-55*01,1.00E+00,0,,
293 | Germ LV,IGKV4-57-1*01,1.00E+00,0,,
294 | Germ LV,IGKV4-57*01,1.00E+00,0,,
295 | Germ LV,IGKV4-59*01,1.00E+00,0,,
296 | Germ LV,IGKV4-61*01,1.00E+00,0,,
297 | Germ LV,IGKV4-63*01,1.00E+00,0,,
298 | Germ LV,IGKV4-68*01,1.00E+00,0,,
299 | Germ LV,IGKV4-70*01,1.00E+00,0,,
300 | Germ LV,IGKV4-72*01,1.00E+00,0,,
301 | Germ LV,IGKV4-74*01,1.00E+00,0,,
302 | Germ LV,IGKV4-79*01,1.00E+00,0,,
303 | Germ LV,IGKV4-80*01,1.00E+00,0,,
304 | Germ LV,IGKV4-81*01,1.00E+00,0,,
305 | Germ LV,IGKV4-86*01,1.00E+00,0,,
306 | Germ LV,IGKV4-91*01,1.00E+00,0,,
307 | Germ LV,IGKV5-39*01,1.00E+00,0.000331762,,
308 | Germ LV,IGKV5-43*01,1.00E+00,0,,
309 | Germ LV,IGKV5-48*01,1.00E+00,0.000960814,,
310 | Germ LV,IGKV6-14*01,1.00E+00,2.54E-05,,
311 | Germ LV,IGKV6-15*01,1.00E+00,4.80E-05,,
312 | Germ LV,IGKV6-17*01,8.76E-01,0.002060703,,
313 | Germ LV,IGKV6-20*01,1.00E+00,0.000203542,,
314 | Germ LV,IGKV6-21*01,1.00E+00,0.000638102,,
315 | Germ LV,IGKV6-21*02,1.00E+00,0,,
316 | Germ LV,IGKV6-23*01,1.00E+00,0,,
317 | Germ LV,IGKV6-25*01,1.00E+00,0,,
318 | Germ LV,IGKV6-32*01,1.00E+00,0,,
319 | Germ LV,IGKV6-32*02,1.00E+00,0,,
320 | Germ LV,IGKV8-19*01,1.00E+00,0,,
321 | Germ LV,IGKV8-21*01,1.00E+00,0,,
322 | Germ LV,IGKV8-24*01,1.00E+00,9.87E-06,,
323 | Germ LV,IGKV8-27*01,1.00E+00,0,,
324 | Germ LV,IGKV8-28*01,1.00E+00,0,,
325 | Germ LV,IGKV8-30*01,1.00E+00,0,,
326 | Germ LV,IGKV9-120*01,1.00E+00,0,,
327 | Germ LV,IGKV9-124*01,1.00E+00,0,,
328 | Germ LV,IGLV1-10*01,1.00E+00,0,,
329 | Germ LV,IGLV1-36*01,1.00E+00,0,,
330 | Germ LV,IGLV1-40*01,5.65E-01,0,,
331 | Germ LV,IGLV1-40*03,1.00E+00,0,,
332 | Germ LV,IGLV1-44*01,2.20E-01,0.000696396,,
333 | Germ LV,IGLV1-47*01,1.64E-01,0.001464818,,
334 | Germ LV,IGLV1-47*02,2.70E-01,0.001099102,,
335 | Germ LV,IGLV1-51*01,1.00E+00,0.00029127,,
336 | Germ LV,IGLV1-51*02,1.00E+00,0.000544347,,
337 | Germ LV,IGLV1*01,1.00E+00,0.000329822,,
338 | Germ LV,IGLV1*02,1.00E+00,0,,
339 | Germ LV,IGLV2-11*01,2.49E-01,0.001072122,,
340 | Germ LV,IGLV2-14*01,2.57E-01,0.002751961,,47
341 | Germ LV,IGLV2-14*02,1.00E+00,0,,
342 | Germ LV,IGLV2-23*01,5.78E-01,4.83E-06,,
343 | Germ LV,IGLV2-23*02,1.00E+00,0,,
344 | Germ LV,IGLV2-8*01,1.77E-01,0.000704107,,
345 | Germ LV,IGLV2S1*01,3.14E-01,0.000799662,,
346 | Germ LV,IGLV2S9*01,1.00E+00,0,,
347 | Germ LV,IGLV3-1*01,7.82E-01,0.00093926,,
348 | Germ LV,IGLV3-10*01,1.00E+00,0,,
349 | Germ LV,IGLV3-19*01,5.78E-01,4.91E-07,,
350 | Germ LV,IGLV3-21*01,1.00E+00,0,,
351 | Germ LV,IGLV3-21*02,2.23E-01,0.0010809,,
352 | Germ LV,IGLV3-21*03,1.00E+00,0,,
353 | Germ LV,IGLV3-25*03,1.00E+00,0,,
354 | Germ LV,IGLV3-9*02,1.00E+00,0,,
355 | Germ LV,IGLV3*01,1.00E+00,2.63E-06,,
356 | Germ LV,IGLV5S10*01,1.00E+00,0,,
357 | Germ LV,IGLV6-57*01,5.00E-01,0,,
358 | Germ LV,IGLV6-57*02,1.00E+00,0,,
359 | Germ LV,IGLV7-43*01,1.00E+00,0,,
360 | Motif,10_FD,6.83E-03,0.001842351,35.99,
361 | Motif,10_MD,2.44E-02,0.002656346,46.63,48
362 | Motif,10_NG,7.46E-04,0,26.18,
363 | Motif,10_YY,3.01E-03,0.001841431,30.18,
364 | Motif,2_AY,2.45E-03,0.000718347,31.58,
365 | Motif,2_AYG,5.19E-07,0.004093965,9.95,30
366 | Motif,2_GG,3.29E-01,0.00048807,,
367 | Motif,2_YG,9.50E-06,0.004157068,11.55,29
368 | Motif,2_YY,3.01E-03,2.95E-05,36.18,
369 | Motif,3_YG,9.50E-06,0.000856833,10.55,
370 | Motif,3_YY,3.01E-03,0.00297028,33.18,41
371 | Motif,4_GD,1.82E-03,0.000754301,28.08,
372 | Motif,4_GDY,1.03E-05,3.70E-06,12.05,
373 | Motif,4_SV,9.73E-01,0,,
374 | Motif,4_SVT,1.00E+00,1.10E-05,,
375 | Motif,4_YD,5.54E-02,0.004696185,,23
376 | Motif,4_YY,3.01E-03,0.000362391,31.18,
377 | Motif,5_DY,3.58E-04,0.000492204,22.06,
378 | Motif,5_DYV,1.67E-05,0,14.44,
379 | Motif,5_FD,6.83E-03,8.46E-06,37.99,
380 | Motif,5_YY,3.01E-03,0.001704189,29.18,
381 | Motif,6_YA,1.00E+00,0.004067616,,31
382 | Motif,6_YD,5.54E-02,0.000727645,,
383 | Motif,6_YF,9.15E-01,0.000529252,,
384 | Motif,6_YV,7.20E-03,0.004395786,37.55,25
385 | Motif,6_YVG,2.20E-05,0.007033143,15.02,17
386 | Motif,6_YY,3.01E-03,0.002312894,34.18,
387 | Motif,7_FD,6.83E-03,0.001472312,38.99,
388 | Motif,7_MD,2.44E-02,0.000534293,47.63,
389 | Motif,7_VG,1.10E-04,0.004383491,18.22,26
390 | Motif,7_VGW,5.19E-07,0,8.75,
391 | Motif,8_GW,5.89E-04,0.000457018,22.57,
392 | Motif,8_GWN,1.11E-06,0.003937073,11.95,34
393 | Motif,8_MD,2.44E-02,0.002503496,45.63,50
394 | Motif,8_SA,1.00E+00,0.000719992,,
395 | Motif,8_YF,9.15E-01,6.92E-06,,
396 | Motif,8_YY,3.01E-03,0.00200915,32.18,
397 | Motif,9_AM,1.00E+00,0,,
398 | Motif,9_AMD,1.00E+00,0,,
399 | Motif,9_FD,6.83E-03,0.000525799,36.99,
400 | Motif,9_WN,3.34E-05,1.01E-05,16.75,
401 | Motif,9_YY,3.01E-03,0.010646544,35.18,12
402 | PI,0.0-3.5,8.32E-03,0.000693414,39.16,
403 | PI,3.5-3.9375,3.53E-03,0.001465018,34.51,
404 | PI,3.9375-4.375,6.74E-01,0.002927121,,42
405 | PI,4.375-5.25,9.54E-01,0.000670093,,
406 | PI,5.25-5.6875,9.80E-01,7.98E-05,,
407 | PI,5.6875-6.125,9.15E-01,0.003030013,,39
408 | PI,6.125-7.0,9.89E-01,0.001250475,,
409 | PI,7.0-14.0,9.92E-01,0.001905193,,


--------------------------------------------------------------------------------
/results/MMP-IGHV/IGHV_Jaccard Feature Coefficient.csv:
--------------------------------------------------------------------------------
  1 | Feature value 1, Feature value 2, Jaccard coefficient for MMP-IGHV-targeting set, Jaccard coefficient for IGHV-reference set
  2 | Canonical_H1_0,Canonical_H2_0,0,0
  3 | Canonical_H1_0,Canonical_H2_6,0,0.003383277
  4 | Canonical_H1_0,Canonical_H3_0,0,0.006116208
  5 | Canonical_H1_0,Canonical_H3_1,0,0.004878049
  6 | Canonical_H1_0,Canonical_H3_2,0,0.004938272
  7 | Canonical_H1_0,Canonical_H3_3,0,0.002617801
  8 | Canonical_H1_0,PI_0.0-3.5,0,0.002770083
  9 | Canonical_H1_0,PI_3.5-3.9375,0,0.004587156
 10 | Canonical_H1_0,PI_3.9375-4.375,0,0.001090513
 11 | Canonical_H1_0,PI_4.375-4.8125,0,0
 12 | Canonical_H1_0,PI_4.8125-5.25,0,0.007092199
 13 | Canonical_H1_0,PI_5.25-5.6875,0,0.005813953
 14 | Canonical_H1_0,PI_5.6875-6.125,0,0.003496503
 15 | Canonical_H1_0,PI_6.125-7.0,0,0.004255319
 16 | Canonical_H1_0,PI_7.0-14.0,0,0
 17 | Canonical_H1_1,Canonical_H1_0,0,0
 18 | Canonical_H1_1,Canonical_H1_2,0,0
 19 | Canonical_H1_1,Canonical_H2_0,0.052238806,0.044938615
 20 | Canonical_H1_1,Canonical_H2_6,0.947761194,0.951523546
 21 | Canonical_H1_1,Canonical_H3_0,0.141791045,0.072238944
 22 | Canonical_H1_1,Canonical_H3_1,0.074626866,0.044243688
 23 | Canonical_H1_1,Canonical_H3_2,0.246268657,0.09029868
 24 | Canonical_H1_1,Canonical_H3_3,0.537313433,0.791262136
 25 | Canonical_H1_1,PI_0.0-3.5,0.194029851,0.080379893
 26 | Canonical_H1_1,PI_3.5-3.9375,0.402985075,0.299097849
 27 | Canonical_H1_1,PI_3.9375-4.375,0.21641791,0.208893006
 28 | Canonical_H1_1,PI_4.375-4.8125,0.02238806,0.052826691
 29 | Canonical_H1_1,PI_4.8125-5.25,0.02238806,0.062065771
 30 | Canonical_H1_1,PI_5.25-5.6875,0.02238806,0.03659949
 31 | Canonical_H1_1,PI_5.6875-6.125,0.074626866,0.129226494
 32 | Canonical_H1_1,PI_6.125-7.0,0.014925373,0.050949514
 33 | Canonical_H1_1,PI_7.0-14.0,0.029850746,0.079240037
 34 | Canonical_H1_2,Canonical_H1_0,0,0
 35 | Canonical_H1_2,Canonical_H2_0,0,0
 36 | Canonical_H1_2,Canonical_H2_6,0,0.000241663
 37 | Canonical_H1_2,Canonical_H3_0,0,0
 38 | Canonical_H1_2,Canonical_H3_1,0,0
 39 | Canonical_H1_2,Canonical_H3_2,0,0.002544529
 40 | Canonical_H1_2,Canonical_H3_3,0,0
 41 | Canonical_H1_2,PI_0.0-3.5,0,0
 42 | Canonical_H1_2,PI_3.5-3.9375,0,0.000769231
 43 | Canonical_H1_2,PI_3.9375-4.375,0,0
 44 | Canonical_H1_2,PI_4.375-4.8125,0,0
 45 | Canonical_H1_2,PI_4.8125-5.25,0,0
 46 | Canonical_H1_2,PI_5.25-5.6875,0,0
 47 | Canonical_H1_2,PI_5.6875-6.125,0,0
 48 | Canonical_H1_2,PI_6.125-7.0,0,0
 49 | Canonical_H1_2,PI_7.0-14.0,0,0
 50 | Canonical_H1_3,Canonical_H1_0,0,0
 51 | Canonical_H1_3,Canonical_H1_1,0,0
 52 | Canonical_H1_3,Canonical_H1_2,0,0
 53 | Canonical_H1_3,Canonical_H2_0,0,0.005102041
 54 | Canonical_H1_3,Canonical_H2_6,0,0.000241604
 55 | Canonical_H1_3,Canonical_H3_0,0,0.003164557
 56 | Canonical_H1_3,Canonical_H3_1,0,0
 57 | Canonical_H1_3,Canonical_H3_2,0,0
 58 | Canonical_H1_3,Canonical_H3_3,0,0.000291206
 59 | Canonical_H1_3,PI_0.0-3.5,0,0
 60 | Canonical_H1_3,PI_3.5-3.9375,0,0
 61 | Canonical_H1_3,PI_3.9375-4.375,0,0.001104972
 62 | Canonical_H1_3,PI_4.375-4.8125,0,0
 63 | Canonical_H1_3,PI_4.8125-5.25,0,0
 64 | Canonical_H1_3,PI_5.25-5.6875,0,0
 65 | Canonical_H1_3,PI_5.6875-6.125,0,0
 66 | Canonical_H1_3,PI_6.125-7.0,0,0.004484305
 67 | Canonical_H1_3,PI_7.0-14.0,0,0
 68 | Canonical_H2_0,Canonical_H3_0,0,0.040816327
 69 | Canonical_H2_0,Canonical_H3_1,0,0.043126685
 70 | Canonical_H2_0,Canonical_H3_2,0.052631579,0.022608696
 71 | Canonical_H2_0,Canonical_H3_3,0.067567568,0.041929925
 72 | Canonical_H2_0,PI_0.0-3.5,0.064516129,0.018761726
 73 | Canonical_H2_0,PI_3.5-3.9375,0.051724138,0.049122807
 74 | Canonical_H2_0,PI_3.9375-4.375,0.028571429,0.036792453
 75 | Canonical_H2_0,PI_4.375-4.8125,0,0.02173913
 76 | Canonical_H2_0,PI_4.8125-5.25,0,0.035634744
 77 | Canonical_H2_0,PI_5.25-5.6875,0,0.014326648
 78 | Canonical_H2_0,PI_5.6875-6.125,0,0.030013643
 79 | Canonical_H2_0,PI_6.125-7.0,0.125,0.037313433
 80 | Canonical_H2_0,PI_7.0-14.0,0,0.017045455
 81 | Canonical_H2_6,Canonical_H2_0,0,0
 82 | Canonical_H2_6,Canonical_H3_0,0.149606299,0.070947571
 83 | Canonical_H2_6,Canonical_H3_1,0.078740157,0.042368801
 84 | Canonical_H2_6,Canonical_H3_2,0.240310078,0.091544206
 85 | Canonical_H2_6,Canonical_H3_3,0.507575758,0.767273576
 86 | Canonical_H2_6,PI_0.0-3.5,0.186046512,0.081485053
 87 | Canonical_H2_6,PI_3.5-3.9375,0.392307692,0.29230038
 88 | Canonical_H2_6,PI_3.9375-4.375,0.21875,0.207086426
 89 | Canonical_H2_6,PI_4.375-4.8125,0.023622047,0.05280926
 90 | Canonical_H2_6,PI_4.8125-5.25,0.023622047,0.061145883
 91 | Canonical_H2_6,PI_5.25-5.6875,0.023622047,0.037171132
 92 | Canonical_H2_6,PI_5.6875-6.125,0.078740157,0.129326923
 93 | Canonical_H2_6,PI_6.125-7.0,0.0078125,0.049843487
 94 | Canonical_H2_6,PI_7.0-14.0,0.031496063,0.080299011
 95 | Canonical_H3_0,Canonical_H3_1,0,0
 96 | Canonical_H3_0,Canonical_H3_3,0,0
 97 | Canonical_H3_0,PI_0.0-3.5,0.022727273,0.016871166
 98 | Canonical_H3_0,PI_3.5-3.9375,0.140625,0.079545455
 99 | Canonical_H3_0,PI_3.9375-4.375,0.043478261,0.043664384
100 | Canonical_H3_0,PI_4.375-4.8125,0,0.011173184
101 | Canonical_H3_0,PI_4.8125-5.25,0.1,0.026315789
102 | Canonical_H3_0,PI_5.25-5.6875,0.1,0.025974026
103 | Canonical_H3_0,PI_5.6875-6.125,0.035714286,0.069682152
104 | Canonical_H3_0,PI_6.125-7.0,0,0.036679537
105 | Canonical_H3_0,PI_7.0-14.0,0.095238095,0.039556962
106 | Canonical_H3_1,Canonical_H3_3,0,0
107 | Canonical_H3_1,PI_0.0-3.5,0.090909091,0.018867925
108 | Canonical_H3_1,PI_3.5-3.9375,0.032258065,0.037552156
109 | Canonical_H3_1,PI_3.9375-4.375,0.026315789,0.046800382
110 | Canonical_H3_1,PI_4.375-4.8125,0,0.004784689
111 | Canonical_H3_1,PI_4.8125-5.25,0,0.01986755
112 | Canonical_H3_1,PI_5.25-5.6875,0,0.03539823
113 | Canonical_H3_1,PI_5.6875-6.125,0.25,0.034387895
114 | Canonical_H3_1,PI_6.125-7.0,0,0.027295285
115 | Canonical_H3_1,PI_7.0-14.0,0,0.038910506
116 | Canonical_H3_2,Canonical_H3_0,0,0
117 | Canonical_H3_2,Canonical_H3_1,0,0
118 | Canonical_H3_2,Canonical_H3_3,0,0
119 | Canonical_H3_2,PI_0.0-3.5,0.092592593,0.042194093
120 | Canonical_H3_2,PI_3.5-3.9375,0.225352113,0.068813131
121 | Canonical_H3_2,PI_3.9375-4.375,0.127272727,0.054471545
122 | Canonical_H3_2,PI_4.375-4.8125,0.028571429,0.019704433
123 | Canonical_H3_2,PI_4.8125-5.25,0,0.055732484
124 | Canonical_H3_2,PI_5.25-5.6875,0,0.041509434
125 | Canonical_H3_2,PI_5.6875-6.125,0.048780488,0.049559471
126 | Canonical_H3_2,PI_6.125-7.0,0.029411765,0.040609137
127 | Canonical_H3_2,PI_7.0-14.0,0.027777778,0.071428571
128 | Canonical_H3_3,PI_0.0-3.5,0.209876543,0.085246843
129 | Canonical_H3_3,PI_3.5-3.9375,0.272727273,0.274024226
130 | Canonical_H3_3,PI_3.9375-4.375,0.231707317,0.204722222
131 | Canonical_H3_3,PI_4.375-4.8125,0.02739726,0.060237475
132 | Canonical_H3_3,PI_4.8125-5.25,0.013513514,0.060423826
133 | Canonical_H3_3,PI_5.25-5.6875,0.013513514,0.032480598
134 | Canonical_H3_3,PI_5.6875-6.125,0.037974684,0.121629213
135 | Canonical_H3_3,PI_6.125-7.0,0.01369863,0.04817895
136 | Canonical_H3_3,PI_7.0-14.0,0.013333333,0.070314715
137 | Germ_HJ_IGHJ1*01,Canonical_H1_0,0,0
138 | Germ_HJ_IGHJ1*01,Canonical_H1_1,0.007462687,0.007645968
139 | Germ_HJ_IGHJ1*01,Canonical_H1_2,0,0
140 | Germ_HJ_IGHJ1*01,Canonical_H1_3,0,0
141 | Germ_HJ_IGHJ1*01,Canonical_H2_0,0,0.008849558
142 | Germ_HJ_IGHJ1*01,Canonical_H2_6,0.007874016,0.007487923
143 | Germ_HJ_IGHJ1*01,Canonical_H3_0,0,0.01754386
144 | Germ_HJ_IGHJ1*01,Canonical_H3_1,0,0.022727273
145 | Germ_HJ_IGHJ1*01,Canonical_H3_2,0.03030303,0.007092199
146 | Germ_HJ_IGHJ1*01,Canonical_H3_3,0,0.005512039
147 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ2*01,0,0
148 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ4*02,0,0
149 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ5*02,0,0
150 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ6*01,0,0
151 | Germ_HJ_IGHJ1*01,PI_0.0-3.5,0,0
152 | Germ_HJ_IGHJ1*01,PI_3.5-3.9375,0,0.00150263
153 | Germ_HJ_IGHJ1*01,PI_3.9375-4.375,0,0.002139037
154 | Germ_HJ_IGHJ1*01,PI_4.375-4.8125,0,0.007722008
155 | Germ_HJ_IGHJ1*01,PI_4.8125-5.25,0,0.027118644
156 | Germ_HJ_IGHJ1*01,PI_5.25-5.6875,0,0.010526316
157 | Germ_HJ_IGHJ1*01,PI_5.6875-6.125,0,0.003384095
158 | Germ_HJ_IGHJ1*01,PI_6.125-7.0,0,0.036585366
159 | Germ_HJ_IGHJ1*01,PI_7.0-14.0,0.25,0.016260163
160 | Germ_HJ_IGHJ2*01,Canonical_H1_0,0,0
161 | Germ_HJ_IGHJ2*01,Canonical_H1_1,0.014925373,0.003707136
162 | Germ_HJ_IGHJ2*01,Canonical_H1_2,0,0
163 | Germ_HJ_IGHJ2*01,Canonical_H1_3,0,0
164 | Germ_HJ_IGHJ2*01,Canonical_H2_0,0,0.004761905
165 | Germ_HJ_IGHJ2*01,Canonical_H2_6,0.015748031,0.003624064
166 | Germ_HJ_IGHJ2*01,Canonical_H3_0,0,0.012232416
167 | Germ_HJ_IGHJ2*01,Canonical_H3_1,0,0.009708738
168 | Germ_HJ_IGHJ2*01,Canonical_H3_2,0.060606061,0.009876543
169 | Germ_HJ_IGHJ2*01,Canonical_H3_3,0,0.001742666
170 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ4*02,0,0
171 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ5*02,0,0
172 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ6*01,0,0
173 | Germ_HJ_IGHJ2*01,PI_0.0-3.5,0,0
174 | Germ_HJ_IGHJ2*01,PI_3.5-3.9375,0,0.00152207
175 | Germ_HJ_IGHJ2*01,PI_3.9375-4.375,0.068965517,0.003271538
176 | Germ_HJ_IGHJ2*01,PI_4.375-4.8125,0,0
177 | Germ_HJ_IGHJ2*01,PI_4.8125-5.25,0,0.003508772
178 | Germ_HJ_IGHJ2*01,PI_5.25-5.6875,0,0.005747126
179 | Germ_HJ_IGHJ2*01,PI_5.6875-6.125,0,0.003484321
180 | Germ_HJ_IGHJ2*01,PI_6.125-7.0,0,0.021459227
181 | Germ_HJ_IGHJ2*01,PI_7.0-14.0,0,0.005617978
182 | Germ_HJ_IGHJ3*01,Canonical_H1_0,0,0
183 | Germ_HJ_IGHJ3*01,Canonical_H1_1,0.007462687,0.002548656
184 | Germ_HJ_IGHJ3*01,Canonical_H1_2,0,0
185 | Germ_HJ_IGHJ3*01,Canonical_H1_3,0,0
186 | Germ_HJ_IGHJ3*01,Canonical_H2_0,0,0.009803922
187 | Germ_HJ_IGHJ3*01,Canonical_H2_6,0.007874016,0.002173913
188 | Germ_HJ_IGHJ3*01,Canonical_H3_0,0,0
189 | Germ_HJ_IGHJ3*01,Canonical_H3_1,0,0
190 | Germ_HJ_IGHJ3*01,Canonical_H3_2,0,0
191 | Germ_HJ_IGHJ3*01,Canonical_H3_3,0.013888889,0.003204195
192 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ1*01,0,0
193 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ2*01,0,0
194 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ4*02,0,0
195 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ5*01,0,0
196 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ5*02,0,0
197 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ6*01,0,0
198 | Germ_HJ_IGHJ3*01,PI_0.0-3.5,0.038461538,0.008426966
199 | Germ_HJ_IGHJ3*01,PI_3.5-3.9375,0,0.003060444
200 | Germ_HJ_IGHJ3*01,PI_3.9375-4.375,0,0.002190581
201 | Germ_HJ_IGHJ3*01,PI_4.375-4.8125,0,0.004201681
202 | Germ_HJ_IGHJ3*01,PI_4.8125-5.25,0,0
203 | Germ_HJ_IGHJ3*01,PI_5.25-5.6875,0,0
204 | Germ_HJ_IGHJ3*01,PI_5.6875-6.125,0,0
205 | Germ_HJ_IGHJ3*01,PI_6.125-7.0,0,0
206 | Germ_HJ_IGHJ3*01,PI_7.0-14.0,0,0.002840909
207 | Germ_HJ_IGHJ3*02,Canonical_H1_0,0,0
208 | Germ_HJ_IGHJ3*02,Canonical_H1_1,0.164179104,0.007877665
209 | Germ_HJ_IGHJ3*02,Canonical_H1_2,0,0
210 | Germ_HJ_IGHJ3*02,Canonical_H1_3,0,0
211 | Germ_HJ_IGHJ3*02,Canonical_H2_0,0.035714286,0.004385965
212 | Germ_HJ_IGHJ3*02,Canonical_H2_6,0.1640625,0.00797294
213 | Germ_HJ_IGHJ3*02,Canonical_H3_0,0.138888889,0.002873563
214 | Germ_HJ_IGHJ3*02,Canonical_H3_1,0.066666667,0.013452915
215 | Germ_HJ_IGHJ3*02,Canonical_H3_2,0.078431373,0.007075472
216 | Germ_HJ_IGHJ3*02,Canonical_H3_3,0.13253012,0.007848837
217 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ1*01,0,0
218 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ2*01,0,0
219 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ3*01,0,0
220 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ4*02,0,0
221 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ5*01,0,0
222 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ5*02,0,0
223 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ6*01,0,0
224 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ6*04,0,0
225 | Germ_HJ_IGHJ3*02,PI_0.0-3.5,0.090909091,0.013262599
226 | Germ_HJ_IGHJ3*02,PI_3.5-3.9375,0.151515152,0.014448669
227 | Germ_HJ_IGHJ3*02,PI_3.9375-4.375,0.159090909,0.003208556
228 | Germ_HJ_IGHJ3*02,PI_4.375-4.8125,0,0.011583012
229 | Germ_HJ_IGHJ3*02,PI_4.8125-5.25,0,0
230 | Germ_HJ_IGHJ3*02,PI_5.25-5.6875,0,0.005208333
231 | Germ_HJ_IGHJ3*02,PI_5.6875-6.125,0.032258065,0.005076142
232 | Germ_HJ_IGHJ3*02,PI_6.125-7.0,0,0
233 | Germ_HJ_IGHJ3*02,PI_7.0-14.0,0,0
234 | Germ_HJ_IGHJ4*02,Canonical_H1_0,0,0.001960784
235 | Germ_HJ_IGHJ4*02,Canonical_H1_1,0.358208955,0.350694444
236 | Germ_HJ_IGHJ4*02,Canonical_H1_2,0,0
237 | Germ_HJ_IGHJ4*02,Canonical_H1_3,0,0.000657895
238 | Germ_HJ_IGHJ4*02,Canonical_H2_0,0.057692308,0.042579075
239 | Germ_HJ_IGHJ4*02,Canonical_H2_6,0.346153846,0.344344106
240 | Germ_HJ_IGHJ4*02,Canonical_H3_0,0.155172414,0.11965812
241 | Germ_HJ_IGHJ4*02,Canonical_H3_1,0.094339623,0.079495268
242 | Germ_HJ_IGHJ4*02,Canonical_H3_2,0.08,0.06935123
243 | Germ_HJ_IGHJ4*02,Canonical_H3_3,0.304347826,0.276617685
244 | Germ_HJ_IGHJ4*02,Germ_HJ_IGHJ5*02,0,0
245 | Germ_HJ_IGHJ4*02,Germ_HJ_IGHJ6*01,0,0
246 | Germ_HJ_IGHJ4*02,PI_0.0-3.5,0.104477612,0.058390023
247 | Germ_HJ_IGHJ4*02,PI_3.5-3.9375,0.2,0.185449958
248 | Germ_HJ_IGHJ4*02,PI_3.9375-4.375,0.203125,0.168837434
249 | Germ_HJ_IGHJ4*02,PI_4.375-4.8125,0.02,0.042985075
250 | Germ_HJ_IGHJ4*02,PI_4.8125-5.25,0.0625,0.055457227
251 | Germ_HJ_IGHJ4*02,PI_5.25-5.6875,0.02,0.03198032
252 | Germ_HJ_IGHJ4*02,PI_5.6875-6.125,0.054545455,0.118343195
253 | Germ_HJ_IGHJ4*02,PI_6.125-7.0,0,0.046274038
254 | Germ_HJ_IGHJ4*02,PI_7.0-14.0,0.06122449,0.062821245
255 | Germ_HJ_IGHJ5*01,Canonical_H1_0,0,0
256 | Germ_HJ_IGHJ5*01,Canonical_H1_1,0.007462687,0.041241891
257 | Germ_HJ_IGHJ5*01,Canonical_H1_2,0,0
258 | Germ_HJ_IGHJ5*01,Canonical_H1_3,0,0
259 | Germ_HJ_IGHJ5*01,Canonical_H2_0,0,0.041899441
260 | Germ_HJ_IGHJ5*01,Canonical_H2_6,0.007874016,0.039248736
261 | Germ_HJ_IGHJ5*01,Canonical_H3_0,0,0.040084388
262 | Germ_HJ_IGHJ5*01,Canonical_H3_1,0,0.022099448
263 | Germ_HJ_IGHJ5*01,Canonical_H3_2,0.03030303,0.010619469
264 | Germ_HJ_IGHJ5*01,Canonical_H3_3,0,0.041834968
265 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ1*01,0,0
266 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ2*01,0,0
267 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ4*02,0,0
268 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ5*02,0,0
269 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ6*01,0,0
270 | Germ_HJ_IGHJ5*01,PI_0.0-3.5,0,0.017408124
271 | Germ_HJ_IGHJ5*01,PI_3.5-3.9375,0,0.041578576
272 | Germ_HJ_IGHJ5*01,PI_3.9375-4.375,0,0.045410628
273 | Germ_HJ_IGHJ5*01,PI_4.375-4.8125,0,0.01754386
274 | Germ_HJ_IGHJ5*01,PI_4.8125-5.25,0,0.037037037
275 | Germ_HJ_IGHJ5*01,PI_5.25-5.6875,0,0.018126888
276 | Germ_HJ_IGHJ5*01,PI_5.6875-6.125,0.1,0.026425591
277 | Germ_HJ_IGHJ5*01,PI_6.125-7.0,0,0.015228426
278 | Germ_HJ_IGHJ5*01,PI_7.0-14.0,0,0.017612524
279 | Germ_HJ_IGHJ5*02,Canonical_H1_0,0,0
280 | Germ_HJ_IGHJ5*02,Canonical_H1_1,0.067164179,0.038452629
281 | Germ_HJ_IGHJ5*02,Canonical_H1_2,0,0
282 | Germ_HJ_IGHJ5*02,Canonical_H1_3,0,0.005952381
283 | Germ_HJ_IGHJ5*02,Canonical_H2_0,0.066666667,0.025495751
284 | Germ_HJ_IGHJ5*02,Canonical_H2_6,0.0625,0.038099831
285 | Germ_HJ_IGHJ5*02,Canonical_H3_0,0.037037037,0.02771855
286 | Germ_HJ_IGHJ5*02,Canonical_H3_1,0.055555556,0.022792023
287 | Germ_HJ_IGHJ5*02,Canonical_H3_2,0.024390244,0.02003643
288 | Germ_HJ_IGHJ5*02,Canonical_H3_3,0.08,0.038961039
289 | Germ_HJ_IGHJ5*02,Germ_HJ_IGHJ6*01,0,0
290 | Germ_HJ_IGHJ5*02,PI_0.0-3.5,0,0.038306452
291 | Germ_HJ_IGHJ5*02,PI_3.5-3.9375,0.016129032,0.034555712
292 | Germ_HJ_IGHJ5*02,PI_3.9375-4.375,0,0.024880383
293 | Germ_HJ_IGHJ5*02,PI_4.375-4.8125,0.090909091,0.010230179
294 | Germ_HJ_IGHJ5*02,PI_4.8125-5.25,0,0.025821596
295 | Germ_HJ_IGHJ5*02,PI_5.25-5.6875,0.2,0.01242236
296 | Germ_HJ_IGHJ5*02,PI_5.6875-6.125,0.1875,0.043041607
297 | Germ_HJ_IGHJ5*02,PI_6.125-7.0,0.222222222,0.023684211
298 | Germ_HJ_IGHJ5*02,PI_7.0-14.0,0,0.030364372
299 | Germ_HJ_IGHJ6*01,Canonical_H1_0,0,0.004627682
300 | Germ_HJ_IGHJ6*01,Canonical_H1_1,0.291044776,0.545748614
301 | Germ_HJ_IGHJ6*01,Canonical_H1_2,0,0.00042123
302 | Germ_HJ_IGHJ6*01,Canonical_H1_3,0,0
303 | Germ_HJ_IGHJ6*01,Canonical_H2_0,0.045454545,0.038399353
304 | Germ_HJ_IGHJ6*01,Canonical_H2_6,0.286821705,0.53838885
305 | Germ_HJ_IGHJ6*01,Canonical_H3_0,0.054545455,0.029085343
306 | Germ_HJ_IGHJ6*01,Canonical_H3_1,0.042553191,0.015835313
307 | Germ_HJ_IGHJ6*01,Canonical_H3_2,0.263157895,0.095841584
308 | Germ_HJ_IGHJ6*01,Canonical_H3_3,0.206521739,0.531785808
309 | Germ_HJ_IGHJ6*01,PI_0.0-3.5,0.181818182,0.083167529
310 | Germ_HJ_IGHJ6*01,PI_3.5-3.9375,0.273972603,0.245423729
311 | Germ_HJ_IGHJ6*01,PI_3.9375-4.375,0.096774194,0.167794799
312 | Germ_HJ_IGHJ6*01,PI_4.375-4.8125,0.024390244,0.056435242
313 | Germ_HJ_IGHJ6*01,PI_4.8125-5.25,0,0.055489022
314 | Germ_HJ_IGHJ6*01,PI_5.25-5.6875,0,0.038114754
315 | Germ_HJ_IGHJ6*01,PI_5.6875-6.125,0.042553191,0.107169811
316 | Germ_HJ_IGHJ6*01,PI_6.125-7.0,0,0.046774194
317 | Germ_HJ_IGHJ6*01,PI_7.0-14.0,0,0.079062376
318 | Germ_HJ_IGHJ6*04,Canonical_H1_0,0,0
319 | Germ_HJ_IGHJ6*04,Canonical_H1_1,0.082089552,0.000231696
320 | Germ_HJ_IGHJ6*04,Canonical_H1_2,0,0
321 | Germ_HJ_IGHJ6*04,Canonical_H1_3,0,0
322 | Germ_HJ_IGHJ6*04,Canonical_H2_0,0,0
323 | Germ_HJ_IGHJ6*04,Canonical_H2_6,0.086614173,0.000241663
324 | Germ_HJ_IGHJ6*04,Canonical_H3_0,0.034482759,0
325 | Germ_HJ_IGHJ6*04,Canonical_H3_1,0,0
326 | Germ_HJ_IGHJ6*04,Canonical_H3_2,0.073170732,0
327 | Germ_HJ_IGHJ6*04,Canonical_H3_3,0.092105263,0.00029129
328 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ1*01,0,0
329 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ2*01,0,0
330 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ3*01,0,0
331 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ4*02,0,0
332 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ5*01,0,0
333 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ5*02,0,0
334 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ6*01,0,0
335 | Germ_HJ_IGHJ6*04,PI_0.0-3.5,0.121212121,0
336 | Germ_HJ_IGHJ6*04,PI_3.5-3.9375,0.101694915,0
337 | Germ_HJ_IGHJ6*04,PI_3.9375-4.375,0.025641026,0
338 | Germ_HJ_IGHJ6*04,PI_4.375-4.8125,0,0
339 | Germ_HJ_IGHJ6*04,PI_4.8125-5.25,0,0.003703704
340 | Germ_HJ_IGHJ6*04,PI_5.25-5.6875,0,0
341 | Germ_HJ_IGHJ6*04,PI_5.6875-6.125,0,0
342 | Germ_HJ_IGHJ6*04,PI_6.125-7.0,0,0
343 | Germ_HJ_IGHJ6*04,PI_7.0-14.0,0,0
344 | Germ_HV_IGHV3-23*01,Canonical_H1_0,0,0.003231018
345 | Germ_HV_IGHV3-23*01,Canonical_H1_1,1,0.996076621
346 | Germ_HV_IGHV3-23*01,Canonical_H1_2,0,0.000230787
347 | Germ_HV_IGHV3-23*01,Canonical_H1_3,0,0.000461574
348 | Germ_HV_IGHV3-23*01,Canonical_H2_0,0.052238806,0.045003462
349 | Germ_HV_IGHV3-23*01,Canonical_H2_6,0.947761194,0.954996538
350 | Germ_HV_IGHV3-23*01,Canonical_H3_0,0.141791045,0.0726979
351 | Germ_HV_IGHV3-23*01,Canonical_H3_1,0.074626866,0.044311101
352 | Germ_HV_IGHV3-23*01,Canonical_H3_2,0.246268657,0.090699285
353 | Germ_HV_IGHV3-23*01,Canonical_H3_3,0.537313433,0.792291715
354 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ1*01,0.007462687,0.00761597
355 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ2*01,0.014925373,0.003692592
356 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ3*01,0.007462687,0.002538657
357 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ3*02,0.164179104,0.007846757
358 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ4*02,0.358208955,0.350565428
359 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ5*01,0.007462687,0.041080083
360 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ5*02,0.067164179,0.038541426
361 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ6*01,0.291044776,0.547888299
362 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ6*04,0.082089552,0.000230787
363 | Germ_HV_IGHV3-23*01,PI_0.0-3.5,0.194029851,0.08031387
364 | Germ_HV_IGHV3-23*01,PI_3.5-3.9375,0.402985075,0.300023079
365 | Germ_HV_IGHV3-23*01,PI_3.9375-4.375,0.21641791,0.208631433
366 | Germ_HV_IGHV3-23*01,PI_4.375-4.8125,0.02238806,0.052619432
367 | Germ_HV_IGHV3-23*01,PI_4.8125-5.25,0.02238806,0.062312486
368 | Germ_HV_IGHV3-23*01,PI_5.25-5.6875,0.02238806,0.03669513
369 | Germ_HV_IGHV3-23*01,PI_5.6875-6.125,0.074626866,0.129240711
370 | Germ_HV_IGHV3-23*01,PI_6.125-7.0,0.014925373,0.05123471
371 | Germ_HV_IGHV3-23*01,PI_7.0-14.0,0.029850746,0.078929148
372 | PI_0.0-3.5,PI_3.5-3.9375,0,0
373 | PI_0.0-3.5,PI_3.9375-4.375,0,0
374 | PI_0.0-3.5,PI_4.375-4.8125,0,0
375 | PI_0.0-3.5,PI_4.8125-5.25,0,0
376 | PI_0.0-3.5,PI_5.25-5.6875,0,0
377 | PI_0.0-3.5,PI_5.6875-6.125,0,0
378 | PI_0.0-3.5,PI_6.125-7.0,0,0
379 | PI_0.0-3.5,PI_7.0-14.0,0,0
380 | PI_3.5-3.9375,PI_3.9375-4.375,0,0
381 | PI_3.5-3.9375,PI_4.375-4.8125,0,0
382 | PI_3.5-3.9375,PI_4.8125-5.25,0,0
383 | PI_3.5-3.9375,PI_5.25-5.6875,0,0
384 | PI_3.5-3.9375,PI_5.6875-6.125,0,0
385 | PI_3.5-3.9375,PI_6.125-7.0,0,0
386 | PI_3.5-3.9375,PI_7.0-14.0,0,0
387 | PI_3.9375-4.375,PI_4.375-4.8125,0,0
388 | PI_3.9375-4.375,PI_4.8125-5.25,0,0
389 | PI_3.9375-4.375,PI_5.25-5.6875,0,0
390 | PI_3.9375-4.375,PI_5.6875-6.125,0,0
391 | PI_3.9375-4.375,PI_6.125-7.0,0,0
392 | PI_3.9375-4.375,PI_7.0-14.0,0,0
393 | PI_4.375-4.8125,PI_4.8125-5.25,0,0
394 | PI_4.375-4.8125,PI_5.25-5.6875,0,0
395 | PI_4.375-4.8125,PI_5.6875-6.125,0,0
396 | PI_4.375-4.8125,PI_6.125-7.0,0,0
397 | PI_4.375-4.8125,PI_7.0-14.0,0,0
398 | PI_4.8125-5.25,PI_5.25-5.6875,0,0
399 | PI_4.8125-5.25,PI_5.6875-6.125,0,0
400 | PI_4.8125-5.25,PI_6.125-7.0,0,0
401 | PI_4.8125-5.25,PI_7.0-14.0,0,0
402 | PI_5.25-5.6875,PI_5.6875-6.125,0,0
403 | PI_5.25-5.6875,PI_6.125-7.0,0,0
404 | PI_5.25-5.6875,PI_7.0-14.0,0,0
405 | PI_5.6875-6.125,PI_6.125-7.0,0,0
406 | PI_5.6875-6.125,PI_7.0-14.0,0,0
407 | PI_6.125-7.0,PI_7.0-14.0,0,0


--------------------------------------------------------------------------------
/ASAP/FeatureExtraction.py:
--------------------------------------------------------------------------------
  1 | import Bio.SeqUtils.ProtParam
  2 | import os
  3 | import numpy as np
  4 | 
  5 | SET_NAME = 'MMP-cluster'
  6 | IF_ONLY_HEAVY = False
  7 | CNT_DB = 2
  8 | CNT_TARGET = 1
  9 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'
 10 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'
 11 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster
 12 | 
 13 | 
 14 | 
 15 | 
 16 | # Chothia numbering definition for CDR regions
 17 | CHOTHIA_CDR = {'L': {'1': [24, 34], '2': [50, 56], '3': [89, 97]}, 'H':{'1': [26, 32], '2': [52, 56], '3': [95, 102]}}
 18 | 
 19 | #################################################################################################################
 20 | #  function ReadAminoAndNum:
 21 | #  Read in the Chothia number reference and targeting files. Store the numbering and putative germline.
 22 | #
 23 | #  Input: targeting_direct, reference_direct
 24 | #  Output:1. dictionary of Amino, {'L': {}, 'H': {}}
 25 | #         2. dictionary of Num  , {'L': {}, 'H': {}}
 26 | #         3. dictionary of Germ , {'L': {'V': {}, 'J':{}}, 'H': {'V': {}, 'J':{}}}
 27 | #         4. list of DatasetName, [dh, dm, p1,....] 
 28 | #         5. list of DatasetSize, [ , , ,...]
 29 | #################################################################################################################
 30 | 
 31 | def ReadAminoNumGerm(targeting_direct, reference_direct):
 32 |     Amino = {'L': {}, 'H': {}}
 33 |     Num ={'L': {}, 'H': {}}
 34 |     Germ =  {'L': {'V': {}, 'J':{}}, 'H': {'V': {}, 'J':{}}}
 35 |     DatasetName = []
 36 |     DatasetSize = []
 37 |     
 38 |     targeting_filenames = sorted(os.listdir(targeting_direct))
 39 |     reference_filenames = sorted(os.listdir(reference_direct))
 40 | 
 41 |     for i, name in enumerate(reference_filenames + targeting_filenames):
 42 |         if not name.endswith('.txt'):
 43 |             continue
 44 |         if i < len(reference_filenames):
 45 |             direct = reference_direct
 46 |         else:
 47 |             direct = targeting_direct
 48 |         with open(direct + name, 'r') as fi:
 49 |             data = fi.readlines()
 50 |         DatasetName.append(name.split('_')[0])
 51 |         cnt_pattern = 0
 52 |         cnt_seq =  0
 53 |         tmp_num = []
 54 |         tmp_seq = []
 55 |         tmp_germ_V = ' '
 56 |         tmp_germ_J = ' '
 57 |         buff = ''
 58 |         for j in range(len(data)):
 59 |             # if chain begin
 60 |             if data[j][0] =='L' or data[j][0] =='H':
 61 |                 L_H = data[j][0]
 62 |                 tmp_seq.append(data[j].split()[-1])
 63 |                 if len(data[j].split()) == 3:
 64 |                     tmp_num.append(data[j].split()[-2])
 65 |                 else:
 66 |                     tmp_num.append(data[j].split()[1] + data[j].split()[-2])
 67 | 
 68 |             # second time of #|, line of germline
 69 |             if data[j][0]=='#' and data[j][1] == '|':
 70 |                 cnt_pattern += 1
 71 |                 if (cnt_pattern % 4) == 0:
 72 |                     tmp_germ_V = data[j].split("|")[2]
 73 |                     tmp_germ_J = data[j].split("|")[4]
 74 | 
 75 | 
 76 |             # time of \\, ending a sequence, need \\ to present \
 77 |             if data[j][0] == '/':
 78 |                 if IF_ONLY_HEAVY:
 79 |                     seq_name = name.split('_')[0] + '_' + str(cnt_seq)
 80 |                 else:
 81 |                     seq_name = name.split('_')[0] + '_' + str(int(cnt_seq / 2))
 82 |                 cnt_seq += 1
 83 |                 Amino[L_H][seq_name] = tmp_seq
 84 |                 Num[L_H][seq_name] =tmp_num
 85 |                 Germ[L_H]['V'][seq_name] = tmp_germ_V
 86 |                 Germ[L_H]['J'][seq_name] = tmp_germ_J
 87 |                 # if not tmp_germ_V.startswith('IGHV3-23'):
 88 |                 #     print(data[j - 8])
 89 |                 #     print(seq_name)
 90 |                 #     print(tmp_germ_V, tmp_germ_J)
 91 |                 tmp_num = []
 92 |                 tmp_seq = []
 93 |                 tmp_germ_V = ' '
 94 |                 tmp_germ_J = ' '
 95 | 
 96 |         if IF_ONLY_HEAVY:
 97 |             DatasetSize.append(cnt_seq)
 98 |         else:
 99 |             DatasetSize.append(int(cnt_seq / 2))
100 |     return Amino, Num, Germ, DatasetName, DatasetSize
101 | 
102 | 
103 | #################################################################################################################
104 | #  function GetOneHotGerm:
105 | #  Transform the stored putative germline into one-hot encoded features.
106 | #
107 | #  Input:  Germ, DatasetSize, DatasetName
108 | #  Output: 1. array of OneHotGerm, [[seq1 onehot], [seq2 onehot], [seq3 onehot], ...]
109 | #          2. list of GermFeatureNames according to one hot, [LV_IGLV1*1, LV_IGLV1*2,....
110 | #                                                             LJ_XXXX, 
111 | #                                                             HV_XXXX,
112 | #                                                             HJ_XXXX ...]
113 | #################################################################################################################
114 | 
115 | def GetOneHotGerm(Germ, DatasetSize, DatasetName):
116 |     OneHotGerm = []
117 |     GermFeatureNames = []
118 |     # for every feature type
119 |     for H_L in Germ:
120 |         if IF_ONLY_HEAVY:
121 |             if H_L=='L':
122 |                 continue
123 |         for V_J in Germ[H_L]:
124 |             # every feature name in that type
125 |             candidate = list(sorted(set(Germ[H_L][V_J].values())))
126 |             for can in candidate:
127 |                 GermFeatureNames.append('Germ_' +H_L+ V_J+'_'+can)
128 | 
129 |     # for every dataset
130 |     for i, name in enumerate(DatasetName):
131 |         tmp = [[] for j in range(int(DatasetSize[i]))]
132 |         # for every seq in that dataset
133 |         for j in range(int(DatasetSize[i])):
134 |             seq_name = name + '_' + str(j)
135 | 
136 |             for k in range(len(GermFeatureNames)):
137 |                 H_L = GermFeatureNames[k].split('_')[1][0]
138 |                 V_J = GermFeatureNames[k].split('_')[1][1]
139 |                 if Germ[H_L][V_J][seq_name] == GermFeatureNames[k].split('_')[2]:
140 |                     tmp[j].append(1)
141 |                 else:
142 |                     tmp[j].append(0)
143 |         OneHotGerm += tmp
144 | 
145 |     return OneHotGerm, GermFeatureNames
146 | 
147 | 
148 | #################################################################################################################
149 | #  function ReadCanonTemp:
150 | #  Read in the template file (default PIGS) and store it.
151 | #  
152 | #  Output: 1. dictionary of CanonTemp, {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}}
153 | #################################################################################################################
154 | def ReadCanonTemp(canonical_direct):
155 |     CanonTemp = {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}}
156 |     with open(canonical_direct, 'r') as fi:
157 |         data = fi.readlines()
158 |     for i in range(len(data)):
159 |         if data[i].split()[1] not in CanonTemp[data[i][0]][data[i][1]]:
160 |             CanonTemp[data[i][0]][data[i][1]][data[i].split()[1]] = []
161 |         CanonTemp[data[i][0]][data[i][1]][data[i].split()[1]].append(data[i].split()[2:])
162 |     return CanonTemp
163 | 
164 | #################################################################################################################
165 | #  function GetCanon:
166 | #  Assign each sequence witht the predicted type of canonical structure according to the template.
167 | #  
168 | #  Input:   Amino, Num
169 | #  Output:  1. dictionary of CanonTemp, {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}}
170 | #              optional: PIGS / Chothia
171 | #################################################################################################################
172 | 
173 | def GetCanon(canonical_direct, Amino, Num):
174 |     CanonTemp = ReadCanonTemp(canonical_direct)
175 |     Canon = {'L': {'1': {}, '2': {}, '3': {}}, 'H': {'1': {}, '2': {}, '3': {}}}
176 |     # for every sequence
177 |     for seq_name in Num['H']:
178 | 
179 |         for L_H in Canon:
180 |             if IF_ONLY_HEAVY:
181 |                 if L_H == 'L':
182 |                     continue
183 | 
184 |             for j in Canon[L_H]:
185 |                 cnt_len = 0
186 | 
187 |                 for k in Num[L_H][seq_name]:
188 |                     if k[-1]>='A'and k[-1]<='Z':
189 |                         num_i = int(k[:-1])
190 |                     else:
191 |                         num_i = int(k)
192 |                     if num_i >= CHOTHIA_CDR[L_H][j][0] and num_i <= CHOTHIA_CDR[L_H][j][1]:
193 |                         cnt_len += 1
194 |                 length = cnt_len
195 |                 # for every type number on specific CDR region
196 |                 for k in CanonTemp[L_H][j]:
197 |                     ############## same type have diff version of template
198 |                     for m in range(len(CanonTemp[L_H][j][k])):
199 |                         # if have matched CDR length, then give zero type
200 |                         if CanonTemp[L_H][j][k][m][0] == str(length):
201 |                             # check if length is the only restriction
202 |                             if len(CanonTemp[L_H][j][k][m]) == 1:
203 |                                 Canon[L_H][j][seq_name] = k
204 |                             # check for each position with in specific motif
205 |                             else:
206 |                                 restriction = CanonTemp[L_H][j][k][m][1:]
207 |                                 for l in range(0,len(restriction),2):
208 | 
209 |                                     pos = CanonTemp[L_H][j][k][m][l+1]
210 | 
211 |                                     # index of the number
212 |                                     if pos not in Num[L_H][seq_name]:
213 |                                         break
214 |                                     else:
215 |                                         id = int(Num[L_H][seq_name].index(pos))
216 |                                         s=CanonTemp[L_H][j][k][m][l + 2]
217 | 
218 |                                         if Amino[L_H][seq_name][id] not in CanonTemp[L_H][j][k][m][l+2]:
219 |                                             break
220 |                                         Canon[L_H][j][seq_name] = k
221 |                 # if no match canonical structure found, then append 0
222 |                 if seq_name not in Canon[L_H][j]:
223 |                     Canon[L_H][j][seq_name] = '0'
224 |     return Canon
225 | 
226 | #################################################################################################################
227 | #  function GetOneHotCanon:
228 | #  Similar to GetOneHotGerm, transform the stored canonical structure into one-hot encoded features.
229 | #
230 | #  Input:  Amino, Num, DatasetSize, DatasetName
231 | #  Output: 1. array of OneHotCanon, [[seq1 onehot], [seq2 onehot], [seq3 onehot], ...]
232 | #          2. list of CanonFeatureNames according to one hot, [Canon_L1_1, Canon_L1_2,....
233 | #                                                              Canon_L2_1, 
234 | #                                                              Canon_L3_1,
235 | #                                                              Canon_H1_1, 
236 | #                                                              Canon_H2_1,
237 | #                                                              Canon_H3_1,...]
238 | #################################################################################################################
239 | 
240 | def GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName):
241 |     Canon = GetCanon(canonical_direct, Amino, Num)
242 |     OneHotCanon = []
243 |     CanonFeatureNames = []
244 |     # for every feature type
245 | 
246 |     for H_L in Canon:
247 |         if IF_ONLY_HEAVY:
248 |             if H_L=='L':
249 |                 continue
250 |         # O_T_T stands for 1_2_3
251 |         for O_T_T in Canon[H_L]:
252 |             # every feature name in that type
253 |             candidate = list(sorted(set(Canon[H_L][O_T_T].values())))
254 |             for can in candidate:
255 |                 CanonFeatureNames.append('Canonical_' +H_L+ O_T_T+'_'+can)
256 |                 
257 |     # for every dataset
258 |     for i, name in enumerate(DatasetName):
259 |         tmp = [[] for j in range(int(DatasetSize[i]))]
260 |         # for every seq in that dataset
261 |         for j in range(int(DatasetSize[i])):
262 |             seq_name = name + '_' + str(j)
263 |             for k in range(len(CanonFeatureNames)):
264 |                 H_L = CanonFeatureNames[k].split('_')[1][0]
265 |                 O_T_T = CanonFeatureNames[k].split('_')[1][1]
266 |                 if Canon[H_L][O_T_T][seq_name] == CanonFeatureNames[k].split('_')[2]:
267 |                     tmp[j].append(1)
268 |                 else:
269 |                     tmp[j].append(0)
270 |         OneHotCanon += tmp
271 |         
272 |     return OneHotCanon, CanonFeatureNames
273 | 
274 | #################################################################################################################
275 | #  function GetCDRH3:
276 | #  Take the CDR-H3 of each seqeunce.
277 | # 
278 | #  Input:   Amino, Num
279 | #  Output: 1. dictionary of CDRH3, {}
280 | #################################################################################################################
281 | 
282 | def GetCDRH3(Amino, Num):
283 |     CDRH3={}
284 |     for seq_name in Amino['H']:
285 |         CDRH3[seq_name]=''
286 |         for i in range(len(Num['H'][seq_name])):
287 |             number = Num['H'][seq_name][i]
288 |             if number[-1] >= 'A' and number[-1] <= 'Z':
289 |                 num_i = int(number[:-1])
290 |             else:
291 |                 num_i = int(number)
292 |             if num_i >= CHOTHIA_CDR['H']['3'][0] and num_i <= CHOTHIA_CDR['H']['3'][1]:
293 |                 CDRH3[seq_name] += Amino['H'][seq_name][i]
294 |     return CDRH3
295 | 
296 | #################################################################################################################
297 | #  function GetCDRH3PI:
298 | #  Calculate the pI value for each sequence
299 | # 
300 | #  Input:   CDRH3
301 | #  Output: 1. dictionary of PI, {}
302 | #################################################################################################################
303 | 
304 | def GetCDRH3PI(CDRH3):
305 |     void = ['KYPLAVSGIIT', '-------V', 'GVVTAAIDGMDV','DLYSGYRSYGLDV', 'GGTSYYGTDV','EEGDIPGTTCMDV']
306 |     PI_CDRH3={}
307 |     for seq_name in CDRH3:
308 |         prot = Bio.SeqUtils.ProtParam.ProteinAnalysis(CDRH3[seq_name])
309 |         try:
310 |             PI_CDRH3[seq_name] = prot.isoelectric_point()
311 |         except:
312 |             PI_CDRH3[seq_name] = -1
313 | 
314 |     return PI_CDRH3
315 | 
316 | 
317 | #################################################################################################################
318 | #  function GetPIBin:
319 | #  Halve the bin of pI following the binning method using sequence's pI information.
320 | # 
321 | #  Input:   PI_CDRH3
322 | #  Output: 1. a list of PITheresholds, []
323 | #################################################################################################################
324 | 
325 | def GetPIBin(PI_CDRH3):
326 |     PITheresholds = [0.0, 7.0, 14.0]
327 |     tenPercent = 0.1*len(PI_CDRH3)
328 |     PITolerance = 0.3
329 |     cnt = 0
330 |     while cnt > tenPercent or len(PITheresholds) == 3:
331 |         # count how many sequence over threshold
332 |         for i in range(1, len(PITheresholds)):
333 |             cnt = 0
334 |             if (PITheresholds[i] - PITheresholds[i-1])< (2 * PITolerance):
335 |                 continue
336 |             # go over the dict
337 |             for seq in PI_CDRH3:
338 |                if PI_CDRH3[seq]> PITheresholds[i-1] and PI_CDRH3[seq]<PITheresholds[i]:
339 |                         cnt +=1
340 | 
341 |             #check if overflow tenpercent
342 |             if cnt > tenPercent:
343 |                 PITheresholds.append((PITheresholds[i-1] + PITheresholds[i])/2.0)
344 |                 PITheresholds = sorted(PITheresholds)
345 |                 break
346 |     return PITheresholds
347 | 
348 | #################################################################################################################
349 | #  function GetOneHotPI:
350 | #  Transform the pI values into one-hot encoded pI bin features.
351 | # 
352 | #  Input:   CDRH3, DatasetSize, DatasetName
353 | #  Output: 1. array of OneHotPI, [[seq1 onehot], 
354 | #                                 [seq2 onehot], 
355 | #                                 [seq3 onehot],
356 | #                                 ...]
357 | #          2. list of PIFeatureNames according to one hot, [PI_bin1, PI_bin2, PI_bin3...]
358 | #################################################################################################################
359 | 
360 | def GetOneHotPI(CDRH3, DatasetSize, DatasetName):
361 | 
362 |     PI_CDRH3 = GetCDRH3PI(CDRH3)
363 | 
364 |     PITheresholds = GetPIBin(PI_CDRH3)
365 | 
366 |     PIFeatureNames = []
367 |     OneHotPI = []
368 |     for i in range(1, len(PITheresholds)):
369 |         PIFeatureNames.append('PI_'+str(PITheresholds[i-1])+'-'+str(PITheresholds[i]))
370 | 
371 |     # for every dataset
372 |     for i, name in enumerate(DatasetName):
373 |         tmp = [[0 for k in range(len(PIFeatureNames))] for j in range(int(DatasetSize[i]))]
374 |         # for every seq in that dataset
375 |         for j in range(int(DatasetSize[i])):
376 |             seq_name = name + '_' + str(j)
377 |             for k in range(1, len(PITheresholds)):
378 |                 if PI_CDRH3[seq_name] >= float(PITheresholds[k-1]) and PI_CDRH3[seq_name] <= float(PITheresholds[k]):
379 |                     tmp[j][k-1] = 1
380 |                     break
381 |         OneHotPI += tmp
382 |     return OneHotPI, PIFeatureNames
383 | 
384 | #################################################################################################################
385 | #  function GetPositionalMotifFreq:
386 | #  Count the frequency of each possible frequent possitional motif for each dataset.
387 | # 
388 | #  Input:   CDRH3
389 | #  Output: 1. dictionary of MotifFreq, {'r1':{}, 'r2':{},'t1':{}, 't2':{}, 't3':{}, 't4':{}, 't5':{}, 't6':{}, 't7':{}, 't8':{}}
390 | #################################################################################################################
391 | 
392 | def GetPositionalMotifFreq(CDRH3):
393 |     MotifFreq ={'r1':{}, 'r2':{},'t1':{}, 't2':{}, 't3':{}, 't4':{}, 't5':{}, 't6':{}, 't7':{}, 't8':{}}
394 |     MotifDict = {}
395 |     for seq_name in CDRH3:
396 |         MotifDict[seq_name] = []
397 |         f_name = seq_name.split('_')[0]
398 |         # length of motif
399 |         for i in range(2, 10):
400 |             if i > len(CDRH3[seq_name]):
401 |                 continue
402 |             else:
403 |                 for j in range(len(CDRH3[seq_name])-i):
404 |                     PostionalMotif = str(j) +'_'+CDRH3[seq_name][j:j+i]
405 | 
406 |                     MotifDict[seq_name].append(PostionalMotif)
407 |                     if PostionalMotif in MotifFreq[f_name]:
408 |                         MotifFreq[f_name][PostionalMotif] += 1
409 |                     else:
410 |                         MotifFreq[f_name][PostionalMotif] = 1
411 |     return MotifFreq, MotifDict
412 | 
413 | #################################################################################################################
414 | #  function GetImpMotif (Version 1.0):
415 | #  Take only the most 2 frequent motif in each data set,  top 2 * 10 set * 9 length = 180 
416 | # 
417 | #  Input:   MotifFreq
418 | #  Output: 1. list of ImpMotif, [motif1, motif2, ...]
419 | #################################################################################################################
420 | 
421 | def GetImpMotif(MotifFreq):
422 |     ImpMotif = []
423 |     Top2 = 2
424 |     for f_name in MotifFreq:
425 |         motif_dic = MotifFreq[f_name]
426 |         for i in range(2, 11):
427 |             tmp = {}
428 |             for motif in motif_dic:
429 | 
430 |                 if motif.split('_')[0] == str(i):
431 |                     tmp[motif]= motif_dic[motif]
432 |             sorted_tmp = sorted(tmp.items(),key= lambda k: k[1],reverse= True)
433 |             for j in range(Top2):
434 |                 if len(sorted_tmp)> j:
435 |                     ImpMotif.append(sorted_tmp[j][0])
436 |     ImpMotif = list(sorted(set(ImpMotif)))
437 |     return ImpMotif
438 | 
439 | #################################################################################################################
440 | #  function GetCDRH3Motif:
441 | #  Assign present frequent motif for each sequence
442 | # 
443 | #  Input:   ImpMotif, CDRH3
444 | #  Output: 1. dictionary of Motif_CDRH3, {}
445 | #################################################################################################################
446 | 
447 | def GetCDRH3Motif(ImpMotif, CDRH3, MotifDict):
448 |     Motif_CDRH3={}
449 |     for seq_name in CDRH3:
450 |         # seq_len = len(CDRH3[seq_name])
451 |         Motif_CDRH3[seq_name]=[0 for z in range(len(ImpMotif))]
452 |         for i in range(len(ImpMotif)):
453 |             if ImpMotif[i] in MotifDict[seq_name]:
454 |                 Motif_CDRH3[seq_name][i] = 1
455 |     return Motif_CDRH3
456 | 
457 | #################################################################################################################
458 | #  function MultiHotMotif:
459 | #  Transfer motif information for each sequence to multi-hot encoded features.
460 | # 
461 | #  Input:   CDRH3, DatasetSize, DatasetName
462 | #  Output: 1. array of MultiHotMotif, [[seq1 multihot], [seq2 multihot], [seq3 multihot],...]
463 | #          2. list of MotifFeatureNames according to multi hot, [Motif1, Motif2, ...]
464 | #################################################################################################################
465 | 
466 | def MultiHotMotif(CDRH3, DatasetSize, DatasetName):
467 |     MotifFreq, MotifDict = GetPositionalMotifFreq(CDRH3)
468 | 
469 |     ImpMotif = GetImpMotif(MotifFreq)
470 | 
471 |     Motif_CDRH3 = GetCDRH3Motif(ImpMotif, CDRH3, MotifDict)
472 | 
473 |     MotifFeatureNames = []
474 |     for motif in ImpMotif:
475 |         MotifFeatureNames.append("Motif_"+ motif)
476 | 
477 |     MultiHotMotif =[]
478 |     for i, name in enumerate(DatasetName):
479 |         tmp = [[] for j in range(int(DatasetSize[i]))]
480 |         # for every seq in that dataset
481 |         for j in range(int(DatasetSize[i])):
482 |             seq_name = name + '_' + str(j)
483 |             tmp[j]= Motif_CDRH3[seq_name]
484 |         MultiHotMotif+=tmp
485 |     return MultiHotMotif, MotifFeatureNames
486 | 
487 | #################################################################################################################
488 | #  function GetFeatureVectors:
489 | #  Combine germline, canonical structure, pI, motif features to feature vectors
490 | # 
491 | #  Input:   OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames
492 | #  Output: 1. AllFeatureVectors for every sequence, [[seq1 LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...],
493 | #                                                    [seq2 LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...],
494 | #                                                    ...]
495 | # 
496 | #          2. AllFeatureNames        [LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...]
497 | #################################################################################################################
498 | 
499 | def GetFeatureVectors(OneHotGerm, GermFeatureNames,
500 |                       OneHotCanon, CanonFeatureNames,
501 |                       OneHotPI, PIFeatureNames,
502 |                       MultiHotMotif, MotifFeatureNames):
503 |     AllFeatureNames= GermFeatureNames + CanonFeatureNames + PIFeatureNames + MotifFeatureNames
504 |     AllFeatureVectors =[[] for i in range(len(OneHotGerm))]
505 |     # num of seq
506 |     for i in range(len(OneHotGerm)):
507 |         AllFeatureVectors[i] += OneHotGerm[i]
508 |         AllFeatureVectors[i] += OneHotCanon[i]
509 |         AllFeatureVectors[i] += OneHotPI[i]
510 |         AllFeatureVectors[i] += MultiHotMotif[i]
511 | 
512 | 
513 |     AllFeatureVectors = np.array(AllFeatureVectors)
514 |     ExcludeIGHVVectors = AllFeatureVectors
515 |     ExcludeFeatureNames = AllFeatureNames
516 |     if SET_NAME == 'IGHV':
517 |         name_index = []
518 |         ExcludeFeatureNames = []
519 |         for i, name in enumerate(AllFeatureNames):
520 |             if not name.startswith('Germ_HV_IGHV3-23'):
521 |                 name_index.append(i)
522 |                 ExcludeFeatureNames.append(AllFeatureNames[i])
523 | 
524 |         ExcludeIGHVVectors = AllFeatureVectors[:, name_index]
525 | 
526 |     return AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames
527 | 
528 | if __name__=='__main__':
529 |     targeting_direct = '../testCase-MMP/data/IGHV/'
530 |     reference_direct = '../testCase-MMP/data/IGHV/'
531 |     Amino, Num, Germ, DatasetName, DatasetSize = ReadAminoNumGerm(targeting_direct, reference_direct)
532 | 
533 | 


--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="c7bdd638-3cea-4080-984e-c663a79bf139" name="Default" comment="" />
  5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  6 |     <option name="SHOW_DIALOG" value="false" />
  7 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
  8 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
  9 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 10 |   </component>
 11 |   <component name="CoverageDataManager">
 12 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$HL_pair.coverage" NAME="HL_pair Coverage Results" MODIFIED="1560299804554" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 13 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$blast_clust.coverage" NAME="blast_clust Coverage Results" MODIFIED="1563569357937" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 14 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$S_SequenceInRegion.coverage" NAME="S_SequenceInRegion Coverage Results" MODIFIED="1561476333072" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/ASAP" />
 15 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$clean_data.coverage" NAME="clean_data Coverage Results" MODIFIED="1563224845051" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 16 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$S_SequenceInRegion__1_.coverage" NAME="S_SequenceInRegion (1) Coverage Results" MODIFIED="1566605337532" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/ASAP" />
 17 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$FeatureExtraction.coverage" NAME="FeatureExtraction Coverage Results" MODIFIED="1567705223819" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/ASAP" />
 18 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$SequenceAndFeatureAnalysis.coverage" NAME="SequenceAndFeatureAnalysis Coverage Results" MODIFIED="1567780970098" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/ASAP" />
 19 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$onlyIGHV.coverage" NAME="onlyIGHV Coverage Results" MODIFIED="1561468526078" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 20 |     <SUITE FILE_PATH="coverage/ASAP1_0_master$onlyheavy.coverage" NAME="onlyheavy Coverage Results" MODIFIED="1560887630264" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/testCase-MMP/data/reference-PDB" />
 21 |   </component>
 22 |   <component name="FUSProjectUsageTrigger">
 23 |     <session id="1049574298">
 24 |       <usages-collector id="statistics.lifecycle.project">
 25 |         <counts>
 26 |           <entry key="project.closed" value="17" />
 27 |           <entry key="project.open.time.0" value="2" />
 28 |           <entry key="project.open.time.1" value="1" />
 29 |           <entry key="project.open.time.2" value="1" />
 30 |           <entry key="project.open.time.4" value="2" />
 31 |           <entry key="project.open.time.5" value="7" />
 32 |           <entry key="project.open.time.6" value="2" />
 33 |           <entry key="project.open.time.7" value="2" />
 34 |           <entry key="project.opened" value="17" />
 35 |         </counts>
 36 |       </usages-collector>
 37 |       <usages-collector id="statistics.file.extensions.open">
 38 |         <counts>
 39 |           <entry key="csv" value="47" />
 40 |           <entry key="fasta" value="4" />
 41 |           <entry key="ipynb" value="1" />
 42 |           <entry key="png" value="123" />
 43 |           <entry key="py" value="51" />
 44 |           <entry key="txt" value="25" />
 45 |         </counts>
 46 |       </usages-collector>
 47 |       <usages-collector id="statistics.file.types.open">
 48 |         <counts>
 49 |           <entry key="CSV" value="47" />
 50 |           <entry key="IPNB" value="1" />
 51 |           <entry key="Image" value="123" />
 52 |           <entry key="PLAIN_TEXT" value="29" />
 53 |           <entry key="Python" value="51" />
 54 |         </counts>
 55 |       </usages-collector>
 56 |       <usages-collector id="statistics.file.extensions.edit">
 57 |         <counts>
 58 |           <entry key="Python Console" value="16" />
 59 |           <entry key="csv" value="1" />
 60 |           <entry key="py" value="12814" />
 61 |         </counts>
 62 |       </usages-collector>
 63 |       <usages-collector id="statistics.file.types.edit">
 64 |         <counts>
 65 |           <entry key="CSV" value="1" />
 66 |           <entry key="Python" value="12830" />
 67 |         </counts>
 68 |       </usages-collector>
 69 |     </session>
 70 |     <session id="-451646121">
 71 |       <usages-collector id="statistics.lifecycle.project">
 72 |         <counts>
 73 |           <entry key="project.closed" value="3" />
 74 |           <entry key="project.open.time.4" value="1" />
 75 |           <entry key="project.open.time.6" value="1" />
 76 |           <entry key="project.open.time.7" value="1" />
 77 |           <entry key="project.open.time.8" value="1" />
 78 |           <entry key="project.opened" value="4" />
 79 |         </counts>
 80 |       </usages-collector>
 81 |       <usages-collector id="statistics.file.extensions.edit">
 82 |         <counts>
 83 |           <entry key="py" value="1232" />
 84 |         </counts>
 85 |       </usages-collector>
 86 |       <usages-collector id="statistics.file.types.edit">
 87 |         <counts>
 88 |           <entry key="Python" value="1232" />
 89 |         </counts>
 90 |       </usages-collector>
 91 |       <usages-collector id="statistics.file.extensions.open">
 92 |         <counts>
 93 |           <entry key="py" value="2" />
 94 |         </counts>
 95 |       </usages-collector>
 96 |       <usages-collector id="statistics.file.types.open">
 97 |         <counts>
 98 |           <entry key="Python" value="2" />
 99 |         </counts>
100 |       </usages-collector>
101 |     </session>
102 |   </component>
103 |   <component name="FileEditorManager">
104 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
105 |       <file pinned="false" current-in-tab="true">
106 |         <entry file="file://$PROJECT_DIR$/ASAP/FeatureExtraction.py">
107 |           <provider selected="true" editor-type-id="text-editor">
108 |             <state relative-caret-position="171">
109 |               <caret line="11" selection-start-line="11" selection-end-line="11" />
110 |               <folding>
111 |                 <element signature="e#0#29#0" expanded="true" />
112 |               </folding>
113 |             </state>
114 |           </provider>
115 |         </entry>
116 |       </file>
117 |       <file pinned="false" current-in-tab="false">
118 |         <entry file="file://$PROJECT_DIR$/ASAP/SequenceAndFeatureAnalysis.py">
119 |           <provider selected="true" editor-type-id="text-editor">
120 |             <state relative-caret-position="-1503">
121 |               <caret line="19" column="14" selection-start-line="19" selection-start-column="14" selection-end-line="19" selection-end-column="14" />
122 |               <folding>
123 |                 <element signature="e#0#13#0" expanded="true" />
124 |               </folding>
125 |             </state>
126 |           </provider>
127 |         </entry>
128 |       </file>
129 |       <file pinned="false" current-in-tab="false">
130 |         <entry file="file://$PROJECT_DIR$/ASAP/DesignRecommendation.py">
131 |           <provider selected="true" editor-type-id="text-editor">
132 |             <state relative-caret-position="171">
133 |               <caret line="11" selection-start-line="11" selection-end-line="11" />
134 |               <folding>
135 |                 <element signature="e#0#18#0" expanded="true" />
136 |               </folding>
137 |             </state>
138 |           </provider>
139 |         </entry>
140 |       </file>
141 |       <file pinned="false" current-in-tab="false">
142 |         <entry file="file://$PROJECT_DIR$/ASAP/S_SequenceInRegion.py">
143 |           <provider selected="true" editor-type-id="text-editor">
144 |             <state relative-caret-position="5282">
145 |               <caret line="283" selection-start-line="283" selection-end-line="283" />
146 |               <folding>
147 |                 <element signature="e#0#29#0" expanded="true" />
148 |               </folding>
149 |             </state>
150 |           </provider>
151 |         </entry>
152 |       </file>
153 |     </leaf>
154 |   </component>
155 |   <component name="FileTemplateManagerImpl">
156 |     <option name="RECENT_TEMPLATES">
157 |       <list>
158 |         <option value="Setup Script" />
159 |         <option value="Python Script" />
160 |       </list>
161 |     </option>
162 |   </component>
163 |   <component name="FindInProjectRecents">
164 |     <findStrings>
165 |       <find>distance</find>
166 |       <find>motif</find>
167 |       <find>PostionalMotifposi</find>
168 |       <find>jaccar</find>
169 |       <find>corr</find>
170 |       <find>print</find>
171 |       <find>moti</find>
172 |       <find>head</find>
173 |       <find>heatmap</find>
174 |       <find>rank</find>
175 |       <find>less'</find>
176 |       <find>.index</find>
177 |       <find>open(</find>
178 |       <find>WriteFisherFS</find>
179 |       <find>sta</find>
180 |       <find>heat map</find>
181 |       <find>float(</find>
182 |       <find>feature</find>
183 |       <find>print(</find>
184 |       <find>all</find>
185 |       <find>shuffle</find>
186 |       <find>referen</find>
187 |       <find>set</find>
188 |       <find>frequency</find>
189 |       <find>_new</find>
190 |       <find>set(</find>
191 |       <find>startswith</find>
192 |       <find>2_GG</find>
193 |       <find>mean</find>
194 |       <find>importance</find>
195 |     </findStrings>
196 |   </component>
197 |   <component name="Git.Settings">
198 |     <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
199 |   </component>
200 |   <component name="IdeDocumentHistory">
201 |     <option name="CHANGED_PATHS">
202 |       <list>
203 |         <option value="$PROJECT_DIR$/ASAP.ipynb" />
204 |         <option value="$PROJECT_DIR$/Features_distribution_GermHV.csv" />
205 |         <option value="$PROJECT_DIR$/HL_pair.py" />
206 |         <option value="$PROJECT_DIR$/testCase-MMP/data/reference-PDB/onlyheavy.py" />
207 |         <option value="$USER_HOME$/Downloads/BoxDetection-master/box_detection.py" />
208 |         <option value="$PROJECT_DIR$/onlyIGHV.py" />
209 |         <option value="$PROJECT_DIR$/S_SequenceInRegion.py" />
210 |         <option value="$PROJECT_DIR$/clean_data.py" />
211 |         <option value="$PROJECT_DIR$/blast_clust.py" />
212 |         <option value="$PROJECT_DIR$/ASAP/S_SequenceInRegion.py" />
213 |         <option value="$PROJECT_DIR$/ASAP/FeatureExtraction.py" />
214 |         <option value="$PROJECT_DIR$/ASAP/SequenceAndFeatureAnalysis.py" />
215 |         <option value="$PROJECT_DIR$/ASAP/DesignRecommendation.py" />
216 |       </list>
217 |     </option>
218 |   </component>
219 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
220 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
221 |   <component name="JsGulpfileManager">
222 |     <detection-done>true</detection-done>
223 |     <sorting>DEFINITION_ORDER</sorting>
224 |   </component>
225 |   <component name="ProjectFrameBounds" fullScreen="true">
226 |     <option name="width" value="1440" />
227 |     <option name="height" value="900" />
228 |   </component>
229 |   <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
230 |   <component name="ProjectView">
231 |     <navigator proportions="" version="1">
232 |       <foldersAlwaysOnTop value="true" />
233 |     </navigator>
234 |     <panes>
235 |       <pane id="ProjectPane">
236 |         <subPane>
237 |           <expand>
238 |             <path>
239 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
240 |               <item name="ASAP1.0-master" type="462c0819:PsiDirectoryNode" />
241 |             </path>
242 |             <path>
243 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
244 |               <item name="ASAP1.0-master" type="462c0819:PsiDirectoryNode" />
245 |               <item name="ASAP" type="462c0819:PsiDirectoryNode" />
246 |             </path>
247 |             <path>
248 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
249 |               <item name="ASAP1.0-master" type="462c0819:PsiDirectoryNode" />
250 |               <item name="data" type="462c0819:PsiDirectoryNode" />
251 |             </path>
252 |             <path>
253 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
254 |               <item name="ASAP1.0-master" type="462c0819:PsiDirectoryNode" />
255 |               <item name="testCase" type="462c0819:PsiDirectoryNode" />
256 |             </path>
257 |             <path>
258 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
259 |               <item name="ASAP1.0-master" type="462c0819:PsiDirectoryNode" />
260 |               <item name="testCase" type="462c0819:PsiDirectoryNode" />
261 |               <item name="IGHV" type="462c0819:PsiDirectoryNode" />
262 |             </path>
263 |             <path>
264 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
265 |               <item name="ASAP1.0-master" type="462c0819:PsiDirectoryNode" />
266 |               <item name="testCase" type="462c0819:PsiDirectoryNode" />
267 |               <item name="IGHV" type="462c0819:PsiDirectoryNode" />
268 |               <item name="reference-IGHV" type="462c0819:PsiDirectoryNode" />
269 |             </path>
270 |             <path>
271 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
272 |               <item name="ASAP1.0-master" type="462c0819:PsiDirectoryNode" />
273 |               <item name="testCase" type="462c0819:PsiDirectoryNode" />
274 |               <item name="IGHV" type="462c0819:PsiDirectoryNode" />
275 |               <item name="targeting-MMP-IGHV" type="462c0819:PsiDirectoryNode" />
276 |             </path>
277 |             <path>
278 |               <item name="ASAP1.0-master" type="b2602c69:ProjectViewProjectNode" />
279 |               <item name="External Libraries" type="cb654da1:ExternalLibrariesNode" />
280 |             </path>
281 |           </expand>
282 |           <select />
283 |         </subPane>
284 |       </pane>
285 |       <pane id="Scope" />
286 |     </panes>
287 |   </component>
288 |   <component name="PropertiesComponent">
289 |     <property name="WebServerToolWindowFactoryState" value="false" />
290 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
291 |     <property name="run.code.analysis.last.selected.profile" value="aDefault" />
292 |     <property name="settings.editor.selected.configurable" value="project.propVCSSupport.Mappings" />
293 |   </component>
294 |   <component name="RecentsManager">
295 |     <key name="MoveFile.RECENT_KEYS">
296 |       <recent name="$PROJECT_DIR$/testCase-MMP/data/target-HIV" />
297 |       <recent name="$PROJECT_DIR$" />
298 |     </key>
299 |     <key name="CopyFile.RECENT_KEYS">
300 |       <recent name="$PROJECT_DIR$" />
301 |       <recent name="$PROJECT_DIR$/ASAP" />
302 |       <recent name="$PROJECT_DIR$/testCase-MMP/data/IGHV" />
303 |     </key>
304 |   </component>
305 |   <component name="RunDashboard">
306 |     <option name="ruleStates">
307 |       <list>
308 |         <RuleState>
309 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
310 |         </RuleState>
311 |         <RuleState>
312 |           <option name="name" value="StatusDashboardGroupingRule" />
313 |         </RuleState>
314 |       </list>
315 |     </option>
316 |   </component>
317 |   <component name="RunManager" selected="Python.SequenceAndFeatureAnalysis">
318 |     <configuration name="FeatureExtraction" type="PythonConfigurationType" factoryName="Python" temporary="true">
319 |       <module name="ASAP-1.0" />
320 |       <option name="INTERPRETER_OPTIONS" value="" />
321 |       <option name="PARENT_ENVS" value="true" />
322 |       <envs>
323 |         <env name="PYTHONUNBUFFERED" value="1" />
324 |       </envs>
325 |       <option name="SDK_HOME" value="" />
326 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/ASAP" />
327 |       <option name="IS_MODULE_SDK" value="true" />
328 |       <option name="ADD_CONTENT_ROOTS" value="true" />
329 |       <option name="ADD_SOURCE_ROOTS" value="true" />
330 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
331 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ASAP/FeatureExtraction.py" />
332 |       <option name="PARAMETERS" value="" />
333 |       <option name="SHOW_COMMAND_LINE" value="false" />
334 |       <option name="EMULATE_TERMINAL" value="false" />
335 |       <option name="MODULE_MODE" value="false" />
336 |       <option name="REDIRECT_INPUT" value="false" />
337 |       <option name="INPUT_FILE" value="" />
338 |       <method v="2" />
339 |     </configuration>
340 |     <configuration name="S_SequenceInRegion (1)" type="PythonConfigurationType" factoryName="Python" temporary="true">
341 |       <module name="ASAP-1.0" />
342 |       <option name="INTERPRETER_OPTIONS" value="" />
343 |       <option name="PARENT_ENVS" value="true" />
344 |       <envs>
345 |         <env name="PYTHONUNBUFFERED" value="1" />
346 |       </envs>
347 |       <option name="SDK_HOME" value="" />
348 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/ASAP" />
349 |       <option name="IS_MODULE_SDK" value="true" />
350 |       <option name="ADD_CONTENT_ROOTS" value="true" />
351 |       <option name="ADD_SOURCE_ROOTS" value="true" />
352 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
353 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ASAP/S_SequenceInRegion.py" />
354 |       <option name="PARAMETERS" value="" />
355 |       <option name="SHOW_COMMAND_LINE" value="false" />
356 |       <option name="EMULATE_TERMINAL" value="false" />
357 |       <option name="MODULE_MODE" value="false" />
358 |       <option name="REDIRECT_INPUT" value="false" />
359 |       <option name="INPUT_FILE" value="" />
360 |       <method v="2" />
361 |     </configuration>
362 |     <configuration name="SequenceAndFeatureAnalysis" type="PythonConfigurationType" factoryName="Python" temporary="true">
363 |       <module name="ASAP-1.0" />
364 |       <option name="INTERPRETER_OPTIONS" value="" />
365 |       <option name="PARENT_ENVS" value="true" />
366 |       <envs>
367 |         <env name="PYTHONUNBUFFERED" value="1" />
368 |       </envs>
369 |       <option name="SDK_HOME" value="" />
370 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/ASAP" />
371 |       <option name="IS_MODULE_SDK" value="true" />
372 |       <option name="ADD_CONTENT_ROOTS" value="true" />
373 |       <option name="ADD_SOURCE_ROOTS" value="true" />
374 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
375 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ASAP/SequenceAndFeatureAnalysis.py" />
376 |       <option name="PARAMETERS" value="" />
377 |       <option name="SHOW_COMMAND_LINE" value="false" />
378 |       <option name="EMULATE_TERMINAL" value="false" />
379 |       <option name="MODULE_MODE" value="false" />
380 |       <option name="REDIRECT_INPUT" value="false" />
381 |       <option name="INPUT_FILE" value="" />
382 |       <method v="2" />
383 |     </configuration>
384 |     <configuration name="blast_clust" type="PythonConfigurationType" factoryName="Python" temporary="true">
385 |       <module name="ASAP-1.0" />
386 |       <option name="INTERPRETER_OPTIONS" value="" />
387 |       <option name="PARENT_ENVS" value="true" />
388 |       <envs>
389 |         <env name="PYTHONUNBUFFERED" value="1" />
390 |       </envs>
391 |       <option name="SDK_HOME" value="" />
392 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
393 |       <option name="IS_MODULE_SDK" value="true" />
394 |       <option name="ADD_CONTENT_ROOTS" value="true" />
395 |       <option name="ADD_SOURCE_ROOTS" value="true" />
396 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
397 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/blast_clust.py" />
398 |       <option name="PARAMETERS" value="" />
399 |       <option name="SHOW_COMMAND_LINE" value="false" />
400 |       <option name="EMULATE_TERMINAL" value="false" />
401 |       <option name="MODULE_MODE" value="false" />
402 |       <option name="REDIRECT_INPUT" value="false" />
403 |       <option name="INPUT_FILE" value="" />
404 |       <method v="2" />
405 |     </configuration>
406 |     <configuration name="clean_data" type="PythonConfigurationType" factoryName="Python" temporary="true">
407 |       <module name="ASAP-1.0" />
408 |       <option name="INTERPRETER_OPTIONS" value="" />
409 |       <option name="PARENT_ENVS" value="true" />
410 |       <envs>
411 |         <env name="PYTHONUNBUFFERED" value="1" />
412 |       </envs>
413 |       <option name="SDK_HOME" value="" />
414 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
415 |       <option name="IS_MODULE_SDK" value="true" />
416 |       <option name="ADD_CONTENT_ROOTS" value="true" />
417 |       <option name="ADD_SOURCE_ROOTS" value="true" />
418 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
419 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/clean_data.py" />
420 |       <option name="PARAMETERS" value="" />
421 |       <option name="SHOW_COMMAND_LINE" value="false" />
422 |       <option name="EMULATE_TERMINAL" value="false" />
423 |       <option name="MODULE_MODE" value="false" />
424 |       <option name="REDIRECT_INPUT" value="false" />
425 |       <option name="INPUT_FILE" value="" />
426 |       <method v="2" />
427 |     </configuration>
428 |     <configuration default="true" type="js.build_tools.gulp" factoryName="Gulp.js">
429 |       <node-interpreter>project</node-interpreter>
430 |       <node-options />
431 |       <gulpfile />
432 |       <tasks />
433 |       <arguments />
434 |       <envs />
435 |       <method v="2" />
436 |     </configuration>
437 |     <configuration default="true" type="tests" factoryName="Unittests">
438 |       <module name="ASAP-1.0" />
439 |       <option name="INTERPRETER_OPTIONS" value="" />
440 |       <option name="PARENT_ENVS" value="true" />
441 |       <option name="SDK_HOME" value="" />
442 |       <option name="WORKING_DIRECTORY" value="" />
443 |       <option name="IS_MODULE_SDK" value="false" />
444 |       <option name="ADD_CONTENT_ROOTS" value="true" />
445 |       <option name="ADD_SOURCE_ROOTS" value="true" />
446 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
447 |       <option name="_new_additionalArguments" value="&quot;&quot;" />
448 |       <option name="_new_target" value="&quot;.&quot;" />
449 |       <option name="_new_targetType" value="&quot;PATH&quot;" />
450 |       <method v="2" />
451 |     </configuration>
452 |     <list>
453 |       <item itemvalue="Python.SequenceAndFeatureAnalysis" />
454 |       <item itemvalue="Python.S_SequenceInRegion (1)" />
455 |       <item itemvalue="Python.clean_data" />
456 |       <item itemvalue="Python.blast_clust" />
457 |       <item itemvalue="Python.FeatureExtraction" />
458 |     </list>
459 |     <recent_temporary>
460 |       <list>
461 |         <item itemvalue="Python.SequenceAndFeatureAnalysis" />
462 |         <item itemvalue="Python.FeatureExtraction" />
463 |         <item itemvalue="Python.S_SequenceInRegion (1)" />
464 |         <item itemvalue="Python.blast_clust" />
465 |         <item itemvalue="Python.clean_data" />
466 |       </list>
467 |     </recent_temporary>
468 |   </component>
469 |   <component name="SvnConfiguration">
470 |     <configuration />
471 |   </component>
472 |   <component name="TaskManager">
473 |     <task active="true" id="Default" summary="Default task">
474 |       <changelist id="c7bdd638-3cea-4080-984e-c663a79bf139" name="Default" comment="" />
475 |       <created>1532624332261</created>
476 |       <option name="number" value="Default" />
477 |       <option name="presentableId" value="Default" />
478 |       <updated>1532624332261</updated>
479 |     </task>
480 |     <servers />
481 |   </component>
482 |   <component name="ToolWindowManager">
483 |     <frame x="0" y="0" width="1440" height="900" extended-state="0" />
484 |     <editor active="true" />
485 |     <layout>
486 |       <window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.33404863" />
487 |       <window_info id="Structure" order="1" weight="0.25" />
488 |       <window_info id="Favorites" order="2" side_tool="true" />
489 |       <window_info anchor="bottom" id="Message" order="0" />
490 |       <window_info anchor="bottom" id="Find" order="1" weight="0.32891566" />
491 |       <window_info anchor="bottom" id="Run" order="2" weight="0.23012048" />
492 |       <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
493 |       <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
494 |       <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
495 |       <window_info anchor="bottom" id="TODO" order="6" />
496 |       <window_info anchor="bottom" id="Version Control" order="7" />
497 |       <window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.31084338" />
498 |       <window_info anchor="bottom" id="Python Console" order="9" weight="0.32891566" />
499 |       <window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
500 |       <window_info anchor="bottom" id="Docker" order="11" show_stripe_button="false" />
501 |       <window_info anchor="bottom" id="Database Changes" order="12" show_stripe_button="false" />
502 |       <window_info anchor="bottom" id="Concurrent Activities Diagram" order="13" weight="0.32891566" />
503 |       <window_info anchor="right" id="Commander" order="0" weight="0.4" />
504 |       <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
505 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
506 |       <window_info anchor="right" id="Data View" order="3" />
507 |       <window_info anchor="right" id="Database" order="4" />
508 |       <window_info anchor="right" id="SciView" order="5" weight="0.3297568" />
509 |     </layout>
510 |     <layout-to-restore>
511 |       <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.2546495" />
512 |       <window_info id="Structure" order="1" weight="0.25" />
513 |       <window_info id="Favorites" order="2" side_tool="true" />
514 |       <window_info anchor="bottom" id="Docker" order="0" show_stripe_button="false" />
515 |       <window_info anchor="bottom" id="Database Changes" order="1" show_stripe_button="false" />
516 |       <window_info anchor="bottom" id="Message" order="2" />
517 |       <window_info anchor="bottom" id="Find" order="3" />
518 |       <window_info anchor="bottom" id="Run" order="4" visible="true" weight="0.33734939" />
519 |       <window_info anchor="bottom" id="Debug" order="5" weight="0.4" />
520 |       <window_info anchor="bottom" id="Cvs" order="6" weight="0.25" />
521 |       <window_info anchor="bottom" id="Inspection" order="7" weight="0.4" />
522 |       <window_info anchor="bottom" id="TODO" order="8" />
523 |       <window_info anchor="bottom" id="Version Control" order="9" />
524 |       <window_info anchor="bottom" id="Terminal" order="10" />
525 |       <window_info anchor="bottom" id="Python Console" order="11" />
526 |       <window_info anchor="bottom" id="Event Log" order="12" side_tool="true" />
527 |       <window_info anchor="right" id="SciView" order="0" weight="0.3297568" />
528 |       <window_info anchor="right" id="Commander" order="1" weight="0.4" />
529 |       <window_info anchor="right" id="Ant Build" order="2" weight="0.25" />
530 |       <window_info anchor="right" content_ui="combo" id="Hierarchy" order="3" weight="0.25" />
531 |       <window_info anchor="right" id="Data View" order="4" />
532 |       <window_info anchor="right" id="Database" order="5" />
533 |     </layout-to-restore>
534 |   </component>
535 |   <component name="TypeScriptGeneratedFilesManager">
536 |     <option name="version" value="1" />
537 |   </component>
538 |   <component name="VcsContentAnnotationSettings">
539 |     <option name="myLimit" value="2678400000" />
540 |   </component>
541 |   <component name="editorHistoryManager">
542 |     <entry file="file://$PROJECT_DIR$/requirements.txt">
543 |       <provider selected="true" editor-type-id="text-editor" />
544 |     </entry>
545 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/reference-PDB/dh_chothia_heavy.txt" />
546 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/reference-PDB/onlyheavy.py" />
547 |     <entry file="file://$PROJECT_DIR$/Features.csv" />
548 |     <entry file="file://$USER_HOME$/Downloads/BoxDetection-master/box_detection.py" />
549 |     <entry file="file://$PROJECT_DIR$/IGHV-23_combined_F.txt" />
550 |     <entry file="file://$PROJECT_DIR$/results/Except pI Features_ROC.png" />
551 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/IGHV/p1_chothia.txt" />
552 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/IGHV/p1_chothia_IGHV.txt" />
553 |     <entry file="file://$PROJECT_DIR$/onlyIGHV.py" />
554 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p2_chothia_IGHV.txt" />
555 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p1_chothia_IGHV.txt" />
556 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p3_chothia_IGHV.txt" />
557 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p4_chothia_IGHV.txt" />
558 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p5_chothia_IGHV.txt" />
559 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p8_chothia_IGHV.txt" />
560 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p7_chothia_IGHV.txt" />
561 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p6_chothia_IGHV.txt" />
562 |     <entry file="file://$PROJECT_DIR$/results/Only Frequent Positional Motif Features_ROC.png" />
563 |     <entry file="file://$PROJECT_DIR$/results/Only Germline Features_ROC.png" />
564 |     <entry file="file://$PROJECT_DIR$/results/Only pI Features_ROC.png" />
565 |     <entry file="file://$PROJECT_DIR$/results/Only CDR Canonical Structure Features_ROC.png" />
566 |     <entry file="file://$PROJECT_DIR$/testCase-MMP/data/targeting-MMP/p6_chothia.txt" />
567 |     <entry file="file://$PROJECT_DIR$/results/Light Chain Sequences.png" />
568 |     <entry file="file://$PROJECT_DIR$/results/Extracted Features.png" />
569 |     <entry file="file://$PROJECT_DIR$/results/Heavy Chain Sequences.png" />
570 |     <entry file="file://$PROJECT_DIR$/results/All Features Included_ROC.png" />
571 |     <entry file="file://$PROJECT_DIR$/results/DTreeAllFeature.png" />
572 |     <entry file="file://$PROJECT_DIR$/results/Except Frequent Positional Motif Features_ROC.png" />
573 |     <entry file="file://$PROJECT_DIR$/results/Except Germline Features_ROC.png" />
574 |     <entry file="file://$PROJECT_DIR$/results/Except CDR Canonical Structure Features_ROC.png" />
575 |     <entry file="file://$PROJECT_DIR$/S_SequenceInRegion.py" />
576 |     <entry file="file://$PROJECT_DIR$/NonNaiveData/toy.fasta.txt" />
577 |     <entry file="file://$PROJECT_DIR$/NonNaiveData/485_mod.fasta" />
578 |     <entry file="file://$PROJECT_DIR$/NonNaiveData/502-503_all_paired_consensus_seqs.fasta" />
579 |     <entry file="file://$PROJECT_DIR$/results/IGHV_Sequence_region.csv" />
580 |     <entry file="file://$PROJECT_DIR$/ASAP/correlation.csv" />
581 |     <entry file="file://$PROJECT_DIR$/blast_clust.py">
582 |       <provider selected="true" editor-type-id="text-editor">
583 |         <state relative-caret-position="451">
584 |           <caret line="49" column="24" selection-start-line="49" selection-start-column="24" selection-end-line="49" selection-end-column="24" />
585 |         </state>
586 |       </provider>
587 |     </entry>
588 |     <entry file="file://$PROJECT_DIR$/clean_data.py">
589 |       <provider selected="true" editor-type-id="text-editor">
590 |         <state relative-caret-position="2090">
591 |           <caret line="110" column="12" selection-start-line="110" selection-start-column="12" selection-end-line="110" selection-end-column="12" />
592 |         </state>
593 |       </provider>
594 |     </entry>
595 |     <entry file="file://$PROJECT_DIR$/testCase/IGHV/reference-IGHV/r1_chothia_mod.txt">
596 |       <provider selected="true" editor-type-id="text-editor">
597 |         <state>
598 |           <caret column="9" selection-start-column="9" selection-end-column="9" />
599 |         </state>
600 |       </provider>
601 |     </entry>
602 |     <entry file="file://$USER_HOME$/anaconda3/envs/homework/lib/python3.6/site-packages/pandas/core/frame.py">
603 |       <provider selected="true" editor-type-id="text-editor">
604 |         <state relative-caret-position="163">
605 |           <caret line="2601" column="8" selection-start-line="2601" selection-start-column="8" selection-end-line="2601" selection-end-column="8" />
606 |         </state>
607 |       </provider>
608 |     </entry>
609 |     <entry file="file://$USER_HOME$/anaconda3/envs/homework/lib/python3.6/site-packages/sklearn/metrics/pairwise.py">
610 |       <provider selected="true" editor-type-id="text-editor">
611 |         <state relative-caret-position="-33">
612 |           <caret line="1140" column="4" selection-start-line="1140" selection-start-column="4" selection-end-line="1140" selection-end-column="4" />
613 |         </state>
614 |       </provider>
615 |     </entry>
616 |     <entry file="file://$USER_HOME$/anaconda3/envs/homework/lib/python3.6/site-packages/numpy/lib/twodim_base.py">
617 |       <provider selected="true" editor-type-id="text-editor">
618 |         <state relative-caret-position="1">
619 |           <caret line="413" column="4" selection-start-line="413" selection-start-column="4" selection-end-line="413" selection-end-column="4" />
620 |         </state>
621 |       </provider>
622 |     </entry>
623 |     <entry file="file://$USER_HOME$/anaconda3/envs/homework/lib/python3.6/site-packages/numpy/core/numeric.py">
624 |       <provider selected="true" editor-type-id="text-editor">
625 |         <state relative-caret-position="67">
626 |           <caret line="155" column="4" selection-start-line="155" selection-start-column="4" selection-end-line="155" selection-end-column="4" />
627 |         </state>
628 |       </provider>
629 |     </entry>
630 |     <entry file="file://$USER_HOME$/anaconda3/envs/homework/lib/python3.6/site-packages/scipy/stats/__init__.py">
631 |       <provider selected="true" editor-type-id="text-editor">
632 |         <state relative-caret-position="-199" />
633 |       </provider>
634 |     </entry>
635 |     <entry file="file://$USER_HOME$/anaconda3/envs/homework/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py">
636 |       <provider selected="true" editor-type-id="text-editor">
637 |         <state relative-caret-position="299">
638 |           <caret line="244" column="15" selection-start-line="244" selection-start-column="15" selection-end-line="244" selection-end-column="15" />
639 |         </state>
640 |       </provider>
641 |     </entry>
642 |     <entry file="file://$USER_HOME$/anaconda3/envs/homework/lib/python3.6/site-packages/sklearn/ensemble/forest.py">
643 |       <provider selected="true" editor-type-id="text-editor">
644 |         <state relative-caret-position="404">
645 |           <caret line="1476" column="28" selection-start-line="1476" selection-start-column="28" selection-end-line="1476" selection-end-column="28" />
646 |         </state>
647 |       </provider>
648 |     </entry>
649 |     <entry file="file://$PROJECT_DIR$/ASAP/DesignRecommendation.py">
650 |       <provider selected="true" editor-type-id="text-editor">
651 |         <state relative-caret-position="171">
652 |           <caret line="11" selection-start-line="11" selection-end-line="11" />
653 |           <folding>
654 |             <element signature="e#0#18#0" expanded="true" />
655 |           </folding>
656 |         </state>
657 |       </provider>
658 |     </entry>
659 |     <entry file="file://$PROJECT_DIR$/ASAP/S_SequenceInRegion.py">
660 |       <provider selected="true" editor-type-id="text-editor">
661 |         <state relative-caret-position="5282">
662 |           <caret line="283" selection-start-line="283" selection-end-line="283" />
663 |           <folding>
664 |             <element signature="e#0#29#0" expanded="true" />
665 |           </folding>
666 |         </state>
667 |       </provider>
668 |     </entry>
669 |     <entry file="file://$PROJECT_DIR$/ASAP/SequenceAndFeatureAnalysis.py">
670 |       <provider selected="true" editor-type-id="text-editor">
671 |         <state relative-caret-position="-1503">
672 |           <caret line="19" column="14" selection-start-line="19" selection-start-column="14" selection-end-line="19" selection-end-column="14" />
673 |           <folding>
674 |             <element signature="e#0#13#0" expanded="true" />
675 |           </folding>
676 |         </state>
677 |       </provider>
678 |     </entry>
679 |     <entry file="file://$PROJECT_DIR$/ASAP/FeatureExtraction.py">
680 |       <provider selected="true" editor-type-id="text-editor">
681 |         <state relative-caret-position="171">
682 |           <caret line="11" selection-start-line="11" selection-end-line="11" />
683 |           <folding>
684 |             <element signature="e#0#29#0" expanded="true" />
685 |           </folding>
686 |         </state>
687 |       </provider>
688 |     </entry>
689 |   </component>
690 | </project>


--------------------------------------------------------------------------------
/ASAP/SequenceAndFeatureAnalysis.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from matplotlib import rc, rcParams
  3 | import matplotlib.pyplot as plt
  4 | import scipy.stats as sta
  5 | from sklearn.ensemble import ExtraTreesClassifier
  6 | from sklearn import svm
  7 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
  8 | from scipy import interp
  9 | from sklearn.metrics import roc_curve, auc
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | np.random.seed(8)
 14 | 
 15 | BLOSUM62_DIRECT = "./data/blosum62.csv"
 16 | 
 17 | SET_NAME = 'MMP-cluster'
 18 | IF_ONLY_HEAVY = False
 19 | CNT_DB = 2
 20 | CNT_TARGET = 1
 21 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'
 22 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'
 23 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster
 24 | 
 25 | 
 26 | 
 27 | #################################################################################################################
 28 | #  function DuplicateSelectFeature:
 29 | #  Sample with replacement, each time the selection is on the total 
 30 | # 
 31 | #  Input:   DatasetName, DatasetSize, AllFeatureVectors
 32 | #  Output: 1. X_DS
 33 | #          2. Y_DS
 34 | #          3. SeqName_DS
 35 | #################################################################################################################
 36 | def DuplicateSelectFeature(size, DatasetName, DatasetSize, AllFeatureVectors):
 37 |     X_DS = []
 38 |     Y_DS = []
 39 |     SeqName_DS =[]
 40 |     previous = 0
 41 |     for i in range(len(DatasetSize)):
 42 |         if i < CNT_DB:
 43 |             actual_size = int(size * CNT_TARGET / CNT_DB)
 44 |         else:
 45 |             actual_size = size
 46 | 
 47 | 
 48 |         if actual_size <= DatasetSize[i]:
 49 |             shuffle_x = np.array([sh_i for sh_i in range(DatasetSize[i])])
 50 |             np.random.shuffle(shuffle_x)
 51 |             for j in range(actual_size):
 52 |                 # idx = np.random.randint(DatasetSize[i])
 53 |                 idx = shuffle_x[j]
 54 |                 SeqName_DS.append(DatasetName[i]+'_'+str(idx))
 55 |                 X_DS.append(AllFeatureVectors[previous+idx])
 56 |                 if i < CNT_DB:
 57 |                     Y_DS.append(0)
 58 |                 else:
 59 |                     Y_DS.append(1)
 60 |             previous += DatasetSize[i]
 61 | 
 62 |         else:
 63 |             for j in range(actual_size):
 64 |                 idx = np.random.randint(DatasetSize[i])
 65 |                 SeqName_DS.append(DatasetName[i]+'_'+str(idx))
 66 |                 X_DS.append(AllFeatureVectors[previous+idx])
 67 |                 if i < CNT_DB:
 68 |                     Y_DS.append(0)
 69 |                 else:
 70 |                     Y_DS.append(1)
 71 |             previous += DatasetSize[i]
 72 |     return X_DS, Y_DS, SeqName_DS
 73 | 
 74 | #################################################################################################################
 75 | #  function IterationDuplicateSelectFeature:
 76 | #  Iteratively sample with replacement
 77 | # 
 78 | #  Input:   DatasetName, DatasetSize, AllFeatureVectors
 79 | #  Output: 1. X_DS
 80 | #          2. Y_DS
 81 | #          3. SeqName_DS
 82 | #################################################################################################################
 83 | def IterationDuplicateSelectFeature(size, iterate, DatasetName, DatasetSize, AllFeatureVectors):
 84 |     X_IDS = [[] for i in range(iterate)]
 85 |     Y_IDS = [[] for i in range(iterate)]
 86 |     SeqName_IDS = [[] for i in range(iterate)]
 87 |     for i in range(iterate):
 88 |         X_DS, Y_DS, SeqName_DS = DuplicateSelectFeature(size, DatasetName, DatasetSize, AllFeatureVectors)
 89 |         X_IDS[i] = X_DS
 90 |         Y_IDS[i] = Y_DS
 91 |         SeqName_IDS[i] = SeqName_DS
 92 |     return X_IDS, Y_IDS, SeqName_IDS
 93 | 
 94 | 
 95 | #################################################################################################################
 96 | #  function normalize:
 97 | #  Normalize the distance matrix
 98 | # 
 99 | #  Input:  dist
100 | #  Output: 1. tmp_dist
101 | #################################################################################################################
102 | 
103 | def normalize(dist):
104 |     tmp_min = dist.min()
105 |     tmp_max = dist.max()
106 |     tmp_dist = (dist-tmp_min)/(tmp_max-tmp_min)
107 |     return tmp_dist
108 | 
109 | #################################################################################################################
110 | #  function Draw_heatmap:
111 | #  Draw heatmap according to the input distance matrix 
112 | # 
113 | #  Input:  dist, name
114 | #################################################################################################################
115 | def Draw_heatmap(size, dist, name, DatasetSize):
116 |     rc('font', size=20)  
117 |     #ticks bold
118 |     rcParams['text.latex.preamble'] = [r'\usepackage{sfmath} \boldmath']
119 |     # figure size
120 |     fig, ax = plt.subplots(figsize=(10, 7))
121 |     # heatmap with color bar
122 | 
123 |     dist = normalize(dist)
124 |     plt.imshow(dist, cmap='Blues', interpolation='nearest')
125 |     v = np.linspace(0.0, 1.0, 6, endpoint=True)
126 |     cb = plt.colorbar(ticks=v)
127 |     plt.title(name, y=1.08)
128 | 
129 |     N_DB = size * (CNT_TARGET / CNT_DB)
130 |     N_TARGET = size
131 | 
132 |     x = []
133 |     for i in range(CNT_DB):
134 |         x.append((i+0.5) * N_DB)
135 |     for i in range(CNT_TARGET):
136 |         x.append((i + 0.5) * N_TARGET + CNT_DB * N_DB)
137 | 
138 |     y = x
139 | 
140 |     if SET_NAME == 'IGHV':
141 |         labels = ['Reference'] + ['MMP-targeting']
142 |     elif SET_NAME == 'MMP':
143 |         labels = ['Human', 'Murine'] + ['MMP-targeting']
144 |     elif SET_NAME == 'MMP-cluster':
145 |         labels = ['Human', 'Murine'] + [str(i + 1) for i in range(CNT_TARGET)]
146 |     elif SET_NAME == 'DEKOSKY':
147 |         labels = ['Naive 1', 'Naive 2'] + [str(i+1) for i in range(CNT_TARGET)]
148 |     else:
149 |         labels = ['Naive 1', 'Naive 2'] + [str(i + 1) for i in range(CNT_TARGET)]
150 |     plt.xticks(x, labels)
151 |     plt.yticks(y, labels)
152 | 
153 |     ax.xaxis.tick_top()
154 |     
155 |     a = []
156 |     for i in range(CNT_DB):
157 |         a.append((i+1) * N_DB)
158 | 
159 |     for i in range(CNT_TARGET - 1):
160 |         a.append((i + 1) * N_TARGET + CNT_DB * N_DB)
161 | 
162 |     for idx, item in enumerate(a):
163 |         if idx == CNT_DB-1:
164 |             ax.axhline(item, linestyle='-', color='black', linewidth=3)
165 |             ax.axvline(item, linestyle='-', color='black', linewidth=3)
166 |         else:
167 |             ax.axhline(item, linestyle='-', color='black', linewidth=1)
168 |             ax.axvline(item, linestyle='-', color='black', linewidth=1)
169 | 
170 |     fig.savefig('./results/'+SET_NAME +'_'+ name+'.png')
171 | 
172 | ######################  Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################
173 | 
174 | #################################################################################################################
175 | #  function ReadBLOSUM:
176 | #  Read in the BLOSUM 62 substitution matrix
177 | # 
178 | #  Output: 1. BLOSUM, a dictionary of pairwise permutation
179 | #################################################################################################################
180 | def ReadBLOSUM():
181 |     with open(BLOSUM62_DIRECT, "r") as fi:
182 |         data = fi.readlines()
183 |     for i in range(len(data)):
184 |         data[i] = data[i].strip().split(',')
185 | 
186 |     names = data[0]
187 |     BLOSUM = {}
188 |     for i in range(len(names)):
189 |         for j in range(len(names)):
190 |             BLOSUM[names[i] + names[j]] = data[i + 1][j]
191 |     return BLOSUM
192 | 
193 | #################################################################################################################
194 | #  function CalBLOSUM:
195 | #  Calculate the sequence similarity for each sequence.
196 | # 
197 | #  Input:  SeqName_DS, Amino, Num, BLOSUM, chain
198 | #  Output: 1. dist
199 | #################################################################################################################
200 | def CalBLOSUM(SeqName_DS, Amino, Num, BLOSUM, chain):
201 |     dist = np.zeros((len(SeqName_DS), len(SeqName_DS)))
202 |     for i, s1 in enumerate(SeqName_DS):
203 |         seq1 = {}
204 |         for k in range(len(Amino[chain][s1])):
205 |             seq1[Num[chain][s1][k]] = Amino[chain][s1][k]
206 |         for j, s2 in enumerate(SeqName_DS):
207 |             seq2 = {}
208 |             for k in range(len(Amino[chain][s2])):
209 |                 seq2[Num[chain][s2][k]] = Amino[chain][s2][k]
210 |             cnt = 0
211 |             for key in seq1:
212 |                 if key in seq2 and (seq1[key] + seq2[key]) in BLOSUM:
213 |                     cnt += int(BLOSUM[seq1[key] + seq2[key]])
214 |                 else:
215 |                     cnt += -4
216 |             for key in seq2:
217 |                 if key not in seq1:
218 |                     cnt += -4
219 |             dist[i][j] = cnt
220 |     return dist
221 | 
222 | #################################################################################################################
223 | #  function CalBLOSUMVAR:
224 | #  Calculate the sequence similarity for each sequence only on non-constant region.
225 | #
226 | #  Input:  SeqName_DS, Amino, Num, BLOSUM, chain
227 | #  Output: 1. dist
228 | #################################################################################################################
229 | def CalBLOSUMVAR(SeqName_DS, Amino, Num, BLOSUM, chain):
230 |     dist = np.zeros((len(SeqName_DS), len(SeqName_DS)))
231 |     # should be the same, since being normalized afterwards
232 | 
233 |     for i, s1 in enumerate(SeqName_DS):
234 |         seq1 = {}
235 |         for k in range(len(Amino[chain][s1])):
236 |             seq1[Num[chain][s1][k]] = Amino[chain][s1][k]
237 |         for j, s2 in enumerate(SeqName_DS):
238 |             seq2 = {}
239 |             for k in range(len(Amino[chain][s2])):
240 |                 seq2[Num[chain][s2][k]] = Amino[chain][s2][k]
241 |             cnt = 0
242 |             for key in seq1:
243 |                 if key in seq2 and (seq1[key] + seq2[key]) in BLOSUM:
244 |                     cnt += int(BLOSUM[seq1[key] + seq2[key]])
245 |                 else:
246 |                     cnt += -4
247 |             for key in seq2:
248 |                 if key not in seq1:
249 |                     cnt += -4
250 |             dist[i][j] = cnt
251 |     return dist
252 | 
253 | #################################################################################################################
254 | #  function HeatmapHL:
255 | #  Calculate the heavy and light chain heatmap over multiple iteration, draw the first heatmap
256 | # 
257 | #  Input:  SeqName_IDS, Amino, Num
258 | #  Output: 1. H_Idist
259 | #          2. L_Idist
260 | #################################################################################################################
261 | def HeatmapHL(size, iterate, SeqName_IDS, Amino, Num):
262 |     iterate = 1
263 |     BLOSUM = ReadBLOSUM()
264 |     H_Idist = []
265 |     L_Idist = []
266 |     for i in range(iterate):
267 |         H_Idist.append(CalBLOSUM(SeqName_IDS[i], Amino, Num, BLOSUM, 'H'))
268 |         H_Idist.append(CalBLOSUMVAR(SeqName_IDS[i], Amino, Num, BLOSUM, 'H'))
269 |         if IF_ONLY_HEAVY:
270 |             continue
271 |         L_Idist.append(CalBLOSUM(SeqName_IDS[i], Amino, Num, BLOSUM, 'L'))
272 |         L_Idist.append(CalBLOSUMVAR(SeqName_IDS[i], Amino, Num, BLOSUM, 'L'))
273 |     return H_Idist, L_Idist
274 | 
275 | #################################################################################################################
276 | #  function HeatmapFeature:
277 | #  Calculate the feature heatmap over multiple iteration, draw the first heatmap
278 | # 
279 | #  Input:  X_IDS, AllFeatureNames, MotifFeatureNames
280 | #  Output: 1. Idist
281 | #################################################################################################################
282 | def HeatmapFeature(size, iterate, X_IDS, AllFeatureNames, MotifFeatureNames):
283 |     iterate = 1
284 |     motifStart = len(AllFeatureNames) - len(MotifFeatureNames)
285 |     Idist = [np.zeros((len(X_IDS[0]), len(X_IDS[0]))) for i in range(iterate)]
286 |     for m in range(iterate):
287 |         for i in range(len(X_IDS[m])):
288 |             a = X_IDS[m][i]
289 |             for j in range(i + 1):
290 |                 b = X_IDS[m][j]
291 | 
292 |                 AandB = 0
293 |                 AorB = 0
294 |                 extr = [0 for x in range(len(X_IDS[m][j]))]
295 |                 for l in range(motifStart, len(X_IDS[m][i])):
296 |                     if a[l] == 1 and b[l] == 1:
297 |                         AandB += 1
298 |                     if a[l] == 1 or b[l] == 1:
299 |                         AorB += 1
300 |                 if AorB == 0:
301 |                     jaccard = 0
302 |                 else:
303 |                     jaccard = AandB / (1.0 * AorB)
304 | 
305 |                 for k in range(0, motifStart):
306 |                     if a[k] == 1 and b[k] == 1:
307 |                         extr[k] = 1
308 |                     else:
309 |                         extr[k] = 0
310 | 
311 |                 extr = np.array(extr)
312 |                 # jaccar score for motif, use except motif sum and motif jaccard score
313 |                 Idist[m][i][j] = np.sum(extr) + jaccard
314 |                 Idist[m][j][i] = Idist[m][i][j]
315 |     # if SET_NAME=='MMP-cluster':
316 |     #     Idist_new = []
317 |     #     for j in range(len(Idist[0])):
318 |     #         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
319 |     #             j].startswith('Canonical_L2') \
320 |     #                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
321 |     #             continue
322 |     #         Idist_new.append(Idist[0][:, j])
323 |     #     Idist_new = np.array(Idist_new)
324 |     #     Idist_new = Idist_new.T
325 |     #     Idist = Idist_new
326 |     return Idist
327 | 
328 | ###############################  Section 3.3 Similarity analysis (Statistical test) #############################
329 | 
330 | #################################################################################################################
331 | #  function RankTestBlock:
332 | #  Use Mann-Whitney test to check if hypothsis on the within set holds
333 | # 
334 | #  Input:  dist
335 | #  Output: 1. p_value
336 | #################################################################################################################
337 | def RankTestBlock(size, dist):
338 |     stop = int(len(dist)/2)
339 |     block1 = np.reshape(dist[:stop, :stop], [-1])
340 |     block4 = np.reshape(dist[stop:, stop:], [-1])
341 |     mean1 = np.mean(block1)
342 |     mean4 = np.mean(block4)
343 |     std1 = np.std(block1)
344 |     std4 = np.std(block4)
345 |     effect_size1 = (mean1 - mean4)/std1
346 |     effect_size4 = (mean1 - mean4) / std4
347 |     p_value  = sta.ranksums(block1, block4) #, alternative='less')
348 |     return p_value
349 | 
350 | #################################################################################################################
351 | #  function RankTestBlock:
352 | #  Use Mann-Whitney test to check if hypothsis on the correlation between heatmaps holds
353 | # 
354 | #  Input:  dist1, dist2
355 | #  Output: 1. p_value
356 | #################################################################################################################
357 | def RankTestHeatMap(dist1, dist2):
358 |     map1 = np.reshape(dist1, [-1])
359 |     map2 = np.reshape(dist2, [-1])
360 |     mean1 = np.mean(map1)
361 |     mean2 = np.mean(map2)
362 |     std1 = np.std(map1)
363 |     std2 = np.std(map2)
364 |     effect_size1 = (mean1 - mean2) / std1
365 |     effect_size2 = (mean1 - mean2) / std2
366 |     p_value = sta.ranksums(map1, map2)#, alternative='less')
367 |     return p_value
368 | 
369 | #################################################################################################################
370 | #  function MultiRankTest:
371 | #  Check if the statistical test for two hypothesis holds over multiple iterations
372 | # 
373 | #  Input:  F_Idist, H_Idist, L_Idist
374 | #################################################################################################################
375 | def MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist):
376 |     iterate = 1
377 |     p_value_F = [[] for i in range(iterate)]
378 |     p_value_H = [[] for i in range(iterate)]
379 |     p_value_L = [[] for i in range(iterate)]
380 |     p_value_Diff = [[] for i in range(iterate)]
381 |     for i in range(iterate):
382 |         # Wilcxon sum rank test, with effect size
383 |         p_value_F[i] = RankTestBlock(size, F_Idist[i])
384 |         p_value_H[i] = RankTestBlock(size, H_Idist[i])
385 |         if not IF_ONLY_HEAVY:
386 |             p_value_L[i] = RankTestBlock(size, L_Idist[i])
387 |             p_value_Diff[i] = RankTestHeatMap(F_Idist[i] - H_Idist[i], F_Idist[i] - L_Idist[i])
388 |     print(p_value_F[0], p_value_H[0])
389 |     if np.max(p_value_F)<0.05 and np.max(p_value_H)<0.05 and (IF_ONLY_HEAVY or np.max(p_value_L)<0.05): #and np.max(p_value_Diff)<0.05:
390 |         print("Statistical tests (Reference against Targeting) succeed.")
391 |     else:
392 |         print("Statistical tests results (Reference against Targeting):")
393 |         print('Extracted features:', p_value_F[0])
394 |         print('Heavy chain sequence:', p_value_H[0])
395 |         if not IF_ONLY_HEAVY:
396 |             print('Light chain sequence:', p_value_L[0])
397 |             print('Difference between (Feature, Heavy) and (Feature, Light):', p_value_Diff)
398 | 
399 | #######################################  Section 3.4 Salient feature-value analysis  ############################                                     #
400 | 
401 | 
402 | #################################################################################################################
403 | #  function Fisher:
404 | #  Calculate p-value with FET
405 | # 
406 | #  Input:  X_DS, Y_DS, AllFeatureNames
407 | #  Output: 1. contingency_table
408 | #          2. pvalue
409 | #          3. X_DS
410 | #          4. Y_DS
411 | #################################################################################################################
412 | def Fisher(X_DS, Y_DS, AllFeatureNames):
413 |     contingency_table=[[] for i in range(len(AllFeatureNames))]
414 |     N_feature = len(AllFeatureNames)
415 |     pvalue= [0 for i in range(N_feature)]
416 |     for i, name in enumerate(AllFeatureNames):
417 |         a, b, c, d = 0, 0, 0, 0
418 |         for j in range(len(Y_DS)):
419 |             if Y_DS[j] == 0:
420 |                 if X_DS[j][i]==1:
421 |                     a += 1
422 |                 else:
423 |                     c += 1
424 |             else:
425 |                 if X_DS[j][i]==1:
426 |                     b += 1
427 |                 else:
428 |                     d += 1
429 |         contingency_table[i] = [[a, b], [c, d]]
430 |     for i in range(N_feature):
431 |         oddsratio, pv = sta.fisher_exact(contingency_table[i], "less")  # greater sig in DB # less sig in patent, "two-sided"
432 |         pvalue[i] = pv
433 |     return contingency_table, pvalue, X_DS, Y_DS
434 | 
435 | #################################################################################################################
436 | #  function Importance:
437 | #  Calculate importance score through feature selection
438 | # 
439 | #  Input:  X_DS, Y_DS, AllFeatureNames
440 | #  Output: 1. importances
441 | #          2. X_DS
442 | #          3. Y_DS
443 | #################################################################################################################
444 | def Importance(X_DS, Y_DS, AllFeatureNames):
445 |     # X_DS, Y_DS, SeqName_DS = DuplicateSelectFeature(DatasetName, DatasetSize, AllFeatureVectors, size)
446 |     clf_featureSelect = ExtraTreesClassifier()
447 |     clf_featureSelect = clf_featureSelect.fit(X_DS, Y_DS)
448 |     importances = clf_featureSelect.feature_importances_
449 |     # print(len(X_DS[0]), len(AllFeatureNames))
450 |     X_DS = np.array(X_DS)
451 |     # Y_DS = np.array(Y_DS)
452 |     # print(len(X_DS))
453 |     a = AllFeatureNames.index('Germ_HJ_IGHJ4*02')
454 |     b = AllFeatureNames.index('Motif_5_YY')
455 |     # b = AllFeatureNames.index('PI_3.5-3.9375')
456 |     ###################################################################################################################################################
457 |     # b = AllFeatureNames.index('Canonical_H3_3')
458 |     # b = AllFeatureNames.index('Germ_HJ_IGHJ6*01')
459 |     sum_ref = 0
460 |     for j in range(int(len(X_DS)/2)):
461 |         if X_DS[j,a]==1 and X_DS[j,b]==0:
462 |             sum_ref+=1
463 |     sum_tar = 0
464 |     for j in range(int(len(X_DS)/2),len(X_DS)):
465 |         if X_DS[j,a]==1 and X_DS[j,b]==0:
466 |             sum_tar +=1
467 | 
468 |     # print(AllFeatureNames[a], AllFeatureNames[b], 'reference: ',sum_ref, 'targeting: ',sum_tar)
469 |     return importances, X_DS, Y_DS, sum_ref, sum_tar
470 | 
471 | #################################################################################################################
472 | #  function RankFisherFS:
473 | #  Sort feature values according to FET and feature selection statistics
474 | # 
475 | #  Input:  Fpvalue, importances
476 | #  Output: 1. RankFpvalue
477 | #          2. RankImportance
478 | #################################################################################################################
479 | def RankFisherFS(Fpvalue, importances):
480 |     RankFpvalue =[-1 for i in range(len(Fpvalue))]
481 |     s_Fpvalue = sorted(range(len(Fpvalue)), key=lambda k: Fpvalue[k])
482 |     for rank, idx in enumerate(s_Fpvalue):
483 |         RankFpvalue[idx] = rank+1 # real rank start from 1
484 | 
485 |     RankImportance =[-1 for i in range(len(importances))]
486 |     s_Importance = sorted(range(len(importances)), key=lambda k: importances[k], reverse = True)
487 |     for rank, idx in enumerate(s_Importance):
488 |         RankImportance[idx] = rank+1 # real rank start from 1
489 |     return RankFpvalue, RankImportance
490 | 
491 | #################################################################################################################
492 | #  function WriteFisherFS:
493 | #  Write FET and feature selection results to csv files
494 | # 
495 | #  Input:  Fpvalue, importances, Fpvalue_std, importances_std, RankFpvalue, RankImportance, AllFeatureNames
496 | #################################################################################################################
497 | def WriteFisherFS(Fpvalue, importances, Fpvalue_std, importances_std, RankFpvalue, RankImportance, AllFeatureNames, AllFeatureVectors, DatasetSize):
498 |     fo = open('./results/'+SET_NAME+'_RankFisherAndFS.csv', 'w')
499 |     fo.write('Feature, Feature Value,')
500 |     cnt_db = int(sum(DatasetSize[:CNT_DB]))
501 |     cnt_mmp = int(sum(DatasetSize[CNT_DB:]))
502 | 
503 |     fo.write('Fisher Test p-value, Feature Selection (thereshold = ' + format(np.mean(importances), '.4f') + '),')
504 |     fo.write('Rank of Statistic Significancy, Rank of Feature Selection, ')
505 |     fo.write('Frequency in Reference , Frequency in Targeting \n')
506 |     AgreeFeature = []
507 |     for i in range(len(AllFeatureNames)):
508 |         if AllFeatureNames[i].split('_')[0] == 'Germ' or AllFeatureNames[i].split('_')[0] == 'Canonical':
509 |             fo.write(AllFeatureNames[i].split('_')[0] + ' ' + AllFeatureNames[i].split('_')[1])
510 |             fo.write(','+AllFeatureNames[i].split('_')[2]+',')
511 |         elif AllFeatureNames[i].split('_')[0] == 'PI':
512 |             fo.write(AllFeatureNames[i].split('_')[0]+','+AllFeatureNames[i].split('_')[1]+',')
513 |         elif AllFeatureNames[i].split('_')[0] == 'Motif':
514 |             fo.write(AllFeatureNames[i].split('_')[0]+',')
515 |             fo.write(AllFeatureNames[i].split('_')[1] + '_' + AllFeatureNames[i].split('_')[2]+',')
516 |         fo.write(str(Fpvalue[i])+',')
517 |         fo.write(str(importances[i])+',')
518 | 
519 |         if Fpvalue[i]<0.05:
520 |             fo.write(str(RankFpvalue[i]))
521 |         fo.write(',')
522 |         if importances[i]>np.mean(importances):
523 |             fo.write(str(RankImportance[i]))
524 |         fo.write(',')
525 |         fo.write(str('{:.2f}'.format(sum(AllFeatureVectors[:cnt_db, i])/cnt_db * 100)) + '%,')
526 |         fo.write(str('{:.2f}'.format(sum(AllFeatureVectors[cnt_db:, i])/cnt_mmp * 100)) + '%,')
527 | 
528 |         fo.write('\n')
529 |         if Fpvalue[i]<0.05 and importances[i]>np.mean(importances):
530 |             AgreeFeature.append(i)
531 |     print(AllFeatureVectors.shape, cnt_db, cnt_mmp)
532 |     fo.close()
533 | 
534 | #################################################################################################################
535 | #  function MultiFisherFS:
536 | #  Average p-values for FET and importance scores for feature select over multiple iterations
537 | # 
538 | #  Input:  DatasetName, DatasetSize, AllFeatureVectors
539 | #################################################################################################################
540 | def MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, AllFeatureVectors, AllFeatureNames):
541 |     Fpvalue =           [[] for i in range(iterate)]
542 |     importances =       [[] for i in range(iterate)]
543 |     RankFpvalue =       [[] for i in range(iterate)]
544 |     RankImportance=     [[] for i in range(iterate)]
545 | 
546 |     ref_list = [0 for i in range(iterate)]
547 |     tar_list = [0 for i in range(iterate)]
548 |     for i in range(iterate):
549 |         _, Fpvalue[i], _, _ = Fisher(X_IDS[i], Y_IDS[i], AllFeatureNames)
550 |         # importances[i], _, _, ref_list[i], tar_list[i]= Importance(X_IDS[i], Y_IDS[i], AllFeatureNames)
551 |         RankFpvalue[i], _= RankFisherFS(Fpvalue[i], importances[i])
552 | 
553 |     X_IDS_all = []
554 |     Y_IDS_all = []
555 |     for i in range(iterate):
556 |         X_IDS_all+=X_IDS[i]
557 |         Y_IDS_all+=Y_IDS[i]
558 |     X_IDS_all = np.array(X_IDS_all)
559 |     Y_IDS_all = np.array(Y_IDS_all)
560 | 
561 |     importances_all, _, _, _, _ = Importance(X_IDS_all, Y_IDS_all, AllFeatureNames)
562 |     RankImportance_all = [-1 for i in range(len(importances_all))]
563 |     s_Importance = sorted(range(len(importances_all)), key=lambda k: importances_all[k], reverse=True)
564 |     for rank, idx in enumerate(s_Importance):
565 |         RankImportance_all[idx] = rank + 1  # real rank start from 1
566 | 
567 |     Fpvalue_avg = np.mean(Fpvalue, axis = 0)
568 |     # importances_avg = np.mean(importances, axis = 0)
569 | 
570 |     Fpvalue_std = np.std(Fpvalue, axis=0)
571 |     # importances_std = np.std(importances, axis=0)
572 | 
573 | 
574 |     # print('tar', '{:.2f}'.format(100*np.mean(tar_list)*2/len(X_IDS[0]))+'% ','ref','{:.2f}'.format(100*np.mean(ref_list)*2/len(X_IDS[0]))+'% ')
575 |     ####### avgR
576 |     RankFpvalue_avgR = np.mean(RankFpvalue, axis = 0)
577 |     # RankImportance_avgR = np.mean(RankImportance, axis = 0)
578 |     WriteFisherFS(Fpvalue_avg, importances_all,Fpvalue_std,Fpvalue_std, RankFpvalue_avgR, RankImportance_all, AllFeatureNames, AllFeatureVectors, DatasetSize)
579 | 
580 | 
581 | #######################################  Section 3.4 Classification on segments  ################################
582 | #################################################################################################################
583 | #  function calculate_auc:
584 | #  Calculate mean AUC over ten-fold cross validation for three algorithms, SVM, random forest, AdaBoost
585 | # 
586 | #  Input:  X, Y
587 | #  Output: 1. auc(mean_fpr, mean_tpr_svm)
588 | #          2. auc(mean_fpr, mean_tpr_rf) 
589 | #          3. auc(mean_fpr, mean_tpr_ada)
590 | #################################################################################################################
591 | def calculate_auc(X, Y):
592 |     clf_svm = svm.SVC(kernel='linear', probability=True, random_state=0)
593 |     clf_randomforest = RandomForestClassifier()  # max_depth=5, n_estimators=10, max_features=1
594 |     clf_adaboost = AdaBoostClassifier()
595 | 
596 |     X = np.array(X)
597 |     Y = np.array(Y)
598 |     indices = [i for i in range(len(Y))]
599 |     random.shuffle(indices)
600 | 
601 |     mean_fpr = np.linspace(0, 1, 100)
602 |     tpr_svms = []
603 |     tpr_rfs = []
604 |     tpr_adas = []
605 | 
606 |     for i in range(10):
607 |         test_i = indices[int(i * len(Y) / 10):int((i + 1) * len(Y) / 10)]
608 |         train_i = indices[:int(i * len(Y) / 10)] + indices[int((i + 1) * len(Y) / 10):]
609 |         X_train, X_test, Y_train, Y_test = X[train_i], X[test_i], Y[train_i], Y[test_i]
610 | 
611 |         clf_svm = clf_svm.fit(X_train, Y_train)
612 |         clf_randomforest = clf_randomforest.fit(X_train, Y_train)
613 |         clf_adaboost = clf_adaboost.fit(X_train, Y_train)
614 | 
615 |         fpr_svm, tpr_svm, _ = roc_curve(Y_test, clf_svm.predict_proba(X_test)[:, 1], pos_label=1)
616 |         tpr_svms.append(interp(mean_fpr, fpr_svm, tpr_svm))
617 |         tpr_svms[-1][0] = 0.0
618 | 
619 |         fpr_rf, tpr_rf, _ = roc_curve(Y_test, clf_randomforest.predict_proba(X_test)[:, 1], pos_label=1)
620 |         tpr_rfs.append(interp(mean_fpr, fpr_rf, tpr_rf))
621 |         tpr_rfs[-1][0] = 0.0
622 | 
623 |         fpr_ada, tpr_ada, _ = roc_curve(Y_test, clf_adaboost.predict_proba(X_test)[:, 1], pos_label=1)
624 |         tpr_adas.append(interp(mean_fpr, fpr_ada, tpr_ada))
625 |         tpr_adas[-1][0] = 0.0
626 | 
627 |     mean_tpr_svm = np.mean(tpr_svms, axis=0)
628 |     mean_tpr_svm[-1] = 1.0
629 |     mean_tpr_rf = np.mean(tpr_rfs, axis=0)
630 |     mean_tpr_rf[-1] = 1.0
631 |     mean_tpr_ada = np.mean(tpr_adas, axis=0)
632 |     mean_tpr_ada[-1] = 1.0
633 |     return auc(mean_fpr, mean_tpr_svm), auc(mean_fpr, mean_tpr_rf), auc(mean_fpr, mean_tpr_ada)
634 | 
635 | #################################################################################################################
636 | #  function MultiAuc:
637 | #  Average AUC for three classification with all features over multiple iterations
638 | # 
639 | #  Input:  X_IDS, Y_IDS
640 | #################################################################################################################
641 | def MultiAuc(iterate, X_IDS, Y_IDS):
642 |     auc_1 = [[] for i in range(iterate)]
643 |     auc_2 = [[] for i in range(iterate)]
644 |     auc_3 = [[] for i in range(iterate)]
645 | 
646 |     for i in range(iterate):
647 |         auc_1[i], auc_2[i],auc_3[i] = calculate_auc(X_IDS[i], Y_IDS[i])
648 |     print("Average AUC with all features: ")  
649 |     print("SVM\t\t", np.mean(auc_1, axis = 0))
650 |     print("Random forest\t",np.mean(auc_2, axis=0))
651 |     print("AdaBoost\t",np.mean(auc_3, axis=0))
652 |     
653 | #################################################################################################################
654 | #  function Classify:
655 | #  Classify the reference and targeting set with three algorithms, SVM, random forest, AdaBoost 
656 | # 
657 | #  Input:  X, Y, roc_name
658 | #################################################################################################################
659 | def Classify(X, Y, roc_name):
660 |     clf_svm = svm.SVC(kernel='linear', probability=True, random_state=0)
661 |     clf_randomforest = RandomForestClassifier()  # max_depth=5, n_estimators=10, max_features=1
662 |     clf_adaboost = AdaBoostClassifier()
663 | 
664 |     X = np.array(X)
665 |     Y = np.array(Y)
666 |     indices = [i for i in range(len(Y))]
667 |     random.shuffle(indices)
668 | 
669 |     mean_fpr = np.linspace(0, 1, 100)
670 |     tpr_svms = []
671 |     tpr_rfs = []
672 |     tpr_adas = []
673 | 
674 |     plt.figure(figsize=(10, 7))
675 |     lw = 2
676 |     for i in range(10):
677 |         test_i = indices[int(i * len(Y) / 10):int((i + 1) * len(Y) / 10)]
678 |         train_i = indices[:int(i * len(Y) / 10)] + indices[int((i + 1) * len(Y) / 10):]
679 |         X_train, X_test, Y_train, Y_test = X[train_i], X[test_i], Y[train_i], Y[test_i]
680 | 
681 |         clf_svm = clf_svm.fit(X_train, Y_train)
682 |         clf_randomforest = clf_randomforest.fit(X_train, Y_train)
683 |         clf_adaboost = clf_adaboost.fit(X_train, Y_train)
684 | 
685 |         fpr_svm, tpr_svm, _ = roc_curve(Y_test, clf_svm.predict_proba(X_test)[:, 1], pos_label=1)
686 |         tpr_svms.append(interp(mean_fpr, fpr_svm, tpr_svm))
687 |         tpr_svms[-1][0] = 0.0
688 | 
689 |         fpr_rf, tpr_rf, _ = roc_curve(Y_test, clf_randomforest.predict_proba(X_test)[:, 1], pos_label=1)
690 |         tpr_rfs.append(interp(mean_fpr, fpr_rf, tpr_rf))
691 |         tpr_rfs[-1][0] = 0.0
692 | 
693 |         fpr_ada, tpr_ada, _ = roc_curve(Y_test, clf_adaboost.predict_proba(X_test)[:, 1], pos_label=1)
694 |         tpr_adas.append(interp(mean_fpr, fpr_ada, tpr_ada))
695 |         tpr_adas[-1][0] = 0.0
696 | 
697 |     mean_tpr_svm = np.mean(tpr_svms, axis=0)
698 |     mean_tpr_svm[-1] = 1.0
699 |     mean_tpr_rf = np.mean(tpr_rfs, axis=0)
700 |     mean_tpr_rf[-1] = 1.0
701 |     mean_tpr_ada = np.mean(tpr_adas, axis=0)
702 |     mean_tpr_ada[-1] = 1.0
703 | 
704 |     plt.plot(mean_fpr, mean_tpr_svm, color='darkorange',
705 |              lw=lw, alpha=1, label='SVM (AUC = %0.4f)' % auc(mean_fpr, mean_tpr_svm))
706 |     plt.plot(mean_fpr, mean_tpr_rf, color='green',
707 |              lw=lw, label='Random Forest (AUC = %0.4f)' % auc(mean_fpr, mean_tpr_rf))
708 |     plt.plot(mean_fpr, mean_tpr_ada, color='darkred',
709 |              lw=lw, label='AdaBoost (AUC = %0.4f)' % auc(mean_fpr, mean_tpr_ada))
710 | 
711 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
712 |     plt.xlim([0.0, 1.0])
713 |     plt.ylim([0.0, 1.05])
714 |     plt.xlabel('False Positive Rate')
715 |     plt.ylabel('True Positive Rate')
716 |     plt.title(roc_name)
717 |     plt.legend(loc="lower right")
718 |     plt.savefig('./results/'+SET_NAME +'_'+ roc_name + "_ROC.png")
719 | 
720 | #################################################################################################################
721 | #  function ROCDrawing:
722 | #  Draw ROC and report AUC for classification
723 | # 
724 | #  Input:  X_S, Y_S
725 | #################################################################################################################
726 | def ROCDrawing(X_S, Y_S, GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames):
727 |     # MMP features over 0.8 jaccard coefficient
728 |     if SET_NAME == 'MMP-cluster':
729 |         correlate_feature = ['Germ_HV_IGHV3-23*01', 'Canonical_L2_0', 'Canonical_L3_0', 'Canonical_H1_1', 'Canonical_H2_6']
730 |     # elif SET_NAME == 'IGHV':
731 |     # # # IGHV features over 0.8 jaccard coefficient
732 |     #     correlate_feature = ['Germ_HV_IGHV3-23*01', 'Canonical_H1_1', 'Canonical_H2_6']
733 |     else:
734 |         correlate_feature = []
735 |     X_S = np.array(X_S)
736 | 
737 |     X_S_new = []
738 |     for j in range(len(X_S[0])):
739 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2')or AllFeatureNames[j].startswith('Canonical_L2') \
740 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
741 |             continue
742 |         X_S_new.append(X_S[:, j])
743 |     X_S_new = np.array(X_S_new)
744 |     X_S_new = X_S_new.T
745 |     Classify(X_S_new, Y_S, 'All Features Included (Exclude Correlated)')
746 | 
747 | 
748 |     Germ_E = len(GermFeatureNames)
749 | 
750 |     Canon_E = Germ_E + len(CanonFeatureNames)
751 |     PI_E = Canon_E + len(PIFeatureNames)
752 | 
753 | 
754 |     # Exclude the features correlated
755 |     X_S_new = []
756 |     for j in range(Germ_E):
757 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
758 |             j].startswith('Canonical_L2') \
759 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
760 |             continue
761 |         X_S_new.append(X_S[:,j])
762 |     X_S_new = np.array(X_S_new)
763 |     X_S_new = X_S_new.T
764 |     Classify(X_S_new, Y_S, 'Only Germline Features (Exclude Correlated)')
765 | 
766 |     X_S_new = []
767 |     for j in range(Germ_E, Canon_E):
768 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
769 |             j].startswith('Canonical_L2') \
770 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
771 |             continue
772 |         X_S_new.append(X_S[:, j])
773 |     X_S_new = np.array(X_S_new)
774 |     X_S_new = X_S_new.T
775 |     Classify(X_S_new, Y_S, 'Only CDR Canonical Structure Features (Exclude Correlated)')
776 | 
777 |     X_S_new = []
778 |     for j in range(Germ_E, X_S.shape[1]):
779 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
780 |             j].startswith('Canonical_L2') \
781 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
782 |             continue
783 |         X_S_new.append(X_S[:, j])
784 |     X_S_new = np.array(X_S_new)
785 |     X_S_new = X_S_new.T
786 |     Classify(X_S_new, Y_S, 'Except Germline Features (Exclude Correlated)')
787 | 
788 |     X_S_new = []
789 |     for j in range(Germ_E):
790 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
791 |             j].startswith('Canonical_L2') \
792 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
793 |             continue
794 |         X_S_new.append(X_S[:, j])
795 |     for j in range(Canon_E, X_S.shape[1]):
796 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
797 |             j].startswith('Canonical_L2') \
798 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
799 |             continue
800 |         X_S_new.append(X_S[:, j])
801 |     X_S_new = np.array(X_S_new)
802 |     X_S_new = X_S_new.T
803 |     Classify(np.concatenate((X_S[:, :Germ_E], X_S[:, Canon_E:]), axis=1), Y_S,
804 |              'Except CDR Canonical Structure Features (Exclude Correlated)')
805 | 
806 |     X_S_new = []
807 |     for j in range(Canon_E):
808 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
809 |             j].startswith('Canonical_L2') \
810 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
811 |             continue
812 |         X_S_new.append(X_S[:, j])
813 |     for j in range(PI_E, X_S.shape[1]):
814 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
815 |             j].startswith('Canonical_L2') \
816 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
817 |             continue
818 |         X_S_new.append(X_S[:, j])
819 |     X_S_new = np.array(X_S_new)
820 |     X_S_new = X_S_new.T
821 |     Classify(np.concatenate((X_S[:, :Canon_E], X_S[:, PI_E:]), axis=1), Y_S, 'Except pI Features (Exclude Correlated)')
822 | 
823 |     X_S_new = []
824 |     for j in range(PI_E):
825 |         if AllFeatureNames[j].startswith('Germ_HV') or AllFeatureNames[j].startswith('Canonical_H2') or AllFeatureNames[
826 |             j].startswith('Canonical_L2') \
827 |                 or AllFeatureNames[j].startswith('Canonical_L3') or AllFeatureNames[j].startswith('Canonical_H1'):
828 |             continue
829 |         X_S_new.append(X_S[:, j])
830 |     X_S_new = np.array(X_S_new)
831 |     X_S_new = X_S_new.T
832 |     Classify(X_S_new, Y_S, 'Except Frequent Positional Motif Features (Exclude Correlated)')
833 | 
834 |     # Classify(X_S[:,:Germ_E], Y_S, 'Only Germline Features')
835 |     # Classify(X_S[:,Germ_E:Canon_E], Y_S, 'Only CDR Canonical Structure Features')
836 |     # Classify(X_S[:,Canon_E:PI_E], Y_S, 'Only pI Features')
837 |     # Classify(X_S[:,PI_E:], Y_S, 'Only Frequent Positional Motif Features')
838 |     #
839 |     # Classify(X_S[:,Germ_E:], Y_S, 'Except Germline Features')
840 |     # Classify(np.concatenate((X_S[:,:Germ_E],X_S[:,Canon_E:]),axis=1) , Y_S, 'Except CDR Canonical Structure Features')
841 |     # Classify(np.concatenate((X_S[:,:Canon_E],X_S[:,PI_E:]),axis=1), Y_S, 'Except pI Features')
842 |     # Classify(X_S[:,:PI_E], Y_S, 'Except Frequent Positional Motif Features')
843 | 
844 | def JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize):
845 |     if SET_NAME=='MMP-cluster' :
846 |         PDB_size = DatasetSize[0] + DatasetSize[1]
847 |     elif SET_NAME=='IGHV':
848 |         PDB_size = DatasetSize[0]
849 | 
850 |     jac_sim_PDB = np.eye(len(AllFeatureNames))
851 |     for i in range(len(AllFeatureNames)):
852 |         for j in range(i + 1, len(AllFeatureNames)):
853 |             if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
854 |                 continue
855 |             a = AllFeatureVectors[:PDB_size, i]
856 |             b = AllFeatureVectors[:PDB_size, j]
857 |             aandb = 0
858 |             aorb = 0
859 |             for k in range(len(a)):
860 |                 if a[k] == b[k] and a[k] == 1:
861 |                     aandb += 1
862 |                 if a[k] == 1 or b[k] == 1:
863 |                     aorb += 1
864 |             if aorb == 0:
865 |                 jac_tmp = 0
866 |             else:
867 |                 jac_tmp = float(aandb) / aorb
868 |             # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
869 |             #     print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
870 |             jac_sim_PDB[i][j] = jac_tmp
871 |             jac_sim_PDB[j][i] = jac_tmp
872 | 
873 |     jac_sim_MMP = np.eye(len(AllFeatureNames))
874 |     for i in range(len(AllFeatureNames)):
875 |         for j in range(i + 1, len(AllFeatureNames)):
876 |             if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
877 |                 continue
878 |             a = AllFeatureVectors[PDB_size:, i]
879 |             b = AllFeatureVectors[PDB_size:, j]
880 | 
881 |             aandb = 0
882 |             aorb = 0
883 |             for k in range(len(a)):
884 |                 if a[k] == b[k] and a[k] == 1:
885 |                     aandb += 1
886 |                 if a[k] == 1 or b[k] == 1:
887 |                     aorb += 1
888 |             if aorb == 0:
889 |                 jac_tmp = 0
890 |             else:
891 |                 jac_tmp = float(aandb) / aorb
892 |             # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature:
893 |             #     print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp)
894 | 
895 |             jac_sim_MMP[i][j] = jac_tmp
896 |             jac_sim_MMP[j][i] = jac_tmp
897 | 
898 |     with open('./results/' + SET_NAME + '_Jaccard Feature Coefficient.csv', 'w') as fi:
899 |         fi.write(
900 |             'Feature value 1, Feature value 2, Jaccard coefficient for reference set, Jaccard coefficient for MMP-targeting set\n')
901 |         for i in range(len(AllFeatureNames)):
902 |             for j in range(i + 1, len(AllFeatureNames)):
903 |                 if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'):
904 |                     continue
905 |                 fi.write(AllFeatureNames[i] + ',' + AllFeatureNames[j] + ',' + str(jac_sim_PDB[i][j]) + ',' + str(
906 |                     jac_sim_MMP[i][j]) + '\n')
907 | 


--------------------------------------------------------------------------------