├── magiccluster
├── __init__.py
├── main.py
├── base.py
├── cli.py
├── magic_clustering.py
├── clustering.py
└── utils.py
├── docs
├── _config.yml
├── images
│ └── magic.png
└── index.md
├── data
├── magic.png
├── participant.tsv
└── test_covariate.tsv
├── requirements.txt
├── .gitignore
├── install_requirements.sh
├── CITATION.cff
├── setup.py
├── LICENSE
└── README.md
/magiccluster/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-leap-day
--------------------------------------------------------------------------------
/data/magic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anbai106/MAGIC/HEAD/data/magic.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scikit-learn==0.21.3
3 | pandas
4 | nibabel
5 |
--------------------------------------------------------------------------------
/docs/images/magic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anbai106/MAGIC/HEAD/docs/images/magic.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | # Compiled python modules.
3 | *.pyc
4 |
5 | # Setuptools distribution folder.
6 | /dist/
7 | /build/
8 | /venv/
9 |
10 | test.py
11 |
12 | # Python egg metadata, regenerated from source files by setuptools.
13 | /*.egg-info
--------------------------------------------------------------------------------
/install_requirements.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | while read requirement;
4 | do
5 | if conda install --yes $requirement; then
6 | echo "Successfully install: ${requirement}"
7 | else
8 | conda install --yes -c conda-forge $requirement
9 | fi
10 | done < requirements.txt
11 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | abstract: "This is my MAGIC software for research purposes only..."
2 | authors:
3 | - family-names: Wen
4 | given-names: Junhao
5 | orcid: "https://orcid.org/0000-0003-2077-3070"
6 | cff-version: 1.2.0
7 | version: 0.0.3
8 | date-released: "2023-09-24"
9 | keywords:
10 | - "multi-scale clustering"
11 | - research
12 | license: MIT
13 | message: "If you use this software, please cite it using these metadata."
14 | repository-code: "https://github.com/anbai106/MAGIC"
15 | title: "MAGIC"
--------------------------------------------------------------------------------
/magiccluster/main.py:
--------------------------------------------------------------------------------
1 | from magiccluster import cli
2 |
3 | __author__ = "Junhao Wen"
4 | __copyright__ = "Copyright 2023"
5 | __credits__ = ["Junhao Wen"]
6 | __license__ = "See LICENSE file"
7 | __version__ = "0.0.3"
8 | __maintainer__ = "Junhao Wen"
9 | __email__ = "junhao.wen89@gmail.com"
10 | __status__ = "Development"
11 |
12 | def main():
13 |
14 | parser = cli.parse_command_line()
15 | args = parser.parse_args()
16 | args.func(args)
17 |
18 |
19 | if __name__ == '__main__':
20 | main()
21 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="magiccluster",
8 | version="0.0.3",
9 | author="junhao.wen",
10 | author_email="junhao.wen89@email.com",
11 | description="Multi-scale semi-supervised clustering",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/anbai106/MAGIC",
15 | packages=setuptools.find_packages(),
16 | entry_points={
17 | 'console_scripts': [
18 | 'magiccluster = magiccluster.main:main',
19 | ],
20 | },
21 | classifiers=(
22 | "Programming Language :: Python :: 3",
23 | "License :: OSI Approved :: MIT License",
24 | "Operating System :: OS Independent",
25 | ),
26 | )
27 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Junhao WEN
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | MAGIC
7 |
8 |
9 | Multi-scAle heteroGeneity analysIs and Clustering
10 |
11 |
12 | Documentation
13 |
14 |
15 | ## `MAGIC`
16 | **MAGIC**, Multi-scAle heteroGeneity analysIs and Clustering, is a multi-scale semi-supervised clustering method that aims to derive robust clustering solutions across different scales for brain diseases.
17 |
18 | > :warning: **The documentation of this software is currently under development**
19 |
20 | ## Citing this work
21 | > :warning: Please let me know if you use this package for your publication; I will update your papers in the section of **Publication using MAGIC**...
22 |
23 | > :warning: Please cite the software using the **Cite this repository** button on the right sidebar menu, as well as the original papers below ...
24 |
25 | ### Original papers
26 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2020) **MAGIC: Multi-scale Heterogeneity Analysis and Clustering for Brain Diseases**. Medical Image Computing and Computer Assisted Intervention – MICCAI 2020. MICCAI 2020. Lecture Notes in Computer Science, vol 12267. Springer, Cham. https://doi.org/10.1007/978-3-030-59728-3_66
27 |
28 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2022) **Multi-scale semi-supervised clustering of brain images: Deriving disease subtypes**. Medical Image Analysis, 2022. https://doi.org/10.1016/j.media.2021.102304 - [Link](https://www.sciencedirect.com/science/article/pii/S1361841521003492)
29 |
--------------------------------------------------------------------------------
/data/participant.tsv:
--------------------------------------------------------------------------------
1 | participant_id session_id diagnosis
2 | sub-80010 ses-M0 -1
3 | sub-80179 ses-M0 -1
4 | sub-80199 ses-M0 -1
5 | sub-80208 ses-M0 1
6 | sub-80249 ses-M0 -1
7 | sub-80265 ses-M0 1
8 | sub-80289 ses-M0 1
9 | sub-80396 ses-M0 -1
10 | sub-80425 ses-M0 1
11 | sub-80498 ses-M0 1
12 | sub-80537 ses-M0 -1
13 | sub-80557 ses-M0 -1
14 | sub-80575 ses-M0 -1
15 | sub-80607 ses-M0 1
16 | sub-80680 ses-M0 1
17 | sub-80688 ses-M0 -1
18 | sub-80765 ses-M0 -1
19 | sub-80812 ses-M0 1
20 | sub-80854 ses-M0 -1
21 | sub-80889 ses-M0 -1
22 | sub-81043 ses-M0 -1
23 | sub-81222 ses-M0 -1
24 | sub-81231 ses-M0 1
25 | sub-81287 ses-M0 -1
26 | sub-81323 ses-M0 -1
27 | sub-81353 ses-M0 -1
28 | sub-81456 ses-M0 -1
29 | sub-81528 ses-M0 -1
30 | sub-81533 ses-M0 1
31 | sub-81544 ses-M0 -1
32 | sub-81644 ses-M0 -1
33 | sub-81659 ses-M0 -1
34 | sub-81662 ses-M0 1
35 | sub-81754 ses-M0 1
36 | sub-81826 ses-M0 -1
37 | sub-81865 ses-M0 -1
38 | sub-81876 ses-M0 -1
39 | sub-81903 ses-M0 -1
40 | sub-81906 ses-M0 1
41 | sub-81989 ses-M0 -1
42 | sub-81992 ses-M0 -1
43 | sub-82003 ses-M0 1
44 | sub-82021 ses-M0 -1
45 | sub-82063 ses-M0 1
46 | sub-82066 ses-M0 1
47 | sub-82096 ses-M0 -1
48 | sub-82124 ses-M0 1
49 | sub-82155 ses-M0 1
50 | sub-82202 ses-M0 1
51 | sub-82208 ses-M0 -1
52 | sub-82217 ses-M0 -1
53 | sub-82229 ses-M0 1
54 | sub-82232 ses-M0 -1
55 | sub-82281 ses-M0 1
56 | sub-82293 ses-M0 1
57 | sub-82311 ses-M0 -1
58 | sub-82359 ses-M0 -1
59 | sub-82373 ses-M0 -1
60 | sub-82423 ses-M0 -1
61 | sub-82453 ses-M0 1
62 | sub-82458 ses-M0 -1
63 | sub-82467 ses-M0 -1
64 | sub-82492 ses-M0 1
65 | sub-82511 ses-M0 1
66 | sub-82587 ses-M0 -1
67 | sub-82674 ses-M0 1
68 | sub-82709 ses-M0 1
69 | sub-82754 ses-M0 -1
70 | sub-82784 ses-M0 1
71 | sub-82877 ses-M0 -1
72 | sub-82962 ses-M0 1
73 | sub-82982 ses-M0 -1
74 | sub-82985 ses-M0 1
75 | sub-82989 ses-M0 1
76 | sub-83010 ses-M0 1
77 | sub-83013 ses-M0 1
78 | sub-83044 ses-M0 1
79 | sub-83080 ses-M0 -1
80 | sub-83103 ses-M0 -1
81 | sub-83113 ses-M0 1
82 | sub-83207 ses-M0 -1
83 | sub-83260 ses-M0 1
84 | sub-83358 ses-M0 1
85 | sub-83372 ses-M0 -1
86 | sub-83423 ses-M0 -1
87 | sub-83429 ses-M0 1
88 | sub-83454 ses-M0 -1
89 | sub-83525 ses-M0 -1
90 | sub-83531 ses-M0 1
91 | sub-83580 ses-M0 -1
92 | sub-83612 ses-M0 1
93 | sub-83616 ses-M0 1
94 | sub-83632 ses-M0 1
95 | sub-83648 ses-M0 1
96 | sub-83835 ses-M0 -1
97 | sub-83972 ses-M0 -1
98 | sub-83987 ses-M0 -1
99 | sub-83999 ses-M0 1
100 | sub-84002 ses-M0 -1
101 |
--------------------------------------------------------------------------------
/data/test_covariate.tsv:
--------------------------------------------------------------------------------
1 | participant_id session_id diagnosis age sex
2 | sub-80010 ses-M0 -1 21.75 0
3 | sub-80179 ses-M0 -1 21.1666666666667 1
4 | sub-80199 ses-M0 -1 20.3333333333333 0
5 | sub-80208 ses-M0 1 20.5 0
6 | sub-80249 ses-M0 -1 20.8333333333333 1
7 | sub-80265 ses-M0 1 20.5 1
8 | sub-80289 ses-M0 1 20.0833333333333 0
9 | sub-80396 ses-M0 -1 20.8333333333333 0
10 | sub-80425 ses-M0 1 20 1
11 | sub-80498 ses-M0 1 20.9166666666667 0
12 | sub-80537 ses-M0 -1 20.9166666666667 1
13 | sub-80557 ses-M0 -1 21.5 1
14 | sub-80575 ses-M0 -1 21.75 0
15 | sub-80607 ses-M0 1 21 0
16 | sub-80680 ses-M0 1 21.0833333333333 0
17 | sub-80688 ses-M0 -1 21.9166666666667 1
18 | sub-80765 ses-M0 -1 20.5833333333333 1
19 | sub-80812 ses-M0 1 20.5833333333333 1
20 | sub-80854 ses-M0 -1 20.1666666666667 0
21 | sub-80889 ses-M0 -1 21.75 0
22 | sub-81043 ses-M0 -1 20.75 1
23 | sub-81222 ses-M0 -1 20.25 1
24 | sub-81231 ses-M0 1 21.75 1
25 | sub-81287 ses-M0 -1 20 1
26 | sub-81323 ses-M0 -1 20.5833333333333 1
27 | sub-81353 ses-M0 -1 20.0833333333333 0
28 | sub-81456 ses-M0 -1 21.6666666666667 1
29 | sub-81528 ses-M0 -1 19.5833333333333 1
30 | sub-81533 ses-M0 1 21.5833333333333 1
31 | sub-81544 ses-M0 -1 19.3333333333333 1
32 | sub-81644 ses-M0 -1 19.25 1
33 | sub-81659 ses-M0 -1 19.25 0
34 | sub-81662 ses-M0 1 19.3333333333333 1
35 | sub-81754 ses-M0 1 19.3333333333333 1
36 | sub-81826 ses-M0 -1 19.0833333333333 1
37 | sub-81865 ses-M0 -1 21.75 0
38 | sub-81876 ses-M0 -1 21.25 0
39 | sub-81903 ses-M0 -1 19.25 1
40 | sub-81906 ses-M0 1 21.3333333333333 1
41 | sub-81989 ses-M0 -1 19.0833333333333 1
42 | sub-81992 ses-M0 -1 21.1666666666667 0
43 | sub-82003 ses-M0 1 21.1666666666667 1
44 | sub-82021 ses-M0 -1 19.3333333333333 1
45 | sub-82063 ses-M0 1 20.1666666666667 0
46 | sub-82066 ses-M0 1 21.4166666666667 1
47 | sub-82096 ses-M0 -1 20.0833333333333 1
48 | sub-82124 ses-M0 1 20.6666666666667 1
49 | sub-82155 ses-M0 1 19.5 1
50 | sub-82202 ses-M0 1 19.5833333333333 1
51 | sub-82208 ses-M0 -1 21.1666666666667 1
52 | sub-82217 ses-M0 -1 19.5 1
53 | sub-82229 ses-M0 1 21.3333333333333 1
54 | sub-82232 ses-M0 -1 19 1
55 | sub-82281 ses-M0 1 20 1
56 | sub-82293 ses-M0 1 19.8333333333333 1
57 | sub-82311 ses-M0 -1 20.6666666666667 1
58 | sub-82359 ses-M0 -1 21.6666666666667 0
59 | sub-82373 ses-M0 -1 19.6666666666667 0
60 | sub-82423 ses-M0 -1 21.25 0
61 | sub-82453 ses-M0 1 21.4166666666667 0
62 | sub-82458 ses-M0 -1 19.0833333333333 0
63 | sub-82467 ses-M0 -1 20.0833333333333 0
64 | sub-82492 ses-M0 1 19.5 0
65 | sub-82511 ses-M0 1 19.6666666666667 1
66 | sub-82587 ses-M0 -1 19.5833333333333 1
67 | sub-82674 ses-M0 1 19.6666666666667 1
68 | sub-82709 ses-M0 1 22.5 1
69 | sub-82754 ses-M0 -1 20.8333333333333 0
70 | sub-82784 ses-M0 1 19.5833333333333 1
71 | sub-82877 ses-M0 -1 20.25 0
72 | sub-82962 ses-M0 1 19.1666666666667 0
73 | sub-82982 ses-M0 -1 18.8333333333333 1
74 | sub-82985 ses-M0 1 20.8333333333333 1
75 | sub-82989 ses-M0 1 18.8333333333333 0
76 | sub-83010 ses-M0 1 19.5833333333333 0
77 | sub-83013 ses-M0 1 20.25 1
78 | sub-83044 ses-M0 1 20.8333333333333 1
79 | sub-83080 ses-M0 -1 18.5 1
80 | sub-83103 ses-M0 -1 18.1666666666667 0
81 | sub-83113 ses-M0 1 18.6666666666667 1
82 | sub-83207 ses-M0 -1 19.25 0
83 | sub-83260 ses-M0 1 22.6666666666667 0
84 | sub-83358 ses-M0 1 19.8333333333333 1
85 | sub-83372 ses-M0 -1 18.25 0
86 | sub-83423 ses-M0 -1 20.75 1
87 | sub-83429 ses-M0 1 18.8333333333333 1
88 | sub-83454 ses-M0 -1 20.1666666666667 1
89 | sub-83525 ses-M0 -1 18.9166666666667 1
90 | sub-83531 ses-M0 1 19.9166666666667 0
91 | sub-83580 ses-M0 -1 19 0
92 | sub-83612 ses-M0 1 20 1
93 | sub-83616 ses-M0 1 18.5833333333333 1
94 | sub-83632 ses-M0 1 20.9166666666667 1
95 | sub-83648 ses-M0 1 19.5833333333333 1
96 | sub-83835 ses-M0 -1 20.6666666666667 0
97 | sub-83972 ses-M0 -1 19.0833333333333 0
98 | sub-83987 ses-M0 -1 18.4166666666667 1
99 | sub-83999 ses-M0 1 18.6666666666667 0
100 | sub-84002 ses-M0 -1 22.25 0
101 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # MAGIC documentation
6 | **MAGIC**, Multi-scAle heteroGeneity analysIs and Clustering, is a multi-scale semi-supervised clustering method that aims to derive robust clustering solutions across different scales for brain diseases.
7 | Compared to original HYDRA method, MAGIC has the following advantages:
8 | - Multi-scale feature extractions via opNMF;
9 | - Inter-scale consistent clustering solution.
10 |
11 | ## Installation
12 | ### Prerequisites
13 | In order to run MAGIC, one must have already installed and ran [SOPNMF (https://github.com/anbai106/SOPNMF) with the voxel-wise image data. After this, please follow the following steps for installation.
14 |
15 | There are three choices to install MAGIC.
16 | ### Use MAGIC as a python package
17 | We recommend the users to use Conda virtual environment:
18 | ```
19 | 1) conda create --name MAGIC python=3.6
20 | ```
21 | Activate the virtual environment:
22 | ```
23 | 2) source activate MAGIC
24 | ```
25 | Install other python package dependencies (go to the root folder of MAGIC):
26 | ```
27 | 3) ./install_requirements.sh
28 | ```
29 | Finally, we need install MAGIC from PyPi:
30 | ```
31 | 3) pip install magiccluster==0.0.3
32 | ```
33 |
34 | ### Use MAGIC from commandline:
35 | After installing all dependencies in the **requirements.txt** file, go to the root folder of MAGIC where the **setup.py** locates:
36 | ```
37 | pip install -e .
38 | ```
39 |
40 | ### Use MAGIC as a developer version:
41 | ```
42 | python -m pip install git+https://github.com/anbai106/MAGIC.git
43 | ```
44 |
45 | ## Input structure
46 | MAGIC requires a specific input structure inspired by [BIDS](https://bids.neuroimaging.io/).
47 | Some conventions for the group label/diagnosis: -1 represents healthy control (**CN**) and 1 represents patient (**PT**); categorical variables, such as sex, should be encoded to numbers: Female for 0 and Male for 1, for instance.
48 |
49 | ### participant and covariate tsv
50 | The first 3 columns are **participant_id**, **session_id** and **diagnosis**.
51 |
52 | Example for feature tsv:
53 | ```
54 | participant_id session_id diagnosis
55 | sub-CLNC0001 ses-M00 -1 432.1
56 | sub-CLNC0002 ses-M00 1 398.2
57 | sub-CLNC0003 ses-M00 -1 412.0
58 | sub-CLNC0004 ses-M00 -1 487.4
59 | sub-CLNC0005 ses-M00 1 346.5
60 | sub-CLNC0006 ses-M00 1 443.2
61 | sub-CLNC0007 ses-M00 -1 450.2
62 | sub-CLNC0008 ses-M00 1 443.2
63 | ```
64 | Example for covariate tsv:
65 | ```
66 | participant_id session_id diagnosis age sex ...
67 | sub-CLNC0001 ses-M00 -1 56.1 0
68 | sub-CLNC0002 ses-M00 1 57.2 0
69 | sub-CLNC0003 ses-M00 -1 43.0 1
70 | sub-CLNC0004 ses-M00 -1 25.4 1
71 | sub-CLNC0005 ses-M00 1 74.5 1
72 | sub-CLNC0006 ses-M00 1 44.2 0
73 | sub-CLNC0007 ses-M00 -1 40.2 0
74 | sub-CLNC0008 ses-M00 1 43.2 1
75 | ```
76 |
77 | ## Example
78 | We offer a fake dataset in the folder of **MAGIC/data**. Users should follow the same data structure.
79 |
80 | ### Running MAGIC for clustering CN vs Subtype1 vs Subtype2 vs ...:
81 | ```
82 | from from magic.magic_clustering import clustering
83 | participant_tsv="MAGIC/data/participant.tsv"
84 | opnmf_dir = "PATH_OPNMF_DIR"
85 | output_dir = "PATH_OUTPUT_DIR"
86 | k_min=2
87 | k_max=8
88 | cv_repetition=100
89 | clustering(participant_tsv, opnmf_dir, output_dir, k_min, k_max, 25, 60, 5, cv_repetition)
90 | ```
91 |
92 | ## Citing this work
93 | > :warning: Please let me know if you use this package for your publication; I will update your papers in the section of **Publication using MAGIC**...
94 |
95 | > :warning: Please cite the software using the **Cite this repository** button on the right sidebar menu, as well as the original papers below ...
96 |
97 | ### Original papers
98 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2020) **MAGIC: Multi-scale Heterogeneity Analysis and Clustering for Brain Diseases**. Medical Image Computing and Computer Assisted Intervention – MICCAI 2020. MICCAI 2020. Lecture Notes in Computer Science, vol 12267. Springer, Cham. https://doi.org/10.1007/978-3-030-59728-3_66
99 |
100 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2022) **Multi-scale semi-supervised clustering of brain images: Deriving disease subtypes**. Medical Image Analysis, 2022. https://doi.org/10.1016/j.media.2021.102304 - [Link](https://www.sciencedirect.com/science/article/pii/S1361841521003492)
--------------------------------------------------------------------------------
/magiccluster/base.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import pandas as pd
3 | from utils import GLMcorrection
4 | import numpy as np
5 | import os
6 | from sklearn.preprocessing import StandardScaler
7 |
8 | __author__ = "Junhao Wen"
9 | __copyright__ = "Copyright 2023"
10 | __credits__ = ["Junhao Wen, Erdem Varol"]
11 | __license__ = "See LICENSE file"
12 | __version__ = "0.0.3"
13 | __maintainer__ = "Junhao Wen"
14 | __email__ = "junhao.wen89@gmail.com"
15 | __status__ = "Development"
16 |
17 |
18 | class WorkFlow:
19 | __metaclass__ = abc.ABCMeta
20 |
21 | @abc.abstractmethod
22 | def run(self):
23 | pass
24 |
25 |
26 | class Input:
27 | __metaclass__ = abc.ABCMeta
28 |
29 | @abc.abstractmethod
30 | def get_x(self):
31 | pass
32 |
33 | @abc.abstractmethod
34 | def get_y(self):
35 | pass
36 |
37 | class OPNMF_Input(Input):
38 |
39 | def __init__(self, opnmf_dir, participant_tsv, covariate_tsv=None):
40 | self._opnmf_dir = opnmf_dir
41 | self._participant_tsv = participant_tsv
42 | self._covariate_tsv = covariate_tsv
43 | self._x = None
44 | self._y = None
45 |
46 | ## check the participant_tsv & covariate_tsv, the header, the order of the columns, etc
47 | self._df_feature = pd.read_csv(participant_tsv, sep='\t')
48 | if ('participant_id' != list(self._df_feature.columns.values)[0]) or (
49 | 'session_id' != list(self._df_feature.columns.values)[1]) or \
50 | ('diagnosis' != list(self._df_feature.columns.values)[2]):
51 | raise Exception("the data file is not in the correct format."
52 | "Columns should include ['participant_id', 'session_id', 'diagnosis']")
53 | self._subjects = list(self._df_feature['participant_id'])
54 | self._sessions = list(self._df_feature['session_id'])
55 | self._diagnosis = list(self._df_feature['diagnosis'])
56 |
57 | def get_x(self, num_component, opnmf_dir):
58 |
59 | ## alternatively, we use here the output of pyOPNMF loading coefficient
60 | loading_coefficient_csv = os.path.join(opnmf_dir, 'NMF', 'component_' + str(num_component),
61 | 'loading_coefficient.tsv')
62 | ## read the tsv
63 | df_opnmf = pd.read_csv(loading_coefficient_csv, sep='\t')
64 | df_opnmf = df_opnmf.loc[df_opnmf['participant_id'].isin(self._df_feature['participant_id'])]
65 | ### adjust the order of the rows to match the original tsv files
66 | df_opnmf = df_opnmf.set_index('participant_id')
67 | df_opnmf = df_opnmf.reindex(index=self._df_feature['participant_id'])
68 | df_opnmf = df_opnmf.reset_index()
69 |
70 | self._x = df_opnmf[['component_' + str(i + 1) for i in range(num_component)]].to_numpy()
71 |
72 | ### normalize the data, note the normalization should be done for each component, not across component
73 | scaler = StandardScaler()
74 | self._x = scaler.fit_transform(self._x)
75 |
76 | if self._covariate_tsv is not None:
77 | df_covariate = pd.read_csv(self._covariate_tsv, sep='\t')
78 | if ('participant_id' != list(self._df_feature.columns.values)[0]) or (
79 | 'session_id' != list(self._df_feature.columns.values)[1]) or \
80 | ('diagnosis' != list(self._df_feature.columns.values)[2]):
81 | raise Exception("the data file is not in the correct format."
82 | "Columns should include ['participant_id', 'session_id', 'diagnosis']")
83 | participant_covariate = list(df_covariate['participant_id'])
84 | session_covariate = list(df_covariate['session_id'])
85 | label_covariate = list(df_covariate['diagnosis'])
86 |
87 | # check that the participant_tsv and covariate_tsv have the same orders for the first three column
88 | if (not self._subjects == participant_covariate) or (not self._sessions == session_covariate) or (
89 | not self._diagnosis == label_covariate):
90 | raise Exception(
91 | "the first three columns in the feature csv and covariate csv should be exactly the same.")
92 |
93 | ## normalize the covariate z-scoring
94 | data_covariate = df_covariate.iloc[:, 3:]
95 | data_covariate = ((data_covariate - data_covariate.mean()) / data_covariate.std()).values
96 |
97 | ## correction for the covariate, only retain the pathodological correspondance
98 | self._x, _ = GLMcorrection(self._x, np.asarray(self._diagnosis), data_covariate, self._x, data_covariate)
99 |
100 | return self._x
101 |
102 | def get_y(self):
103 | """
104 | Do not change the label's representation
105 | :return:
106 | """
107 |
108 | if self._y is not None:
109 | return self._y
110 |
111 | self._y = np.array(self._diagnosis)
112 | return self._y
113 |
114 |
--------------------------------------------------------------------------------
/magiccluster/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | __author__ = "Junhao Wen"
4 | __copyright__ = "Copyright 2023"
5 | __credits__ = ["Junhao Wen"]
6 | __license__ = "See LICENSE file"
7 | __version__ = "0.0.3"
8 | __maintainer__ = "Junhao Wen"
9 | __email__ = "junhao.wen89@gmail.com"
10 | __status__ = "Development"
11 |
12 | def magic_func(args):
13 | """
14 | The default function to run classification.
15 | Args:
16 | args: args from parser
17 |
18 | Returns:
19 |
20 | """
21 | from magiccluster.magic_clustering import clustering
22 | clustering(
23 | args.participant_tsv,
24 | args.opnmf_dir,
25 | args.output_dir,
26 | args.k_min,
27 | args.k_max,
28 | args.num_components_min,
29 | args.num_components_max,
30 | args.num_components_step,
31 | args.cv_repetition,
32 | args.covariate_tsv,
33 | args.cv_strategy,
34 | args.save_models,
35 | args.cluster_predefined_c,
36 | args.class_weight_balanced,
37 | args.weight_initialization_type,
38 | args.num_iteration,
39 | args.num_consensus,
40 | args.tol,
41 | args.multiscale_tol,
42 | args.n_threads,
43 | args.verbose
44 | )
45 |
46 | def parse_command_line():
47 | """
48 | Definition for the commandline parser
49 | Returns:
50 |
51 | """
52 |
53 | parser = argparse.ArgumentParser(
54 | prog='magiccluster-cluster',
55 | description='Perform multi-scale semi-supervised clustering using MAGIC...')
56 |
57 | subparser = parser.add_subparsers(
58 | title='''Task to perform...''',
59 | description='''We now only allow to use MAGIC for clustering''',
60 | dest='task',
61 | help='''****** Tasks proposed by MAGIC ******''')
62 |
63 | subparser.required = True
64 |
65 | ########################################################################################################################
66 |
67 | ## Add arguments for ADML ROI classification
68 | clustering_parser = subparser.add_parser(
69 | 'cluster',
70 | help='Perform clustering with MAGIC.')
71 |
72 | clustering_parser.add_argument(
73 | 'participant_tsv',
74 | help="Path to the tsv containing the following first columns:"
75 | "i) the first column is the participant_id. "
76 | "ii) the second column should be the session_id. "
77 | "iii) the third column should be the diagnosis. ",
78 | default=None
79 | )
80 |
81 | clustering_parser.add_argument(
82 | 'opnmf_dir',
83 | help='Path to the directory of where SOPNMF was run (the voxel-wise images should be run first with SOPNMF).',
84 | default=None
85 | )
86 |
87 | clustering_parser.add_argument(
88 | 'output_dir',
89 | help='Path to the directory of where to store the final output.',
90 | default=None
91 | )
92 |
93 | clustering_parser.add_argument(
94 | 'k_min',
95 | help='Number of cluster (k) minimum value.',
96 | default=None, type=int
97 | )
98 |
99 | clustering_parser.add_argument(
100 | 'k_max',
101 | help='Number of cluster (k) maximum value.',
102 | default=None, type=int
103 | )
104 |
105 | clustering_parser.add_argument(
106 | 'num_components_min',
107 | help='Number of the min PSC for the SOPNMF',
108 | default=None, type=int
109 | )
110 |
111 | clustering_parser.add_argument(
112 | 'num_components_max',
113 | help='Number of the max PSC for the SOPNMF',
114 | default=None, type=int
115 | )
116 |
117 | clustering_parser.add_argument(
118 | 'num_components_step',
119 | help='The step size between the min and the max PSC for the SOPNMF',
120 | default=None, type=int
121 | )
122 |
123 | clustering_parser.add_argument(
124 | 'cv_repetition',
125 | help='Number of repetitions for the chosen cross-validation (CV).',
126 | default=None, type=int
127 | )
128 |
129 | clustering_parser.add_argument(
130 | '--covariate_tsv',
131 | help="Path to the tsv containing covariates, following the BIDS convention. The first 3 columns is the same as feature_tsv",
132 | default=None,
133 | type=str
134 | )
135 |
136 | clustering_parser.add_argument(
137 | '-cs', '--cv_strategy',
138 | help='Chosen CV strategy, default is hold_out. ',
139 | type=str, default='hold_out',
140 | choices=['k_fold', 'hold_out'],
141 | )
142 |
143 | clustering_parser.add_argument(
144 | '-sm', '--save_models',
145 | help='If save modles during all repetitions of CV. ',
146 | default=False, action="store_true"
147 | )
148 |
149 | clustering_parser.add_argument(
150 | '--cluster_predefined_c',
151 | type=float,
152 | default=0.25,
153 | help="Predefined hyperparameter C of SVM. Default is 0.25. "
154 | "Better choice may be guided by HYDRA global classification with nested CV for optimal C searching. "
155 | )
156 |
157 | clustering_parser.add_argument(
158 | '-cwb', '--class_weight_balanced',
159 | help='If group samples are balanced, default is True. ',
160 | default=False, action="store_true"
161 | )
162 |
163 | clustering_parser.add_argument(
164 | '-wit', '--weight_initialization_type',
165 | help='Strategy for initializing the weighted sample matrix of the polytope. ',
166 | type=str, default='DPP',
167 | choices=['DPP', 'random_assign'],
168 | )
169 |
170 | clustering_parser.add_argument(
171 | '--num_iteration',
172 | help='Number of iteration to converge each SVM.',
173 | default=50, type=int
174 | )
175 |
176 | clustering_parser.add_argument(
177 | '--num_consensus',
178 | help='Number of iteration for inner consensus clusetering.',
179 | default=20, type=int
180 | )
181 |
182 | clustering_parser.add_argument(
183 | '--tol',
184 | help='Clustering stopping criterion, until the polytope becomes stable',
185 | default=1e-8, type=float
186 | )
187 |
188 | clustering_parser.add_argument(
189 | '--multiscale_tol',
190 | help='Clustering stopping criterion, until the multi-scale clustering solution stable',
191 | default=0.85, type=float
192 | )
193 |
194 | clustering_parser.add_argument(
195 | '-nt', '--n_threads',
196 | help='Number of cores used, default is 4',
197 | type=int, default=4
198 | )
199 |
200 | clustering_parser.add_argument(
201 | '-v', '--verbose',
202 | help='Increase output verbosity',
203 | default=False, action="store_true"
204 | )
205 |
206 | clustering_parser.set_defaults(func=magic_func)
207 |
208 |
209 |
210 |
--------------------------------------------------------------------------------
/magiccluster/magic_clustering.py:
--------------------------------------------------------------------------------
1 | from .clustering import DualSVM_Subtype, DualSVM_Subtype_transfer_learning
2 | from .base import OPNMF_Input
3 | import os, pickle
4 | from .utils import cluster_stability_across_resolution, summary_clustering_result_multiscale, shift_list, consensus_clustering_across_c, make_cv_partition
5 | import numpy as np
6 |
7 | __author__ = "Junhao Wen"
8 | __copyright__ = "Copyright 2023"
9 | __credits__ = ["Junhao Wen, Erdem Varol"]
10 | __license__ = "See LICENSE file"
11 | __version__ = "0.0.3"
12 | __maintainer__ = "Junhao Wen"
13 | __email__ = "junhao.wen89@gmail.com"
14 | __status__ = "Development"
15 |
16 | def clustering(participant_tsv, opnmf_dir, output_dir, k_min, k_max, num_components_min, num_components_max, num_components_step, cv_repetition, covariate_tsv=None, cv_strategy='hold_out', save_models=False,
17 | cluster_predefined_c=0.25, class_weight_balanced=True, weight_initialization_type='DPP', num_iteration=50,
18 | num_consensus=20, tol=1e-8, multiscale_tol=0.85, n_threads=8, verbose=False):
19 | """
20 | pyhydra core function for clustering
21 | Args:
22 | participant_tsv:str, path to the participant_tsv tsv, following the BIDS convention. The tsv contains
23 | the following headers: "
24 | "i) the first column is the participant_id;"
25 | "ii) the second column should be the session_id;"
26 | "iii) the third column should be the diagnosis;"
27 | opnmf_dir: str, path to store the OPNMF results
28 | output_dir: str, path to store the clustering results
29 | k_min: int, minimum k (number of clusters)
30 | k_max: int, maximum k (number of clusters)
31 | cv_repetition: int, number of repetitions for cross-validation (CV)
32 | covariate_tsv: str, path to the tsv containing the covaria`tes, eg., age or sex. The header (first 3 columns) of
33 | the tsv file is the same as the feature_tsv, following the BIDS convention.
34 | cv_strategy: str, cross validation strategy used. Default is hold_out. choices=['k_fold', 'hold_out']
35 | save_models: Bool, if save all models during CV. Default is False to save space.
36 | Set true only if you are going to apply the trained model to unseen data.
37 | cluster_predefined_c: Float, default is 0.25. The predefined best c if you do not want to perform a nested CV to
38 | find it. If used, it should be a float number
39 | class_weight_balanced: Bool, default is True. If the two groups are balanced.
40 | weight_initialization_type: str, default is DPP. The strategy for initializing the weight to control the
41 | hyperplances and the subpopulation of patients. choices=["random_hyperplane", "random_assign", "k_means", "DPP"]
42 | num_iteration: int, default is 50. The number of iterations to iteratively optimize the polytope.
43 | num_consensus: int, default is 20. The number of repeats for consensus clustering to eliminate the unstable clustering.
44 | tol: float, default is 1e-8. Clustering stopping criterion.
45 | multiscale_tol: float, default is 0.85. Double cyclic optimization stopping criterion.
46 | n_threads: int, default is 8. The number of threads to run model in parallel.
47 | verbose: Bool, default is False. If the output message is verbose.
48 |
49 | Returns: clustering outputs.
50 |
51 | """
52 | ### For voxel approach
53 | print('MAGIC for semi-supervised clustering...')
54 | if covariate_tsv == None:
55 | input_data = OPNMF_Input(opnmf_dir, participant_tsv, covariate_tsv=None)
56 | else:
57 | input_data = OPNMF_Input(opnmf_dir, participant_tsv, covariate_tsv=covariate_tsv)
58 |
59 | ## data split
60 | print('Data split was performed based on validation strategy: %s...\n' % cv_strategy)
61 | if cv_strategy == "hold_out":
62 | ## check if data split has been done, if yes, the pickle file is there
63 | if os.path.isfile(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-holdout.pkl')):
64 | split_index = pickle.load(open(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-holdout.pkl'), 'rb'))
65 | else:
66 | split_index, _ = make_cv_partition(input_data.get_y(), cv_strategy, output_dir, cv_repetition)
67 | elif cv_strategy == "k_fold":
68 | ## check if data split has been done, if yes, the pickle file is there
69 | if os.path.isfile(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-fold.pkl')):
70 | split_index = pickle.load(open(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-fold.pkl'), 'rb'))
71 | else:
72 | split_index, _ = make_cv_partition(input_data.get_y(), cv_strategy, output_dir, cv_repetition)
73 |
74 | print('Data split has been done!\n')
75 |
76 | print('Starts semi-supervised clustering...')
77 | ### Here, semi-supervised clustering with multi-scale feature reduction learning
78 | if (num_components_max - num_components_min) % num_components_step != 0:
79 | raise Exception('Number of componnets step should be divisible!')
80 |
81 | ## C lists
82 | C_list = list(range(num_components_min, num_components_max+num_components_step, num_components_step))
83 | ## first loop on different initial C.
84 | for i in range(len(C_list)):
85 | c_list = shift_list(C_list, i)
86 | num_run = 0
87 | loop = True
88 | print('Initialize C == %d\n' % C_list[i])
89 | while loop:
90 | for j in range(len(c_list)):
91 | if num_run == 0:
92 | num_run += 1
93 | k_continuing = np.arange(k_min, k_max+1).tolist()
94 | print('First C == %d\n' % c_list[j])
95 | output_dir_loop = os.path.join(output_dir, 'initialization_c_' + str(C_list[i]), 'clustering_run' + str(num_run))
96 | wf_clustering = DualSVM_Subtype(input_data,
97 | participant_tsv,
98 | split_index,
99 | cv_repetition,
100 | k_min,
101 | k_max,
102 | output_dir_loop,
103 | opnmf_dir,
104 | balanced=class_weight_balanced,
105 | num_consensus=num_consensus,
106 | num_iteration=num_iteration,
107 | tol=tol,
108 | predefined_c=cluster_predefined_c,
109 | weight_initialization_type=weight_initialization_type,
110 | n_threads=n_threads,
111 | num_components_min=c_list[j],
112 | num_components_max=c_list[j],
113 | num_components_step=num_components_step,
114 | save_models=save_models,
115 | verbose=verbose)
116 |
117 | wf_clustering.run()
118 | else: ## initialize the model from the former resolution
119 | num_run += 1
120 | print('Transfer learning on resolution C == %d for run == %d\n' % (c_list[j], num_run))
121 | output_dir_tl = os.path.join(output_dir, 'initialization_c_' + str(C_list[i]))
122 | wf_clustering = DualSVM_Subtype_transfer_learning(input_data,
123 | participant_tsv,
124 | split_index,
125 | cv_repetition,
126 | k_continuing,
127 | output_dir_tl,
128 | opnmf_dir,
129 | balanced=class_weight_balanced,
130 | num_iteration=num_iteration,
131 | tol=tol,
132 | predefined_c=cluster_predefined_c,
133 | weight_initialization_type=weight_initialization_type,
134 | n_threads=n_threads,
135 | num_component=c_list[j],
136 | num_component_former=c_list[j-1],
137 | num_run=num_run)
138 |
139 | wf_clustering.run()
140 |
141 | ### check the clustering stability between the current C and former C
142 | k_continuing, k_converged = cluster_stability_across_resolution(c_list[j], c_list[j-1], os.path.join(output_dir, 'initialization_c_' + str(C_list[i])), k_continuing, num_run, stop_tol=multiscale_tol)
143 |
144 | if not k_continuing:
145 | loop = False
146 | break
147 |
148 | ## After cross validate the hyperparameter k & num_components, summarize the results into a single tsv file.
149 | if not k_continuing:
150 | summary_clustering_result_multiscale(os.path.join(output_dir, 'initialization_c_' + str(C_list[i])), k_min, k_max)
151 |
152 | ## consensus learning based on different initialization Cs
153 | print('Computing the final consensus group membership!\n')
154 | consensus_clustering_across_c(output_dir, C_list, k_min, k_max)
155 | print('Finish...')
--------------------------------------------------------------------------------
/magiccluster/clustering.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 | from .utils import consensus_clustering, cv_cluster_stability, hydra_solver_svm_tl
5 | from .base import WorkFlow
6 | from utils import hydra_solver_svm
7 |
8 | __author__ = "Junhao Wen"
9 | __copyright__ = "Copyright 2023"
10 | __credits__ = ["Junhao Wen, Erdem Varol"]
11 | __license__ = "See LICENSE file"
12 | __version__ = "0.0.3"
13 | __maintainer__ = "Junhao Wen"
14 | __email__ = "junhao.wen89@gmail.com"
15 | __status__ = "Development"
16 | class DualSVM_Subtype(WorkFlow):
17 |
18 | def __init__(self, input, participant_tsv, split_index, cv_repetition, k_min, k_max, output_dir, opnmf_dir, balanced=True,
19 | test_size=0.2, num_consensus=20, num_iteration=50, tol=1e-6, predefined_c=None, weight_initialization_type='DPP',
20 | n_threads=8, num_components_min=10, num_components_max=100, num_components_step=10, save_models=False,
21 | verbose=True):
22 |
23 | self._input = input
24 | self._participant_tsv = participant_tsv
25 | self._split_index = split_index
26 | self._cv_repetition = cv_repetition
27 | self._output_dir = output_dir
28 | self._opnmf_dir = opnmf_dir
29 | self._k_min = k_min
30 | self._k_max = k_max
31 | self._balanced = balanced
32 | self._test_size = test_size
33 | self._num_consensus = num_consensus
34 | self._num_iteration = num_iteration
35 | self._tol = tol
36 | self._predefined_c = predefined_c
37 | self._weight_initialization_type = weight_initialization_type
38 | self._k_range_list = list(range(k_min, k_max + 1))
39 | self._n_threads = n_threads
40 | self._num_components_min = num_components_min
41 | self._num_components_max = num_components_max
42 | self._num_components_step = num_components_step
43 | self._save_models = save_models
44 | self._verbose = verbose
45 |
46 |
47 | def run(self):
48 |
49 | ## by default, we solve the problem using dual solver with a linear kernel.
50 | for num_component in range(self._num_components_min, self._num_components_max + self._num_components_step, self._num_components_step):
51 |
52 | if os.path.exists(os.path.join(self._output_dir, 'component_' + str(num_component), "adjusted_rand_index.tsv")):
53 | print("This number of component have been trained and converged: %d" % num_component)
54 | else:
55 | x = self._input.get_x(num_component, self._opnmf_dir)
56 | y = self._input.get_y_raw()
57 | data_label_folds_ks = np.zeros((y.shape[0], self._cv_repetition, self._k_max - self._k_min + 1)).astype(int)
58 |
59 | for i in range(self._cv_repetition):
60 | for j in self._k_range_list:
61 | print('Applying pyHRDRA for finding %d clusters. Repetition: %d / %d...\n' % (j, i+1, self._cv_repetition))
62 | training_final_prediction = hydra_solver_svm(i, x[self._split_index[i][0]], y[self._split_index[i][0]], j, self._output_dir,
63 | self._num_consensus, self._num_iteration, self._tol, self._balanced, self._predefined_c,
64 | self._weight_initialization_type, self._n_threads, self._save_models, self._verbose)
65 |
66 |
67 | # change the final prediction's label: test data to be 0, the rest training data will b e updated by the model's prediction
68 | data_label_fold = y.copy()
69 | data_label_fold[self._split_index[i][1]] = 0 # all test data to be 0
70 | data_label_fold[self._split_index[i][0]] = training_final_prediction ## assign the training prediction
71 | data_label_folds_ks[:, i, j - self._k_min] = data_label_fold
72 |
73 | print('Estimating clustering stability...\n')
74 | ## for the adjusted rand index, only consider the PT results
75 | adjusted_rand_index_results = np.zeros(self._k_max - self._k_min + 1)
76 | index_pt = np.where(y == 1)[0] # index for PTs
77 | for m in range(self._k_max - self._k_min + 1):
78 | result = data_label_folds_ks[:, :, m][index_pt]
79 | adjusted_rand_index_result = cv_cluster_stability(result, self._k_range_list[m])
80 | # saving each k result into the final adjusted_rand_index_results
81 | adjusted_rand_index_results[m] = adjusted_rand_index_result
82 |
83 | print('Computing the final consensus group membership...\n')
84 | final_assignment_ks = -np.ones((self._input.get_y_raw().shape[0], self._k_max - self._k_min + 1)).astype(int)
85 | for n in range(self._k_max - self._k_min + 1):
86 | result = data_label_folds_ks[:, :, n][index_pt]
87 | final_assignment_ks_pt = consensus_clustering(result, n + self._k_min)
88 | final_assignment_ks[index_pt, n] = final_assignment_ks_pt + 1
89 |
90 | print('Saving the final results...\n')
91 | # save_cluster_results(adjusted_rand_index_results, final_assignment_ks)
92 | columns = ['ari_' + str(i) + '_subtypes' for i in self._k_range_list]
93 | ari_df = pd.DataFrame(adjusted_rand_index_results[:, np.newaxis].transpose(), columns=columns)
94 | ari_df.to_csv(os.path.join(self._output_dir, 'adjusted_rand_index.tsv'), index=False, sep='\t',
95 | encoding='utf-8')
96 |
97 | # save the final assignment for consensus clustering across different folds
98 | participant_df = pd.read_csv(self._participant_tsv, sep='\t')
99 | columns = ['assignment_' + str(i) for i in self._k_range_list]
100 | cluster_df = pd.DataFrame(final_assignment_ks, columns=columns)
101 | all_df = pd.concat([participant_df, cluster_df], axis=1)
102 | all_df.to_csv(os.path.join(self._output_dir, 'clustering_assignment.tsv'), index=False,
103 | sep='\t', encoding='utf-8')
104 |
105 | class DualSVM_Subtype_transfer_learning(WorkFlow):
106 | """
107 | Instead of training from scratch, we initialize the polytope from the former C
108 | """
109 | def __init__(self, input, participant_tsv, split_index, cv_repetition, k_list, output_dir, opnmf_output, balanced=True,
110 | test_size=0.2, num_iteration=50, tol=1e-6, predefined_c=None,
111 | weight_initialization_type='DPP', n_threads=8, num_component=10, num_component_former=10, num_run=None):
112 |
113 | self._input = input
114 | self._participant_tsv = participant_tsv
115 | self._split_index = split_index
116 | self._cv_repetition = cv_repetition
117 | self._output_dir = output_dir
118 | self._opnmf_output = opnmf_output
119 | self._k_list = k_list
120 | self._balanced = balanced
121 | self._test_size = test_size
122 | self._num_iteration = num_iteration
123 | self._tol = tol
124 | self._predefined_c = predefined_c
125 | self._weight_initialization_type = weight_initialization_type
126 | self._n_threads = n_threads
127 | self._num_component = num_component
128 | self._num_component_former = num_component_former
129 | self._num_run = num_run
130 |
131 | def run(self):
132 |
133 | if os.path.exists(os.path.join(self._output_dir, 'clustering_run' + str(self._num_run), 'component_' + str(self._num_component), "adjusted_rand_index.tsv")):
134 | print("This number of component have been trained and converged: %d" % self._num_component)
135 | else:
136 | print("cross validate for num_component, running for %d components for feature selection" % self._num_component)
137 | x = self._input.get_x(self._num_component, self._opnmf_output)
138 |
139 | y = self._input.get_y_raw()
140 | data_label_folds_ks = np.zeros((y.shape[0], self._cv_repetition, len(self._k_list))).astype(int)
141 |
142 | for i in range(self._cv_repetition):
143 | for j in range(len(self._k_list)):
144 | print('Applying HRDRA for finding %d clusters. Repetition: %d / %d...\n' % (self._k_list[j], i+1, self._cv_repetition))
145 | training_final_prediction = hydra_solver_svm_tl(self._num_component, self._num_component_former, i, x[self._split_index[i][0]], y[self._split_index[i][0]], self._k_list[j], self._output_dir,
146 | self._num_iteration, self._tol, self._balanced, self._predefined_c,
147 | self._n_threads, self._num_run)
148 |
149 |
150 | # change the final prediction's label: test data to be 0, the rest training data will be updated by the model's prediction
151 | data_label_fold = y.copy()
152 | data_label_fold[self._split_index[i][1]] = 0 # all test data to be 0
153 | data_label_fold[self._split_index[i][0]] = training_final_prediction ## assign the training prediction
154 | data_label_folds_ks[:, i, j] = data_label_fold
155 |
156 | print('Finish the clustering procedure!\n')
157 |
158 | print('Estimating clustering stability!\n')
159 | ## for the adjusted rand index, only consider the PT results
160 | adjusted_rand_index_results = np.zeros(len(self._k_list))
161 | index_pt = np.where(y == 1)[0] # index for PTs
162 | for m in range(len(self._k_list)):
163 | result = data_label_folds_ks[:, :, m][index_pt] ## the result of each K during all runs of CV
164 | adjusted_rand_index_result = cv_cluster_stability(result, self._k_list[m])
165 |
166 | # saving each k result into the final adjusted_rand_index_results
167 | adjusted_rand_index_results[m] = adjusted_rand_index_result
168 | print('Done!\n')
169 |
170 | print('Computing the final consensus group membership!\n')
171 | final_assignment_ks = -np.ones((self._input.get_y_raw().shape[0], len(self._k_list))).astype(int)
172 | for n in range(len(self._k_list)):
173 | result = data_label_folds_ks[:, :, n][index_pt]
174 | final_assignment_ks_pt = consensus_clustering(result, n + self._k_list[0]) ## the final subtype assignment is performed with consensus clustering with KMeans
175 | final_assignment_ks[index_pt, n] = final_assignment_ks_pt + 1
176 | print('Done!\n')
177 |
178 | print('Saving the final results!\n')
179 | # save_cluster_results(adjusted_rand_index_results, final_assignment_ks)
180 | columns = ['ari_' + str(i) + '_subtypes' for i in self._k_list]
181 | ari_df = pd.DataFrame(adjusted_rand_index_results[:, np.newaxis].transpose(), columns=columns)
182 | ari_df.to_csv(os.path.join(self._output_dir, 'clustering_run' + str(self._num_run), 'component_' + str(self._num_component), 'adjusted_rand_index.tsv'), index=False, sep='\t',
183 | encoding='utf-8')
184 |
185 | # save the final assignment for consensus clustering across different folds
186 | df_feature = pd.read_csv(self._participant_tsv, sep='\t')
187 | columns = ['assignment_' + str(i) for i in self._k_list]
188 | participant_df = df_feature.iloc[:, :3]
189 | cluster_df = pd.DataFrame(final_assignment_ks, columns=columns)
190 | all_df = pd.concat([participant_df, cluster_df], axis=1)
191 | all_df.to_csv(os.path.join(self._output_dir, 'clustering_run' + str(self._num_run), 'component_' + str(self._num_component), 'clustering_assignment.tsv'), index=False,
192 | sep='\t', encoding='utf-8')
193 |
194 | print('Done!\n')
--------------------------------------------------------------------------------
/magiccluster/utils.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import numpy as np
3 | import scipy
4 | import os, pickle
5 | from sklearn.cluster import KMeans
6 | from sklearn.metrics import adjusted_rand_score
7 | from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, KFold, ShuffleSplit
8 | from joblib import dump
9 | import pandas as pd
10 | from multiprocessing.pool import ThreadPool
11 | from sklearn.svm import SVC
12 |
13 | __author__ = "Junhao Wen"
14 | __copyright__ = "Copyright 2023"
15 | __credits__ = ["Junhao Wen, Erdem Varol"]
16 | __license__ = "See LICENSE file"
17 | __version__ = "0.0.3"
18 | __maintainer__ = "Junhao Wen"
19 | __email__ = "junhao.wen89@gmail.com"
20 | __status__ = "Development"
21 |
22 | def elem_sym_poly(lambda_value, k):
23 | """
24 | given a vector of lambdas and a maximum size k, determine the value of
25 | the elementary symmetric polynomials:
26 | E(l+1,n+1) = sum_{J \subseteq 1..n,|J| = l} prod_{i \in J} lambda(i)
27 | :param lambda_value: the corresponding eigenvalues
28 | :param k: number of clusters
29 | :return:
30 | """
31 | N = lambda_value.shape[0]
32 | E = np.zeros((k + 1, N + 1))
33 | E[0, :] = 1
34 |
35 | for i in range(1, k+1):
36 | for j in range(1, N+1):
37 | E[i, j] = E[i, j - 1] + lambda_value[j-1] * E[i - 1, j - 1]
38 |
39 | return E
40 |
41 |
42 | def sample_k(lambda_value, k):
43 | """
44 | Pick k lambdas according to p(S) \propto prod(lambda \in S)
45 | :param lambda_value: the corresponding eigenvalues
46 | :param k: the number of clusters
47 | :return:
48 | """
49 |
50 | ## compute elementary symmetric polynomials
51 | E = elem_sym_poly(lambda_value, k)
52 |
53 | ## ietrate over the lambda value
54 | num = lambda_value.shape[0]
55 | remaining = k
56 | S = np.zeros(k)
57 | while remaining > 0:
58 | #compute marginal of num given that we choose remaining values from 0:num-1
59 | if num == remaining:
60 | marg = 1
61 | else:
62 | marg = lambda_value[num-1] * E[remaining-1, num-1] / E[remaining, num]
63 |
64 | # sample marginal
65 | if np.random.rand(1) < marg:
66 | S[remaining-1] = num
67 | remaining = remaining - 1
68 | num = num - 1
69 | return S
70 |
71 | def sample_dpp(evalue, evector, k=None):
72 | """
73 | sample a set Y from a dpp. evalue, evector are a decomposed kernel, and k is (optionally) the size of the set to return
74 | :param evalue: eigenvalue
75 | :param evector: normalized eigenvector
76 | :param k: number of cluster
77 | :return:
78 | """
79 | if k == None:
80 | # choose eigenvectors randomly
81 | evalue = np.divide(evalue, (1 + evalue))
82 | evector = np.where(np.random.random(evalue.shape[0]) <= evalue)[0]
83 | else:
84 | v = sample_k(evalue, k) ## v here is a 1d array with size: k
85 |
86 | k = v.shape[0]
87 | v = v.astype(int)
88 | v = [i - 1 for i in v.tolist()] ## due to the index difference between matlab & python, here, the element of v is for matlab
89 | V = evector[:, v]
90 |
91 | ## iterate
92 | y = np.zeros(k)
93 | for i in range(k, 0, -1):
94 | ## compute probabilities for each item
95 | P = np.sum(np.square(V), axis=1)
96 | P = P / np.sum(P)
97 |
98 | # choose a new item to include
99 | y[i-1] = np.where(np.random.rand(1) < np.cumsum(P))[0][0]
100 | y = y.astype(int)
101 |
102 | # choose a vector to eliminate
103 | j = np.where(V[y[i-1], :])[0][0]
104 | Vj = V[:, j]
105 | V = np.delete(V, j, 1)
106 |
107 | ## Update V
108 | if V.size == 0:
109 | pass
110 | else:
111 | V = np.subtract(V, np.multiply(Vj, (V[y[i-1], :] / Vj[y[i-1]])[:, np.newaxis]).transpose()) ## watch out the dimension here
112 |
113 | ## orthogonalize
114 | for m in range(i - 1):
115 | for n in range(m):
116 | V[:, m] = np.subtract(V[:, m], np.matmul(V[:, m].transpose(), V[:, n]) * V[:, n])
117 |
118 | V[:, m] = V[:, m] / np.linalg.norm(V[:, m])
119 |
120 | y = np.sort(y)
121 |
122 | return y
123 |
124 | def proportional_assign(l, d):
125 | """
126 | Proportional assignment based on margin
127 | :param l: int
128 | :param d: int
129 | :return:
130 | """
131 | np.seterr(divide='ignore', invalid='ignore')
132 | invL = np.divide(1, l)
133 | idx = np.isinf(invL)
134 | invL[idx] = d[idx]
135 |
136 | for i in range(l.shape[0]):
137 | pos = np.where(invL[i, :] > 0)[0]
138 | neg = np.where(invL[i, :] < 0)[0]
139 | if pos.size != 0:
140 | invL[i, neg] = 0
141 | else:
142 | invL[i, :] = np.divide(invL[i, :], np.amin(invL[i, :]))
143 | invL[i, invL[i, :] < 1] = 0
144 |
145 | S = np.multiply(invL, np.divide(1, np.sum(invL, axis=1))[:, np.newaxis])
146 |
147 | return S
148 |
149 | def random_init_dirichlet(k, num_pt):
150 | """
151 | a sample from a dirichlet distribution
152 | :param k: number of clusters
153 | :param num_pt: number of PT
154 | :return:
155 | """
156 | a = np.ones(k)
157 | s = np.random.dirichlet(a, num_pt)
158 |
159 | return s
160 |
161 | def hydra_init_weight(X, y, k, index_pt, index_cn, weight_initialization_type):
162 | """
163 | Function performs initialization for the polytope of mlni
164 | Args:
165 | X: the input features
166 | y: the label
167 | k: number of predefined clusters
168 | index_pt: list, the index for patient subjects
169 | index_cn: list, the index for control subjects
170 | weight_initialization_type: the type of chosen initialization method
171 | Returns:
172 |
173 | """
174 | if weight_initialization_type == "DPP": ##
175 | num_subject = y.shape[0]
176 | W = np.zeros((num_subject, X.shape[1]))
177 | for j in range(num_subject):
178 | ipt = np.random.randint(index_pt.shape[0])
179 | icn = np.random.randint(index_cn.shape[0])
180 | W[j, :] = X[index_pt[ipt], :] - X[index_cn[icn], :]
181 |
182 | KW = np.matmul(W, W.transpose())
183 | KW = np.divide(KW, np.sqrt(np.multiply(np.diag(KW)[:, np.newaxis], np.diag(KW)[:, np.newaxis].transpose())))
184 | evalue, evector = np.linalg.eig(KW)
185 | Widx = sample_dpp(np.real(evalue), np.real(evector), k)
186 | prob = np.zeros((len(index_pt), k)) # only consider the PTs
187 |
188 | for i in range(k):
189 | prob[:, i] = np.matmul(np.multiply(X[index_pt, :], np.divide(1, np.linalg.norm(X[index_pt, :], axis=1))[:, np.newaxis]), W[Widx[i], :].transpose())
190 |
191 | l = np.minimum(prob - 1, 0)
192 | d = prob - 1
193 | S = proportional_assign(l, d)
194 |
195 | elif weight_initialization_type == "random_hyperplane":
196 | print("TODO")
197 |
198 | elif weight_initialization_type == "random_assign":
199 | S = random_init_dirichlet(k, len(index_pt))
200 |
201 | elif weight_initialization_type == "k_means":
202 | print("TODO")
203 | else:
204 | raise Exception("Not implemented yet!")
205 |
206 | return S
207 |
208 | def hydra_solver_svm(num_repetition, X, y, k, output_dir, num_consensus, num_iteration, tol, balanced, predefined_c,
209 | weight_initialization_type, n_threads, save_models, verbose):
210 | """
211 | This is the main function of HYDRA, which find the convex polytope using a supervised classification fashion.
212 | Args:
213 | num_repetition: int, number of repetitions for CV
214 | X: input matrix for features
215 | y: input for group label
216 | k: number of clusters
217 | output_dir: the path for output
218 | num_consensus: int, number of runs for consensus clustering
219 | num_iteration: int, number of maximum iterations for running HYDRA
220 | tol: float, tolerance value for model convergence
221 | balanced: if sample imbalance should be considered during model optimization
222 | predefined_c: predefined c for SVM for clustering
223 | weight_initialization_type: the type of initialization of the weighted sample matrix
224 | n_threads: number of threads used
225 | save_models: if save all models during CV
226 | verbose: if output is verbose
227 |
228 | Returns:
229 |
230 | """
231 | censensus_assignment = np.zeros((y[y == 1].shape[0], num_consensus)) ## only consider the PTs
232 |
233 | index_pt = np.where(y == 1)[0] # index for PTs
234 | index_cn = np.where(y == -1)[0] # index for CNs
235 |
236 | for i in range(num_consensus):
237 | weight_sample = np.ones((y.shape[0], k)) / k
238 | ## depending on the weight initialization strategy, random hyperplanes were initialized with maximum diversity to constitute the convex polytope
239 | weight_sample_pt = hydra_init_weight(X, y, k, index_pt, index_cn, weight_initialization_type)
240 | weight_sample[index_pt] = weight_sample_pt ## only replace the sample weight of the PT group
241 | ## cluster assignment is based on this svm scores across different SVM/hyperplanes
242 | svm_scores = np.zeros((weight_sample.shape[0], weight_sample.shape[1]))
243 | update_weights_pool = ThreadPool(processes=n_threads)
244 |
245 | for j in range(num_iteration):
246 | for m in range(k):
247 | sample_weight = np.ascontiguousarray(weight_sample[:, m])
248 | if np.count_nonzero(sample_weight[index_pt]) == 0:
249 | if verbose == True:
250 | print(
251 | "Cluster dropped, meaning that all PT has been assigned to one single hyperplane in iteration: %d" % (
252 | j - 1))
253 | print(
254 | "Be careful, this could cause problem because of the ill-posed solution. Especially when k==2")
255 | else:
256 | results = update_weights_pool.apply_async(launch_svc,
257 | args=(X, y, predefined_c, sample_weight, balanced))
258 | weight_coef = results.get()[0]
259 | intesept = results.get()[1]
260 | ## Apply the data again the trained model to get the final SVM scores
261 | svm_scores[:, m] = (np.matmul(weight_coef, X.transpose()) + intesept).transpose().squeeze()
262 |
263 | cluster_index = np.argmax(svm_scores[index_pt], axis=1)
264 |
265 | ## decide the converge of the polytope based on the toleration
266 | weight_sample_hold = weight_sample.copy()
267 | # after each iteration, first set the weight of patient rows to be 0
268 | weight_sample[index_pt, :] = 0
269 | # then set the pt's weight to be 1 for the assigned hyperplane
270 | for n in range(len(index_pt)):
271 | weight_sample[index_pt[n], cluster_index[n]] = 1
272 |
273 | ## check the loss comparted to the tolorence for stopping criteria
274 | loss = np.linalg.norm(np.subtract(weight_sample, weight_sample_hold), ord='fro')
275 | if verbose == True:
276 | print("The loss is: %f" % loss)
277 | if loss < tol:
278 | if verbose == True:
279 | print(
280 | "The polytope has been converged for iteration %d in finding %d clusters in consensus running: %d" % (
281 | j, k, i))
282 | break
283 | update_weights_pool.close()
284 | update_weights_pool.join()
285 |
286 | ## update the cluster index for the consensus clustering
287 | censensus_assignment[:, i] = cluster_index + 1
288 |
289 | ## do censensus clustering
290 | final_predict = consensus_clustering(censensus_assignment.astype(int), k)
291 |
292 | ## after deciding the final convex polytope, we refit the training data once to save the best model
293 | weight_sample_final = np.zeros((y.shape[0], k))
294 | ## change the weight of PTs to be 1, CNs to be 1/k
295 |
296 | # then set the pt's weight to be 1 for the assigned hyperplane
297 | for n in range(len(index_pt)):
298 | weight_sample_final[index_pt[n], final_predict[n]] = 1
299 |
300 | weight_sample_final[index_cn] = 1 / k
301 | update_weights_pool_final = ThreadPool(processes=n_threads)
302 | ## create the final polytope by applying all weighted subjects
303 | for o in range(k):
304 | sample_weight = np.ascontiguousarray(weight_sample_final[:, o])
305 | results = update_weights_pool_final.apply_async(launch_svc, args=(X, y, predefined_c, sample_weight, balanced))
306 |
307 | if not os.path.exists(os.path.join(output_dir, str(k) + '_clusters', 'models')):
308 | os.makedirs(os.path.join(output_dir, str(k) + '_clusters', 'models'))
309 |
310 | ## save the final model for the k SVMs/hyperplanes
311 | if save_models == True:
312 | if not os.path.exists(os.path.join(output_dir, str(k) + '_clusters', 'models')):
313 | os.makedirs(os.path.join(output_dir, str(k) + '_clusters', 'models'))
314 |
315 | dump(results.get()[2], os.path.join(output_dir, str(k) + '_clusters', 'models',
316 | 'svm-' + str(o) + '_cv_' + str(num_repetition) + '.joblib'))
317 | else:
318 | ## only save the last repetition
319 | if not os.path.isfile(os.path.join(output_dir, str(k) + '_clusters', 'models',
320 | 'svm-' + str(o) + '_last_repetition.joblib')):
321 | dump(results.get()[2], os.path.join(output_dir, str(k) + '_clusters', 'models',
322 | 'svm-' + str(o) + '_last_repetition.joblib'))
323 | update_weights_pool_final.close()
324 | update_weights_pool_final.join()
325 |
326 | y[index_pt] = final_predict + 1
327 |
328 | if not os.path.exists(os.path.join(output_dir, str(k) + '_clusters', 'tsv')):
329 | os.makedirs(os.path.join(output_dir, str(k) + '_clusters', 'tsv'))
330 |
331 | ### save results also in tsv file for each repetition
332 | ## save the assigned weight for each subject across k-fold
333 | columns = ['hyperplane' + str(i) for i in range(k)]
334 | weight_sample_df = pd.DataFrame(weight_sample_final, columns=columns)
335 | weight_sample_df.to_csv(
336 | os.path.join(output_dir, str(k) + '_clusters', 'tsv', 'weight_sample_cv_' + str(num_repetition) + '.tsv'),
337 | index=False, sep='\t', encoding='utf-8')
338 |
339 | ## save the final_predict_all
340 | columns = ['y_hat']
341 | y_hat_df = pd.DataFrame(y, columns=columns)
342 | y_hat_df.to_csv(os.path.join(output_dir, str(k) + '_clusters', 'tsv', 'y_hat_cv_' + str(num_repetition) + '.tsv'),
343 | index=False, sep='\t', encoding='utf-8')
344 |
345 | ## save the pt index
346 | columns = ['pt_index']
347 | pt_df = pd.DataFrame(index_pt, columns=columns)
348 | pt_df.to_csv(os.path.join(output_dir, str(k) + '_clusters', 'tsv', 'pt_index_cv_' + str(num_repetition) + '.tsv'),
349 | index=False, sep='\t', encoding='utf-8')
350 |
351 | return y
352 |
353 | def GLMcorrection(X_train, Y_train, covar_train, X_test, covar_test):
354 | """
355 | Eliminate the confound of covariate, such as age and sex, from the disease-based changes.
356 | Ref: "Age Correction in Dementia Matching to a Healthy Brain"
357 | :param X_train: array, training features
358 | :param Y_train: array, training labels
359 | :param covar_train: array, ttraining covariate data
360 | :param X_test: array, test labels
361 | :param covar_test: array, ttest covariate data
362 | :return: corrected training & test feature data
363 | """
364 | Yc = X_train[Y_train == -1]
365 | Xc = covar_train[Y_train == -1]
366 | Xc = np.concatenate((Xc, np.ones((Xc.shape[0], 1))), axis=1)
367 | beta = np.matmul(np.matmul(Yc.transpose(), Xc), np.linalg.inv(np.matmul(Xc.transpose(), Xc)))
368 | num_col = beta.shape[1]
369 | X_train_cor = (X_train.transpose() - np.matmul(beta[:, : num_col - 1], covar_train.transpose())).transpose()
370 | X_test_cor = (X_test.transpose() - np.matmul(beta[:, : num_col - 1], covar_test.transpose())).transpose()
371 |
372 | return X_train_cor, X_test_cor
373 |
374 | def launch_svc(X, y, predefined_c, sample_weight, balanced):
375 | """
376 | Lauch svc classifier of sklearn
377 | Args:
378 | X: input matrix for features
379 | y: input matrix for label
380 | predefined_c: predefined C
381 | sample_weight: the weighted sample matrix
382 | balanced:
383 |
384 | Returns:
385 |
386 | """
387 | if not balanced:
388 | model = SVC(kernel='linear', C=predefined_c)
389 | else:
390 | model = SVC(kernel='linear', C=predefined_c, class_weight='balanced')
391 |
392 | ## fit the different SVM/hyperplanes
393 | model.fit(X, y, sample_weight=sample_weight)
394 |
395 | weight_coef = model.coef_
396 | intesept = model.intercept_
397 |
398 | return weight_coef, intesept, model
399 |
400 | def check_symmetric(a, rtol=1e-05, atol=1e-08):
401 | """
402 | Check if the numpy array is symmetric or not
403 | Args:
404 | a:
405 | rtol:
406 | atol:
407 |
408 | Returns:
409 |
410 | """
411 | result = np.allclose(a, a.T, rtol=rtol, atol=atol)
412 | return result
413 |
414 | def make_cv_partition(diagnosis, cv_strategy, output_dir, cv_repetition, seed=None):
415 | """
416 | Randomly generate the data split index for different CV strategy.
417 |
418 | :param diagnosis: the list for labels
419 | :param cv_repetition: the number of repetitions or folds
420 | :param output_dir: the output folder path
421 | :param cv_repetition: the number of repetitions for CV
422 | :param seed: random seed for sklearn split generator. Default is None
423 | :return:
424 | """
425 | unique = list(set(diagnosis))
426 | y = np.array(diagnosis)
427 | if len(unique) == 2: ### CV for classification and clustering
428 | if cv_strategy == 'k_fold':
429 | splits_indices_pickle = os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-fold.pkl')
430 | ## try to see if the shuffle has been done
431 | if os.path.isfile(splits_indices_pickle):
432 | splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
433 | else:
434 | splits = StratifiedKFold(n_splits=cv_repetition, random_state=seed)
435 | splits_indices = list(splits.split(np.zeros(len(y)), y))
436 | elif cv_strategy == 'hold_out':
437 | splits_indices_pickle = os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-holdout.pkl')
438 | ## try to see if the shuffle has been done
439 | if os.path.isfile(splits_indices_pickle):
440 | splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
441 | else:
442 | splits = StratifiedShuffleSplit(n_splits=cv_repetition, test_size=0.2, random_state=seed)
443 | splits_indices = list(splits.split(np.zeros(len(y)), y))
444 | else:
445 | raise Exception("this cross validation strategy has not been implemented!")
446 | elif len(unique) == 1:
447 | raise Exception("Diagnosis cannot be the same for all participants...")
448 | else: ### CV for regression, no need to be stratified
449 | if cv_strategy == 'k_fold':
450 | splits_indices_pickle = os.path.join(output_dir, 'data_split_' + str(cv_repetition) + '-fold.pkl')
451 |
452 | ## try to see if the shuffle has been done
453 | if os.path.isfile(splits_indices_pickle):
454 | splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
455 | else:
456 | splits = KFold(n_splits=cv_repetition, random_state=seed)
457 | splits_indices = list(splits.split(np.zeros(len(y)), y))
458 | elif cv_strategy == 'hold_out':
459 | splits_indices_pickle = os.path.join(output_dir, 'data_split_' + str(cv_repetition) + '-holdout.pkl')
460 | ## try to see if the shuffle has been done
461 | if os.path.isfile(splits_indices_pickle):
462 | splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
463 | else:
464 | splits = ShuffleSplit(n_splits=cv_repetition, test_size=0.2, random_state=seed)
465 | splits_indices = list(splits.split(np.zeros(len(y)), y))
466 | else:
467 | raise Exception("this cross validation strategy has not been implemented!")
468 |
469 | with open(splits_indices_pickle, 'wb') as s:
470 | pickle.dump(splits_indices, s)
471 |
472 | return splits_indices, splits_indices_pickle
473 |
474 | def consensus_clustering(clustering_results, k):
475 | """
476 | This function performs consensus clustering on a co-occurence matrix
477 | :param clustering_results: an array containing all the clustering results across different iterations, in order to
478 | perform
479 | :param k:
480 | :return:
481 | """
482 |
483 | num_pt = clustering_results.shape[0]
484 | cooccurence_matrix = np.zeros((num_pt, num_pt))
485 |
486 | for i in range(num_pt - 1):
487 | for j in range(i + 1, num_pt):
488 | cooccurence_matrix[i, j] = sum(clustering_results[i, :] == clustering_results[j, :])
489 |
490 | cooccurence_matrix = np.add(cooccurence_matrix, cooccurence_matrix.transpose())
491 | ## here is to compute the Laplacian matrix
492 | Laplacian = np.subtract(np.diag(np.sum(cooccurence_matrix, axis=1)), cooccurence_matrix)
493 |
494 | Laplacian_norm = np.subtract(np.eye(num_pt), np.matmul(np.matmul(np.diag(1 / np.sqrt(np.sum(cooccurence_matrix, axis=1))), cooccurence_matrix), np.diag(1 / np.sqrt(np.sum(cooccurence_matrix, axis=1)))))
495 | ## replace the nan with 0
496 | Laplacian_norm = np.nan_to_num(Laplacian_norm)
497 |
498 | ## check if the Laplacian norm is symmetric or not, because matlab eig function will automatically check this, but not in numpy or scipy
499 | if check_symmetric(Laplacian_norm):
500 | ## extract the eigen value and vector
501 | ## matlab eig equivalence is eigh, not eig from numpy or scipy, see this post: https://stackoverflow.com/questions/8765310/scipy-linalg-eig-return-complex-eigenvalues-for-covariance-matrix
502 | ## Note, the eigenvector is not unique, thus the matlab and python eigenvector may be different, but this will not affect the results.
503 | evalue, evector = scipy.linalg.eigh(Laplacian_norm)
504 | else:
505 | # evalue, evector = np.linalg.eig(Laplacian_norm)
506 | raise Exception("The Laplacian matrix should be symmetric here...")
507 |
508 | ## check if the eigen vector is complex
509 | if np.any(np.iscomplex(evector)):
510 | evalue, evector = scipy.linalg.eigh(Laplacian)
511 |
512 | ## create the kmean algorithm with sklearn
513 | kmeans = KMeans(n_clusters=k, n_init=20).fit(evector.real[:, 0: k])
514 | final_predict = kmeans.labels_
515 |
516 | return final_predict
517 |
518 | def cv_cluster_stability(result, k):
519 | """
520 | To compute the adjusted rand index across different pair of 2 folds cross CV
521 | :param result:
522 | :return:
523 | """
524 |
525 | num_pair = 0
526 | aris = []
527 | if k == 1:
528 | adjusted_rand_index = 0 ## note, here, we manually set it to be 0, because it does not make sense when k==1. TODO, need to clarify if there is really heterogeneity in the data, i.e., k == 1 or k>1
529 | else:
530 | for i in range(result.shape[1] - 1):
531 | for j in range(i+1, result.shape[1]):
532 | num_pair += 1
533 | non_zero_index = np.all(result[:, [i, j]], axis=1)
534 | pair_result = result[:, [i, j]][non_zero_index]
535 | ari = adjusted_rand_score(pair_result[:, 0], pair_result[:, 1])
536 | aris.append(ari)
537 |
538 | adjusted_rand_index = np.mean(np.asarray(aris))
539 |
540 | return adjusted_rand_index
541 |
542 | def hydra_solver_svm_tl(num_component, num_component_former, num_repetition, X, y, k, output_dir, num_iteration, tol, balanced, predefined_c, n_threads, num_run):
543 | """
544 | This is the main function of HYDRA, which find the convex polytope using a supervised classification fashion.
545 | :param num_repetition: the number of iteration of CV currently. This is helpful to reconstruct the model and also moniter the processing
546 | :param X: corrected training data feature
547 | :param y: traing data label
548 | :param k: hyperparameter for desired number of clusters in patients
549 | :param options: commandline parameters
550 | :return: the optimal model
551 | """
552 | index_pt = np.where(y == 1)[0] # index for PTs
553 | index_cn = np.where(y == -1)[0] # index for CNs
554 |
555 | ### initialize the final weight for the polytope from the former C
556 | weight_file = os.path.join(output_dir, 'clustering_run' + str(num_run-1), 'component_' + str(num_component_former), str(k) + '_clusters', 'tsv', 'weight_sample_cv_' + str(num_repetition) + '.tsv')
557 | weight_sample = pd.read_csv(weight_file, sep='\t').to_numpy()
558 |
559 | ## cluster assignment is based on this svm scores across different SVM/hyperplanes
560 | svm_scores = np.zeros((weight_sample.shape[0], weight_sample.shape[1]))
561 | update_weights_pool = ThreadPool(n_threads)
562 | for j in range(num_iteration):
563 | for m in range(k):
564 | sample_weight = np.ascontiguousarray(weight_sample[:, m])
565 |
566 | if np.count_nonzero(sample_weight[index_pt]) == 0:
567 | print("Cluster dropped, meaning that all PT has been assigned to one single hyperplane in iteration: %d" % (j-1))
568 | svm_scores[:, m] = np.asarray([np.NINF] * (y.shape[0]))
569 | else:
570 |
571 | results = update_weights_pool.apply_async(launch_svc, args=(X, y, predefined_c, sample_weight, balanced))
572 | weight_coef = results.get()[0]
573 | intesept = results.get()[1]
574 | ## Apply the data again the trained model to get the final SVM scores
575 | svm_scores[:, m] = (np.matmul(weight_coef, X.transpose()) + intesept).transpose().squeeze()
576 |
577 |
578 | final_predict = np.argmax(svm_scores[index_pt], axis=1)
579 |
580 | ## decide the converge of the polytope based on the toleration
581 | weight_sample_hold = weight_sample.copy()
582 | # after each iteration, first set the weight of patient rows to be 0
583 | weight_sample[index_pt, :] = 0
584 | # then set the pt's weight to be 1 for the assigned hyperplane
585 | for n in range(len(index_pt)):
586 | weight_sample[index_pt[n], final_predict[n]] = 1
587 |
588 | ## check the loss comparted to the tolorence for stopping criteria
589 | loss = np.linalg.norm(np.subtract(weight_sample, weight_sample_hold), ord='fro')
590 | print("The loss is: %f" % loss)
591 | if loss < tol:
592 | print("The polytope has been converged for iteration %d in finding %d clusters" % (j, k))
593 | break
594 | update_weights_pool.close()
595 | update_weights_pool.join()
596 |
597 | ## after deciding the final convex polytope, we refit the training data once to save the best model
598 | weight_sample_final = np.zeros((y.shape[0], k))
599 | ## change the weight of PTs to be 1, CNs to be 1/k
600 |
601 | # then set the pt's weight to be 1 for the assigned hyperplane
602 | for n in range(len(index_pt)):
603 | weight_sample_final[index_pt[n], final_predict[n]] = 1
604 |
605 | weight_sample_final[index_cn] = 1 / k
606 | update_weights_pool_final = ThreadPool(n_threads)
607 |
608 | for o in range(k):
609 | sample_weight = np.ascontiguousarray(weight_sample_final[:, o])
610 | if np.count_nonzero(sample_weight[index_pt]) == 0:
611 | print("Cluster dropped, meaning that the %d th hyperplane is useless!" % (o))
612 | else:
613 | results = update_weights_pool_final.apply_async(launch_svc, args=(X, y, predefined_c, sample_weight, balanced))
614 |
615 | ## save the final model for the k SVMs/hyperplanes
616 | if not os.path.exists(
617 | os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component),
618 | str(k) + '_clusters', 'models')):
619 | os.makedirs(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component),
620 | str(k) + '_clusters', 'models'))
621 |
622 | dump(results.get()[2],
623 | os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component),
624 | str(k) + '_clusters', 'models',
625 | 'svm-' + str(o) + '_last_repetition.joblib'))
626 |
627 | update_weights_pool_final.close()
628 | update_weights_pool_final.join()
629 |
630 | y[index_pt] = final_predict + 1
631 |
632 | if not os.path.exists(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv')):
633 | os.makedirs(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv'))
634 |
635 | ## save the assigned weight for each subject across k-fold
636 | columns = ['hyperplane' + str(i) for i in range(k)]
637 | weight_sample_df = pd.DataFrame(weight_sample_final, columns=columns)
638 | weight_sample_df.to_csv(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv', 'weight_sample_cv_' + str(num_repetition) + '.tsv'), index=False, sep='\t', encoding='utf-8')
639 |
640 | ## save the final_predict_all
641 | columns = ['y_hat']
642 | y_hat_df = pd.DataFrame(y, columns=columns)
643 | y_hat_df.to_csv(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv', 'y_hat_cv_' + str(num_repetition) + '.tsv'), index=False, sep='\t', encoding='utf-8')
644 |
645 | ## save the pt index
646 | columns = ['pt_index']
647 | pt_df = pd.DataFrame(index_pt, columns=columns)
648 | pt_df.to_csv(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv', 'pt_index_cv_' + str(num_repetition) + '.tsv'), index=False, sep='\t', encoding='utf-8')
649 |
650 | return y
651 |
652 | def cluster_stability_across_resolution(c, c_former, output_dir, k_continuing, num_run, stop_tol=0.98):
653 | """
654 | To evaluate the stability of clustering across two different C for stopping criterion.
655 | Args:
656 | c:
657 | c_former:
658 | output_dir:
659 | k_continuing:
660 | num_run:
661 | stop_tol:
662 | max_num_iter:
663 |
664 | Returns:
665 |
666 | """
667 | ## read the output of current C and former Cs
668 | cluster_ass1 = os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(c), 'clustering_assignment.tsv')
669 | ass1_df = pd.read_csv(cluster_ass1, sep='\t')
670 | ass1_df = ass1_df.loc[ass1_df['diagnosis'] == 1]
671 |
672 | cluster_ass2 = os.path.join(output_dir, 'clustering_run' + str(num_run-1), 'component_' + str(c_former), 'clustering_assignment.tsv')
673 | ass2_df = pd.read_csv(cluster_ass2, sep='\t')
674 | ass2_df = ass2_df.loc[ass2_df['diagnosis'] == 1]
675 |
676 | df_final = pd.DataFrame(columns=['C', 'K', 'num_run'])
677 |
678 | k_continuing_update = []
679 | k_converged = []
680 | for i in k_continuing:
681 | ari = adjusted_rand_score(ass1_df['assignment_' + str(i)], ass2_df['assignment_' + str(i)])
682 | print("For k == %d, run %d got ARI == %f compared to former run" % (i, num_run, ari))
683 | if ari < stop_tol and num_run:
684 | k_continuing_update.append(i)
685 | else:
686 | print("Model has been converged or stop at the max iteration: C == %d, K == %d and run == %d" % (c, i, num_run))
687 | k_converged.append(i)
688 | df_row = pd.DataFrame(columns=['C', 'K', 'num_run'])
689 | df_row.loc[len(['C', 'K', 'num_run'])] = [c, i, num_run]
690 | df_final = df_final.append(df_row)
691 |
692 | if len(k_converged) != 0:
693 | df_final.to_csv(os.path.join(output_dir, 'results_convergence_run' + str(num_run) + '.tsv'), index=False, sep='\t', encoding='utf-8')
694 |
695 | return k_continuing_update, k_converged
696 |
697 | def summary_clustering_result_multiscale(output_dir, k_min, k_max):
698 | """
699 | This is a function to summarize the clustering results
700 | :param num_components_min:
701 | :param num_components_max:
702 | :param num_components_step:
703 | :param output_dir:
704 | :return:
705 | """
706 | clu_col_list = ['assignment_' + str(e) for e in range(k_min, k_max)]
707 | df_clusters = pd.DataFrame(columns=clu_col_list)
708 |
709 | ## read the convergence tsv
710 | convergence_tsvs = [f for f in glob.glob(output_dir + "/results_convergence_*.tsv", recursive=True)]
711 |
712 | for tsv in convergence_tsvs:
713 | df_convergence = pd.read_csv(tsv, sep='\t')
714 |
715 | ## sorf by K
716 | df_convergence = df_convergence.sort_values(by=['K'])
717 |
718 | for i in range(df_convergence.shape[0]):
719 | k = df_convergence['K'].tolist()[i]
720 | num_run = df_convergence['num_run'].tolist()[i]
721 | C = df_convergence['C'].tolist()[i]
722 | cluster_file = os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(C), 'clustering_assignment.tsv')
723 |
724 | df_cluster = pd.read_csv(cluster_file, sep='\t')
725 | if i == 0:
726 | df_header = df_cluster.iloc[:, 0:3]
727 | assign = df_cluster['assignment_' + str(k)]
728 | df_clusters['assignment_' + str(k)] = assign
729 |
730 | ## concatenqte the header
731 | df_assignment = pd.concat((df_header, df_clusters), axis=1)
732 |
733 | ## save the result
734 | df_assignment.to_csv(os.path.join(output_dir, 'results_cluster_assignment_final.tsv'), index=False, sep='\t', encoding='utf-8')
735 |
736 | def shift_list(c_list, index):
737 | """
738 | This is a function to reorder a list to have all posibility by putting each element in the first place
739 | Args:
740 | c_list: list to shift
741 | index: the index of which element to shift
742 |
743 | Returns:
744 |
745 | """
746 | new_list = c_list[index:] + c_list[:index]
747 |
748 | return new_list
749 |
750 | def consensus_clustering_across_c(output_dir, c_list, k_min, k_max):
751 | """
752 | This is for consensus learning at the end across different Cs
753 | Args:
754 | output_dir:
755 | c_list:
756 |
757 | Returns:
758 |
759 | """
760 | k_list = list(range(k_min, k_max+1))
761 | for k in k_list:
762 | for i in c_list:
763 | clu_col_list = ['c_' + str(i) + '_assignment_' + str(e) for e in k_list]
764 | df_clusters = pd.DataFrame(columns=clu_col_list)
765 |
766 | tsv = os.path.join(output_dir, 'initialization_c_' + str(i), 'results_cluster_assignment_final.tsv')
767 | df = pd.read_csv(tsv, sep='\t')
768 |
769 | if i == c_list[0]:
770 | df_header = df.iloc[:, 0:3]
771 | df_clusters['c_' + str(i) + '_assignment_' + str(k)] = df['assignment_' + str(k)]
772 | if i == c_list[0]:
773 | df_final = df_clusters
774 | else:
775 | df_final = pd.concat([df_final, df_clusters], axis=1)
776 |
777 | ## concatenate the header and the results
778 | df_final = pd.concat([df_header, df_final], axis=1)
779 | df_final_pt = df_final.loc[df_final['diagnosis'] == 1]
780 | df_final_cn = df_final.loc[df_final['diagnosis'] == -1]
781 | num_cn = df_final_cn.shape[0]
782 |
783 | ## create the final dataframe to store the final assignment
784 | col_list = ['assignment_' + str(e) for e in k_list]
785 | df_final_assign = pd.DataFrame(columns=col_list)
786 |
787 | ## read the final clustering assignment for each C
788 | for m in k_list:
789 | columns_names = ['c_' + str(e) + '_assignment_' + str(m) for e in c_list]
790 | assignment_pt = df_final_pt[columns_names]
791 | final_predict_pt = consensus_clustering(assignment_pt.to_numpy(), m)
792 | final_predict_cn = -2 * np.ones(num_cn)
793 | final_predict = np.concatenate((final_predict_cn, final_predict_pt)).astype(int)
794 | df_final_assign['assignment_' + str(m)] = final_predict + 1
795 |
796 | df_final_assign = pd.concat([df_header, df_final_assign], axis=1)
797 | ## save the final results into tsv file.
798 | df_final_assign.to_csv(os.path.join(output_dir, 'results_cluster_assignment_final.tsv'), index=False, sep='\t',
799 | encoding='utf-8')
--------------------------------------------------------------------------------