├── magiccluster
    ├── __init__.py
    ├── main.py
    ├── base.py
    ├── cli.py
    ├── magic_clustering.py
    ├── clustering.py
    └── utils.py
├── docs
    ├── _config.yml
    ├── images
    │   └── magic.png
    └── index.md
├── data
    ├── magic.png
    ├── participant.tsv
    └── test_covariate.tsv
├── requirements.txt
├── .gitignore
├── install_requirements.sh
├── CITATION.cff
├── setup.py
├── LICENSE
└── README.md


/magiccluster/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-leap-day


--------------------------------------------------------------------------------
/data/magic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anbai106/MAGIC/HEAD/data/magic.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scikit-learn==0.21.3
3 | pandas
4 | nibabel
5 | 


--------------------------------------------------------------------------------
/docs/images/magic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anbai106/MAGIC/HEAD/docs/images/magic.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.idea/
 2 | # Compiled python modules.
 3 | *.pyc
 4 | 
 5 | # Setuptools distribution folder.
 6 | /dist/
 7 | /build/
 8 | /venv/
 9 | 
10 | test.py
11 | 
12 | # Python egg metadata, regenerated from source files by setuptools.
13 | /*.egg-info


--------------------------------------------------------------------------------
/install_requirements.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | while read requirement;
 4 | do 
 5 | 	if conda install --yes $requirement; then
 6 | 		echo "Successfully install: ${requirement}"
 7 | 	else
 8 | 		conda install --yes -c conda-forge $requirement
 9 | 	fi
10 | done < requirements.txt
11 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | abstract: "This is my MAGIC software for research purposes only..."
 2 | authors:
 3 |   - family-names: Wen
 4 |     given-names: Junhao
 5 |     orcid: "https://orcid.org/0000-0003-2077-3070"
 6 | cff-version: 1.2.0
 7 | version: 0.0.3
 8 | date-released: "2023-09-24"
 9 | keywords:
10 |   - "multi-scale clustering"
11 |   - research
12 | license: MIT
13 | message: "If you use this software, please cite it using these metadata."
14 | repository-code: "https://github.com/anbai106/MAGIC"
15 | title: "MAGIC"


--------------------------------------------------------------------------------
/magiccluster/main.py:
--------------------------------------------------------------------------------
 1 | from magiccluster import cli
 2 | 
 3 | __author__ = "Junhao Wen"
 4 | __copyright__ = "Copyright 2023"
 5 | __credits__ = ["Junhao Wen"]
 6 | __license__ = "See LICENSE file"
 7 | __version__ = "0.0.3"
 8 | __maintainer__ = "Junhao Wen"
 9 | __email__ = "junhao.wen89@gmail.com"
10 | __status__ = "Development"
11 | 
12 | def main():
13 | 
14 |     parser = cli.parse_command_line()
15 |     args = parser.parse_args()
16 |     args.func(args)
17 | 
18 |     
19 | if __name__ == '__main__':
20 |     main()
21 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="magiccluster",
 8 |     version="0.0.3",
 9 |     author="junhao.wen",
10 |     author_email="junhao.wen89@email.com",
11 |     description="Multi-scale semi-supervised clustering",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/anbai106/MAGIC",
15 |     packages=setuptools.find_packages(),
16 |     entry_points={
17 |         'console_scripts': [
18 |             'magiccluster = magiccluster.main:main',
19 |         ],
20 |     },
21 |     classifiers=(
22 |         "Programming Language :: Python :: 3",
23 |         "License :: OSI Approved :: MIT License",
24 |         "Operating System :: OS Independent",
25 |     ),
26 | )
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Junhao WEN
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">
 2 |   <a href="https://anbai106.github.io/MAGIC/">
 3 |     <img src="https://anbai106.github.io/MAGIC/images/magic.png" alt="magic logo" width="150" height="150">
 4 |   </a>
 5 |   <br/>
 6 |   MAGIC
 7 | </h1>
 8 | 
 9 | <p align="center"><strong>Multi-scAle heteroGeneity analysIs and Clustering</strong></p>
10 | 
11 | <p align="center">
12 |   <a href="https://anbai106.github.io/MAGIC/">Documentation</a>
13 | </p>
14 | 
15 | ## `MAGIC`
16 | **MAGIC**, Multi-scAle heteroGeneity analysIs and Clustering, is a multi-scale semi-supervised clustering method that aims to derive robust clustering solutions across different scales for brain diseases.
17 | 
18 | > :warning: **The documentation of this software is currently under development**
19 | 
20 | ## Citing this work
21 | > :warning: Please let me know if you use this package for your publication; I will update your papers in the section of **Publication using MAGIC**...
22 | 
23 | > :warning: Please cite the software using the **Cite this repository** button on the right sidebar menu, as well as the original papers below ...
24 | 
25 | ### Original papers
26 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2020) **MAGIC: Multi-scale Heterogeneity Analysis and Clustering for Brain Diseases**. Medical Image Computing and Computer Assisted Intervention – MICCAI 2020. MICCAI 2020. Lecture Notes in Computer Science, vol 12267. Springer, Cham. https://doi.org/10.1007/978-3-030-59728-3_66
27 | 
28 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2022) **Multi-scale semi-supervised clustering of brain images: Deriving disease subtypes**. Medical Image Analysis, 2022. https://doi.org/10.1016/j.media.2021.102304 - [Link](https://www.sciencedirect.com/science/article/pii/S1361841521003492)
29 | 


--------------------------------------------------------------------------------
/data/participant.tsv:
--------------------------------------------------------------------------------
  1 | participant_id	session_id	diagnosis
  2 | sub-80010	ses-M0	-1
  3 | sub-80179	ses-M0	-1
  4 | sub-80199	ses-M0	-1
  5 | sub-80208	ses-M0	1
  6 | sub-80249	ses-M0	-1
  7 | sub-80265	ses-M0	1
  8 | sub-80289	ses-M0	1
  9 | sub-80396	ses-M0	-1
 10 | sub-80425	ses-M0	1
 11 | sub-80498	ses-M0	1
 12 | sub-80537	ses-M0	-1
 13 | sub-80557	ses-M0	-1
 14 | sub-80575	ses-M0	-1
 15 | sub-80607	ses-M0	1
 16 | sub-80680	ses-M0	1
 17 | sub-80688	ses-M0	-1
 18 | sub-80765	ses-M0	-1
 19 | sub-80812	ses-M0	1
 20 | sub-80854	ses-M0	-1
 21 | sub-80889	ses-M0	-1
 22 | sub-81043	ses-M0	-1
 23 | sub-81222	ses-M0	-1
 24 | sub-81231	ses-M0	1
 25 | sub-81287	ses-M0	-1
 26 | sub-81323	ses-M0	-1
 27 | sub-81353	ses-M0	-1
 28 | sub-81456	ses-M0	-1
 29 | sub-81528	ses-M0	-1
 30 | sub-81533	ses-M0	1
 31 | sub-81544	ses-M0	-1
 32 | sub-81644	ses-M0	-1
 33 | sub-81659	ses-M0	-1
 34 | sub-81662	ses-M0	1
 35 | sub-81754	ses-M0	1
 36 | sub-81826	ses-M0	-1
 37 | sub-81865	ses-M0	-1
 38 | sub-81876	ses-M0	-1
 39 | sub-81903	ses-M0	-1
 40 | sub-81906	ses-M0	1
 41 | sub-81989	ses-M0	-1
 42 | sub-81992	ses-M0	-1
 43 | sub-82003	ses-M0	1
 44 | sub-82021	ses-M0	-1
 45 | sub-82063	ses-M0	1
 46 | sub-82066	ses-M0	1
 47 | sub-82096	ses-M0	-1
 48 | sub-82124	ses-M0	1
 49 | sub-82155	ses-M0	1
 50 | sub-82202	ses-M0	1
 51 | sub-82208	ses-M0	-1
 52 | sub-82217	ses-M0	-1
 53 | sub-82229	ses-M0	1
 54 | sub-82232	ses-M0	-1
 55 | sub-82281	ses-M0	1
 56 | sub-82293	ses-M0	1
 57 | sub-82311	ses-M0	-1
 58 | sub-82359	ses-M0	-1
 59 | sub-82373	ses-M0	-1
 60 | sub-82423	ses-M0	-1
 61 | sub-82453	ses-M0	1
 62 | sub-82458	ses-M0	-1
 63 | sub-82467	ses-M0	-1
 64 | sub-82492	ses-M0	1
 65 | sub-82511	ses-M0	1
 66 | sub-82587	ses-M0	-1
 67 | sub-82674	ses-M0	1
 68 | sub-82709	ses-M0	1
 69 | sub-82754	ses-M0	-1
 70 | sub-82784	ses-M0	1
 71 | sub-82877	ses-M0	-1
 72 | sub-82962	ses-M0	1
 73 | sub-82982	ses-M0	-1
 74 | sub-82985	ses-M0	1
 75 | sub-82989	ses-M0	1
 76 | sub-83010	ses-M0	1
 77 | sub-83013	ses-M0	1
 78 | sub-83044	ses-M0	1
 79 | sub-83080	ses-M0	-1
 80 | sub-83103	ses-M0	-1
 81 | sub-83113	ses-M0	1
 82 | sub-83207	ses-M0	-1
 83 | sub-83260	ses-M0	1
 84 | sub-83358	ses-M0	1
 85 | sub-83372	ses-M0	-1
 86 | sub-83423	ses-M0	-1
 87 | sub-83429	ses-M0	1
 88 | sub-83454	ses-M0	-1
 89 | sub-83525	ses-M0	-1
 90 | sub-83531	ses-M0	1
 91 | sub-83580	ses-M0	-1
 92 | sub-83612	ses-M0	1
 93 | sub-83616	ses-M0	1
 94 | sub-83632	ses-M0	1
 95 | sub-83648	ses-M0	1
 96 | sub-83835	ses-M0	-1
 97 | sub-83972	ses-M0	-1
 98 | sub-83987	ses-M0	-1
 99 | sub-83999	ses-M0	1
100 | sub-84002	ses-M0	-1
101 | 


--------------------------------------------------------------------------------
/data/test_covariate.tsv:
--------------------------------------------------------------------------------
  1 | participant_id	session_id	diagnosis	age	sex
  2 | sub-80010	ses-M0	-1	21.75	0
  3 | sub-80179	ses-M0	-1	21.1666666666667	1
  4 | sub-80199	ses-M0	-1	20.3333333333333	0
  5 | sub-80208	ses-M0	1	20.5	0
  6 | sub-80249	ses-M0	-1	20.8333333333333	1
  7 | sub-80265	ses-M0	1	20.5	1
  8 | sub-80289	ses-M0	1	20.0833333333333	0
  9 | sub-80396	ses-M0	-1	20.8333333333333	0
 10 | sub-80425	ses-M0	1	20	1
 11 | sub-80498	ses-M0	1	20.9166666666667	0
 12 | sub-80537	ses-M0	-1	20.9166666666667	1
 13 | sub-80557	ses-M0	-1	21.5	1
 14 | sub-80575	ses-M0	-1	21.75	0
 15 | sub-80607	ses-M0	1	21	0
 16 | sub-80680	ses-M0	1	21.0833333333333	0
 17 | sub-80688	ses-M0	-1	21.9166666666667	1
 18 | sub-80765	ses-M0	-1	20.5833333333333	1
 19 | sub-80812	ses-M0	1	20.5833333333333	1
 20 | sub-80854	ses-M0	-1	20.1666666666667	0
 21 | sub-80889	ses-M0	-1	21.75	0
 22 | sub-81043	ses-M0	-1	20.75	1
 23 | sub-81222	ses-M0	-1	20.25	1
 24 | sub-81231	ses-M0	1	21.75	1
 25 | sub-81287	ses-M0	-1	20	1
 26 | sub-81323	ses-M0	-1	20.5833333333333	1
 27 | sub-81353	ses-M0	-1	20.0833333333333	0
 28 | sub-81456	ses-M0	-1	21.6666666666667	1
 29 | sub-81528	ses-M0	-1	19.5833333333333	1
 30 | sub-81533	ses-M0	1	21.5833333333333	1
 31 | sub-81544	ses-M0	-1	19.3333333333333	1
 32 | sub-81644	ses-M0	-1	19.25	1
 33 | sub-81659	ses-M0	-1	19.25	0
 34 | sub-81662	ses-M0	1	19.3333333333333	1
 35 | sub-81754	ses-M0	1	19.3333333333333	1
 36 | sub-81826	ses-M0	-1	19.0833333333333	1
 37 | sub-81865	ses-M0	-1	21.75	0
 38 | sub-81876	ses-M0	-1	21.25	0
 39 | sub-81903	ses-M0	-1	19.25	1
 40 | sub-81906	ses-M0	1	21.3333333333333	1
 41 | sub-81989	ses-M0	-1	19.0833333333333	1
 42 | sub-81992	ses-M0	-1	21.1666666666667	0
 43 | sub-82003	ses-M0	1	21.1666666666667	1
 44 | sub-82021	ses-M0	-1	19.3333333333333	1
 45 | sub-82063	ses-M0	1	20.1666666666667	0
 46 | sub-82066	ses-M0	1	21.4166666666667	1
 47 | sub-82096	ses-M0	-1	20.0833333333333	1
 48 | sub-82124	ses-M0	1	20.6666666666667	1
 49 | sub-82155	ses-M0	1	19.5	1
 50 | sub-82202	ses-M0	1	19.5833333333333	1
 51 | sub-82208	ses-M0	-1	21.1666666666667	1
 52 | sub-82217	ses-M0	-1	19.5	1
 53 | sub-82229	ses-M0	1	21.3333333333333	1
 54 | sub-82232	ses-M0	-1	19	1
 55 | sub-82281	ses-M0	1	20	1
 56 | sub-82293	ses-M0	1	19.8333333333333	1
 57 | sub-82311	ses-M0	-1	20.6666666666667	1
 58 | sub-82359	ses-M0	-1	21.6666666666667	0
 59 | sub-82373	ses-M0	-1	19.6666666666667	0
 60 | sub-82423	ses-M0	-1	21.25	0
 61 | sub-82453	ses-M0	1	21.4166666666667	0
 62 | sub-82458	ses-M0	-1	19.0833333333333	0
 63 | sub-82467	ses-M0	-1	20.0833333333333	0
 64 | sub-82492	ses-M0	1	19.5	0
 65 | sub-82511	ses-M0	1	19.6666666666667	1
 66 | sub-82587	ses-M0	-1	19.5833333333333	1
 67 | sub-82674	ses-M0	1	19.6666666666667	1
 68 | sub-82709	ses-M0	1	22.5	1
 69 | sub-82754	ses-M0	-1	20.8333333333333	0
 70 | sub-82784	ses-M0	1	19.5833333333333	1
 71 | sub-82877	ses-M0	-1	20.25	0
 72 | sub-82962	ses-M0	1	19.1666666666667	0
 73 | sub-82982	ses-M0	-1	18.8333333333333	1
 74 | sub-82985	ses-M0	1	20.8333333333333	1
 75 | sub-82989	ses-M0	1	18.8333333333333	0
 76 | sub-83010	ses-M0	1	19.5833333333333	0
 77 | sub-83013	ses-M0	1	20.25	1
 78 | sub-83044	ses-M0	1	20.8333333333333	1
 79 | sub-83080	ses-M0	-1	18.5	1
 80 | sub-83103	ses-M0	-1	18.1666666666667	0
 81 | sub-83113	ses-M0	1	18.6666666666667	1
 82 | sub-83207	ses-M0	-1	19.25	0
 83 | sub-83260	ses-M0	1	22.6666666666667	0
 84 | sub-83358	ses-M0	1	19.8333333333333	1
 85 | sub-83372	ses-M0	-1	18.25	0
 86 | sub-83423	ses-M0	-1	20.75	1
 87 | sub-83429	ses-M0	1	18.8333333333333	1
 88 | sub-83454	ses-M0	-1	20.1666666666667	1
 89 | sub-83525	ses-M0	-1	18.9166666666667	1
 90 | sub-83531	ses-M0	1	19.9166666666667	0
 91 | sub-83580	ses-M0	-1	19	0
 92 | sub-83612	ses-M0	1	20	1
 93 | sub-83616	ses-M0	1	18.5833333333333	1
 94 | sub-83632	ses-M0	1	20.9166666666667	1
 95 | sub-83648	ses-M0	1	19.5833333333333	1
 96 | sub-83835	ses-M0	-1	20.6666666666667	0
 97 | sub-83972	ses-M0	-1	19.0833333333333	0
 98 | sub-83987	ses-M0	-1	18.4166666666667	1
 99 | sub-83999	ses-M0	1	18.6666666666667	0
100 | sub-84002	ses-M0	-1	22.25	0
101 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="./images/magic.png" width="150" height="150"/>
  3 | </p>
  4 | 
  5 | # MAGIC documentation
  6 | **MAGIC**, Multi-scAle heteroGeneity analysIs and Clustering, is a multi-scale semi-supervised clustering method that aims to derive robust clustering solutions across different scales for brain diseases.
  7 | Compared to original HYDRA method, MAGIC has the following advantages:
  8 | - Multi-scale feature extractions via opNMF;
  9 | - Inter-scale consistent clustering solution.
 10 | 
 11 | ## Installation
 12 | ### Prerequisites
 13 | In order to run MAGIC, one must have already installed and ran [SOPNMF (https://github.com/anbai106/SOPNMF) with the voxel-wise image data. After this, please follow the following steps for installation.
 14 | 
 15 | There are three choices to install MAGIC.
 16 | ### Use MAGIC as a python package
 17 | We recommend the users to use Conda virtual environment:
 18 | ```
 19 | 1) conda create --name MAGIC python=3.6
 20 | ```
 21 | Activate the virtual environment:
 22 | ```
 23 | 2) source activate MAGIC
 24 | ```
 25 | Install other python package dependencies (go to the root folder of MAGIC):
 26 | ```
 27 | 3) ./install_requirements.sh
 28 | ```
 29 | Finally, we need install MAGIC from PyPi:
 30 | ```
 31 | 3) pip install magiccluster==0.0.3
 32 | ```
 33 | 
 34 | ### Use MAGIC from commandline:
 35 | After installing all dependencies in the **requirements.txt** file, go to the root folder of MAGIC where the **setup.py** locates:
 36 | ```
 37 | pip install -e .
 38 | ```
 39 | 
 40 | ### Use MAGIC as a developer version:
 41 | ```
 42 | python -m pip install git+https://github.com/anbai106/MAGIC.git
 43 | ```
 44 | 
 45 | ## Input structure
 46 | MAGIC requires a specific input structure inspired by [BIDS](https://bids.neuroimaging.io/).
 47 | Some conventions for the group label/diagnosis: -1 represents healthy control (**CN**) and 1 represents patient (**PT**); categorical variables, such as sex, should be encoded to numbers: Female for 0 and Male for 1, for instance.
 48 | 
 49 | ### participant and covariate tsv
 50 | The first 3 columns are **participant_id**, **session_id** and **diagnosis**.
 51 | 
 52 | Example for feature tsv:
 53 | ```
 54 | participant_id    session_id    diagnosis
 55 | sub-CLNC0001      ses-M00    -1   432.1
 56 | sub-CLNC0002      ses-M00    1    398.2
 57 | sub-CLNC0003      ses-M00    -1    412.0
 58 | sub-CLNC0004      ses-M00    -1    487.4
 59 | sub-CLNC0005      ses-M00    1    346.5
 60 | sub-CLNC0006      ses-M00    1    443.2
 61 | sub-CLNC0007      ses-M00    -1    450.2
 62 | sub-CLNC0008      ses-M00    1    443.2
 63 | ```
 64 | Example for covariate tsv:
 65 | ```
 66 | participant_id    session_id    diagnosis    age    sex ...
 67 | sub-CLNC0001      ses-M00    -1   56.1    0
 68 | sub-CLNC0002      ses-M00    1    57.2    0
 69 | sub-CLNC0003      ses-M00    -1    43.0    1
 70 | sub-CLNC0004      ses-M00    -1    25.4    1
 71 | sub-CLNC0005      ses-M00    1    74.5    1
 72 | sub-CLNC0006      ses-M00    1    44.2    0
 73 | sub-CLNC0007      ses-M00    -1    40.2    0
 74 | sub-CLNC0008      ses-M00    1    43.2    1
 75 | ```
 76 | 
 77 | ## Example
 78 | We offer a fake dataset in the folder of **MAGIC/data**. Users should follow the same data structure.
 79 | 
 80 | ### Running MAGIC for clustering CN vs Subtype1 vs Subtype2 vs ...:
 81 | ```
 82 | from from magic.magic_clustering import clustering
 83 | participant_tsv="MAGIC/data/participant.tsv"
 84 | opnmf_dir = "PATH_OPNMF_DIR"
 85 | output_dir = "PATH_OUTPUT_DIR"
 86 | k_min=2
 87 | k_max=8
 88 | cv_repetition=100
 89 | clustering(participant_tsv, opnmf_dir, output_dir, k_min, k_max, 25, 60, 5, cv_repetition)
 90 | ```
 91 | 
 92 | ## Citing this work
 93 | > :warning: Please let me know if you use this package for your publication; I will update your papers in the section of **Publication using MAGIC**...
 94 | 
 95 | > :warning: Please cite the software using the **Cite this repository** button on the right sidebar menu, as well as the original papers below ...
 96 | 
 97 | ### Original papers
 98 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2020) **MAGIC: Multi-scale Heterogeneity Analysis and Clustering for Brain Diseases**. Medical Image Computing and Computer Assisted Intervention – MICCAI 2020. MICCAI 2020. Lecture Notes in Computer Science, vol 12267. Springer, Cham. https://doi.org/10.1007/978-3-030-59728-3_66
 99 | 
100 | > Wen J., Varol E., Chand G., Sotiras A., Davatzikos C. (2022) **Multi-scale semi-supervised clustering of brain images: Deriving disease subtypes**. Medical Image Analysis, 2022. https://doi.org/10.1016/j.media.2021.102304 - [Link](https://www.sciencedirect.com/science/article/pii/S1361841521003492)


--------------------------------------------------------------------------------
/magiccluster/base.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import pandas as pd
  3 | from utils import GLMcorrection
  4 | import numpy as np
  5 | import os
  6 | from sklearn.preprocessing import StandardScaler
  7 | 
  8 | __author__ = "Junhao Wen"
  9 | __copyright__ = "Copyright 2023"
 10 | __credits__ = ["Junhao Wen, Erdem Varol"]
 11 | __license__ = "See LICENSE file"
 12 | __version__ = "0.0.3"
 13 | __maintainer__ = "Junhao Wen"
 14 | __email__ = "junhao.wen89@gmail.com"
 15 | __status__ = "Development"
 16 | 
 17 | 
 18 | class WorkFlow:
 19 |     __metaclass__ = abc.ABCMeta
 20 | 
 21 |     @abc.abstractmethod
 22 |     def run(self):
 23 |         pass
 24 | 
 25 | 
 26 | class Input:
 27 |     __metaclass__ = abc.ABCMeta
 28 | 
 29 |     @abc.abstractmethod
 30 |     def get_x(self):
 31 |         pass
 32 | 
 33 |     @abc.abstractmethod
 34 |     def get_y(self):
 35 |         pass
 36 | 
 37 | class OPNMF_Input(Input):
 38 | 
 39 |     def __init__(self, opnmf_dir, participant_tsv, covariate_tsv=None):
 40 |         self._opnmf_dir = opnmf_dir
 41 |         self._participant_tsv = participant_tsv
 42 |         self._covariate_tsv = covariate_tsv
 43 |         self._x = None
 44 |         self._y = None
 45 | 
 46 |         ## check the participant_tsv & covariate_tsv, the header, the order of the columns, etc
 47 |         self._df_feature = pd.read_csv(participant_tsv, sep='\t')
 48 |         if ('participant_id' != list(self._df_feature.columns.values)[0]) or (
 49 |                 'session_id' != list(self._df_feature.columns.values)[1]) or \
 50 |                 ('diagnosis' != list(self._df_feature.columns.values)[2]):
 51 |             raise Exception("the data file is not in the correct format."
 52 |                             "Columns should include ['participant_id', 'session_id', 'diagnosis']")
 53 |         self._subjects = list(self._df_feature['participant_id'])
 54 |         self._sessions = list(self._df_feature['session_id'])
 55 |         self._diagnosis = list(self._df_feature['diagnosis'])
 56 | 
 57 |     def get_x(self, num_component, opnmf_dir):
 58 | 
 59 |         ## alternatively, we use here the output of pyOPNMF loading coefficient
 60 |         loading_coefficient_csv = os.path.join(opnmf_dir, 'NMF', 'component_' + str(num_component),
 61 |                                                'loading_coefficient.tsv')
 62 |         ## read the tsv
 63 |         df_opnmf = pd.read_csv(loading_coefficient_csv, sep='\t')
 64 |         df_opnmf = df_opnmf.loc[df_opnmf['participant_id'].isin(self._df_feature['participant_id'])]
 65 |         ### adjust the order of the rows to match the original tsv files
 66 |         df_opnmf = df_opnmf.set_index('participant_id')
 67 |         df_opnmf = df_opnmf.reindex(index=self._df_feature['participant_id'])
 68 |         df_opnmf = df_opnmf.reset_index()
 69 | 
 70 |         self._x = df_opnmf[['component_' + str(i + 1) for i in range(num_component)]].to_numpy()
 71 | 
 72 |         ### normalize the data, note the normalization should be done for each component, not across component
 73 |         scaler = StandardScaler()
 74 |         self._x = scaler.fit_transform(self._x)
 75 | 
 76 |         if self._covariate_tsv is not None:
 77 |             df_covariate = pd.read_csv(self._covariate_tsv, sep='\t')
 78 |             if ('participant_id' != list(self._df_feature.columns.values)[0]) or (
 79 |                     'session_id' != list(self._df_feature.columns.values)[1]) or \
 80 |                     ('diagnosis' != list(self._df_feature.columns.values)[2]):
 81 |                 raise Exception("the data file is not in the correct format."
 82 |                                 "Columns should include ['participant_id', 'session_id', 'diagnosis']")
 83 |             participant_covariate = list(df_covariate['participant_id'])
 84 |             session_covariate = list(df_covariate['session_id'])
 85 |             label_covariate = list(df_covariate['diagnosis'])
 86 | 
 87 |             # check that the participant_tsv and covariate_tsv have the same orders for the first three column
 88 |             if (not self._subjects == participant_covariate) or (not self._sessions == session_covariate) or (
 89 |                     not self._diagnosis == label_covariate):
 90 |                 raise Exception(
 91 |                     "the first three columns in the feature csv and covariate csv should be exactly the same.")
 92 | 
 93 |             ## normalize the covariate z-scoring
 94 |             data_covariate = df_covariate.iloc[:, 3:]
 95 |             data_covariate = ((data_covariate - data_covariate.mean()) / data_covariate.std()).values
 96 | 
 97 |             ## correction for the covariate, only retain the pathodological correspondance
 98 |             self._x, _ = GLMcorrection(self._x, np.asarray(self._diagnosis), data_covariate, self._x, data_covariate)
 99 | 
100 |         return self._x
101 | 
102 |     def get_y(self):
103 |         """
104 |         Do not change the label's representation
105 |         :return:
106 |         """
107 | 
108 |         if self._y is not None:
109 |             return self._y
110 | 
111 |         self._y = np.array(self._diagnosis)
112 |         return self._y
113 | 
114 | 


--------------------------------------------------------------------------------
/magiccluster/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | __author__ = "Junhao Wen"
  4 | __copyright__ = "Copyright 2023"
  5 | __credits__ = ["Junhao Wen"]
  6 | __license__ = "See LICENSE file"
  7 | __version__ = "0.0.3"
  8 | __maintainer__ = "Junhao Wen"
  9 | __email__ = "junhao.wen89@gmail.com"
 10 | __status__ = "Development"
 11 | 
 12 | def magic_func(args):
 13 |     """
 14 |     The default function to run classification.
 15 |     Args:
 16 |         args: args from parser
 17 | 
 18 |     Returns:
 19 | 
 20 |     """
 21 |     from magiccluster.magic_clustering import clustering
 22 |     clustering(
 23 |         args.participant_tsv,
 24 |         args.opnmf_dir,
 25 |         args.output_dir,
 26 |         args.k_min,
 27 |         args.k_max,
 28 |         args.num_components_min,
 29 |         args.num_components_max,
 30 |         args.num_components_step,
 31 |         args.cv_repetition,
 32 |         args.covariate_tsv,
 33 |         args.cv_strategy,
 34 |         args.save_models,
 35 |         args.cluster_predefined_c,
 36 |         args.class_weight_balanced,
 37 |         args.weight_initialization_type,
 38 |         args.num_iteration,
 39 |         args.num_consensus,
 40 |         args.tol,
 41 |         args.multiscale_tol,
 42 |         args.n_threads,
 43 |         args.verbose
 44 |     )
 45 | 
 46 | def parse_command_line():
 47 |     """
 48 |     Definition for the commandline parser
 49 |     Returns:
 50 | 
 51 |     """
 52 | 
 53 |     parser = argparse.ArgumentParser(
 54 |         prog='magiccluster-cluster',
 55 |         description='Perform multi-scale semi-supervised clustering using MAGIC...')
 56 | 
 57 |     subparser = parser.add_subparsers(
 58 |         title='''Task to perform...''',
 59 |         description='''We now only allow to use MAGIC for clustering''',
 60 |         dest='task',
 61 |         help='''****** Tasks proposed by MAGIC ******''')
 62 | 
 63 |     subparser.required = True
 64 | 
 65 | ########################################################################################################################
 66 | 
 67 |     ## Add arguments for ADML ROI classification
 68 |     clustering_parser = subparser.add_parser(
 69 |         'cluster',
 70 |         help='Perform clustering with MAGIC.')
 71 | 
 72 |     clustering_parser.add_argument(
 73 |         'participant_tsv',
 74 |         help="Path to the tsv containing the following first columns:"
 75 |              "i) the first column is the participant_id. "
 76 |              "ii) the second column should be the session_id. "
 77 |              "iii) the third column should be the diagnosis. ",
 78 |         default=None
 79 |     )
 80 | 
 81 |     clustering_parser.add_argument(
 82 |         'opnmf_dir',
 83 |         help='Path to the directory of where SOPNMF was run (the voxel-wise images should be run first with SOPNMF).',
 84 |         default=None
 85 |     )
 86 | 
 87 |     clustering_parser.add_argument(
 88 |         'output_dir',
 89 |         help='Path to the directory of where to store the final output.',
 90 |         default=None
 91 |     )
 92 |     
 93 |     clustering_parser.add_argument(
 94 |         'k_min',
 95 |         help='Number of cluster (k) minimum value.',
 96 |         default=None, type=int
 97 |     )
 98 | 
 99 |     clustering_parser.add_argument(
100 |         'k_max',
101 |         help='Number of cluster (k) maximum value.',
102 |         default=None, type=int
103 |     )
104 | 
105 |     clustering_parser.add_argument(
106 |         'num_components_min',
107 |         help='Number of the min PSC for the SOPNMF',
108 |         default=None, type=int
109 |     )
110 | 
111 |     clustering_parser.add_argument(
112 |         'num_components_max',
113 |         help='Number of the max PSC for the SOPNMF',
114 |         default=None, type=int
115 |     )
116 | 
117 |     clustering_parser.add_argument(
118 |         'num_components_step',
119 |         help='The step size between the min and the max PSC for the SOPNMF',
120 |         default=None, type=int
121 |     )
122 | 
123 |     clustering_parser.add_argument(
124 |         'cv_repetition',
125 |         help='Number of repetitions for the chosen cross-validation (CV).',
126 |         default=None, type=int
127 |     )
128 | 
129 |     clustering_parser.add_argument(
130 |         '--covariate_tsv',
131 |         help="Path to the tsv containing covariates, following the BIDS convention. The first 3 columns is the same as feature_tsv",
132 |         default=None,
133 |         type=str
134 |     )
135 | 
136 |     clustering_parser.add_argument(
137 |         '-cs', '--cv_strategy',
138 |         help='Chosen CV strategy, default is hold_out. ',
139 |         type=str, default='hold_out',
140 |         choices=['k_fold', 'hold_out'],
141 |     )
142 | 
143 |     clustering_parser.add_argument(
144 |         '-sm', '--save_models',
145 |         help='If save modles during all repetitions of CV. ',
146 |         default=False, action="store_true"
147 |     )
148 | 
149 |     clustering_parser.add_argument(
150 |         '--cluster_predefined_c',
151 |         type=float,
152 |         default=0.25,
153 |         help="Predefined hyperparameter C of SVM. Default is 0.25. "
154 |              "Better choice may be guided by HYDRA global classification with nested CV for optimal C searching. "
155 |     )
156 | 
157 |     clustering_parser.add_argument(
158 |         '-cwb', '--class_weight_balanced',
159 |         help='If group samples are balanced, default is True. ',
160 |         default=False, action="store_true"
161 |     )
162 | 
163 |     clustering_parser.add_argument(
164 |         '-wit', '--weight_initialization_type',
165 |         help='Strategy for initializing the weighted sample matrix of the polytope. ',
166 |         type=str, default='DPP',
167 |         choices=['DPP', 'random_assign'],
168 |     )
169 | 
170 |     clustering_parser.add_argument(
171 |         '--num_iteration',
172 |         help='Number of iteration to converge each SVM.',
173 |         default=50, type=int
174 |     )
175 | 
176 |     clustering_parser.add_argument(
177 |         '--num_consensus',
178 |         help='Number of iteration for inner consensus clusetering.',
179 |         default=20, type=int
180 |     )
181 | 
182 |     clustering_parser.add_argument(
183 |         '--tol',
184 |         help='Clustering stopping criterion, until the polytope becomes stable',
185 |         default=1e-8, type=float
186 |     )
187 | 
188 |     clustering_parser.add_argument(
189 |         '--multiscale_tol',
190 |         help='Clustering stopping criterion, until the multi-scale clustering solution stable',
191 |         default=0.85, type=float
192 |     )
193 | 
194 |     clustering_parser.add_argument(
195 |         '-nt', '--n_threads',
196 |         help='Number of cores used, default is 4',
197 |         type=int, default=4
198 |     )
199 | 
200 |     clustering_parser.add_argument(
201 |         '-v', '--verbose',
202 |         help='Increase output verbosity',
203 |         default=False, action="store_true"
204 |     )
205 | 
206 |     clustering_parser.set_defaults(func=magic_func)
207 |     
208 |     
209 | 
210 | 


--------------------------------------------------------------------------------
/magiccluster/magic_clustering.py:
--------------------------------------------------------------------------------
  1 | from .clustering import DualSVM_Subtype, DualSVM_Subtype_transfer_learning
  2 | from .base import OPNMF_Input
  3 | import os, pickle
  4 | from .utils import cluster_stability_across_resolution, summary_clustering_result_multiscale, shift_list, consensus_clustering_across_c, make_cv_partition
  5 | import numpy as np
  6 | 
  7 | __author__ = "Junhao Wen"
  8 | __copyright__ = "Copyright 2023"
  9 | __credits__ = ["Junhao Wen, Erdem Varol"]
 10 | __license__ = "See LICENSE file"
 11 | __version__ = "0.0.3"
 12 | __maintainer__ = "Junhao Wen"
 13 | __email__ = "junhao.wen89@gmail.com"
 14 | __status__ = "Development"
 15 | 
 16 | def clustering(participant_tsv, opnmf_dir, output_dir, k_min, k_max, num_components_min, num_components_max, num_components_step, cv_repetition, covariate_tsv=None, cv_strategy='hold_out', save_models=False,
 17 |             cluster_predefined_c=0.25, class_weight_balanced=True, weight_initialization_type='DPP', num_iteration=50,
 18 |             num_consensus=20, tol=1e-8, multiscale_tol=0.85, n_threads=8, verbose=False):
 19 |     """
 20 |     pyhydra core function for clustering
 21 |     Args:
 22 |         participant_tsv:str, path to the participant_tsv tsv, following the BIDS convention. The tsv contains
 23 |         the following headers: "
 24 |                                  "i) the first column is the participant_id;"
 25 |                                  "ii) the second column should be the session_id;"
 26 |                                  "iii) the third column should be the diagnosis;"
 27 |         opnmf_dir: str, path to store the OPNMF results
 28 |         output_dir: str, path to store the clustering results
 29 |         k_min: int, minimum k (number of clusters)
 30 |         k_max: int, maximum k (number of clusters)
 31 |         cv_repetition: int, number of repetitions for cross-validation (CV)
 32 |         covariate_tsv: str, path to the tsv containing the covaria`tes, eg., age or sex. The header (first 3 columns) of
 33 |                      the tsv file is the same as the feature_tsv, following the BIDS convention.
 34 |         cv_strategy: str, cross validation strategy used. Default is hold_out. choices=['k_fold', 'hold_out']
 35 |         save_models: Bool, if save all models during CV. Default is False to save space.
 36 |                       Set true only if you are going to apply the trained model to unseen data.
 37 |         cluster_predefined_c: Float, default is 0.25. The predefined best c if you do not want to perform a nested CV to
 38 |                              find it. If used, it should be a float number
 39 |         class_weight_balanced: Bool, default is True. If the two groups are balanced.
 40 |         weight_initialization_type: str, default is DPP. The strategy for initializing the weight to control the
 41 |                                     hyperplances and the subpopulation of patients. choices=["random_hyperplane", "random_assign", "k_means", "DPP"]
 42 |         num_iteration: int, default is 50. The number of iterations to iteratively optimize the polytope.
 43 |         num_consensus: int, default is 20. The number of repeats for consensus clustering to eliminate the unstable clustering.
 44 |         tol: float, default is 1e-8. Clustering stopping criterion.
 45 |         multiscale_tol: float, default is 0.85. Double cyclic optimization stopping criterion.
 46 |         n_threads: int, default is 8. The number of threads to run model in parallel.
 47 |         verbose: Bool, default is False. If the output message is verbose.
 48 | 
 49 |     Returns: clustering outputs.
 50 | 
 51 |     """
 52 |     ### For voxel approach
 53 |     print('MAGIC for semi-supervised clustering...')
 54 |     if covariate_tsv == None:
 55 |         input_data = OPNMF_Input(opnmf_dir, participant_tsv, covariate_tsv=None)
 56 |     else:
 57 |         input_data = OPNMF_Input(opnmf_dir, participant_tsv, covariate_tsv=covariate_tsv)
 58 | 
 59 |     ## data split
 60 |     print('Data split was performed based on validation strategy: %s...\n' % cv_strategy)
 61 |     if cv_strategy == "hold_out":
 62 |         ## check if data split has been done, if yes, the pickle file is there
 63 |         if os.path.isfile(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-holdout.pkl')):
 64 |             split_index = pickle.load(open(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-holdout.pkl'), 'rb'))
 65 |         else:
 66 |             split_index, _ = make_cv_partition(input_data.get_y(), cv_strategy, output_dir, cv_repetition)
 67 |     elif cv_strategy == "k_fold":
 68 |         ## check if data split has been done, if yes, the pickle file is there
 69 |         if os.path.isfile(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-fold.pkl')):
 70 |             split_index = pickle.load(open(os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-fold.pkl'), 'rb'))
 71 |         else:
 72 |             split_index, _ = make_cv_partition(input_data.get_y(), cv_strategy, output_dir, cv_repetition)
 73 | 
 74 |     print('Data split has been done!\n')
 75 | 
 76 |     print('Starts semi-supervised clustering...')
 77 |     ### Here, semi-supervised clustering with multi-scale feature reduction learning
 78 |     if (num_components_max - num_components_min) % num_components_step != 0:
 79 |         raise Exception('Number of componnets step should be divisible!')
 80 | 
 81 |     ## C lists
 82 |     C_list = list(range(num_components_min, num_components_max+num_components_step, num_components_step))
 83 |     ## first loop on different initial C.
 84 |     for i in range(len(C_list)):
 85 |         c_list = shift_list(C_list, i)
 86 |         num_run = 0
 87 |         loop = True
 88 |         print('Initialize C == %d\n' % C_list[i])
 89 |         while loop:
 90 |             for j in range(len(c_list)):
 91 |                 if num_run == 0:
 92 |                     num_run += 1
 93 |                     k_continuing = np.arange(k_min, k_max+1).tolist()
 94 |                     print('First C == %d\n' % c_list[j])
 95 |                     output_dir_loop = os.path.join(output_dir, 'initialization_c_' + str(C_list[i]), 'clustering_run' + str(num_run))
 96 |                     wf_clustering = DualSVM_Subtype(input_data,
 97 |                                                     participant_tsv,
 98 |                                                     split_index,
 99 |                                                     cv_repetition,
100 |                                                     k_min,
101 |                                                     k_max,
102 |                                                     output_dir_loop,
103 |                                                     opnmf_dir,
104 |                                                     balanced=class_weight_balanced,
105 |                                                     num_consensus=num_consensus,
106 |                                                     num_iteration=num_iteration,
107 |                                                     tol=tol,
108 |                                                     predefined_c=cluster_predefined_c,
109 |                                                     weight_initialization_type=weight_initialization_type,
110 |                                                     n_threads=n_threads,
111 |                                                     num_components_min=c_list[j],
112 |                                                     num_components_max=c_list[j],
113 |                                                     num_components_step=num_components_step,
114 |                                                     save_models=save_models,
115 |                                                     verbose=verbose)
116 | 
117 |                     wf_clustering.run()
118 |                 else: ## initialize the model from the former resolution
119 |                     num_run += 1
120 |                     print('Transfer learning on resolution C == %d for run == %d\n' % (c_list[j], num_run))
121 |                     output_dir_tl = os.path.join(output_dir, 'initialization_c_' + str(C_list[i]))
122 |                     wf_clustering = DualSVM_Subtype_transfer_learning(input_data,
123 |                                                                       participant_tsv,
124 |                                                                       split_index,
125 |                                                                       cv_repetition,
126 |                                                                       k_continuing,
127 |                                                                       output_dir_tl,
128 |                                                                       opnmf_dir,
129 |                                                                       balanced=class_weight_balanced,
130 |                                                                       num_iteration=num_iteration,
131 |                                                                       tol=tol,
132 |                                                                       predefined_c=cluster_predefined_c,
133 |                                                                       weight_initialization_type=weight_initialization_type,
134 |                                                                       n_threads=n_threads,
135 |                                                                       num_component=c_list[j],
136 |                                                                       num_component_former=c_list[j-1],
137 |                                                                       num_run=num_run)
138 | 
139 |                     wf_clustering.run()
140 | 
141 |                     ### check the clustering stability between the current C and former C
142 |                     k_continuing, k_converged = cluster_stability_across_resolution(c_list[j], c_list[j-1], os.path.join(output_dir, 'initialization_c_' + str(C_list[i])), k_continuing, num_run, stop_tol=multiscale_tol)
143 | 
144 |                     if not k_continuing:
145 |                         loop = False
146 |                         break
147 | 
148 |         ## After cross validate the hyperparameter k & num_components, summarize the results into a single tsv file.
149 |         if not k_continuing:
150 |             summary_clustering_result_multiscale(os.path.join(output_dir, 'initialization_c_' + str(C_list[i])), k_min, k_max)
151 | 
152 |     ## consensus learning based on different initialization Cs
153 |     print('Computing the final consensus group membership!\n')
154 |     consensus_clustering_across_c(output_dir, C_list, k_min, k_max)
155 |     print('Finish...')


--------------------------------------------------------------------------------
/magiccluster/clustering.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pandas as pd
  4 | from .utils import consensus_clustering, cv_cluster_stability, hydra_solver_svm_tl
  5 | from .base import WorkFlow
  6 | from utils import hydra_solver_svm
  7 | 
  8 | __author__ = "Junhao Wen"
  9 | __copyright__ = "Copyright 2023"
 10 | __credits__ = ["Junhao Wen, Erdem Varol"]
 11 | __license__ = "See LICENSE file"
 12 | __version__ = "0.0.3"
 13 | __maintainer__ = "Junhao Wen"
 14 | __email__ = "junhao.wen89@gmail.com"
 15 | __status__ = "Development"
 16 | class DualSVM_Subtype(WorkFlow):
 17 | 
 18 |     def __init__(self, input, participant_tsv, split_index, cv_repetition, k_min, k_max, output_dir, opnmf_dir, balanced=True,
 19 |                 test_size=0.2, num_consensus=20, num_iteration=50, tol=1e-6, predefined_c=None, weight_initialization_type='DPP',
 20 |                  n_threads=8, num_components_min=10, num_components_max=100, num_components_step=10, save_models=False,
 21 |                  verbose=True):
 22 | 
 23 |         self._input = input
 24 |         self._participant_tsv = participant_tsv
 25 |         self._split_index = split_index
 26 |         self._cv_repetition = cv_repetition
 27 |         self._output_dir = output_dir
 28 |         self._opnmf_dir = opnmf_dir
 29 |         self._k_min = k_min
 30 |         self._k_max = k_max
 31 |         self._balanced = balanced
 32 |         self._test_size = test_size
 33 |         self._num_consensus = num_consensus
 34 |         self._num_iteration = num_iteration
 35 |         self._tol = tol
 36 |         self._predefined_c = predefined_c
 37 |         self._weight_initialization_type = weight_initialization_type
 38 |         self._k_range_list = list(range(k_min, k_max + 1))
 39 |         self._n_threads = n_threads
 40 |         self._num_components_min = num_components_min
 41 |         self._num_components_max = num_components_max
 42 |         self._num_components_step = num_components_step
 43 |         self._save_models = save_models
 44 |         self._verbose = verbose
 45 | 
 46 | 
 47 |     def run(self):
 48 | 
 49 |         ## by default, we solve the problem using dual solver with a linear kernel.
 50 |         for num_component in range(self._num_components_min, self._num_components_max + self._num_components_step, self._num_components_step):
 51 | 
 52 |             if os.path.exists(os.path.join(self._output_dir, 'component_' + str(num_component), "adjusted_rand_index.tsv")):
 53 |                 print("This number of component have been trained and converged: %d" % num_component)
 54 |             else:
 55 |                 x = self._input.get_x(num_component, self._opnmf_dir)
 56 |                 y = self._input.get_y_raw()
 57 |                 data_label_folds_ks = np.zeros((y.shape[0], self._cv_repetition, self._k_max - self._k_min + 1)).astype(int)
 58 | 
 59 |                 for i in range(self._cv_repetition):
 60 |                         for j in self._k_range_list:
 61 |                             print('Applying pyHRDRA for finding %d clusters. Repetition: %d / %d...\n' % (j, i+1, self._cv_repetition))
 62 |                             training_final_prediction = hydra_solver_svm(i, x[self._split_index[i][0]], y[self._split_index[i][0]], j, self._output_dir,
 63 |                                                                      self._num_consensus, self._num_iteration, self._tol, self._balanced, self._predefined_c,
 64 |                                                                      self._weight_initialization_type, self._n_threads, self._save_models, self._verbose)
 65 | 
 66 | 
 67 |                             # change the final prediction's label: test data to be 0, the rest training data will b e updated by the model's prediction
 68 |                             data_label_fold = y.copy()
 69 |                             data_label_fold[self._split_index[i][1]] = 0 # all test data to be 0
 70 |                             data_label_fold[self._split_index[i][0]] = training_final_prediction ## assign the training prediction
 71 |                             data_label_folds_ks[:, i, j - self._k_min] = data_label_fold
 72 | 
 73 |                 print('Estimating clustering stability...\n')
 74 |                 ## for the adjusted rand index, only consider the PT results
 75 |                 adjusted_rand_index_results = np.zeros(self._k_max - self._k_min + 1)
 76 |                 index_pt = np.where(y == 1)[0]  # index for PTs
 77 |                 for m in range(self._k_max - self._k_min + 1):
 78 |                     result = data_label_folds_ks[:, :, m][index_pt]
 79 |                     adjusted_rand_index_result = cv_cluster_stability(result, self._k_range_list[m])
 80 |                     # saving each k result into the final adjusted_rand_index_results
 81 |                     adjusted_rand_index_results[m] = adjusted_rand_index_result
 82 | 
 83 |                 print('Computing the final consensus group membership...\n')
 84 |                 final_assignment_ks = -np.ones((self._input.get_y_raw().shape[0], self._k_max - self._k_min + 1)).astype(int)
 85 |                 for n in range(self._k_max - self._k_min + 1):
 86 |                     result = data_label_folds_ks[:, :, n][index_pt]
 87 |                     final_assignment_ks_pt = consensus_clustering(result, n + self._k_min)
 88 |                     final_assignment_ks[index_pt, n] = final_assignment_ks_pt + 1
 89 | 
 90 |                 print('Saving the final results...\n')
 91 |                 # save_cluster_results(adjusted_rand_index_results, final_assignment_ks)
 92 |                 columns = ['ari_' + str(i) + '_subtypes' for i in self._k_range_list]
 93 |                 ari_df = pd.DataFrame(adjusted_rand_index_results[:, np.newaxis].transpose(), columns=columns)
 94 |                 ari_df.to_csv(os.path.join(self._output_dir, 'adjusted_rand_index.tsv'), index=False, sep='\t',
 95 |                               encoding='utf-8')
 96 | 
 97 |                 # save the final assignment for consensus clustering across different folds
 98 |                 participant_df = pd.read_csv(self._participant_tsv, sep='\t')
 99 |                 columns = ['assignment_' + str(i) for i in self._k_range_list]
100 |                 cluster_df = pd.DataFrame(final_assignment_ks, columns=columns)
101 |                 all_df = pd.concat([participant_df, cluster_df], axis=1)
102 |                 all_df.to_csv(os.path.join(self._output_dir, 'clustering_assignment.tsv'), index=False,
103 |                               sep='\t', encoding='utf-8')
104 | 
105 | class DualSVM_Subtype_transfer_learning(WorkFlow):
106 |     """
107 |     Instead of training from scratch, we initialize the polytope from the former C
108 |     """
109 |     def __init__(self, input, participant_tsv, split_index, cv_repetition, k_list, output_dir, opnmf_output, balanced=True,
110 |                  test_size=0.2, num_iteration=50, tol=1e-6, predefined_c=None,
111 |                  weight_initialization_type='DPP', n_threads=8, num_component=10, num_component_former=10, num_run=None):
112 | 
113 |         self._input = input
114 |         self._participant_tsv = participant_tsv
115 |         self._split_index = split_index
116 |         self._cv_repetition = cv_repetition
117 |         self._output_dir = output_dir
118 |         self._opnmf_output = opnmf_output
119 |         self._k_list = k_list
120 |         self._balanced = balanced
121 |         self._test_size = test_size
122 |         self._num_iteration = num_iteration
123 |         self._tol = tol
124 |         self._predefined_c = predefined_c
125 |         self._weight_initialization_type = weight_initialization_type
126 |         self._n_threads = n_threads
127 |         self._num_component = num_component
128 |         self._num_component_former = num_component_former
129 |         self._num_run = num_run
130 | 
131 |     def run(self):
132 | 
133 |         if os.path.exists(os.path.join(self._output_dir, 'clustering_run' + str(self._num_run), 'component_' + str(self._num_component), "adjusted_rand_index.tsv")):
134 |             print("This number of component have been trained and converged: %d" % self._num_component)
135 |         else:
136 |             print("cross validate for num_component, running for %d components for feature selection" % self._num_component)
137 |             x = self._input.get_x(self._num_component, self._opnmf_output)
138 | 
139 |             y = self._input.get_y_raw()
140 |             data_label_folds_ks = np.zeros((y.shape[0], self._cv_repetition, len(self._k_list))).astype(int)
141 | 
142 |             for i in range(self._cv_repetition):
143 |                 for j in range(len(self._k_list)):
144 |                     print('Applying HRDRA for finding %d clusters. Repetition: %d / %d...\n' % (self._k_list[j], i+1, self._cv_repetition))
145 |                     training_final_prediction = hydra_solver_svm_tl(self._num_component, self._num_component_former, i, x[self._split_index[i][0]], y[self._split_index[i][0]], self._k_list[j], self._output_dir,
146 |                                                             self._num_iteration, self._tol, self._balanced, self._predefined_c,
147 |                                                             self._n_threads, self._num_run)
148 | 
149 | 
150 |                     # change the final prediction's label: test data to be 0, the rest training data will be updated by the model's prediction
151 |                     data_label_fold = y.copy()
152 |                     data_label_fold[self._split_index[i][1]] = 0 # all test data to be 0
153 |                     data_label_fold[self._split_index[i][0]] = training_final_prediction ## assign the training prediction
154 |                     data_label_folds_ks[:, i, j] = data_label_fold
155 | 
156 |             print('Finish the clustering procedure!\n')
157 | 
158 |             print('Estimating clustering stability!\n')
159 |             ## for the adjusted rand index, only consider the PT results
160 |             adjusted_rand_index_results = np.zeros(len(self._k_list))
161 |             index_pt = np.where(y == 1)[0]  # index for PTs
162 |             for m in range(len(self._k_list)):
163 |                 result = data_label_folds_ks[:, :, m][index_pt] ## the result of each K during all runs of CV
164 |                 adjusted_rand_index_result = cv_cluster_stability(result, self._k_list[m])
165 | 
166 |                 # saving each k result into the final adjusted_rand_index_results
167 |                 adjusted_rand_index_results[m] = adjusted_rand_index_result
168 |             print('Done!\n')
169 | 
170 |             print('Computing the final consensus group membership!\n')
171 |             final_assignment_ks = -np.ones((self._input.get_y_raw().shape[0], len(self._k_list))).astype(int)
172 |             for n in range(len(self._k_list)):
173 |                 result = data_label_folds_ks[:, :, n][index_pt]
174 |                 final_assignment_ks_pt = consensus_clustering(result, n + self._k_list[0]) ## the final subtype assignment is performed with consensus clustering with KMeans
175 |                 final_assignment_ks[index_pt, n] = final_assignment_ks_pt + 1
176 |             print('Done!\n')
177 | 
178 |             print('Saving the final results!\n')
179 |             # save_cluster_results(adjusted_rand_index_results, final_assignment_ks)
180 |             columns = ['ari_' + str(i) + '_subtypes' for i in self._k_list]
181 |             ari_df = pd.DataFrame(adjusted_rand_index_results[:, np.newaxis].transpose(), columns=columns)
182 |             ari_df.to_csv(os.path.join(self._output_dir, 'clustering_run' + str(self._num_run), 'component_' + str(self._num_component), 'adjusted_rand_index.tsv'), index=False, sep='\t',
183 |                           encoding='utf-8')
184 | 
185 |             # save the final assignment for consensus clustering across different folds
186 |             df_feature = pd.read_csv(self._participant_tsv, sep='\t')
187 |             columns = ['assignment_' + str(i) for i in self._k_list]
188 |             participant_df = df_feature.iloc[:, :3]
189 |             cluster_df = pd.DataFrame(final_assignment_ks, columns=columns)
190 |             all_df = pd.concat([participant_df, cluster_df], axis=1)
191 |             all_df.to_csv(os.path.join(self._output_dir, 'clustering_run' + str(self._num_run), 'component_' + str(self._num_component), 'clustering_assignment.tsv'), index=False,
192 |                           sep='\t', encoding='utf-8')
193 | 
194 |             print('Done!\n')


--------------------------------------------------------------------------------
/magiccluster/utils.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import numpy as np
  3 | import scipy
  4 | import os, pickle
  5 | from sklearn.cluster import KMeans
  6 | from sklearn.metrics import adjusted_rand_score
  7 | from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, KFold, ShuffleSplit
  8 | from joblib import dump
  9 | import pandas as pd
 10 | from multiprocessing.pool import ThreadPool
 11 | from sklearn.svm import SVC
 12 | 
 13 | __author__ = "Junhao Wen"
 14 | __copyright__ = "Copyright 2023"
 15 | __credits__ = ["Junhao Wen, Erdem Varol"]
 16 | __license__ = "See LICENSE file"
 17 | __version__ = "0.0.3"
 18 | __maintainer__ = "Junhao Wen"
 19 | __email__ = "junhao.wen89@gmail.com"
 20 | __status__ = "Development"
 21 | 
 22 | def elem_sym_poly(lambda_value, k):
 23 |     """
 24 |     given a vector of lambdas and a maximum size k, determine the value of
 25 |     the elementary symmetric polynomials:
 26 |     E(l+1,n+1) = sum_{J \subseteq 1..n,|J| = l} prod_{i \in J} lambda(i)
 27 |     :param lambda_value: the corresponding eigenvalues
 28 |     :param k: number of clusters
 29 |     :return:
 30 |     """
 31 |     N = lambda_value.shape[0]
 32 |     E = np.zeros((k + 1, N + 1))
 33 |     E[0, :] = 1
 34 | 
 35 |     for i in range(1, k+1):
 36 |         for j in range(1, N+1):
 37 |             E[i, j] = E[i, j - 1] + lambda_value[j-1] * E[i - 1, j - 1]
 38 | 
 39 |     return E
 40 | 
 41 | 
 42 | def sample_k(lambda_value, k):
 43 |     """
 44 |     Pick k lambdas according to p(S) \propto prod(lambda \in S)
 45 |     :param lambda_value: the corresponding eigenvalues
 46 |     :param k: the number of clusters
 47 |     :return:
 48 |     """
 49 | 
 50 |     ## compute elementary symmetric polynomials
 51 |     E = elem_sym_poly(lambda_value, k)
 52 | 
 53 |     ## ietrate over the lambda value
 54 |     num = lambda_value.shape[0]
 55 |     remaining = k
 56 |     S = np.zeros(k)
 57 |     while remaining > 0:
 58 |         #compute marginal of num given that we choose remaining values from 0:num-1
 59 |         if num == remaining:
 60 |             marg = 1
 61 |         else:
 62 |             marg = lambda_value[num-1] * E[remaining-1, num-1] / E[remaining, num]
 63 | 
 64 |         # sample marginal
 65 |         if np.random.rand(1) < marg:
 66 |             S[remaining-1] = num
 67 |             remaining = remaining - 1
 68 |         num = num - 1
 69 |     return S
 70 | 
 71 | def sample_dpp(evalue, evector, k=None):
 72 |     """
 73 |     sample a set Y from a dpp.  evalue, evector are a decomposed kernel, and k is (optionally) the size of the set to return
 74 |     :param evalue: eigenvalue
 75 |     :param evector: normalized eigenvector
 76 |     :param k: number of cluster
 77 |     :return:
 78 |     """
 79 |     if k == None:
 80 |         # choose eigenvectors randomly
 81 |         evalue = np.divide(evalue, (1 + evalue))
 82 |         evector = np.where(np.random.random(evalue.shape[0]) <= evalue)[0]
 83 |     else:
 84 |         v = sample_k(evalue, k) ## v here is a 1d array with size: k
 85 | 
 86 |     k = v.shape[0]
 87 |     v = v.astype(int)
 88 |     v = [i - 1 for i in v.tolist()]  ## due to the index difference between matlab & python, here, the element of v is for matlab
 89 |     V = evector[:, v]
 90 | 
 91 |     ## iterate
 92 |     y = np.zeros(k)
 93 |     for i in range(k, 0, -1):
 94 |         ## compute probabilities for each item
 95 |         P = np.sum(np.square(V), axis=1)
 96 |         P = P / np.sum(P)
 97 | 
 98 |         # choose a new item to include
 99 |         y[i-1] = np.where(np.random.rand(1) < np.cumsum(P))[0][0]
100 |         y = y.astype(int)
101 | 
102 |         # choose a vector to eliminate
103 |         j = np.where(V[y[i-1], :])[0][0]
104 |         Vj = V[:, j]
105 |         V = np.delete(V, j, 1)
106 | 
107 |         ## Update V
108 |         if V.size == 0:
109 |            pass
110 |         else:
111 |             V = np.subtract(V, np.multiply(Vj, (V[y[i-1], :] / Vj[y[i-1]])[:, np.newaxis]).transpose())  ## watch out the dimension here
112 | 
113 |         ## orthogonalize
114 |         for m in range(i - 1):
115 |             for n in range(m):
116 |                 V[:, m] = np.subtract(V[:, m], np.matmul(V[:, m].transpose(), V[:, n]) * V[:, n])
117 | 
118 |             V[:, m] = V[:, m] / np.linalg.norm(V[:, m])
119 | 
120 |     y = np.sort(y)
121 | 
122 |     return y
123 | 
124 | def proportional_assign(l, d):
125 |     """
126 |     Proportional assignment based on margin
127 |     :param l: int
128 |     :param d: int
129 |     :return:
130 |     """
131 |     np.seterr(divide='ignore', invalid='ignore')
132 |     invL = np.divide(1, l)
133 |     idx = np.isinf(invL)
134 |     invL[idx] = d[idx]
135 | 
136 |     for i in range(l.shape[0]):
137 |         pos = np.where(invL[i, :] > 0)[0]
138 |         neg = np.where(invL[i, :] < 0)[0]
139 |         if pos.size != 0:
140 |             invL[i, neg] = 0
141 |         else:
142 |             invL[i, :] = np.divide(invL[i, :], np.amin(invL[i, :]))
143 |             invL[i, invL[i, :] < 1] = 0
144 | 
145 |     S = np.multiply(invL, np.divide(1, np.sum(invL, axis=1))[:, np.newaxis])
146 | 
147 |     return S
148 | 
149 | def random_init_dirichlet(k, num_pt):
150 |     """
151 |     a sample from a dirichlet distribution
152 |     :param k: number of clusters
153 |     :param num_pt: number of PT
154 |     :return:
155 |     """
156 |     a = np.ones(k)
157 |     s = np.random.dirichlet(a, num_pt)
158 | 
159 |     return s
160 | 
161 | def hydra_init_weight(X, y, k, index_pt, index_cn, weight_initialization_type):
162 |     """
163 |     Function performs initialization for the polytope of mlni
164 |     Args:
165 |         X: the input features
166 |         y: the label
167 |         k: number of predefined clusters
168 |         index_pt: list, the index for patient subjects
169 |         index_cn: list, the index for control subjects
170 |         weight_initialization_type: the type of chosen initialization method
171 |     Returns:
172 | 
173 |     """
174 |     if weight_initialization_type == "DPP":  ##
175 |         num_subject = y.shape[0]
176 |         W = np.zeros((num_subject, X.shape[1]))
177 |         for j in range(num_subject):
178 |             ipt = np.random.randint(index_pt.shape[0])
179 |             icn = np.random.randint(index_cn.shape[0])
180 |             W[j, :] = X[index_pt[ipt], :] - X[index_cn[icn], :]
181 | 
182 |         KW = np.matmul(W, W.transpose())
183 |         KW = np.divide(KW, np.sqrt(np.multiply(np.diag(KW)[:, np.newaxis], np.diag(KW)[:, np.newaxis].transpose())))
184 |         evalue, evector = np.linalg.eig(KW)
185 |         Widx = sample_dpp(np.real(evalue), np.real(evector), k)
186 |         prob = np.zeros((len(index_pt), k))  # only consider the PTs
187 | 
188 |         for i in range(k):
189 |             prob[:, i] = np.matmul(np.multiply(X[index_pt, :], np.divide(1, np.linalg.norm(X[index_pt, :], axis=1))[:, np.newaxis]), W[Widx[i], :].transpose())
190 | 
191 |         l = np.minimum(prob - 1, 0)
192 |         d = prob - 1
193 |         S = proportional_assign(l, d)
194 | 
195 |     elif weight_initialization_type == "random_hyperplane":
196 |         print("TODO")
197 | 
198 |     elif weight_initialization_type == "random_assign":
199 |         S = random_init_dirichlet(k, len(index_pt))
200 | 
201 |     elif weight_initialization_type == "k_means":
202 |         print("TODO")
203 |     else:
204 |         raise Exception("Not implemented yet!")
205 | 
206 |     return S
207 | 
208 | def hydra_solver_svm(num_repetition, X, y, k, output_dir, num_consensus, num_iteration, tol, balanced, predefined_c,
209 |                      weight_initialization_type, n_threads, save_models, verbose):
210 |     """
211 |     This is the main function of HYDRA, which find the convex polytope using a supervised classification fashion.
212 |     Args:
213 |         num_repetition: int, number of repetitions for CV
214 |         X: input matrix for features
215 |         y: input for group label
216 |         k: number of clusters
217 |         output_dir: the path for output
218 |         num_consensus: int, number of runs for consensus clustering
219 |         num_iteration: int, number of maximum iterations for running HYDRA
220 |         tol: float, tolerance value for model convergence
221 |         balanced: if sample imbalance should be considered during model optimization
222 |         predefined_c: predefined c for SVM for clustering
223 |         weight_initialization_type: the type of initialization of the weighted sample matrix
224 |         n_threads: number of threads used
225 |         save_models: if save all models during CV
226 |         verbose: if output is verbose
227 | 
228 |     Returns:
229 | 
230 |     """
231 |     censensus_assignment = np.zeros((y[y == 1].shape[0], num_consensus))  ## only consider the PTs
232 | 
233 |     index_pt = np.where(y == 1)[0]  # index for PTs
234 |     index_cn = np.where(y == -1)[0]  # index for CNs
235 | 
236 |     for i in range(num_consensus):
237 |         weight_sample = np.ones((y.shape[0], k)) / k
238 |         ## depending on the weight initialization strategy, random hyperplanes were initialized with maximum diversity to constitute the convex polytope
239 |         weight_sample_pt = hydra_init_weight(X, y, k, index_pt, index_cn, weight_initialization_type)
240 |         weight_sample[index_pt] = weight_sample_pt  ## only replace the sample weight of the PT group
241 |         ## cluster assignment is based on this svm scores across different SVM/hyperplanes
242 |         svm_scores = np.zeros((weight_sample.shape[0], weight_sample.shape[1]))
243 |         update_weights_pool = ThreadPool(processes=n_threads)
244 | 
245 |         for j in range(num_iteration):
246 |             for m in range(k):
247 |                 sample_weight = np.ascontiguousarray(weight_sample[:, m])
248 |                 if np.count_nonzero(sample_weight[index_pt]) == 0:
249 |                     if verbose == True:
250 |                         print(
251 |                             "Cluster dropped, meaning that all PT has been assigned to one single hyperplane in iteration: %d" % (
252 |                                         j - 1))
253 |                         print(
254 |                             "Be careful, this could cause problem because of the ill-posed solution. Especially when k==2")
255 |                 else:
256 |                     results = update_weights_pool.apply_async(launch_svc,
257 |                                                               args=(X, y, predefined_c, sample_weight, balanced))
258 |                     weight_coef = results.get()[0]
259 |                     intesept = results.get()[1]
260 |                     ## Apply the data again the trained model to get the final SVM scores
261 |                     svm_scores[:, m] = (np.matmul(weight_coef, X.transpose()) + intesept).transpose().squeeze()
262 | 
263 |             cluster_index = np.argmax(svm_scores[index_pt], axis=1)
264 | 
265 |             ## decide the converge of the polytope based on the toleration
266 |             weight_sample_hold = weight_sample.copy()
267 |             # after each iteration, first set the weight of patient rows to be 0
268 |             weight_sample[index_pt, :] = 0
269 |             # then set the pt's weight to be 1 for the assigned hyperplane
270 |             for n in range(len(index_pt)):
271 |                 weight_sample[index_pt[n], cluster_index[n]] = 1
272 | 
273 |             ## check the loss comparted to the tolorence for stopping criteria
274 |             loss = np.linalg.norm(np.subtract(weight_sample, weight_sample_hold), ord='fro')
275 |             if verbose == True:
276 |                 print("The loss is: %f" % loss)
277 |             if loss < tol:
278 |                 if verbose == True:
279 |                     print(
280 |                         "The polytope has been converged for iteration %d in finding %d clusters in consensus running: %d" % (
281 |                         j, k, i))
282 |                 break
283 |         update_weights_pool.close()
284 |         update_weights_pool.join()
285 | 
286 |         ## update the cluster index for the consensus clustering
287 |         censensus_assignment[:, i] = cluster_index + 1
288 | 
289 |     ## do censensus clustering
290 |     final_predict = consensus_clustering(censensus_assignment.astype(int), k)
291 | 
292 |     ## after deciding the final convex polytope, we refit the training data once to save the best model
293 |     weight_sample_final = np.zeros((y.shape[0], k))
294 |     ## change the weight of PTs to be 1, CNs to be 1/k
295 | 
296 |     # then set the pt's weight to be 1 for the assigned hyperplane
297 |     for n in range(len(index_pt)):
298 |         weight_sample_final[index_pt[n], final_predict[n]] = 1
299 | 
300 |     weight_sample_final[index_cn] = 1 / k
301 |     update_weights_pool_final = ThreadPool(processes=n_threads)
302 |     ## create the final polytope by applying all weighted subjects
303 |     for o in range(k):
304 |         sample_weight = np.ascontiguousarray(weight_sample_final[:, o])
305 |         results = update_weights_pool_final.apply_async(launch_svc, args=(X, y, predefined_c, sample_weight, balanced))
306 | 
307 |         if not os.path.exists(os.path.join(output_dir, str(k) + '_clusters', 'models')):
308 |             os.makedirs(os.path.join(output_dir, str(k) + '_clusters', 'models'))
309 | 
310 |         ## save the final model for the k SVMs/hyperplanes
311 |         if save_models == True:
312 |             if not os.path.exists(os.path.join(output_dir, str(k) + '_clusters', 'models')):
313 |                 os.makedirs(os.path.join(output_dir, str(k) + '_clusters', 'models'))
314 | 
315 |             dump(results.get()[2], os.path.join(output_dir, str(k) + '_clusters', 'models',
316 |                                                 'svm-' + str(o) + '_cv_' + str(num_repetition) + '.joblib'))
317 |         else:
318 |             ## only save the last repetition
319 |             if not os.path.isfile(os.path.join(output_dir, str(k) + '_clusters', 'models',
320 |                                                'svm-' + str(o) + '_last_repetition.joblib')):
321 |                 dump(results.get()[2], os.path.join(output_dir, str(k) + '_clusters', 'models',
322 |                                                     'svm-' + str(o) + '_last_repetition.joblib'))
323 |     update_weights_pool_final.close()
324 |     update_weights_pool_final.join()
325 | 
326 |     y[index_pt] = final_predict + 1
327 | 
328 |     if not os.path.exists(os.path.join(output_dir, str(k) + '_clusters', 'tsv')):
329 |         os.makedirs(os.path.join(output_dir, str(k) + '_clusters', 'tsv'))
330 | 
331 |     ### save results also in tsv file for each repetition
332 |     ## save the assigned weight for each subject across k-fold
333 |     columns = ['hyperplane' + str(i) for i in range(k)]
334 |     weight_sample_df = pd.DataFrame(weight_sample_final, columns=columns)
335 |     weight_sample_df.to_csv(
336 |         os.path.join(output_dir, str(k) + '_clusters', 'tsv', 'weight_sample_cv_' + str(num_repetition) + '.tsv'),
337 |         index=False, sep='\t', encoding='utf-8')
338 | 
339 |     ## save the final_predict_all
340 |     columns = ['y_hat']
341 |     y_hat_df = pd.DataFrame(y, columns=columns)
342 |     y_hat_df.to_csv(os.path.join(output_dir, str(k) + '_clusters', 'tsv', 'y_hat_cv_' + str(num_repetition) + '.tsv'),
343 |                     index=False, sep='\t', encoding='utf-8')
344 | 
345 |     ## save the pt index
346 |     columns = ['pt_index']
347 |     pt_df = pd.DataFrame(index_pt, columns=columns)
348 |     pt_df.to_csv(os.path.join(output_dir, str(k) + '_clusters', 'tsv', 'pt_index_cv_' + str(num_repetition) + '.tsv'),
349 |                  index=False, sep='\t', encoding='utf-8')
350 | 
351 |     return y
352 | 
353 | def GLMcorrection(X_train, Y_train, covar_train, X_test, covar_test):
354 |     """
355 |     Eliminate the confound of covariate, such as age and sex, from the disease-based changes.
356 |     Ref: "Age Correction in Dementia Matching to a Healthy Brain"
357 |     :param X_train: array, training features
358 |     :param Y_train: array, training labels
359 |     :param covar_train: array, ttraining covariate data
360 |     :param X_test: array, test labels
361 |     :param covar_test: array, ttest covariate data
362 |     :return: corrected training & test feature data
363 |     """
364 |     Yc = X_train[Y_train == -1]
365 |     Xc = covar_train[Y_train == -1]
366 |     Xc = np.concatenate((Xc, np.ones((Xc.shape[0], 1))), axis=1)
367 |     beta = np.matmul(np.matmul(Yc.transpose(), Xc), np.linalg.inv(np.matmul(Xc.transpose(), Xc)))
368 |     num_col = beta.shape[1]
369 |     X_train_cor = (X_train.transpose() - np.matmul(beta[:, : num_col - 1], covar_train.transpose())).transpose()
370 |     X_test_cor = (X_test.transpose() - np.matmul(beta[:, : num_col - 1], covar_test.transpose())).transpose()
371 | 
372 |     return X_train_cor, X_test_cor
373 | 
374 | def launch_svc(X, y, predefined_c, sample_weight, balanced):
375 |     """
376 |     Lauch svc classifier of sklearn
377 |     Args:
378 |         X: input matrix for features
379 |         y: input matrix for label
380 |         predefined_c: predefined C
381 |         sample_weight: the weighted sample matrix
382 |         balanced:
383 | 
384 |     Returns:
385 | 
386 |     """
387 |     if not balanced:
388 |         model = SVC(kernel='linear', C=predefined_c)
389 |     else:
390 |         model = SVC(kernel='linear', C=predefined_c, class_weight='balanced')
391 | 
392 |     ## fit the different SVM/hyperplanes
393 |     model.fit(X, y, sample_weight=sample_weight)
394 | 
395 |     weight_coef = model.coef_
396 |     intesept = model.intercept_
397 | 
398 |     return weight_coef, intesept, model
399 | 
400 | def check_symmetric(a, rtol=1e-05, atol=1e-08):
401 |     """
402 |     Check if the numpy array is symmetric or not
403 |     Args:
404 |         a:
405 |         rtol:
406 |         atol:
407 | 
408 |     Returns:
409 | 
410 |     """
411 |     result = np.allclose(a, a.T, rtol=rtol, atol=atol)
412 |     return result
413 | 
414 | def make_cv_partition(diagnosis, cv_strategy, output_dir, cv_repetition, seed=None):
415 |     """
416 |     Randomly generate the data split index for different CV strategy.
417 | 
418 |     :param diagnosis: the list for labels
419 |     :param cv_repetition: the number of repetitions or folds
420 |     :param output_dir: the output folder path
421 |     :param cv_repetition: the number of repetitions for CV
422 |     :param seed: random seed for sklearn split generator. Default is None
423 |     :return:
424 |     """
425 |     unique = list(set(diagnosis))
426 |     y = np.array(diagnosis)
427 |     if len(unique) == 2: ### CV for classification and clustering
428 |         if cv_strategy == 'k_fold':
429 |             splits_indices_pickle = os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-fold.pkl')
430 |             ## try to see if the shuffle has been done
431 |             if os.path.isfile(splits_indices_pickle):
432 |                 splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
433 |             else:
434 |                 splits = StratifiedKFold(n_splits=cv_repetition, random_state=seed)
435 |                 splits_indices = list(splits.split(np.zeros(len(y)), y))
436 |         elif cv_strategy == 'hold_out':
437 |             splits_indices_pickle = os.path.join(output_dir, 'data_split_stratified_' + str(cv_repetition) + '-holdout.pkl')
438 |             ## try to see if the shuffle has been done
439 |             if os.path.isfile(splits_indices_pickle):
440 |                 splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
441 |             else:
442 |                 splits = StratifiedShuffleSplit(n_splits=cv_repetition, test_size=0.2, random_state=seed)
443 |                 splits_indices = list(splits.split(np.zeros(len(y)), y))
444 |         else:
445 |             raise Exception("this cross validation strategy has not been implemented!")
446 |     elif len(unique) == 1:
447 |         raise Exception("Diagnosis cannot be the same for all participants...")
448 |     else: ### CV for regression, no need to be stratified
449 |         if cv_strategy == 'k_fold':
450 |             splits_indices_pickle = os.path.join(output_dir, 'data_split_' + str(cv_repetition) + '-fold.pkl')
451 | 
452 |             ## try to see if the shuffle has been done
453 |             if os.path.isfile(splits_indices_pickle):
454 |                 splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
455 |             else:
456 |                 splits = KFold(n_splits=cv_repetition, random_state=seed)
457 |                 splits_indices = list(splits.split(np.zeros(len(y)), y))
458 |         elif cv_strategy == 'hold_out':
459 |             splits_indices_pickle = os.path.join(output_dir, 'data_split_' + str(cv_repetition) + '-holdout.pkl')
460 |             ## try to see if the shuffle has been done
461 |             if os.path.isfile(splits_indices_pickle):
462 |                 splits_indices = pickle.load(open(splits_indices_pickle, 'rb'))
463 |             else:
464 |                 splits = ShuffleSplit(n_splits=cv_repetition, test_size=0.2, random_state=seed)
465 |                 splits_indices = list(splits.split(np.zeros(len(y)), y))
466 |         else:
467 |             raise Exception("this cross validation strategy has not been implemented!")
468 | 
469 |     with open(splits_indices_pickle, 'wb') as s:
470 |         pickle.dump(splits_indices, s)
471 | 
472 |     return splits_indices, splits_indices_pickle
473 | 
474 | def consensus_clustering(clustering_results, k):
475 |     """
476 |     This function performs consensus clustering on a co-occurence matrix
477 |     :param clustering_results: an array containing all the clustering results across different iterations, in order to
478 |     perform
479 |     :param k:
480 |     :return:
481 |     """
482 | 
483 |     num_pt = clustering_results.shape[0]
484 |     cooccurence_matrix = np.zeros((num_pt, num_pt))
485 | 
486 |     for i in range(num_pt - 1):
487 |         for j in range(i + 1, num_pt):
488 |             cooccurence_matrix[i, j] = sum(clustering_results[i, :] == clustering_results[j, :])
489 | 
490 |     cooccurence_matrix = np.add(cooccurence_matrix, cooccurence_matrix.transpose())
491 |     ## here is to compute the Laplacian matrix
492 |     Laplacian = np.subtract(np.diag(np.sum(cooccurence_matrix, axis=1)), cooccurence_matrix)
493 | 
494 |     Laplacian_norm = np.subtract(np.eye(num_pt), np.matmul(np.matmul(np.diag(1 / np.sqrt(np.sum(cooccurence_matrix, axis=1))), cooccurence_matrix), np.diag(1 / np.sqrt(np.sum(cooccurence_matrix, axis=1)))))
495 |     ## replace the nan with 0
496 |     Laplacian_norm = np.nan_to_num(Laplacian_norm)
497 | 
498 |     ## check if the Laplacian norm is symmetric or not, because matlab eig function will automatically check this, but not in numpy or scipy
499 |     if check_symmetric(Laplacian_norm):
500 |         ## extract the eigen value and vector
501 |         ## matlab eig equivalence is eigh, not eig from numpy or scipy, see this post: https://stackoverflow.com/questions/8765310/scipy-linalg-eig-return-complex-eigenvalues-for-covariance-matrix
502 |         ## Note, the eigenvector is not unique, thus the matlab and python eigenvector may be different, but this will not affect the results.
503 |         evalue, evector = scipy.linalg.eigh(Laplacian_norm)
504 |     else:
505 |         # evalue, evector = np.linalg.eig(Laplacian_norm)
506 |         raise Exception("The Laplacian matrix should be symmetric here...")
507 | 
508 |     ## check if the eigen vector is complex
509 |     if np.any(np.iscomplex(evector)):
510 |         evalue, evector = scipy.linalg.eigh(Laplacian)
511 | 
512 |     ## create the kmean algorithm with sklearn
513 |     kmeans = KMeans(n_clusters=k, n_init=20).fit(evector.real[:, 0: k])
514 |     final_predict = kmeans.labels_
515 | 
516 |     return final_predict
517 | 
518 | def cv_cluster_stability(result, k):
519 |     """
520 |     To compute the adjusted rand index across different pair of 2 folds cross CV
521 |     :param result:
522 |     :return:
523 |     """
524 | 
525 |     num_pair = 0
526 |     aris = []
527 |     if k == 1:
528 |         adjusted_rand_index = 0  ## note, here, we manually set it to be 0, because it does not make sense when k==1. TODO, need to clarify if there is really heterogeneity in the data, i.e., k == 1 or k>1
529 |     else:
530 |         for i in range(result.shape[1] - 1):
531 |             for j in range(i+1, result.shape[1]):
532 |                 num_pair += 1
533 |                 non_zero_index = np.all(result[:, [i, j]], axis=1)
534 |                 pair_result = result[:, [i, j]][non_zero_index]
535 |                 ari = adjusted_rand_score(pair_result[:, 0], pair_result[:, 1])
536 |                 aris.append(ari)
537 | 
538 |         adjusted_rand_index = np.mean(np.asarray(aris))
539 | 
540 |     return adjusted_rand_index
541 | 
542 | def hydra_solver_svm_tl(num_component, num_component_former, num_repetition, X, y, k, output_dir, num_iteration, tol, balanced, predefined_c, n_threads, num_run):
543 |     """
544 |     This is the main function of HYDRA, which find the convex polytope using a supervised classification fashion.
545 |     :param num_repetition: the number of iteration of CV currently. This is helpful to reconstruct the model and also moniter the processing
546 |     :param X: corrected training data feature
547 |     :param y: traing data label
548 |     :param k: hyperparameter for desired number of clusters in patients
549 |     :param options: commandline parameters
550 |     :return: the optimal model
551 |     """
552 |     index_pt = np.where(y == 1)[0]  # index for PTs
553 |     index_cn = np.where(y == -1)[0]  # index for CNs
554 | 
555 |     ### initialize the final weight for the polytope from the former C
556 |     weight_file = os.path.join(output_dir, 'clustering_run' + str(num_run-1), 'component_' + str(num_component_former), str(k) + '_clusters', 'tsv', 'weight_sample_cv_' + str(num_repetition) + '.tsv')
557 |     weight_sample = pd.read_csv(weight_file, sep='\t').to_numpy()
558 | 
559 |     ## cluster assignment is based on this svm scores across different SVM/hyperplanes
560 |     svm_scores = np.zeros((weight_sample.shape[0], weight_sample.shape[1]))
561 |     update_weights_pool = ThreadPool(n_threads)
562 |     for j in range(num_iteration):
563 |         for m in range(k):
564 |             sample_weight = np.ascontiguousarray(weight_sample[:, m])
565 | 
566 |             if np.count_nonzero(sample_weight[index_pt]) == 0:
567 |                 print("Cluster dropped, meaning that all PT has been assigned to one single hyperplane in iteration: %d" % (j-1))
568 |                 svm_scores[:, m] = np.asarray([np.NINF] * (y.shape[0]))
569 |             else:
570 | 
571 |                 results = update_weights_pool.apply_async(launch_svc, args=(X, y, predefined_c, sample_weight, balanced))
572 |                 weight_coef = results.get()[0]
573 |                 intesept = results.get()[1]
574 |                 ## Apply the data again the trained model to get the final SVM scores
575 |                 svm_scores[:, m] = (np.matmul(weight_coef, X.transpose()) + intesept).transpose().squeeze()
576 | 
577 | 
578 |         final_predict = np.argmax(svm_scores[index_pt], axis=1)
579 | 
580 |         ## decide the converge of the polytope based on the toleration
581 |         weight_sample_hold = weight_sample.copy()
582 |         # after each iteration, first set the weight of patient rows to be 0
583 |         weight_sample[index_pt, :] = 0
584 |         # then set the pt's weight to be 1 for the assigned hyperplane
585 |         for n in range(len(index_pt)):
586 |             weight_sample[index_pt[n], final_predict[n]] = 1
587 | 
588 |         ## check the loss comparted to the tolorence for stopping criteria
589 |         loss = np.linalg.norm(np.subtract(weight_sample, weight_sample_hold), ord='fro')
590 |         print("The loss is: %f" % loss)
591 |         if loss < tol:
592 |             print("The polytope has been converged for iteration %d in finding %d clusters" % (j, k))
593 |             break
594 |     update_weights_pool.close()
595 |     update_weights_pool.join()
596 | 
597 |     ## after deciding the final convex polytope, we refit the training data once to save the best model
598 |     weight_sample_final = np.zeros((y.shape[0], k))
599 |     ## change the weight of PTs to be 1, CNs to be 1/k
600 | 
601 |     # then set the pt's weight to be 1 for the assigned hyperplane
602 |     for n in range(len(index_pt)):
603 |         weight_sample_final[index_pt[n], final_predict[n]] = 1
604 | 
605 |     weight_sample_final[index_cn] = 1 / k
606 |     update_weights_pool_final = ThreadPool(n_threads)
607 | 
608 |     for o in range(k):
609 |         sample_weight = np.ascontiguousarray(weight_sample_final[:, o])
610 |         if np.count_nonzero(sample_weight[index_pt]) == 0:
611 |             print("Cluster dropped, meaning that the %d th hyperplane is useless!" % (o))
612 |         else:
613 |             results = update_weights_pool_final.apply_async(launch_svc, args=(X, y, predefined_c, sample_weight, balanced))
614 | 
615 |             ## save the final model for the k SVMs/hyperplanes
616 |             if not os.path.exists(
617 |                     os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component),
618 |                                  str(k) + '_clusters', 'models')):
619 |                 os.makedirs(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component),
620 |                                          str(k) + '_clusters', 'models'))
621 | 
622 |             dump(results.get()[2],
623 |                  os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component),
624 |                               str(k) + '_clusters', 'models',
625 |                               'svm-' + str(o) + '_last_repetition.joblib'))
626 | 
627 |     update_weights_pool_final.close()
628 |     update_weights_pool_final.join()
629 | 
630 |     y[index_pt] = final_predict + 1
631 | 
632 |     if not os.path.exists(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv')):
633 |         os.makedirs(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv'))
634 | 
635 |     ## save the assigned weight for each subject across k-fold
636 |     columns = ['hyperplane' + str(i) for i in range(k)]
637 |     weight_sample_df = pd.DataFrame(weight_sample_final, columns=columns)
638 |     weight_sample_df.to_csv(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv', 'weight_sample_cv_' + str(num_repetition) + '.tsv'), index=False, sep='\t', encoding='utf-8')
639 | 
640 |     ## save the final_predict_all
641 |     columns = ['y_hat']
642 |     y_hat_df = pd.DataFrame(y, columns=columns)
643 |     y_hat_df.to_csv(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv', 'y_hat_cv_' + str(num_repetition) + '.tsv'), index=False, sep='\t', encoding='utf-8')
644 | 
645 |     ## save the pt index
646 |     columns = ['pt_index']
647 |     pt_df = pd.DataFrame(index_pt, columns=columns)
648 |     pt_df.to_csv(os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(num_component), str(k) + '_clusters', 'tsv', 'pt_index_cv_' + str(num_repetition) + '.tsv'), index=False, sep='\t', encoding='utf-8')
649 | 
650 |     return y
651 | 
652 | def cluster_stability_across_resolution(c, c_former, output_dir, k_continuing, num_run, stop_tol=0.98):
653 |     """
654 |     To evaluate the stability of clustering across two different C for stopping criterion.
655 |     Args:
656 |         c:
657 |         c_former:
658 |         output_dir:
659 |         k_continuing:
660 |         num_run:
661 |         stop_tol:
662 |         max_num_iter:
663 | 
664 |     Returns:
665 | 
666 |     """
667 |     ## read the output of current C and former Cs
668 |     cluster_ass1 = os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(c), 'clustering_assignment.tsv')
669 |     ass1_df = pd.read_csv(cluster_ass1, sep='\t')
670 |     ass1_df = ass1_df.loc[ass1_df['diagnosis'] == 1]
671 | 
672 |     cluster_ass2 = os.path.join(output_dir, 'clustering_run' + str(num_run-1), 'component_' + str(c_former), 'clustering_assignment.tsv')
673 |     ass2_df = pd.read_csv(cluster_ass2, sep='\t')
674 |     ass2_df = ass2_df.loc[ass2_df['diagnosis'] == 1]
675 | 
676 |     df_final = pd.DataFrame(columns=['C', 'K', 'num_run'])
677 | 
678 |     k_continuing_update = []
679 |     k_converged = []
680 |     for i in k_continuing:
681 |         ari = adjusted_rand_score(ass1_df['assignment_' + str(i)], ass2_df['assignment_' + str(i)])
682 |         print("For k == %d, run %d got ARI == %f compared to former run" % (i, num_run, ari))
683 |         if ari < stop_tol and num_run:
684 |             k_continuing_update.append(i)
685 |         else:
686 |             print("Model has been converged or stop at the max iteration: C == %d, K == %d and run == %d" % (c, i, num_run))
687 |             k_converged.append(i)
688 |             df_row = pd.DataFrame(columns=['C', 'K', 'num_run'])
689 |             df_row.loc[len(['C', 'K', 'num_run'])] = [c, i, num_run]
690 |             df_final = df_final.append(df_row)
691 | 
692 |     if len(k_converged) != 0:
693 |         df_final.to_csv(os.path.join(output_dir, 'results_convergence_run' + str(num_run) + '.tsv'), index=False, sep='\t', encoding='utf-8')
694 | 
695 |     return k_continuing_update, k_converged
696 | 
697 | def summary_clustering_result_multiscale(output_dir, k_min, k_max):
698 |     """
699 |     This is a function to summarize the clustering results
700 |     :param num_components_min:
701 |     :param num_components_max:
702 |     :param num_components_step:
703 |     :param output_dir:
704 |     :return:
705 |     """
706 |     clu_col_list = ['assignment_' + str(e) for e in range(k_min, k_max)]
707 |     df_clusters = pd.DataFrame(columns=clu_col_list)
708 | 
709 |     ## read the convergence tsv
710 |     convergence_tsvs = [f for f in glob.glob(output_dir + "/results_convergence_*.tsv", recursive=True)]
711 | 
712 |     for tsv in convergence_tsvs:
713 |         df_convergence = pd.read_csv(tsv, sep='\t')
714 | 
715 |         ## sorf by K
716 |         df_convergence = df_convergence.sort_values(by=['K'])
717 | 
718 |         for i in range(df_convergence.shape[0]):
719 |             k = df_convergence['K'].tolist()[i]
720 |             num_run = df_convergence['num_run'].tolist()[i]
721 |             C = df_convergence['C'].tolist()[i]
722 |             cluster_file = os.path.join(output_dir, 'clustering_run' + str(num_run), 'component_' + str(C), 'clustering_assignment.tsv')
723 | 
724 |             df_cluster = pd.read_csv(cluster_file, sep='\t')
725 |             if i == 0:
726 |                 df_header = df_cluster.iloc[:, 0:3]
727 |             assign = df_cluster['assignment_' + str(k)]
728 |             df_clusters['assignment_' + str(k)] = assign
729 | 
730 |     ## concatenqte the header
731 |     df_assignment = pd.concat((df_header, df_clusters), axis=1)
732 | 
733 |     ## save the result
734 |     df_assignment.to_csv(os.path.join(output_dir, 'results_cluster_assignment_final.tsv'), index=False, sep='\t', encoding='utf-8')
735 | 
736 | def shift_list(c_list, index):
737 |     """
738 |     This is a function to reorder a list to have all posibility by putting each element in the first place
739 |     Args:
740 |         c_list: list to shift
741 |         index: the index of which element to shift
742 | 
743 |     Returns:
744 | 
745 |     """
746 |     new_list = c_list[index:] + c_list[:index]
747 | 
748 |     return new_list
749 | 
750 | def consensus_clustering_across_c(output_dir, c_list, k_min, k_max):
751 |     """
752 |     This is for consensus learning at the end across different Cs
753 |     Args:
754 |         output_dir:
755 |         c_list:
756 | 
757 |     Returns:
758 | 
759 |     """
760 |     k_list = list(range(k_min, k_max+1))
761 |     for k in k_list:
762 |         for i in c_list:
763 |             clu_col_list = ['c_' + str(i) + '_assignment_' + str(e) for e in k_list]
764 |             df_clusters = pd.DataFrame(columns=clu_col_list)
765 | 
766 |             tsv = os.path.join(output_dir, 'initialization_c_' + str(i), 'results_cluster_assignment_final.tsv')
767 |             df = pd.read_csv(tsv, sep='\t')
768 | 
769 |             if i == c_list[0]:
770 |                 df_header = df.iloc[:, 0:3]
771 |             df_clusters['c_' + str(i) + '_assignment_' + str(k)] = df['assignment_' + str(k)]
772 |             if i == c_list[0]:
773 |                 df_final = df_clusters
774 |             else:
775 |                 df_final = pd.concat([df_final, df_clusters], axis=1)
776 | 
777 |     ## concatenate the header and the results
778 |     df_final = pd.concat([df_header, df_final], axis=1)
779 |     df_final_pt = df_final.loc[df_final['diagnosis'] == 1]
780 |     df_final_cn = df_final.loc[df_final['diagnosis'] == -1]
781 |     num_cn = df_final_cn.shape[0]
782 | 
783 |     ## create the final dataframe to store the final assignment
784 |     col_list = ['assignment_' + str(e) for e in k_list]
785 |     df_final_assign = pd.DataFrame(columns=col_list)
786 | 
787 |     ## read the final clustering assignment for each C
788 |     for m in k_list:
789 |         columns_names = ['c_' + str(e) + '_assignment_' + str(m) for e in c_list]
790 |         assignment_pt = df_final_pt[columns_names]
791 |         final_predict_pt = consensus_clustering(assignment_pt.to_numpy(), m)
792 |         final_predict_cn = -2 * np.ones(num_cn)
793 |         final_predict = np.concatenate((final_predict_cn, final_predict_pt)).astype(int)
794 |         df_final_assign['assignment_' + str(m)] = final_predict + 1
795 | 
796 |     df_final_assign = pd.concat([df_header, df_final_assign], axis=1)
797 |     ## save the final results into tsv file.
798 |     df_final_assign.to_csv(os.path.join(output_dir, 'results_cluster_assignment_final.tsv'), index=False, sep='\t',
799 |                          encoding='utf-8')


--------------------------------------------------------------------------------