├── examples ├── __init__.py ├── reduce_mnist.py ├── reduce_twomoons.py ├── cluster_mnist.py ├── cluster_twomoons.py └── data.py ├── src ├── tests │ └── __init__.py └── spectralnet │ ├── _trainers │ ├── _trainer.py │ ├── __init__.py │ ├── _ae_trainer.py │ ├── _spectralnet_trainer.py │ └── _siamesenet_trainer.py │ ├── _losses │ ├── __init__.py │ ├── _spectralnet_loss.py │ └── _siamese_loss.py │ ├── _models │ ├── __init__.py │ ├── _siamesenet_model.py │ ├── _ae_model.py │ └── _spectralnet_model.py │ ├── __init__.py │ ├── _metrics.py │ ├── _cluster.py │ ├── _reduction.py │ └── _utils.py ├── setup.py ├── docs ├── paper.png ├── twomoons.png └── index.html ├── figures └── twomoons.png ├── req.txt ├── .gitignore ├── pyproject.toml ├── setup.cfg ├── data └── Reuters │ ├── get_reuters_data.sh │ └── make_reuters.py ├── LICENSE.md └── README.md /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup() 4 | -------------------------------------------------------------------------------- /docs/paper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaham-lab/SpectralNet/HEAD/docs/paper.png -------------------------------------------------------------------------------- /docs/twomoons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaham-lab/SpectralNet/HEAD/docs/twomoons.png -------------------------------------------------------------------------------- /figures/twomoons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaham-lab/SpectralNet/HEAD/figures/twomoons.png -------------------------------------------------------------------------------- /src/spectralnet/_trainers/_trainer.py: -------------------------------------------------------------------------------- 1 | class Trainer: 2 | def __init__(self): 3 | pass 4 | -------------------------------------------------------------------------------- /src/spectralnet/_losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ._siamese_loss import ContrastiveLoss 2 | from ._spectralnet_loss import SpectralNetLoss -------------------------------------------------------------------------------- /req.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.0 2 | torchvision==0.15.1 3 | h5py 4 | numpy 5 | annoy 6 | scipy 7 | munkres 8 | matplotlib 9 | scikit-learn 10 | -------------------------------------------------------------------------------- /src/spectralnet/_models/__init__.py: -------------------------------------------------------------------------------- 1 | from ._ae_model import AEModel 2 | from ._siamesenet_model import SiameseNetModel 3 | from ._spectralnet_model import SpectralNetModel -------------------------------------------------------------------------------- /src/spectralnet/_trainers/__init__.py: -------------------------------------------------------------------------------- 1 | from ._ae_trainer import AETrainer 2 | from ._siamesenet_trainer import SiameseTrainer 3 | from ._spectralnet_trainer import SpectralTrainer 4 | -------------------------------------------------------------------------------- /src/spectralnet/__init__.py: -------------------------------------------------------------------------------- 1 | from ._metrics import Metrics 2 | from ._cluster import SpectralNet 3 | from ._reduction import SpectralReduction 4 | from ._utils import * 5 | 6 | __all__ = [ 7 | "Metrics", 8 | "SpectralNet", 9 | "SpectralReduction", 10 | ] 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | /data/MNIST 4 | dist 5 | spectralnet.egg-info 6 | /cluster_mnist.py 7 | /cluster_twomoons.py 8 | /data.py 9 | /src/data.py 10 | /src/reduce_mnist.py 11 | /src/reduce_twomoons.py 12 | /src/spectralnet/_reduction.py 13 | /src/spectralnet/_trainers/weights 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Minimum requirements for the build system to execute. 3 | requires = [ 4 | "setuptools", 5 | "wheel", 6 | "torch>=2.0.0", 7 | "torchvision>=0.15.1", 8 | "h5py>=3.8.0", 9 | "numpy>=1.24", 10 | "annoy>=1.17.1", 11 | "scipy>=1.10.1", 12 | "munkres", 13 | "matplotlib", 14 | "scikit-learn>=1.2.2" 15 | ] 16 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /examples/reduce_mnist.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | 6 | from spectralnet import Metrics 7 | from spectralnet import SpectralReduction 8 | 9 | 10 | def main(): 11 | x_train, x_test, y_train, y_test = load_data("mnist") 12 | X = torch.cat([x_train, x_test]) 13 | 14 | if y_train is not None: 15 | y = torch.cat([y_train, y_test]) 16 | else: 17 | y = None 18 | 19 | spectralreduction = SpectralReduction( 20 | n_components=3, 21 | should_use_ae=True, 22 | should_use_siamese=True, 23 | spectral_hiddens=[512, 512, 2048, 3], 24 | ) 25 | 26 | X_new = spectralreduction.fit_transform(X) 27 | spectralreduction.visualize(X_new, y, n_components=2) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = spectralnet 3 | version = 0.1.2 4 | author = Amitai 5 | description = Spectral Clustering Using Deep Neural Networks 6 | long_description = file: README.md 7 | long_description_content_type = text/markdown 8 | url = https://github.com/shaham-lab/SpectralNet.git 9 | project_urls = 10 | Bug Tracker = https://github.com/shaham-lab/SpectralNet/issues 11 | classifiers = 12 | Programming Language :: Python :: 3 13 | License :: OSI Approved :: MIT License 14 | Operating System :: OS Independent 15 | 16 | [options] 17 | package_dir = 18 | = src 19 | packages = find: 20 | python_requires = >=3.11 21 | install_requires = 22 | setuptools 23 | wheel 24 | torch>=2.0.0 25 | torchvision>=0.15.1 26 | h5py>=3.8.0 27 | numpy>=1.24 28 | annoy>=1.17.1 29 | scipy>=1.10.1 30 | munkres 31 | matplotlib 32 | scikit-learn>=1.2.2 33 | 34 | [options.packages.find] 35 | where = src -------------------------------------------------------------------------------- /data/Reuters/get_reuters_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt0.dat.gz 3 | gunzip lyrl2004_tokens_test_pt0.dat.gz 4 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt1.dat.gz 5 | gunzip lyrl2004_tokens_test_pt1.dat.gz 6 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt2.dat.gz 7 | gunzip lyrl2004_tokens_test_pt2.dat.gz 8 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt3.dat.gz 9 | gunzip lyrl2004_tokens_test_pt3.dat.gz 10 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_train.dat.gz 11 | gunzip lyrl2004_tokens_train.dat.gz 12 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz 13 | gunzip rcv1-v2.topics.qrels.gz -------------------------------------------------------------------------------- /examples/reduce_twomoons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | from spectralnet import SpectralReduction 6 | 7 | 8 | def main(): 9 | x_train, x_test, y_train, y_test = load_data("twomoons") 10 | X = torch.cat([x_train, x_test]) 11 | 12 | if y_train is not None: 13 | y = torch.cat([y_train, y_test]) 14 | else: 15 | y = None 16 | 17 | spectralreduction = SpectralReduction( 18 | n_components=2, 19 | should_use_ae=False, 20 | should_use_siamese=False, 21 | spectral_batch_size=712, 22 | spectral_epochs=40, 23 | spectral_is_local_scale=False, 24 | spectral_n_nbg=8, 25 | spectral_scale_k=2, 26 | spectral_lr=1e-2, 27 | spectral_hiddens=[128, 128, 2], 28 | ) 29 | 30 | X_new = spectralreduction.fit_transform(X) 31 | spectralreduction.visualize(X_new, y, n_components=1) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /src/spectralnet/_models/_siamesenet_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SiameseNetModel(nn.Module): 6 | def __init__(self, architecture: dict, input_dim: int): 7 | super(SiameseNetModel, self).__init__() 8 | self.architecture = architecture 9 | self.layers = nn.ModuleList() 10 | 11 | current_dim = input_dim 12 | for layer in self.architecture: 13 | next_dim = layer 14 | self.layers.append( 15 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.ReLU()) 16 | ) 17 | current_dim = next_dim 18 | 19 | def forward_once(self, x: torch.Tensor) -> torch.Tensor: 20 | for layer in self.layers: 21 | x = layer(x) 22 | return x 23 | 24 | def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> tuple: 25 | output1 = self.forward_once(x1) 26 | output2 = self.forward_once(x2) 27 | return output1, output2 28 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Uri Shaham, Amitai Yacobi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/cluster_mnist.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | 6 | from spectralnet import Metrics 7 | from spectralnet import SpectralNet 8 | 9 | 10 | def main(): 11 | x_train, x_test, y_train, y_test = load_data("mnist") 12 | 13 | X = torch.cat([x_train, x_test]) 14 | 15 | if y_train is not None: 16 | y = torch.cat([y_train, y_test]) 17 | else: 18 | y = None 19 | 20 | spectralnet = SpectralNet( 21 | n_clusters=10, 22 | should_use_ae=True, 23 | should_use_siamese=True, 24 | ) 25 | spectralnet.fit(X, y) 26 | cluster_assignments = spectralnet.predict(X) 27 | embeddings = spectralnet.embeddings_ 28 | 29 | if y is not None: 30 | y = y.detach().cpu().numpy() 31 | acc_score = Metrics.acc_score(cluster_assignments, y, n_clusters=10) 32 | nmi_score = Metrics.nmi_score(cluster_assignments, y) 33 | print(f"ACC: {np.round(acc_score, 3)}") 34 | print(f"NMI: {np.round(nmi_score, 3)}") 35 | 36 | return embeddings, cluster_assignments 37 | 38 | 39 | if __name__ == "__main__": 40 | embeddings, assignments = main() 41 | -------------------------------------------------------------------------------- /src/spectralnet/_losses/_spectralnet_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SpectralNetLoss(nn.Module): 6 | def __init__(self): 7 | super(SpectralNetLoss, self).__init__() 8 | 9 | def forward( 10 | self, W: torch.Tensor, Y: torch.Tensor, is_normalized: bool = False 11 | ) -> torch.Tensor: 12 | """ 13 | This function computes the loss of the SpectralNet model. 14 | The loss is the rayleigh quotient of the Laplacian matrix obtained from W, 15 | and the orthonormalized output of the network. 16 | 17 | Args: 18 | W (torch.Tensor): Affinity matrix 19 | Y (torch.Tensor): Output of the network 20 | is_normalized (bool, optional): Whether to use the normalized Laplacian matrix or not. 21 | 22 | Returns: 23 | torch.Tensor: The loss 24 | """ 25 | m = Y.size(0) 26 | if is_normalized: 27 | D = torch.sum(W, dim=1) 28 | Y = Y / torch.sqrt(D)[:, None] 29 | 30 | Dy = torch.cdist(Y, Y) 31 | loss = torch.sum(W * Dy.pow(2)) / (2 * m) 32 | 33 | return loss 34 | -------------------------------------------------------------------------------- /examples/cluster_twomoons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | 6 | from spectralnet import Metrics 7 | from spectralnet import SpectralNet 8 | 9 | 10 | def main(): 11 | x_train, x_test, y_train, y_test = load_data("twomoons") 12 | X = torch.cat([x_train, x_test]) 13 | 14 | if y_train is not None: 15 | y = torch.cat([y_train, y_test]) 16 | else: 17 | y = None 18 | 19 | spectralnet = SpectralNet( 20 | n_clusters=2, 21 | should_use_ae=False, 22 | should_use_siamese=False, 23 | spectral_batch_size=712, 24 | spectral_epochs=40, 25 | spectral_is_local_scale=False, 26 | spectral_n_nbg=8, 27 | spectral_scale_k=2, 28 | spectral_lr=1e-2, 29 | spectral_hiddens=[128, 128, 2], 30 | ) 31 | 32 | spectralnet.fit(X, y) 33 | cluster_assignments = spectralnet.predict(X) 34 | embeddings = spectralnet.embeddings_ 35 | 36 | if y is not None: 37 | y = y.detach().cpu().numpy() 38 | acc_score = Metrics.acc_score(cluster_assignments, y, n_clusters=2) 39 | nmi_score = Metrics.nmi_score(cluster_assignments, y) 40 | print(f"ACC: {np.round(acc_score, 3)}") 41 | print(f"NMI: {np.round(nmi_score, 3)}") 42 | 43 | return embeddings, cluster_assignments 44 | 45 | 46 | if __name__ == "__main__": 47 | embeddings, assignments = main() 48 | -------------------------------------------------------------------------------- /src/spectralnet/_models/_ae_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class AEModel(nn.Module): 6 | def __init__(self, architecture: dict, input_dim: int): 7 | super(AEModel, self).__init__() 8 | self.architecture = architecture 9 | self.encoder = nn.ModuleList() 10 | self.decoder = nn.ModuleList() 11 | 12 | current_dim = input_dim 13 | for i, layer in enumerate(self.architecture): 14 | next_dim = layer 15 | if i == len(self.architecture) - 1: 16 | self.encoder.append(nn.Sequential(nn.Linear(current_dim, next_dim))) 17 | else: 18 | self.encoder.append( 19 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.ReLU()) 20 | ) 21 | current_dim = next_dim 22 | 23 | last_dim = input_dim 24 | current_dim = self.architecture[-1] 25 | for i, layer in enumerate(reversed(self.architecture[:-1])): 26 | next_dim = layer 27 | self.decoder.append( 28 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.ReLU()) 29 | ) 30 | current_dim = next_dim 31 | self.decoder.append(nn.Sequential(nn.Linear(current_dim, last_dim))) 32 | 33 | def encode(self, x: torch.Tensor) -> torch.Tensor: 34 | for layer in self.encoder: 35 | x = layer(x) 36 | return x 37 | 38 | def decode(self, x: torch.Tensor) -> torch.Tensor: 39 | for layer in self.decoder: 40 | x = layer(x) 41 | return x 42 | 43 | def forward(self, x: torch.Tensor) -> torch.Tensor: 44 | x = self.encode(x) 45 | x = self.decode(x) 46 | return x 47 | -------------------------------------------------------------------------------- /src/spectralnet/_losses/_siamese_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ContrastiveLoss(nn.Module): 6 | def __init__(self, margin: float = 1.0): 7 | super(ContrastiveLoss, self).__init__() 8 | self.margin = margin 9 | 10 | def forward( 11 | self, output1: torch.Tensor, output2: torch.Tensor, label: torch.Tensor 12 | ) -> torch.Tensor: 13 | """ 14 | Compute the contrastive loss between the two outputs of the siamese network. 15 | 16 | Parameters 17 | ---------- 18 | output1 : torch.Tensor 19 | The first output of the siamese network. 20 | output2 : torch.Tensor 21 | The second output of the siamese network. 22 | label : torch.Tensor 23 | The label indicating whether the two outputs are similar (1) or not (0). 24 | 25 | Returns 26 | ------- 27 | torch.Tensor 28 | The computed contrastive loss value. 29 | 30 | Notes 31 | ----- 32 | This function takes the two outputs `output1` and `output2` of the siamese network, 33 | along with the corresponding `label` indicating whether the outputs are similar (1) or not (0). 34 | The contrastive loss is computed based on the Euclidean distance between the outputs and the label, 35 | and the computed loss value is returned. 36 | """ 37 | 38 | euclidean = nn.functional.pairwise_distance(output1, output2) 39 | positive_distance = torch.pow(euclidean, 2) 40 | negative_distance = torch.pow(torch.clamp(self.margin - euclidean, min=0.0), 2) 41 | loss = torch.mean( 42 | (label * positive_distance) + ((1 - label) * negative_distance) 43 | ) 44 | return loss 45 | -------------------------------------------------------------------------------- /data/Reuters/make_reuters.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | 5 | 6 | # from dec (https://github.com/piiswrong/dec/tree/master/dec) 7 | def save_hdf5(X, y, name): 8 | import h5py 9 | with h5py.File('./{}.h5'.format(name), 'w') as f: 10 | f['data'] = X 11 | f['labels'] = y 12 | 13 | 14 | def make_reuters_data(): 15 | np.random.seed(1234) 16 | random.seed(1234) 17 | from sklearn.feature_extraction.text import CountVectorizer 18 | did_to_cat = {} 19 | cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT'] 20 | with open('../Reuters/rcv1-v2.topics.qrels') as fin: 21 | for line in fin.readlines(): 22 | line = line.strip().split(' ') 23 | cat = line[0] 24 | did = int(line[1]) 25 | if cat in cat_list: 26 | did_to_cat[did] = did_to_cat.get(did, []) + [cat] 27 | for did in list(did_to_cat): 28 | if len(did_to_cat[did]) > 1: 29 | del did_to_cat[did] 30 | 31 | dat_list = ['lyrl2004_tokens_test_pt0.dat', 32 | 'lyrl2004_tokens_test_pt1.dat', 33 | 'lyrl2004_tokens_test_pt2.dat', 34 | 'lyrl2004_tokens_test_pt3.dat', 35 | 'lyrl2004_tokens_train.dat'] 36 | data = [] 37 | target = [] 38 | cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3} 39 | del did 40 | for dat in dat_list: 41 | with open('../Reuters/' + dat) as fin: 42 | for line in fin.readlines(): 43 | if line.startswith('.I'): 44 | if 'did' in locals(): 45 | assert doc != '' 46 | if did in did_to_cat: 47 | data.append(doc) 48 | target.append(cat_to_cid[did_to_cat[did][0]]) 49 | did = int(line.strip().split(' ')[1]) 50 | doc = '' 51 | elif line.startswith('.W'): 52 | assert doc == '' 53 | else: 54 | doc += line 55 | 56 | assert len(data) == len(did_to_cat) 57 | 58 | X = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data) 59 | Y = np.asarray(target) 60 | 61 | from sklearn.feature_extraction.text import TfidfTransformer 62 | X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X) 63 | X = np.asarray(X.todense()) * np.sqrt(X.shape[1]) 64 | 65 | p = np.random.permutation(X.shape[0]) 66 | X = X[p] 67 | Y = Y[p] 68 | 69 | N = X.shape[0] 70 | save_hdf5(X[:N], Y[:N], 'reutersidf_train') 71 | save_hdf5(X[int(N * 4 / 5):N], Y[int(N * 4 / 5):N], 'reutersidf_test') 72 | save_hdf5(X[:N], Y[:N], 'reutersidf_total') 73 | 74 | 75 | if __name__ == '__main__': 76 | make_reuters_data() 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpectralNet 2 | 3 |
4 |
5 |
6 | SpectralNet is a Python package that performs spectral clustering with deep neural networks.
7 | This package is based on the following paper - [SpectralNet](https://openreview.net/pdf?id=HJ_aoCyRZ)
8 |
9 | ## Installation
10 |
11 | You can install the latest package version via
12 |
13 | ```bash
14 | pip install spectralnet
15 | ```
16 |
17 | ## Usage
18 |
19 | ### Clustering
20 |
21 | The basic functionality is quite intuitive and easy to use, e.g.,
22 |
23 | ```python
24 | from spectralnet import SpectralNet
25 |
26 | spectralnet = SpectralNet(n_clusters=10)
27 | spectralnet.fit(X) # X is the dataset and it should be a torch.Tensor
28 | cluster_assignments = spectralnet.predict(X) # Get the final assignments to clusters
29 | ```
30 |
31 | If you have labels to your dataset and you want to measure ACC and NMI you can do the following:
32 |
33 | ```python
34 | from spectralnet import SpectralNet
35 | from spectralnet import Metrics
36 |
37 |
38 | spectralnet = SpectralNet(n_clusters=2)
39 | spectralnet.fit(X, y) # X is the dataset and it should be a torch.Tensor
40 | cluster_assignments = spectralnet.predict(X) # Get the final assignments to clusters
41 |
42 | y = y_train.detach().cpu().numpy() # In case your labels are of torch.Tensor type.
43 | acc_score = Metrics.acc_score(cluster_assignments, y, n_clusters=2)
44 | nmi_score = Metrics.nmi_score(cluster_assignments, y)
45 | print(f"ACC: {np.round(acc_score, 3)}")
46 | print(f"NMI: {np.round(nmi_score, 3)}")
47 | ```
48 |
49 | You can read the code docs for more information and functionalities
50 |
51 | #### Running examples
52 |
53 | In order to run the model on twomoons or MNIST datasets, you should first cd to the examples folder and then run:
54 | `python3 cluster_twomoons.py`
55 | or
56 | `python3 cluster_mnist.py`
57 |
58 |
73 |
74 |
75 |
76 | ## Citation
77 |
78 | ```
79 |
80 | @inproceedings{shaham2018,
81 | author = {Uri Shaham and Kelly Stanton and Henri Li and Boaz Nadler and Ronen Basri and Yuval Kluger},
82 | title = {SpectralNet: Spectral Clustering Using Deep Neural Networks},
83 | booktitle = {Proc. ICLR 2018},
84 | year = {2018}
85 | }
86 |
87 | ```
88 |
--------------------------------------------------------------------------------
/src/spectralnet/_models/_spectralnet_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import torch.nn as nn
4 |
5 |
6 | class SpectralNetModel(nn.Module):
7 | def __init__(self, architecture: dict, input_dim: int):
8 | super(SpectralNetModel, self).__init__()
9 | self.architecture = architecture
10 | self.layers = nn.ModuleList()
11 | self.input_dim = input_dim
12 |
13 | current_dim = self.input_dim
14 | for i, layer in enumerate(self.architecture):
15 | next_dim = layer
16 | if i == len(self.architecture) - 1:
17 | self.layers.append(
18 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.Tanh())
19 | )
20 | else:
21 | self.layers.append(
22 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.LeakyReLU())
23 | )
24 | current_dim = next_dim
25 |
26 | def _make_orthonorm_weights(self, Y: torch.Tensor) -> torch.Tensor:
27 | """
28 | Orthonormalize the output of the network using the Cholesky decomposition.
29 |
30 | Parameters
31 | ----------
32 | Y : torch.Tensor
33 | The output of the network.
34 |
35 | Returns
36 | -------
37 | torch.Tensor
38 | The orthonormalized output.
39 |
40 | Notes
41 | -----
42 | This function applies QR decomposition to orthonormalize the output (`Y`) of the network.
43 | The inverse of the R matrix is returned as the orthonormalization weights.
44 | """
45 |
46 | m = Y.shape[0]
47 | _, R = torch.linalg.qr(Y)
48 | orthonorm_weights = np.sqrt(m) * torch.inverse(R)
49 | return orthonorm_weights
50 |
51 | def forward(
52 | self, x: torch.Tensor, should_update_orth_weights: bool = True
53 | ) -> torch.Tensor:
54 | """
55 | Perform the forward pass of the model.
56 |
57 | Parameters
58 | ----------
59 | x : torch.Tensor
60 | The input tensor.
61 | should_update_orth_weights : bool, optional
62 | Whether to update the orthonormalization weights using the Cholesky decomposition or not.
63 |
64 | Returns
65 | -------
66 | torch.Tensor
67 | The output tensor.
68 |
69 | Notes
70 | -----
71 | This function takes an input tensor `x` and computes the forward pass of the model.
72 | If `should_update_orth_weights` is set to True, the orthonormalization weights are updated
73 | using the QR decomposition. The output tensor is returned.
74 | """
75 |
76 | for layer in self.layers:
77 | x = layer(x)
78 |
79 | Y_tilde = x
80 | if should_update_orth_weights:
81 | self.orthonorm_weights = self._make_orthonorm_weights(Y_tilde)
82 |
83 | Y = Y_tilde @ self.orthonorm_weights
84 | return Y
85 |
--------------------------------------------------------------------------------
/src/spectralnet/_metrics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import sklearn.metrics as metrics
3 |
4 | from munkres import Munkres
5 | from sklearn.metrics import normalized_mutual_info_score as nmi
6 |
7 | from spectralnet._utils import *
8 |
9 |
10 | class Metrics:
11 | @staticmethod
12 | def acc_score(
13 | cluster_assignments: np.ndarray, y: np.ndarray, n_clusters: int
14 | ) -> float:
15 | """
16 | Compute the accuracy score of the clustering algorithm.
17 |
18 | Parameters
19 | ----------
20 | cluster_assignments : np.ndarray
21 | Cluster assignments for each data point.
22 | y : np.ndarray
23 | Ground truth labels.
24 | n_clusters : int
25 | Number of clusters.
26 |
27 | Returns
28 | -------
29 | float
30 | The computed accuracy score.
31 |
32 | Notes
33 | -----
34 | This function takes the `cluster_assignments` which represent the assigned clusters for each data point,
35 | the ground truth labels `y`, and the number of clusters `n_clusters`. It computes the accuracy score of the
36 | clustering algorithm by comparing the cluster assignments with the ground truth labels. The accuracy score
37 | is returned as a floating-point value.
38 | """
39 |
40 | confusion_matrix = metrics.confusion_matrix(y, cluster_assignments, labels=None)
41 | cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters=n_clusters)
42 | indices = Munkres().compute(cost_matrix)
43 | kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices)
44 | y_pred = kmeans_to_true_cluster_labels[cluster_assignments]
45 | print(metrics.confusion_matrix(y, y_pred))
46 | accuracy = np.mean(y_pred == y)
47 | return accuracy
48 |
49 | @staticmethod
50 | def nmi_score(cluster_assignments: np.ndarray, y: np.ndarray) -> float:
51 | """
52 | Compute the normalized mutual information score of the clustering algorithm.
53 |
54 | Parameters
55 | ----------
56 | cluster_assignments : np.ndarray
57 | Cluster assignments for each data point.
58 | y : np.ndarray
59 | Ground truth labels.
60 |
61 | Returns
62 | -------
63 | float
64 | The computed normalized mutual information score.
65 |
66 | Notes
67 | -----
68 | This function takes the `cluster_assignments` which represent the assigned clusters for each data point
69 | and the ground truth labels `y`. It computes the normalized mutual information (NMI) score of the clustering
70 | algorithm. NMI measures the mutual dependence between the cluster assignments and the ground truth labels,
71 | normalized by the entropy of both variables. The NMI score ranges between 0 and 1, where a higher score
72 | indicates a better clustering performance. The computed NMI score is returned as a floating-point value.
73 | """
74 | return nmi(cluster_assignments, y)
75 |
--------------------------------------------------------------------------------
/examples/data.py:
--------------------------------------------------------------------------------
1 | import h5py
2 | import torch
3 | import numpy as np
4 | import scipy.io
5 |
6 |
7 | from torch.utils.data import Dataset, Subset
8 | from sklearn.datasets import make_moons
9 | from torchvision import datasets, transforms
10 | from sklearn.preprocessing import StandardScaler
11 | from sklearn.model_selection import train_test_split
12 |
13 |
14 | def load_mnist() -> tuple:
15 | tensor_transform = transforms.Compose([transforms.ToTensor()])
16 | train_set = datasets.MNIST(
17 | root="../data", train=True, download=True, transform=tensor_transform
18 | )
19 | test_set = datasets.MNIST(
20 | root="../data", train=False, download=True, transform=tensor_transform
21 | )
22 |
23 | x_train, y_train = zip(*train_set)
24 | x_train, y_train = torch.cat(x_train), torch.Tensor(y_train)
25 | x_test, y_test = zip(*test_set)
26 | x_test, y_test = torch.cat(x_test), torch.Tensor(y_test)
27 |
28 | return x_train, y_train, x_test, y_test
29 |
30 |
31 | def load_twomoon() -> tuple:
32 | data, y = make_moons(n_samples=7000, shuffle=True, noise=0.075, random_state=None)
33 | scaler = StandardScaler()
34 | data = scaler.fit_transform(data)
35 | x_train, x_test, y_train, y_test = train_test_split(
36 | data, y, test_size=0.33, random_state=42
37 | )
38 | x_train, x_test = torch.Tensor(x_train), torch.Tensor(x_test)
39 | y_train, y_test = torch.Tensor(y_train), torch.Tensor(y_test)
40 | return x_train, y_train, x_test, y_test
41 |
42 |
43 | def load_reuters() -> tuple:
44 | with h5py.File("../data/Reuters/reutersidf_total.h5", "r") as f:
45 | x = np.asarray(f.get("data"), dtype="float32")
46 | y = np.asarray(f.get("labels"), dtype="float32")
47 |
48 | n_train = int(0.9 * len(x))
49 | x_train, x_test = x[:n_train], x[n_train:]
50 | y_train, y_test = y[:n_train], y[n_train:]
51 |
52 | x_train, x_test = torch.from_numpy(x_train), torch.from_numpy(x_test)
53 | y_train, y_test = torch.from_numpy(y_train), torch.from_numpy(y_test)
54 |
55 | return x_train, y_train, x_test, y_test
56 |
57 |
58 | def load_from_path(dpath: str, lpath: str = None) -> tuple:
59 | X = np.loadtxt(dpath, delimiter=",", dtype=np.float32)
60 | n_train = int(0.9 * len(X))
61 |
62 | x_train, x_test = X[:n_train], X[n_train:]
63 | x_train, x_test = torch.from_numpy(x_train), torch.from_numpy(x_test)
64 |
65 | if lpath is not None:
66 | y = np.loadtxt(lpath, delimiter=",", dtype=np.float32)
67 | y_train, y_test = y[:n_train], y[n_train:]
68 | y_train, y_test = torch.from_numpy(y_train), torch.from_numpy(y_test)
69 |
70 | else:
71 | y_train, y_test = None, None
72 |
73 | return x_train, y_train, x_test, y_test
74 |
75 |
76 | def load_data(dataset: str) -> tuple:
77 | """
78 | This function loads the dataset specified in the config file.
79 |
80 |
81 | Args:
82 | dataset (str or dictionary): In case you want to load your own dataset,
83 | you should specify the path to the data (and label if applicable)
84 | files in the config file in a dictionary fashion under the key "dataset".
85 |
86 | Raises:
87 | ValueError: If the dataset is not found in the config file.
88 |
89 | Returns:
90 | tuple: A tuple containing the train and test data and labels.
91 | """
92 |
93 | if dataset == "mnist":
94 | x_train, y_train, x_test, y_test = load_mnist()
95 | elif dataset == "twomoons":
96 | x_train, y_train, x_test, y_test = load_twomoon()
97 | elif dataset == "reuters":
98 | x_train, y_train, x_test, y_test = load_reuters()
99 | else:
100 | try:
101 | data_path = dataset["dpath"]
102 | if "lpath" in dataset:
103 | label_path = dataset["lpath"]
104 | else:
105 | label_path = None
106 | except:
107 | raise ValueError("Could not find dataset path. Check your config file.")
108 | x_train, y_train, x_test, y_test = load_from_path(data_path, label_path)
109 |
110 | return x_train, x_test, y_train, y_test
111 |
--------------------------------------------------------------------------------
/src/spectralnet/_trainers/_ae_trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import torch.nn as nn
4 | import torch.optim as optim
5 |
6 | from tqdm import trange
7 | from ._trainer import Trainer
8 | from .._models import AEModel
9 | from torch.utils.data import DataLoader, random_split
10 |
11 |
12 | class AETrainer:
13 | def __init__(self, config: dict, device: torch.device):
14 | self.device = device
15 | self.ae_config = config
16 | self.lr = self.ae_config["lr"]
17 | self.epochs = self.ae_config["epochs"]
18 | self.min_lr = self.ae_config["min_lr"]
19 | self.lr_decay = self.ae_config["lr_decay"]
20 | self.patience = self.ae_config["patience"]
21 | self.architecture = self.ae_config["hiddens"]
22 | self.batch_size = self.ae_config["batch_size"]
23 | self.weights_dir = "spectralnet/_trainers/weights"
24 | self.weights_path = "spectralnet/_trainers/weights/ae_weights.pth"
25 | if not os.path.exists(self.weights_dir):
26 | os.makedirs(self.weights_dir)
27 |
28 | def train(self, X: torch.Tensor) -> AEModel:
29 | self.X = X.view(X.size(0), -1)
30 | self.criterion = nn.MSELoss()
31 |
32 | self.ae_net = AEModel(self.architecture, input_dim=self.X.shape[1]).to(
33 | self.device
34 | )
35 |
36 | self.optimizer = optim.Adam(self.ae_net.parameters(), lr=self.lr)
37 |
38 | self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
39 | self.optimizer, mode="min", factor=self.lr_decay, patience=self.patience
40 | )
41 |
42 | if os.path.exists(self.weights_path):
43 | self.ae_net.load_state_dict(torch.load(self.weights_path))
44 | return self.ae_net
45 |
46 | train_loader, valid_loader = self._get_data_loader()
47 |
48 | print("Training Autoencoder:")
49 | t = trange(self.epochs, leave=True)
50 | for epoch in t:
51 | train_loss = 0.0
52 | for batch_x in train_loader:
53 | batch_x = batch_x.to(self.device)
54 | batch_x = batch_x.view(batch_x.size(0), -1)
55 | self.optimizer.zero_grad()
56 | output = self.ae_net(batch_x)
57 | loss = self.criterion(output, batch_x)
58 | loss.backward()
59 | self.optimizer.step()
60 | train_loss += loss.item()
61 |
62 | train_loss /= len(train_loader)
63 | valid_loss = self.validate(valid_loader)
64 | self.scheduler.step(valid_loss)
65 | current_lr = self.optimizer.param_groups[0]["lr"]
66 |
67 | if current_lr <= self.min_lr:
68 | break
69 |
70 | t.set_description(
71 | "Train Loss: {:.7f}, Valid Loss: {:.7f}, LR: {:.6f}".format(
72 | train_loss, valid_loss, current_lr
73 | )
74 | )
75 | t.refresh()
76 |
77 | torch.save(self.ae_net.state_dict(), self.weights_path)
78 | return self.ae_net
79 |
80 | def validate(self, valid_loader: DataLoader) -> float:
81 | self.ae_net.eval()
82 | valid_loss = 0.0
83 | with torch.no_grad():
84 | for batch_x in valid_loader:
85 | batch_x = batch_x.to(self.device)
86 | batch_x = batch_x.view(batch_x.size(0), -1)
87 | output = self.ae_net(batch_x)
88 | loss = self.criterion(output, batch_x)
89 | valid_loss += loss.item()
90 | valid_loss /= len(valid_loader)
91 | return valid_loss
92 |
93 | def embed(self, X: torch.Tensor) -> torch.Tensor:
94 | print("Embedding data ...")
95 | self.ae_net.eval()
96 | with torch.no_grad():
97 | X = X.view(X.size(0), -1)
98 | encoded_data = self.ae_net.encode(X.to(self.device))
99 | return encoded_data
100 |
101 | def _get_data_loader(self) -> tuple:
102 | trainset_len = int(len(self.X) * 0.9)
103 | validset_len = len(self.X) - trainset_len
104 | trainset, validset = random_split(self.X, [trainset_len, validset_len])
105 | train_loader = DataLoader(
106 | trainset, batch_size=self.ae_config["batch_size"], shuffle=True
107 | )
108 | valid_loader = DataLoader(
109 | validset, batch_size=self.ae_config["batch_size"], shuffle=False
110 | )
111 | return train_loader, valid_loader
112 |
--------------------------------------------------------------------------------
/src/spectralnet/_trainers/_spectralnet_trainer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 | from torch.utils.data import DataLoader, random_split, TensorDataset
5 | from sklearn.neighbors import kneighbors_graph
6 | from tqdm import trange
7 | from spectralnet._utils import *
8 | from ._trainer import Trainer
9 | from .._losses import SpectralNetLoss
10 | from .._models import SpectralNetModel
11 |
12 |
13 | class SpectralTrainer:
14 | def __init__(self, config: dict, device: torch.device, is_sparse: bool = False):
15 | """
16 | Initialize the SpectralNet model trainer.
17 |
18 | Parameters
19 | ----------
20 | config : dict
21 | The configuration dictionary.
22 | device : torch.device
23 | The device to use for training.
24 | is_sparse : bool, optional
25 | Whether the graph-laplacian obtained from a mini-batch is sparse or not.
26 | If True, the batch is constructed by taking 1/5 of the original random batch
27 | and adding 4 of its nearest neighbors to each sample. Defaults to False.
28 |
29 | Notes
30 | -----
31 | This class is responsible for training the SpectralNet model.
32 | The configuration dictionary (`config`) contains various settings for training.
33 | The device (`device`) specifies the device (CPU or GPU) to be used for training.
34 | The `is_sparse` flag is used to determine the construction of the batch when the graph-laplacian is sparse.
35 | """
36 |
37 | self.device = device
38 | self.is_sparse = is_sparse
39 | self.spectral_config = config
40 | self.lr = self.spectral_config["lr"]
41 | self.n_nbg = self.spectral_config["n_nbg"]
42 | self.min_lr = self.spectral_config["min_lr"]
43 | self.epochs = self.spectral_config["epochs"]
44 | self.scale_k = self.spectral_config["scale_k"]
45 | self.lr_decay = self.spectral_config["lr_decay"]
46 | self.patience = self.spectral_config["patience"]
47 | self.architecture = self.spectral_config["hiddens"]
48 | self.batch_size = self.spectral_config["batch_size"]
49 | self.is_local_scale = self.spectral_config["is_local_scale"]
50 |
51 | def train(
52 | self, X: torch.Tensor, y: torch.Tensor, siamese_net: nn.Module = None
53 | ) -> SpectralNetModel:
54 | """
55 | Train the SpectralNet model.
56 |
57 | Parameters
58 | ----------
59 | X : torch.Tensor
60 | The dataset to train on.
61 | y : torch.Tensor, optional
62 | The labels of the dataset in case there are any.
63 | siamese_net : nn.Module, optional
64 | The siamese network to use for computing the affinity matrix.
65 |
66 | Returns
67 | -------
68 | SpectralNetModel
69 | The trained SpectralNet model.
70 |
71 | Notes
72 | -----
73 | This function trains the SpectralNet model using the provided dataset (`X`) and labels (`y`).
74 | If labels are not provided (`y` is None), unsupervised training is performed.
75 | The siamese network (`siamese_net`) is an optional parameter used for computing the affinity matrix.
76 | The trained SpectralNet model is returned as the output.
77 | """
78 |
79 | self.X = X.view(X.size(0), -1)
80 | self.y = y
81 | self.counter = 0
82 | self.siamese_net = siamese_net
83 | self.criterion = SpectralNetLoss()
84 | self.spectral_net = SpectralNetModel(
85 | self.architecture, input_dim=self.X.shape[1]
86 | ).to(self.device)
87 |
88 | self.optimizer = optim.Adam(self.spectral_net.parameters(), lr=self.lr)
89 |
90 | self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
91 | self.optimizer, mode="min", factor=self.lr_decay, patience=self.patience
92 | )
93 |
94 | train_loader, ortho_loader, valid_loader = self._get_data_loader()
95 |
96 | print("Training SpectralNet:")
97 | t = trange(self.epochs, leave=True)
98 | for epoch in t:
99 | train_loss = 0.0
100 | for (X_grad, _), (X_orth, _) in zip(train_loader, ortho_loader):
101 | X_grad = X_grad.to(device=self.device)
102 | X_grad = X_grad.view(X_grad.size(0), -1)
103 | X_orth = X_orth.to(device=self.device)
104 | X_orth = X_orth.view(X_orth.size(0), -1)
105 |
106 | if self.is_sparse:
107 | X_grad = make_batch_for_sparse_grapsh(X_grad)
108 | X_orth = make_batch_for_sparse_grapsh(X_orth)
109 |
110 | # Orthogonalization step
111 | self.spectral_net.eval()
112 | self.spectral_net(X_orth, should_update_orth_weights=True)
113 |
114 | # Gradient step
115 | self.spectral_net.train()
116 | self.optimizer.zero_grad()
117 |
118 | Y = self.spectral_net(X_grad, should_update_orth_weights=False)
119 | if self.siamese_net is not None:
120 | with torch.no_grad():
121 | X_grad = self.siamese_net.forward_once(X_grad)
122 |
123 | W = self._get_affinity_matrix(X_grad)
124 |
125 | loss = self.criterion(W, Y)
126 | loss.backward()
127 | self.optimizer.step()
128 | train_loss += loss.item()
129 |
130 | train_loss /= len(train_loader)
131 |
132 | # Validation step
133 | valid_loss = self.validate(valid_loader)
134 | self.scheduler.step(valid_loss)
135 |
136 | current_lr = self.optimizer.param_groups[0]["lr"]
137 | if current_lr <= self.spectral_config["min_lr"]:
138 | break
139 | t.set_description(
140 | "Train Loss: {:.7f}, Valid Loss: {:.7f}, LR: {:.6f}".format(
141 | train_loss, valid_loss, current_lr
142 | )
143 | )
144 | t.refresh()
145 |
146 | return self.spectral_net
147 |
148 | def validate(self, valid_loader: DataLoader) -> float:
149 | valid_loss = 0.0
150 | self.spectral_net.eval()
151 | with torch.no_grad():
152 | for batch in valid_loader:
153 | X, y = batch
154 | X, y = X.to(self.device), y.to(self.device)
155 |
156 | if self.is_sparse:
157 | X = make_batch_for_sparse_grapsh(X)
158 |
159 | Y = self.spectral_net(X, should_update_orth_weights=False)
160 | with torch.no_grad():
161 | if self.siamese_net is not None:
162 | X = self.siamese_net.forward_once(X)
163 |
164 | W = self._get_affinity_matrix(X)
165 |
166 | loss = self.criterion(W, Y)
167 | valid_loss += loss.item()
168 |
169 | valid_loss /= len(valid_loader)
170 | return valid_loss
171 |
172 | def _get_affinity_matrix(self, X: torch.Tensor) -> torch.Tensor:
173 | """
174 | This function computes the affinity matrix W using the Gaussian kernel.
175 |
176 | Args:
177 | X (torch.Tensor): The input data
178 |
179 | Returns:
180 | torch.Tensor: The affinity matrix W
181 | """
182 |
183 | is_local = self.is_local_scale
184 | n_neighbors = self.n_nbg
185 | scale_k = self.scale_k
186 | Dx = torch.cdist(X, X)
187 | Dis, indices = get_nearest_neighbors(X, k=n_neighbors + 1)
188 | scale = compute_scale(Dis, k=scale_k, is_local=is_local)
189 | W = get_gaussian_kernel(
190 | Dx, scale, indices, device=self.device, is_local=is_local
191 | )
192 | return W
193 |
194 | def _get_data_loader(self) -> tuple:
195 | """
196 | This function returns the data loaders for training, validation and testing.
197 |
198 | Returns:
199 | tuple: The data loaders
200 | """
201 | if self.y is None:
202 | self.y = torch.zeros(len(self.X))
203 | train_size = int(0.9 * len(self.X))
204 | valid_size = len(self.X) - train_size
205 | dataset = TensorDataset(self.X, self.y)
206 | train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])
207 | train_loader = DataLoader(
208 | train_dataset, batch_size=self.batch_size, shuffle=True
209 | )
210 | ortho_loader = DataLoader(
211 | train_dataset, batch_size=self.batch_size, shuffle=True
212 | )
213 | valid_loader = DataLoader(
214 | valid_dataset, batch_size=self.batch_size, shuffle=False
215 | )
216 | return train_loader, ortho_loader, valid_loader
217 |
--------------------------------------------------------------------------------
/src/spectralnet/_trainers/_siamesenet_trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import numpy as np
4 | import torch.optim as optim
5 |
6 | from tqdm import trange
7 | from annoy import AnnoyIndex
8 | from sklearn.neighbors import NearestNeighbors
9 | from torch.utils.data import DataLoader, random_split
10 |
11 | from ._trainer import Trainer
12 | from .._models import SiameseNetModel
13 | from .._losses import ContrastiveLoss
14 |
15 |
16 | class SiameseDataset:
17 | def __init__(self, pairs: list):
18 | """
19 | Initializes a Siamese dataset.
20 |
21 | Parameters
22 | ----------
23 | pairs : list
24 | A list of tuples containing the pairs of data
25 | and their labels.
26 | """
27 | self.pairs = pairs
28 |
29 | def __getitem__(self, index: int):
30 | x1 = self.pairs[index][0]
31 | x2 = self.pairs[index][1]
32 | label = self.pairs[index][2]
33 | return x1, x2, label
34 |
35 | def __len__(self):
36 | return len(self.pairs)
37 |
38 |
39 | class SiameseTrainer:
40 | def __init__(self, config: dict, device: torch.device):
41 | self.device = device
42 | self.siamese_config = config
43 | self.lr = self.siamese_config["lr"]
44 | self.n_nbg = self.siamese_config["n_nbg"]
45 | self.min_lr = self.siamese_config["min_lr"]
46 | self.epochs = self.siamese_config["epochs"]
47 | self.lr_decay = self.siamese_config["lr_decay"]
48 | self.patience = self.siamese_config["patience"]
49 | self.architecture = self.siamese_config["hiddens"]
50 | self.batch_size = self.siamese_config["batch_size"]
51 | self.use_approx = self.siamese_config["use_approx"]
52 | self.weights_path = "spectralnet/_trainers/weights/siamese_weights.pth"
53 |
54 | def train(self, X: torch.Tensor) -> SiameseNetModel:
55 | self.X = X.view(X.size(0), -1)
56 | # self.X = X
57 |
58 | self.criterion = ContrastiveLoss()
59 | self.siamese_net = SiameseNetModel(
60 | self.architecture, input_dim=self.X.shape[1]
61 | ).to(self.device)
62 |
63 | self.optimizer = optim.Adam(self.siamese_net.parameters(), lr=self.lr)
64 |
65 | self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
66 | self.optimizer, mode="min", factor=self.lr_decay, patience=self.patience
67 | )
68 |
69 | if os.path.exists(self.weights_path):
70 | self.siamese_net.load_state_dict(torch.load(self.weights_path))
71 | return self.siamese_net
72 |
73 | train_loader, valid_loader = self._get_data_loader()
74 |
75 | print("Training Siamese Network:")
76 | t = trange(self.epochs, leave=True)
77 | self.siamese_net.train()
78 | for epoch in t:
79 | train_loss = 0.0
80 | for x1, x2, label in train_loader:
81 | x1 = x1.to(self.device)
82 | x1 = x1.view(x1.size(0), -1)
83 | x2 = x2.to(self.device)
84 | x2 = x2.view(x2.size(0), -1)
85 | label = label.to(self.device)
86 | self.optimizer.zero_grad()
87 | output1, output2 = self.siamese_net(x1, x2)
88 | loss = self.criterion(output1, output2, label)
89 | loss.backward()
90 | self.optimizer.step()
91 | train_loss += loss.item()
92 |
93 | train_loss /= len(train_loader)
94 | valid_loss = self.validate(valid_loader)
95 | self.scheduler.step(valid_loss)
96 | current_lr = self.optimizer.param_groups[0]["lr"]
97 |
98 | if current_lr <= self.min_lr:
99 | break
100 | t.set_description(
101 | "Train Loss: {:.7f}, Valid Loss: {:.7f}, LR: {:.6f}".format(
102 | train_loss, valid_loss, current_lr
103 | )
104 | )
105 | t.refresh()
106 |
107 | torch.save(self.siamese_net.state_dict(), self.weights_path)
108 | return self.siamese_net
109 |
110 | def validate(self, valid_loader: DataLoader) -> float:
111 | valid_loss = 0.0
112 | self.siamese_net.eval()
113 | with torch.no_grad():
114 | for x1, x2, label in valid_loader:
115 | x1 = x1.to(self.device)
116 | x1 = x1.view(x1.size(0), -1)
117 | x2 = x2.to(self.device)
118 | x2 = x2.view(x2.size(0), -1)
119 | label = label.to(self.device)
120 | output1, output2 = self.siamese_net(x1, x2)
121 | loss = self.criterion(output1, output2, label)
122 | valid_loss += loss.item()
123 | valid_loss /= len(valid_loader)
124 | return valid_loss
125 |
126 | def _get_knn_pairs(self) -> list:
127 | """Gets the pairs of data points to be used for training the siamese network.
128 |
129 | Parameters
130 | ----------
131 | None
132 |
133 | Returns
134 | -------
135 | list
136 | A list of pairs of data points.
137 |
138 | Notes
139 | -----
140 | The pairs are chosen such that each data point has n_neighbors positive pairs
141 | and n_neighbors negative pairs where the neighbors are chosen using KNN.
142 | """
143 |
144 | pairs = []
145 | X = self.X.detach().cpu().numpy()
146 | data_indices = np.arange(len(X))
147 | n_neighbors = self.n_nbg
148 | nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, algorithm="ball_tree").fit(
149 | X
150 | )
151 | _, neighbors_indices = nbrs.kneighbors(X)
152 |
153 | for i in range(len(X)):
154 | non_neighbors_indices = np.delete(data_indices, neighbors_indices[i])
155 | non_neighbors_random_chosen_indices = np.random.choice(
156 | non_neighbors_indices, n_neighbors
157 | )
158 |
159 | positive_pairs = [
160 | [self.X[i], self.X[n], 1]
161 | for n in neighbors_indices[i][1 : n_neighbors + 1]
162 | ]
163 | negative_pairs = [
164 | [self.X[i], self.X[n], 0] for n in non_neighbors_random_chosen_indices
165 | ]
166 |
167 | pairs += positive_pairs
168 | pairs += negative_pairs
169 |
170 | return pairs
171 |
172 | def _get_approx_nn_pairs(self) -> list:
173 | """Gets the pairs of data points to be used for training the siamese network.
174 |
175 | Parameters
176 | ----------
177 | None
178 |
179 | Returns
180 | -------
181 | list
182 | A list of pairs of data points.
183 |
184 | Notes
185 | -----
186 | The pairs are chosen such that each data point has 1 neighbor from its nearest n_neighbors
187 | neighbors and 1 neighbor from the rest of the data points. The neighbors are chosen using
188 | approximate nearest neighbors using the Annoy library.
189 | """
190 |
191 | pairs = []
192 | n_samples = self.X.shape[0]
193 | n_neighbors = self.n_nbg
194 | indices = torch.randperm(self.X.shape[0])[:n_samples]
195 | x_train = self.X[indices]
196 | X_numpy = self.X[indices].detach().cpu().numpy()
197 | data_indices = np.arange(len(x_train))
198 |
199 | ann = AnnoyIndex(X_numpy.shape[1], "euclidean")
200 | for i, x_ in enumerate(X_numpy):
201 | ann.add_item(i, x_)
202 | ann.build(50)
203 |
204 | neighbors_indices = np.empty((len(X_numpy), n_neighbors + 1))
205 | for i in range(len(X_numpy)):
206 | nn_i = ann.get_nns_by_item(i, n_neighbors + 1, include_distances=False)
207 | neighbors_indices[i, :] = np.array(nn_i)
208 | neighbors_indices = neighbors_indices.astype(int)
209 |
210 | print("Building dataset for the siamese network ...")
211 | for i in range(len(X_numpy)):
212 | non_neighbors_indices = np.delete(data_indices, neighbors_indices[i])
213 |
214 | neighbor_idx = np.random.choice(neighbors_indices[i][1:], 1)
215 | non_nbr_idx = np.random.choice(non_neighbors_indices, 1)
216 |
217 | positive_pairs = [[x_train[i], x_train[neighbor_idx], 1]]
218 | negative_pairs = [[x_train[i], x_train[non_nbr_idx], 0]]
219 |
220 | pairs += positive_pairs
221 | pairs += negative_pairs
222 |
223 | return pairs
224 |
225 | def _get_pairs(self) -> list:
226 | """Gets the pairs of data points to be used for training the siamese network.
227 |
228 | Parameters
229 | ----------
230 | None
231 |
232 | Returns
233 | -------
234 | list
235 | A list of pairs of data points.
236 |
237 | Notes
238 | -----
239 | This method internally calls either _get_knn_pairs() or _get_approx_nn_pairs() based on the value
240 | of the 'use_approx' attribute.
241 | """
242 |
243 | should_use_approx = self.use_approx
244 | if should_use_approx:
245 | return self._get_approx_nn_pairs()
246 | else:
247 | return self._get_knn_pairs()
248 |
249 | def _get_data_loader(self) -> tuple:
250 | """
251 | Splits the data into train and validation sets and returns the corresponding data loaders.
252 |
253 | Parameters
254 | ----------
255 | None
256 |
257 | Returns
258 | -------
259 | tuple
260 | A tuple containing the train and validation data loaders.
261 |
262 | Notes
263 | -----
264 | This function splits the data into train and validation sets and creates data loaders for them.
265 | The train and validation sets are obtained by randomly splitting the siamese dataset.
266 | The train and validation data loaders are created using DataLoader from the PyTorch library.
267 | """
268 |
269 | pairs = self._get_pairs()
270 | siamese_dataset = SiameseDataset(pairs)
271 | siamese_trainset_len = int(len(siamese_dataset) * 0.9)
272 | siamese_validset_len = len(siamese_dataset) - siamese_trainset_len
273 | siamese_trainset, siamese_validset = random_split(
274 | siamese_dataset, [siamese_trainset_len, siamese_validset_len]
275 | )
276 | siamese_trainloader = DataLoader(
277 | siamese_trainset, batch_size=self.siamese_config["batch_size"], shuffle=True
278 | )
279 | siamese_validloader = DataLoader(
280 | siamese_validset,
281 | batch_size=self.siamese_config["batch_size"],
282 | shuffle=False,
283 | )
284 | return siamese_trainloader, siamese_validloader
285 |
--------------------------------------------------------------------------------
/src/spectralnet/_cluster.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | from ._utils import *
5 | from sklearn.cluster import KMeans
6 | from ._trainers import SpectralTrainer, SiameseTrainer, AETrainer
7 |
8 |
9 | class SpectralNet:
10 | def __init__(
11 | self,
12 | n_clusters: int,
13 | should_use_ae: bool = False,
14 | should_use_siamese: bool = False,
15 | is_sparse_graph: bool = False,
16 | ae_hiddens: list = [512, 512, 2048, 10],
17 | ae_epochs: int = 40,
18 | ae_lr: float = 1e-3,
19 | ae_lr_decay: float = 0.1,
20 | ae_min_lr: float = 1e-7,
21 | ae_patience: int = 10,
22 | ae_batch_size: int = 256,
23 | siamese_hiddens: list = [1024, 1024, 512, 10],
24 | siamese_epochs: int = 30,
25 | siamese_lr: float = 1e-3,
26 | siamese_lr_decay: float = 0.1,
27 | siamese_min_lr: float = 1e-7,
28 | siamese_patience: int = 10,
29 | siamese_n_nbg: int = 2,
30 | siamese_use_approx: bool = False,
31 | siamese_batch_size: int = 128,
32 | spectral_hiddens: list = [1024, 1024, 512, 10],
33 | spectral_epochs: int = 30,
34 | spectral_lr: float = 1e-3,
35 | spectral_lr_decay: float = 0.1,
36 | spectral_min_lr: float = 1e-8,
37 | spectral_patience: int = 10,
38 | spectral_batch_size: int = 1024,
39 | spectral_n_nbg: int = 30,
40 | spectral_scale_k: int = 15,
41 | spectral_is_local_scale: bool = True,
42 | ):
43 | """SpectralNet is a class for implementing a Deep learning model that performs spectral clustering.
44 | This model optionally utilizes Autoencoders (AE) and Siamese networks for training.
45 |
46 | Parameters
47 | ----------
48 | n_clusters : int
49 | The number of clusters to be generated by the SpectralNet algorithm.
50 | Also used for the dimention of the projection subspace.
51 |
52 | should_use_ae : bool, optional (default=False)
53 | Specifies whether to use the Autoencoder (AE) network as part of the training process.
54 |
55 | should_use_siamese : bool, optional (default=False)
56 | Specifies whether to use the Siamese network as part of the training process.
57 |
58 | is_sparse_graph : bool, optional (default=False)
59 | Specifies whether the graph Laplacian created from the data is sparse.
60 |
61 | ae_hiddens : list, optional (default=[512, 512, 2048, 10])
62 | The number of hidden units in each layer of the Autoencoder network.
63 |
64 | ae_epochs : int, optional (default=30)
65 | The number of epochs to train the Autoencoder network.
66 |
67 | ae_lr : float, optional (default=1e-3)
68 | The learning rate for the Autoencoder network.
69 |
70 | ae_lr_decay : float, optional (default=0.1)
71 | The learning rate decay factor for the Autoencoder network.
72 |
73 | ae_min_lr : float, optional (default=1e-7)
74 | The minimum learning rate for the Autoencoder network.
75 |
76 | ae_patience : int, optional (default=10)
77 | The number of epochs to wait before reducing the learning rate for the Autoencoder network.
78 |
79 | ae_batch_size : int, optional (default=256)
80 | The batch size used during training of the Autoencoder network.
81 |
82 | siamese_hiddens : list, optional (default=[1024, 1024, 512, 10])
83 | The number of hidden units in each layer of the Siamese network.
84 |
85 | siamese_epochs : int, optional (default=30)
86 | The number of epochs to train the Siamese network.
87 |
88 | siamese_lr : float, optional (default=1e-3)
89 | The learning rate for the Siamese network.
90 |
91 | siamese_lr_decay : float, optional (default=0.1)
92 | The learning rate decay factor for the Siamese network.
93 |
94 | siamese_min_lr : float, optional (default=1e-7)
95 | The minimum learning rate for the Siamese network.
96 |
97 | siamese_patience : int, optional (default=10)
98 | The number of epochs to wait before reducing the learning rate for the Siamese network.
99 |
100 | siamese_n_nbg : int, optional (default=2)
101 | The number of nearest neighbors to consider as 'positive' pairs by the Siamese network.
102 |
103 | siamese_use_approx : bool, optional (default=False)
104 | Specifies whether to use Annoy instead of KNN for computing nearest neighbors,
105 | particularly useful for large datasets.
106 |
107 | siamese_batch_size : int, optional (default=256)
108 | The batch size used during training of the Siamese network.
109 |
110 | spectral_hiddens : list, optional (default=[1024, 1024, 512, 10])
111 | The number of hidden units in each layer of the Spectral network.
112 |
113 | spectral_epochs : int, optional (default=30)
114 | The number of epochs to train the Spectral network.
115 |
116 | spectral_lr : float, optional (default=1e-3)
117 | The learning rate for the Spectral network.
118 |
119 | spectral_lr_decay : float, optional (default=0.1)
120 | The learning rate decay factor"""
121 |
122 | self.n_clusters = n_clusters
123 | self.should_use_ae = should_use_ae
124 | self.should_use_siamese = should_use_siamese
125 | self.is_sparse_graph = is_sparse_graph
126 | self.ae_hiddens = ae_hiddens
127 | self.ae_epochs = ae_epochs
128 | self.ae_lr = ae_lr
129 | self.ae_lr_decay = ae_lr_decay
130 | self.ae_min_lr = ae_min_lr
131 | self.ae_patience = ae_patience
132 | self.ae_batch_size = ae_batch_size
133 | self.siamese_hiddens = siamese_hiddens
134 | self.siamese_epochs = siamese_epochs
135 | self.siamese_lr = siamese_lr
136 | self.siamese_lr_decay = siamese_lr_decay
137 | self.siamese_min_lr = siamese_min_lr
138 | self.siamese_patience = siamese_patience
139 | self.siamese_n_nbg = siamese_n_nbg
140 | self.siamese_use_approx = siamese_use_approx
141 | self.siamese_batch_size = siamese_batch_size
142 | self.spectral_hiddens = spectral_hiddens
143 | self.spectral_epochs = spectral_epochs
144 | self.spectral_lr = spectral_lr
145 | self.spectral_lr_decay = spectral_lr_decay
146 | self.spectral_min_lr = spectral_min_lr
147 | self.spectral_patience = spectral_patience
148 | self.spectral_n_nbg = spectral_n_nbg
149 | self.spectral_scale_k = spectral_scale_k
150 | self.spectral_is_local_scale = spectral_is_local_scale
151 | self.spectral_batch_size = spectral_batch_size
152 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
153 |
154 | self._validate_spectral_hiddens()
155 |
156 | def _validate_spectral_hiddens(self):
157 | """Validates the number of hidden units in each layer of the Spectral network."""
158 |
159 | if self.spectral_hiddens[-1] != self.n_clusters:
160 | raise ValueError(
161 | "The number of units in the last layer of spectral_hiddens network must be equal to the number of clusters or components."
162 | )
163 |
164 | def fit(self, X: torch.Tensor, y: torch.Tensor = None):
165 | """Performs the main training loop for the SpectralNet model.
166 |
167 | Parameters
168 | ----------
169 | X : torch.Tensor
170 | Data to train the networks on.
171 |
172 | y : torch.Tensor, optional
173 | Labels in case there are any. Defaults to None.
174 | """
175 | self._X = X
176 | ae_config = {
177 | "hiddens": self.ae_hiddens,
178 | "epochs": self.ae_epochs,
179 | "lr": self.ae_lr,
180 | "lr_decay": self.ae_lr_decay,
181 | "min_lr": self.ae_min_lr,
182 | "patience": self.ae_patience,
183 | "batch_size": self.ae_batch_size,
184 | }
185 |
186 | siamese_config = {
187 | "hiddens": self.siamese_hiddens,
188 | "epochs": self.siamese_epochs,
189 | "lr": self.siamese_lr,
190 | "lr_decay": self.siamese_lr_decay,
191 | "min_lr": self.siamese_min_lr,
192 | "patience": self.siamese_patience,
193 | "n_nbg": self.siamese_n_nbg,
194 | "use_approx": self.siamese_use_approx,
195 | "batch_size": self.siamese_batch_size,
196 | }
197 |
198 | spectral_config = {
199 | "hiddens": self.spectral_hiddens,
200 | "epochs": self.spectral_epochs,
201 | "lr": self.spectral_lr,
202 | "lr_decay": self.spectral_lr_decay,
203 | "min_lr": self.spectral_min_lr,
204 | "patience": self.spectral_patience,
205 | "n_nbg": self.spectral_n_nbg,
206 | "scale_k": self.spectral_scale_k,
207 | "is_local_scale": self.spectral_is_local_scale,
208 | "batch_size": self.spectral_batch_size,
209 | }
210 |
211 | if self.should_use_ae:
212 | self.ae_trainer = AETrainer(config=ae_config, device=self.device)
213 | self.ae_net = self.ae_trainer.train(X)
214 | X = self.ae_trainer.embed(X)
215 |
216 | if self.should_use_siamese:
217 | self.siamese_trainer = SiameseTrainer(
218 | config=siamese_config, device=self.device
219 | )
220 | self.siamese_net = self.siamese_trainer.train(X)
221 | else:
222 | self.siamese_net = None
223 |
224 | is_sparse = self.is_sparse_graph
225 | if is_sparse:
226 | build_ann(X)
227 |
228 | self.spectral_trainer = SpectralTrainer(
229 | config=spectral_config, device=self.device, is_sparse=is_sparse
230 | )
231 | self.spec_net = self.spectral_trainer.train(X, y, self.siamese_net)
232 |
233 | def predict(self, X: torch.Tensor) -> np.ndarray:
234 | """Predicts the cluster assignments for the given data.
235 |
236 | Parameters
237 | ----------
238 | X : torch.Tensor
239 | Data to be clustered.
240 |
241 | Returns
242 | -------
243 | np.ndarray
244 | The cluster assignments for the given data.
245 | """
246 | X = X.view(X.size(0), -1)
247 | X = X.to(self.device)
248 |
249 | with torch.no_grad():
250 | if self.should_use_ae:
251 | X = self.ae_net.encode(X)
252 | self.embeddings_ = self.spec_net(X, should_update_orth_weights=False)
253 | self.embeddings_ = self.embeddings_.detach().cpu().numpy()
254 |
255 | cluster_assignments = self._get_clusters_by_kmeans(self.embeddings_)
256 | return cluster_assignments
257 |
258 | def get_random_batch(self, batch_size: int = 1024) -> tuple:
259 | """Get a batch of the input data.
260 |
261 | Parameters
262 | ----------
263 | batch_size : int
264 | The size of the batch to use.
265 |
266 | Returns
267 | -------
268 | tuple
269 | The raw batch and the encoded batch.
270 |
271 | """
272 | permuted_indices = torch.randperm(batch_size)
273 | X_raw = self._X.view(self._X.size(0), -1)
274 | X_encoded = X_raw
275 |
276 | if self.should_use_ae:
277 | X_encoded = self.ae_trainer.embed(self._X)
278 |
279 | if self.should_use_siamese:
280 | X_encoded = self.siamese_net.forward_once(X_encoded)
281 |
282 | X_encoded = X_encoded[permuted_indices]
283 | X_raw = X_raw[permuted_indices]
284 | X_encoded = X_encoded.to(self.device)
285 | return X_raw, X_encoded
286 |
287 | def _get_clusters_by_kmeans(self, embeddings: np.ndarray) -> np.ndarray:
288 | """Performs k-means clustering on the spectral-embedding space.
289 |
290 | Parameters
291 | ----------
292 | embeddings : np.ndarray
293 | The spectral-embedding space.
294 |
295 | Returns
296 | -------
297 | np.ndarray
298 | The cluster assignments for the given data.
299 | """
300 |
301 | kmeans = KMeans(n_clusters=self.n_clusters, n_init=10).fit(embeddings)
302 | cluster_assignments = kmeans.predict(embeddings)
303 | return cluster_assignments
304 |
--------------------------------------------------------------------------------
/src/spectralnet/_reduction.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | from ._utils import *
6 | from ._cluster import SpectralNet
7 | from sklearn.cluster import KMeans
8 | from ._metrics import Metrics
9 |
10 |
11 | class SpectralReduction:
12 | def __init__(
13 | self,
14 | n_components: int,
15 | should_use_ae: bool = False,
16 | should_use_siamese: bool = False,
17 | is_sparse_graph: bool = False,
18 | ae_hiddens: list = [512, 512, 2048, 10],
19 | ae_epochs: int = 40,
20 | ae_lr: float = 1e-3,
21 | ae_lr_decay: float = 0.1,
22 | ae_min_lr: float = 1e-7,
23 | ae_patience: int = 10,
24 | ae_batch_size: int = 256,
25 | siamese_hiddens: list = [1024, 1024, 512, 10],
26 | siamese_epochs: int = 30,
27 | siamese_lr: float = 1e-3,
28 | siamese_lr_decay: float = 0.1,
29 | siamese_min_lr: float = 1e-7,
30 | siamese_patience: int = 10,
31 | siamese_n_nbg: int = 2,
32 | siamese_use_approx: bool = False,
33 | siamese_batch_size: int = 128,
34 | spectral_hiddens: list = [1024, 1024, 512, 10],
35 | spectral_epochs: int = 30,
36 | spectral_lr: float = 1e-3,
37 | spectral_lr_decay: float = 0.1,
38 | spectral_min_lr: float = 1e-8,
39 | spectral_patience: int = 10,
40 | spectral_batch_size: int = 1024,
41 | spectral_n_nbg: int = 30,
42 | spectral_scale_k: int = 15,
43 | spectral_is_local_scale: bool = True,
44 | ):
45 | """SpectralNet is a class for implementing a Deep learning model that performs spectral clustering.
46 | This model optionally utilizes Autoencoders (AE) and Siamese networks for training.
47 |
48 | Parameters
49 | ----------
50 | n_components : int
51 | The number of components to keep.
52 |
53 | should_use_ae : bool, optional (default=False)
54 | Specifies whether to use the Autoencoder (AE) network as part of the training process.
55 |
56 | should_use_siamese : bool, optional (default=False)
57 | Specifies whether to use the Siamese network as part of the training process.
58 |
59 | is_sparse_graph : bool, optional (default=False)
60 | Specifies whether the graph Laplacian created from the data is sparse.
61 |
62 | ae_hiddens : list, optional (default=[512, 512, 2048, 10])
63 | The number of hidden units in each layer of the Autoencoder network.
64 |
65 | ae_epochs : int, optional (default=30)
66 | The number of epochs to train the Autoencoder network.
67 |
68 | ae_lr : float, optional (default=1e-3)
69 | The learning rate for the Autoencoder network.
70 |
71 | ae_lr_decay : float, optional (default=0.1)
72 | The learning rate decay factor for the Autoencoder network.
73 |
74 | ae_min_lr : float, optional (default=1e-7)
75 | The minimum learning rate for the Autoencoder network.
76 |
77 | ae_patience : int, optional (default=10)
78 | The number of epochs to wait before reducing the learning rate for the Autoencoder network.
79 |
80 | ae_batch_size : int, optional (default=256)
81 | The batch size used during training of the Autoencoder network.
82 |
83 | siamese_hiddens : list, optional (default=[1024, 1024, 512, 10])
84 | The number of hidden units in each layer of the Siamese network.
85 |
86 | siamese_epochs : int, optional (default=30)
87 | The number of epochs to train the Siamese network.
88 |
89 | siamese_lr : float, optional (default=1e-3)
90 | The learning rate for the Siamese network.
91 |
92 | siamese_lr_decay : float, optional (default=0.1)
93 | The learning rate decay factor for the Siamese network.
94 |
95 | siamese_min_lr : float, optional (default=1e-7)
96 | The minimum learning rate for the Siamese network.
97 |
98 | siamese_patience : int, optional (default=10)
99 | The number of epochs to wait before reducing the learning rate for the Siamese network.
100 |
101 | siamese_n_nbg : int, optional (default=2)
102 | The number of nearest neighbors to consider as 'positive' pairs by the Siamese network.
103 |
104 | siamese_use_approx : bool, optional (default=False)
105 | Specifies whether to use Annoy instead of KNN for computing nearest neighbors,
106 | particularly useful for large datasets.
107 |
108 | siamese_batch_size : int, optional (default=256)
109 | The batch size used during training of the Siamese network.
110 |
111 | spectral_hiddens : list, optional (default=[1024, 1024, 512, 10])
112 | The number of hidden units in each layer of the Spectral network.
113 |
114 | spectral_epochs : int, optional (default=30)
115 | The number of epochs to train the Spectral network.
116 |
117 | spectral_lr : float, optional (default=1e-3)
118 | The learning rate for the Spectral network.
119 |
120 | spectral_lr_decay : float, optional (default=0.1)
121 | The learning rate decay factor"""
122 |
123 | self.n_components = n_components
124 | self.should_use_ae = should_use_ae
125 | self.should_use_siamese = should_use_siamese
126 | self.is_sparse_graph = is_sparse_graph
127 | self.ae_hiddens = ae_hiddens
128 | self.ae_epochs = ae_epochs
129 | self.ae_lr = ae_lr
130 | self.ae_lr_decay = ae_lr_decay
131 | self.ae_min_lr = ae_min_lr
132 | self.ae_patience = ae_patience
133 | self.ae_batch_size = ae_batch_size
134 | self.siamese_hiddens = siamese_hiddens
135 | self.siamese_epochs = siamese_epochs
136 | self.siamese_lr = siamese_lr
137 | self.siamese_lr_decay = siamese_lr_decay
138 | self.siamese_min_lr = siamese_min_lr
139 | self.siamese_patience = siamese_patience
140 | self.siamese_n_nbg = siamese_n_nbg
141 | self.siamese_use_approx = siamese_use_approx
142 | self.siamese_batch_size = siamese_batch_size
143 | self.spectral_hiddens = spectral_hiddens
144 | self.spectral_epochs = spectral_epochs
145 | self.spectral_lr = spectral_lr
146 | self.spectral_lr_decay = spectral_lr_decay
147 | self.spectral_min_lr = spectral_min_lr
148 | self.spectral_patience = spectral_patience
149 | self.spectral_n_nbg = spectral_n_nbg
150 | self.spectral_scale_k = spectral_scale_k
151 | self.spectral_is_local_scale = spectral_is_local_scale
152 | self.spectral_batch_size = spectral_batch_size
153 | self.X_new = None
154 |
155 | def _fit(self, X: torch.Tensor, y: torch.Tensor) -> np.ndarray:
156 | """Fit the SpectralNet model to the input data.
157 |
158 | Parameters
159 | ----------
160 | X : torch.Tensor
161 | The input data of shape (n_samples, n_features).
162 |
163 | y: torch.Tensor
164 | The labels of the input data of shape (n_samples,).
165 |
166 | Returns
167 | -------
168 | np.ndarray
169 | The fitted embeddings of shape (n_samples, n_components).
170 | """
171 | self._spectralnet = SpectralNet(
172 | n_clusters=self.n_components,
173 | should_use_ae=self.should_use_ae,
174 | should_use_siamese=self.should_use_siamese,
175 | is_sparse_graph=self.is_sparse_graph,
176 | ae_hiddens=self.ae_hiddens,
177 | ae_epochs=self.ae_epochs,
178 | ae_lr=self.ae_lr,
179 | ae_lr_decay=self.ae_lr_decay,
180 | ae_min_lr=self.ae_min_lr,
181 | ae_patience=self.ae_patience,
182 | ae_batch_size=self.ae_batch_size,
183 | siamese_hiddens=self.siamese_hiddens,
184 | siamese_epochs=self.siamese_epochs,
185 | siamese_lr=self.siamese_lr,
186 | siamese_lr_decay=self.siamese_lr_decay,
187 | siamese_min_lr=self.siamese_min_lr,
188 | siamese_patience=self.siamese_patience,
189 | siamese_n_nbg=self.siamese_n_nbg,
190 | siamese_use_approx=self.siamese_use_approx,
191 | siamese_batch_size=self.siamese_batch_size,
192 | spectral_hiddens=self.spectral_hiddens,
193 | spectral_epochs=self.spectral_epochs,
194 | spectral_lr=self.spectral_lr,
195 | spectral_lr_decay=self.spectral_lr_decay,
196 | spectral_min_lr=self.spectral_min_lr,
197 | spectral_patience=self.spectral_patience,
198 | spectral_n_nbg=self.spectral_n_nbg,
199 | spectral_scale_k=self.spectral_scale_k,
200 | spectral_is_local_scale=self.spectral_is_local_scale,
201 | spectral_batch_size=self.spectral_batch_size,
202 | )
203 |
204 | self._spectralnet.fit(X, y)
205 |
206 | def _predict(self, X: torch.Tensor) -> np.ndarray:
207 | """Predict embeddings for the input data using the fitted SpectralNet model.
208 |
209 | Parameters
210 | ----------
211 | X : torch.Tensor
212 | The input data of shape (n_samples, n_features).
213 |
214 | Returns
215 | -------
216 | np.ndarray
217 | The predicted embeddings of shape (n_samples, n_components).
218 | """
219 | self._spectralnet.predict(X)
220 | return self._spectralnet.embeddings_
221 |
222 | def _transform(self, X: torch.Tensor) -> np.ndarray:
223 | """Transform the input data into embeddings using the fitted SpectralNet model.
224 |
225 | Parameters
226 | ----------
227 | X : torch.Tensor
228 | The input data of shape (n_samples, n_features).
229 |
230 | Returns
231 | -------
232 | np.ndarray
233 | The transformed embeddings of shape (n_samples, n_components).
234 | """
235 | return self._predict(X)
236 |
237 | def fit_transform(self, X: torch.Tensor, y: torch.Tensor = None) -> np.ndarray:
238 | """Fit the SpectralNet model to the input data and transform it into embeddings.
239 |
240 | This is a convenience method that combines the fit and transform steps.
241 |
242 | Parameters
243 | ----------
244 | X : torch.Tensor
245 | The input data of shape (n_samples, n_features).
246 |
247 | y: torch.Tensor
248 | The labels of the input data of shape (n_samples,).
249 |
250 | Returns
251 | -------
252 | np.ndarray
253 | The fitted and transformed embeddings of shape (n_samples, n_components).
254 | """
255 | self._fit(X, y)
256 | return self._transform(X)
257 |
258 | def _get_laplacian_of_small_batch(self, batch: torch.Tensor) -> np.ndarray:
259 | """Get the Laplacian of a small batch of the input data
260 |
261 | Parameters
262 | ----------
263 |
264 | batch : torch.Tensor
265 | A small batch of the input data of shape (batch_size, n_features).
266 |
267 | Returns
268 | -------
269 | np.ndarray
270 | The Laplacian of the small batch of the input data.
271 |
272 |
273 |
274 | """
275 |
276 | W = get_affinity_matrix(batch, self.spectral_n_nbg, self._spectralnet.device)
277 | L = get_laplacian(W)
278 | return L
279 |
280 | def _remove_smallest_eigenvector(self, V: np.ndarray) -> np.ndarray:
281 | """Remove the constant eigenvector from the eigenvectors of the Laplacian of a small batch of the input data.
282 |
283 |
284 | Parameters
285 | ----------
286 | V : np.ndarray
287 | The eigenvectors of the Laplacian of a small batch of the input data.
288 |
289 |
290 | Returns
291 | -------
292 | np.ndarray
293 | The eigenvectors of the Laplacian of a small batch of the input data without the constant eigenvector.
294 | """
295 |
296 | batch_raw, batch_encoded = self._spectralnet.get_random_batch()
297 | L_batch = self._get_laplacian_of_small_batch(batch_encoded)
298 | V_batch = self._predict(batch_raw)
299 | eigenvalues = np.diag(V_batch.T @ L_batch @ V_batch)
300 | indices = np.argsort(eigenvalues)
301 | smallest_index = indices[0]
302 | V = V[:, np.arange(V.shape[1]) != smallest_index]
303 | V = V[
304 | :,
305 | (np.arange(V.shape[1]) == indices[1])
306 | | (np.arange(V.shape[1]) == indices[2]),
307 | ]
308 |
309 | return V
310 |
311 | def visualize(
312 | self, V: np.ndarray, y: torch.Tensor = None, n_components: int = 1
313 | ) -> None:
314 | """Visualize the embeddings of the input data using the fitted SpectralNet model.
315 |
316 | Parameters
317 | ----------
318 | V : torch.Tensor
319 | The reduced data of shape (n_samples, n_features) to be visualized.
320 | y : torch.Tensor
321 | The input labels of shape (n_samples,).
322 | """
323 | V = self._remove_smallest_eigenvector(V)
324 | print(V.shape)
325 |
326 | plot_laplacian_eigenvectors(V, y)
327 | cluster_labels = self._get_clusters_by_kmeans(V)
328 | acc = Metrics.acc_score(cluster_labels, y.detach().cpu().numpy(), n_clusters=10)
329 | print("acc with 2 components: ", acc)
330 |
331 | if n_components > 1:
332 | x_axis = V[:, 0]
333 | y_axis = V[:, 1]
334 |
335 | elif n_components == 1:
336 | x_axis = V
337 | y_axis = np.zeros_like(V)
338 |
339 | else:
340 | raise ValueError(
341 | "n_components must be a positive integer (greater than 0))"
342 | )
343 |
344 | if y is None:
345 | plt.scatter(x_axis, y_axis)
346 | else:
347 | plt.scatter(x_axis, y_axis, c=y, cmap="tab10", s=3)
348 |
349 | plt.show()
350 |
351 | def _get_clusters_by_kmeans(self, embeddings: np.ndarray) -> np.ndarray:
352 | """Performs k-means clustering on the spectral-embedding space.
353 |
354 | Parameters
355 | ----------
356 | embeddings : np.ndarray
357 | The spectral-embedding space.
358 |
359 | Returns
360 | -------
361 | np.ndarray
362 | The cluster assignments for the given data.
363 | """
364 |
365 | kmeans = KMeans(n_clusters=self.n_components, n_init=10).fit(embeddings)
366 | cluster_assignments = kmeans.predict(embeddings)
367 | return cluster_assignments
368 |
--------------------------------------------------------------------------------
/src/spectralnet/_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import matplotlib.colors as colors
6 |
7 | from annoy import AnnoyIndex
8 | from sklearn.neighbors import NearestNeighbors
9 |
10 |
11 | def build_ann(X: torch.Tensor):
12 | """
13 | Builds approximate-nearest-neighbors object
14 | that can be used to calculate the k-nearest neighbors of a data-point
15 |
16 | Parameters
17 | ----------
18 | X : torch.Tensor
19 | Dataset.
20 |
21 | Returns
22 | -------
23 | None
24 | """
25 |
26 | X = X.view(X.size(0), -1)
27 | t = AnnoyIndex(X[0].shape[0], "euclidean")
28 | for i, x_i in enumerate(X):
29 | t.add_item(i, x_i)
30 |
31 | t.build(50)
32 | t.save("ann_index.ann")
33 |
34 |
35 | def make_batch_for_sparse_grapsh(batch_x: torch.Tensor) -> torch.Tensor:
36 | """
37 | Computes a new batch of data points from the given batch (batch_x)
38 | in case that the graph-laplacian obtained from the given batch is sparse.
39 | The new batch is computed based on the nearest neighbors of 0.25
40 | of the given batch.
41 |
42 | Parameters
43 | ----------
44 | batch_x : torch.Tensor
45 | Batch of data points.
46 |
47 | Returns
48 | -------
49 | torch.Tensor
50 | New batch of data points.
51 | """
52 |
53 | batch_size = batch_x.shape[0]
54 | batch_size //= 5
55 | new_batch_x = batch_x[:batch_size]
56 | batch_x = new_batch_x
57 | n_neighbors = 5
58 |
59 | u = AnnoyIndex(batch_x[0].shape[0], "euclidean")
60 | u.load("ann_index.ann")
61 | for x in batch_x:
62 | x = x.detach().cpu().numpy()
63 | nn_indices = u.get_nns_by_vector(x, n_neighbors)
64 | nn_tensors = [u.get_item_vector(i) for i in nn_indices[1:]]
65 | nn_tensors = torch.tensor(nn_tensors, device=batch_x.device)
66 | new_batch_x = torch.cat((new_batch_x, nn_tensors))
67 |
68 | return new_batch_x
69 |
70 |
71 | def get_laplacian(W: torch.Tensor) -> np.ndarray:
72 | """
73 | Computes the unnormalized Laplacian matrix, given the affinity matrix W.
74 |
75 | Parameters
76 | ----------
77 | W : torch.Tensor
78 | Affinity matrix.
79 |
80 | Returns
81 | -------
82 | np.ndarray
83 | Laplacian matrix.
84 | """
85 |
86 | W = W.detach().cpu().numpy()
87 | D = np.diag(W.sum(axis=1))
88 | L = D - W
89 | return L
90 |
91 |
92 | def sort_laplacian(L: np.ndarray, y: np.ndarray) -> np.ndarray:
93 | """
94 | Sorts the columns and rows of the Laplacian by the true labels in order
95 | to see whether the sorted Laplacian is a block diagonal matrix.
96 |
97 | Parameters
98 | ----------
99 | L : np.ndarray
100 | Laplacian matrix.
101 | y : np.ndarray
102 | Labels.
103 |
104 | Returns
105 | -------
106 | np.ndarray
107 | Sorted Laplacian.
108 | """
109 |
110 | i = np.argsort(y)
111 | L = L[i, :]
112 | L = L[:, i]
113 | return L
114 |
115 |
116 | def sort_matrix_rows(A: np.ndarray, y: np.ndarray) -> np.ndarray:
117 | """
118 | Sorts the rows of a matrix by a given order.
119 |
120 | Parameters
121 | ----------
122 | A : np.ndarray
123 | Numpy ndarray.
124 | y : np.ndarray
125 | True labels.
126 |
127 | Returns
128 | -------
129 | np.ndarray
130 | Sorted matrix.
131 | """
132 |
133 | i = np.argsort(y)
134 | A = A[i, :]
135 | return A
136 |
137 |
138 | def get_eigenvalues(A: np.ndarray) -> np.ndarray:
139 | """
140 | Computes the eigenvalues of a given matrix A and sorts them in increasing order.
141 |
142 | Parameters
143 | ----------
144 | A : np.ndarray
145 | Numpy ndarray.
146 |
147 | Returns
148 | -------
149 | np.ndarray
150 | Sorted eigenvalues.
151 | """
152 |
153 | _, vals, _ = np.linalg.svd(A)
154 | sorted_vals = vals[np.argsort(vals)]
155 | return sorted_vals
156 |
157 |
158 | def get_eigenvectors(A: np.ndarray) -> np.ndarray:
159 | """
160 | Computes the eigenvectors of a given matrix A and sorts them by the eigenvalues.
161 |
162 | Parameters
163 | ----------
164 | A : np.ndarray
165 | Numpy ndarray.
166 |
167 | Returns
168 | -------
169 | np.ndarray
170 | Sorted eigenvectors.
171 | """
172 |
173 | vecs, vals, _ = np.linalg.svd(A)
174 | vecs = vecs[:, np.argsort(vals)]
175 | return vecs
176 |
177 |
178 | def plot_eigenvalues(vals: np.ndarray):
179 | """
180 | Plot the eigenvalues of the Laplacian.
181 |
182 | Parameters
183 | ----------
184 | vals : np.ndarray
185 | Eigenvalues.
186 | """
187 |
188 | rang = range(len(vals))
189 | plt.plot(rang, vals)
190 | plt.show()
191 |
192 |
193 | def get_laplacian_eigenvectors(V: torch.Tensor, y: np.ndarray) -> np.ndarray:
194 | """
195 | Returns eigenvectors of the Laplacian when the data is sorted in increasing
196 | order by the true label.
197 |
198 | Parameters
199 | ----------
200 | V : torch.Tensor
201 | Eigenvectors matrix.
202 | y : np.ndarray
203 | True labels.
204 |
205 | Returns
206 | -------
207 | np.ndarray
208 | Sorted eigenvectors matrix and range.
209 |
210 | """
211 |
212 | V = sort_matrix_rows(V, y)
213 | rang = range(len(y))
214 | return V, rang
215 |
216 |
217 | def plot_laplacian_eigenvectors(V: np.ndarray, y: np.ndarray):
218 | """
219 | Plot the eigenvectors of the Laplacian when the data is sorted in increasing
220 | order by the true label.
221 |
222 | Parameters
223 | ----------
224 | V : np.ndarray
225 | Eigenvectors matrix.
226 | y : np.ndarray
227 | True labels.
228 |
229 | Returns
230 | -------
231 | plt.Axes
232 | The matplotlib Axes object containing the plot.
233 | """
234 |
235 | V = sort_matrix_rows(V, y)
236 | rang = range(len(y))
237 | plt.plot(rang, V)
238 | plt.show()
239 | return plt
240 |
241 |
242 | def plot_sorted_laplacian(W: torch.Tensor, y: np.ndarray):
243 | """
244 | Plot the block diagonal matrix obtained from the sorted Laplacian.
245 |
246 | Parameters
247 | ----------
248 | W : torch.Tensor
249 | Affinity matrix.
250 | y : np.ndarray
251 | True labels.
252 | """
253 | L = get_laplacian(W)
254 | L = sort_laplacian(L, y)
255 | plt.imshow(L, cmap="hot", norm=colors.LogNorm())
256 | plt.imshow(L, cmap="flag")
257 | plt.show()
258 |
259 |
260 | def get_nearest_neighbors(
261 | X: torch.Tensor, Y: torch.Tensor = None, k: int = 3
262 | ) -> tuple[np.ndarray, np.ndarray]:
263 | """
264 | Computes the distances and the indices of the k nearest neighbors of each data point.
265 |
266 | Parameters
267 | ----------
268 | X : torch.Tensor
269 | Batch of data points.
270 | Y : torch.Tensor, optional
271 | Defaults to None.
272 | k : int, optional
273 | Number of nearest neighbors to calculate. Defaults to 3.
274 |
275 | Returns
276 | -------
277 | tuple[np.ndarray, np.ndarray]
278 | Distances and indices of each data point.
279 | """
280 | if Y is None:
281 | Y = X
282 | if len(X) < k:
283 | k = len(X)
284 | X = X.cpu().detach().numpy()
285 | Y = Y.cpu().detach().numpy()
286 | nbrs = NearestNeighbors(n_neighbors=k).fit(X)
287 | Dis, Ids = nbrs.kneighbors(X)
288 | return Dis, Ids
289 |
290 |
291 | def get_grassman_distance(A: np.ndarray, B: np.ndarray) -> float:
292 | """
293 | Computes the Grassmann distance between the subspaces spanned by the columns of A and B.
294 |
295 | Parameters
296 | ----------
297 | A : np.ndarray
298 | Numpy ndarray.
299 | B : np.ndarray
300 | Numpy ndarray.
301 |
302 | Returns
303 | -------
304 | float
305 | The Grassmann distance.
306 | """
307 |
308 | M = np.dot(np.transpose(A), B)
309 | _, s, _ = np.linalg.svd(M, full_matrices=False)
310 | s = 1 - np.square(s)
311 | grassmann = np.sum(s)
312 | return grassmann
313 |
314 |
315 | def compute_scale(
316 | Dis: np.ndarray, k: int = 2, med: bool = True, is_local: bool = True
317 | ) -> np.ndarray:
318 | """
319 | Computes the scale for the Gaussian similarity function.
320 |
321 | Parameters
322 | ----------
323 | Dis : np.ndarray
324 | Distances of the k nearest neighbors of each data point.
325 | k : int, optional
326 | Number of nearest neighbors for the scale calculation. Relevant for global scale only.
327 | med : bool, optional
328 | Scale calculation method. Can be calculated by the median distance from a data point to its neighbors,
329 | or by the maximum distance. Defaults to True.
330 | is_local : bool, optional
331 | Local distance (different for each data point), or global distance. Defaults to True.
332 |
333 | Returns
334 | -------
335 | np.ndarray
336 | Scale (global or local).
337 | """
338 |
339 | if is_local:
340 | if not med:
341 | scale = np.max(Dis, axis=1)
342 | else:
343 | scale = np.median(Dis, axis=1)
344 | else:
345 | if not med:
346 | scale = np.max(Dis[:, k - 1])
347 | else:
348 | scale = np.median(Dis[:, k - 1])
349 | return scale
350 |
351 |
352 | def get_gaussian_kernel(
353 | D: torch.Tensor, scale, Ids: np.ndarray, device: torch.device, is_local: bool = True
354 | ) -> torch.Tensor:
355 | """
356 | Computes the Gaussian similarity function according to a given distance matrix D and a given scale.
357 |
358 | Parameters
359 | ----------
360 | D : torch.Tensor
361 | Distance matrix.
362 | scale :
363 | Scale.
364 | Ids : np.ndarray
365 | Indices of the k nearest neighbors of each sample.
366 | device : torch.device
367 | Defaults to torch.device("cpu").
368 | is_local : bool, optional
369 | Determines whether the given scale is global or local. Defaults to True.
370 |
371 | Returns
372 | -------
373 | torch.Tensor
374 | Matrix W with Gaussian similarities.
375 | """
376 |
377 | if not is_local:
378 | # global scale
379 | W = torch.exp(-torch.pow(D, 2) / (scale**2))
380 | else:
381 | # local scales
382 | W = torch.exp(
383 | -torch.pow(D, 2).to(device)
384 | / (torch.tensor(scale).float().to(device).clamp_min(1e-7) ** 2)
385 | )
386 | if Ids is not None:
387 | n, k = Ids.shape
388 | mask = torch.zeros([n, n]).to(device=device)
389 | for i in range(len(Ids)):
390 | mask[i, Ids[i]] = 1
391 | W = W * mask
392 | sym_W = (W + torch.t(W)) / 2.0
393 | return sym_W
394 |
395 |
396 | def get_t_kernel(
397 | D: torch.Tensor, Ids: np.ndarray, device: torch.device, is_local: bool = True
398 | ) -> torch.Tensor:
399 | """
400 | Computes the t similarity function according to a given distance matrix D and a given scale.
401 |
402 | Parameters
403 | ----------
404 | D : torch.Tensor
405 | Distance matrix.
406 | Ids : np.ndarray
407 | Indices of the k nearest neighbors of each sample.
408 | device : torch.device
409 | Defaults to torch.device("cpu").
410 | is_local : bool, optional
411 | Determines whether the given scale is global or local. Defaults to True.
412 |
413 | Returns
414 | -------
415 | torch.Tensor
416 | Matrix W with t similarities.
417 | """
418 |
419 | W = torch.pow(1 + torch.pow(D, 2), -1)
420 | if Ids is not None:
421 | n, k = Ids.shape
422 | mask = torch.zeros([n, n]).to(device=device)
423 | for i in range(len(Ids)):
424 | mask[i, Ids[i]] = 1
425 | W = W * mask
426 | sym_W = (W + W.T) / 2.0
427 | return sym_W
428 |
429 |
430 | def get_affinity_matrix(
431 | X: torch.Tensor, n_neighbors: int, device: torch.device
432 | ) -> torch.Tensor:
433 | """
434 | Computes the affinity matrix for the data X.
435 |
436 | Parameters
437 | ----------
438 | X : torch.Tensor
439 | Data.
440 | n_neighbors : int
441 | Number of nearest neighbors to calculate.
442 | device : torch.device
443 | Defaults to torch.device("cpu").
444 |
445 | Returns
446 | -------
447 | torch.Tensor
448 | Affinity matrix.
449 | """
450 |
451 | Dx = torch.cdist(X, X)
452 | Dis, indices = get_nearest_neighbors(X, k=n_neighbors + 1)
453 | W = get_t_kernel(Dx, indices, device=device)
454 | return W
455 |
456 |
457 | def plot_data_by_assignments(X, assignments: np.ndarray):
458 | """
459 | Plots the data with the assignments obtained from SpectralNet. Relevant only for 2D data.
460 |
461 | Parameters
462 | ----------
463 | X :
464 | Data.
465 | assignments : np.ndarray
466 | Cluster assignments.
467 | """
468 |
469 | plt.scatter(X[:, 0], X[:, 1], c=assignments)
470 | plt.show()
471 |
472 |
473 | def calculate_cost_matrix(C: np.ndarray, n_clusters: int) -> np.ndarray:
474 | """
475 | Calculates the cost matrix for the Munkres algorithm.
476 |
477 | Parameters
478 | ----------
479 | C : np.ndarray
480 | Confusion matrix.
481 | n_clusters : int
482 | Number of clusters.
483 |
484 | Returns
485 | -------
486 | np.ndarray
487 | Cost matrix.
488 | """
489 |
490 | cost_matrix = np.zeros((n_clusters, n_clusters))
491 | # cost_matrix[i,j] will be the cost of assigning cluster i to label j
492 | for j in range(n_clusters):
493 | s = np.sum(C[:, j]) # number of examples in cluster i
494 | for i in range(n_clusters):
495 | t = C[i, j]
496 | cost_matrix[j, i] = s - t
497 | return cost_matrix
498 |
499 |
500 | def get_cluster_labels_from_indices(indices: np.ndarray) -> np.ndarray:
501 | """
502 | Gets the cluster labels from their indices.
503 |
504 | Parameters
505 | ----------
506 | indices : np.ndarray
507 | Indices of the clusters.
508 |
509 | Returns
510 | -------
511 | np.ndarray
512 | Cluster labels.
513 | """
514 |
515 | num_clusters = len(indices)
516 | cluster_labels = np.zeros(num_clusters)
517 | for i in range(num_clusters):
518 | cluster_labels[i] = indices[i][1]
519 | return cluster_labels
520 |
521 |
522 | def write_assignments_to_file(assignments: np.ndarray):
523 | """
524 | Saves SpectralNet cluster assignments to a file.
525 |
526 | Parameters
527 | ----------
528 | assignments : np.ndarray
529 | The assignments that obtained from SpectralNet.
530 | """
531 |
532 | np.savetxt(
533 | "cluster_assignments.csv", assignments.astype(int), fmt="%i", delimiter=","
534 | )
535 |
536 |
537 | def create_weights_dir():
538 | """
539 | Creates a directory for the weights of the Autoencoder and the Siamese network
540 | """
541 | if not os.path.exists("weights"):
542 | os.makedirs("weights")
543 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
529 | 534 | Spectral clustering is a leading and popular technique in unsupervised data analysis. Two of its major 535 | limitations are scalability and generalization of the spectral embedding (i.e., out-of-sample-extension). In 536 | this paper we introduce a deep 537 | learning approach to spectral clustering that overcomes the above shortcomings. 538 | Our network, which we call SpectralNet, learns a map that embeds input data 539 | points into the eigenspace of their associated graph Laplacian matrix and subsequently clusters them. We 540 | train SpectralNet using a procedure that involves 541 | constrained stochastic optimization. Stochastic optimization allows it to scale 542 | to large datasets, while the constraints, which are implemented using a specialpurpose output layer, allow 543 | us to keep the network output orthogonal. Moreover, the map learned by SpectralNet naturally generalizes the 544 | spectral embedding to unseen data points. To further improve the quality of the clustering, we 545 | replace the standard pairwise Gaussian affinities with affinities learned from the 546 | given unlabeled data using a Siamese network. Additional improvement of the 547 | resulting clustering can be achieved by applying the network to code representations produced, e.g., by 548 | standard autoencoders. Our end-to-end learning procedure is fully unsupervised. In addition, we apply VC 549 | dimension theory to derive a lower bound on the size of SpectralNet. State-of-the-art clustering results 550 | are reported on the Reuters dataset. 551 |
552 |
559 |
560 | @inproceedings{shaham2018,
581 | author = {Uri Shaham and Kelly Stanton and Henri Li and Boaz Nadler and Ronen Basri and Yuval Kluger},
582 | title = {SpectralNet: Spectral Clustering Using Deep Neural Networks},
583 | booktitle = {Proc. ICLR 2018},
584 | year = {2018}
585 | }
586 |
587 | 591 | MIT License. 592 | Feel free to use any of the material in your own work, as long as you give us appropriate credit by 593 | mentioning the title and author list of our paper. 594 |
595 | 596 |