├── examples ├── __init__.py ├── reduce_mnist.py ├── reduce_twomoons.py ├── cluster_mnist.py ├── cluster_twomoons.py └── data.py ├── src ├── tests │ └── __init__.py └── spectralnet │ ├── _trainers │ ├── _trainer.py │ ├── __init__.py │ ├── _ae_trainer.py │ ├── _spectralnet_trainer.py │ └── _siamesenet_trainer.py │ ├── _losses │ ├── __init__.py │ ├── _spectralnet_loss.py │ └── _siamese_loss.py │ ├── _models │ ├── __init__.py │ ├── _siamesenet_model.py │ ├── _ae_model.py │ └── _spectralnet_model.py │ ├── __init__.py │ ├── _metrics.py │ ├── _cluster.py │ ├── _reduction.py │ └── _utils.py ├── setup.py ├── docs ├── paper.png ├── twomoons.png └── index.html ├── figures └── twomoons.png ├── req.txt ├── .gitignore ├── pyproject.toml ├── setup.cfg ├── data └── Reuters │ ├── get_reuters_data.sh │ └── make_reuters.py ├── LICENSE.md └── README.md /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup() 4 | -------------------------------------------------------------------------------- /docs/paper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaham-lab/SpectralNet/HEAD/docs/paper.png -------------------------------------------------------------------------------- /docs/twomoons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaham-lab/SpectralNet/HEAD/docs/twomoons.png -------------------------------------------------------------------------------- /figures/twomoons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shaham-lab/SpectralNet/HEAD/figures/twomoons.png -------------------------------------------------------------------------------- /src/spectralnet/_trainers/_trainer.py: -------------------------------------------------------------------------------- 1 | class Trainer: 2 | def __init__(self): 3 | pass 4 | -------------------------------------------------------------------------------- /src/spectralnet/_losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ._siamese_loss import ContrastiveLoss 2 | from ._spectralnet_loss import SpectralNetLoss -------------------------------------------------------------------------------- /req.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.0 2 | torchvision==0.15.1 3 | h5py 4 | numpy 5 | annoy 6 | scipy 7 | munkres 8 | matplotlib 9 | scikit-learn 10 | -------------------------------------------------------------------------------- /src/spectralnet/_models/__init__.py: -------------------------------------------------------------------------------- 1 | from ._ae_model import AEModel 2 | from ._siamesenet_model import SiameseNetModel 3 | from ._spectralnet_model import SpectralNetModel -------------------------------------------------------------------------------- /src/spectralnet/_trainers/__init__.py: -------------------------------------------------------------------------------- 1 | from ._ae_trainer import AETrainer 2 | from ._siamesenet_trainer import SiameseTrainer 3 | from ._spectralnet_trainer import SpectralTrainer 4 | -------------------------------------------------------------------------------- /src/spectralnet/__init__.py: -------------------------------------------------------------------------------- 1 | from ._metrics import Metrics 2 | from ._cluster import SpectralNet 3 | from ._reduction import SpectralReduction 4 | from ._utils import * 5 | 6 | __all__ = [ 7 | "Metrics", 8 | "SpectralNet", 9 | "SpectralReduction", 10 | ] 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | /data/MNIST 4 | dist 5 | spectralnet.egg-info 6 | /cluster_mnist.py 7 | /cluster_twomoons.py 8 | /data.py 9 | /src/data.py 10 | /src/reduce_mnist.py 11 | /src/reduce_twomoons.py 12 | /src/spectralnet/_reduction.py 13 | /src/spectralnet/_trainers/weights 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Minimum requirements for the build system to execute. 3 | requires = [ 4 | "setuptools", 5 | "wheel", 6 | "torch>=2.0.0", 7 | "torchvision>=0.15.1", 8 | "h5py>=3.8.0", 9 | "numpy>=1.24", 10 | "annoy>=1.17.1", 11 | "scipy>=1.10.1", 12 | "munkres", 13 | "matplotlib", 14 | "scikit-learn>=1.2.2" 15 | ] 16 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /examples/reduce_mnist.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | 6 | from spectralnet import Metrics 7 | from spectralnet import SpectralReduction 8 | 9 | 10 | def main(): 11 | x_train, x_test, y_train, y_test = load_data("mnist") 12 | X = torch.cat([x_train, x_test]) 13 | 14 | if y_train is not None: 15 | y = torch.cat([y_train, y_test]) 16 | else: 17 | y = None 18 | 19 | spectralreduction = SpectralReduction( 20 | n_components=3, 21 | should_use_ae=True, 22 | should_use_siamese=True, 23 | spectral_hiddens=[512, 512, 2048, 3], 24 | ) 25 | 26 | X_new = spectralreduction.fit_transform(X) 27 | spectralreduction.visualize(X_new, y, n_components=2) 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = spectralnet 3 | version = 0.1.2 4 | author = Amitai 5 | description = Spectral Clustering Using Deep Neural Networks 6 | long_description = file: README.md 7 | long_description_content_type = text/markdown 8 | url = https://github.com/shaham-lab/SpectralNet.git 9 | project_urls = 10 | Bug Tracker = https://github.com/shaham-lab/SpectralNet/issues 11 | classifiers = 12 | Programming Language :: Python :: 3 13 | License :: OSI Approved :: MIT License 14 | Operating System :: OS Independent 15 | 16 | [options] 17 | package_dir = 18 | = src 19 | packages = find: 20 | python_requires = >=3.11 21 | install_requires = 22 | setuptools 23 | wheel 24 | torch>=2.0.0 25 | torchvision>=0.15.1 26 | h5py>=3.8.0 27 | numpy>=1.24 28 | annoy>=1.17.1 29 | scipy>=1.10.1 30 | munkres 31 | matplotlib 32 | scikit-learn>=1.2.2 33 | 34 | [options.packages.find] 35 | where = src -------------------------------------------------------------------------------- /data/Reuters/get_reuters_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt0.dat.gz 3 | gunzip lyrl2004_tokens_test_pt0.dat.gz 4 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt1.dat.gz 5 | gunzip lyrl2004_tokens_test_pt1.dat.gz 6 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt2.dat.gz 7 | gunzip lyrl2004_tokens_test_pt2.dat.gz 8 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_test_pt3.dat.gz 9 | gunzip lyrl2004_tokens_test_pt3.dat.gz 10 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files/lyrl2004_tokens_train.dat.gz 11 | gunzip lyrl2004_tokens_train.dat.gz 12 | wget http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz 13 | gunzip rcv1-v2.topics.qrels.gz -------------------------------------------------------------------------------- /examples/reduce_twomoons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | from spectralnet import SpectralReduction 6 | 7 | 8 | def main(): 9 | x_train, x_test, y_train, y_test = load_data("twomoons") 10 | X = torch.cat([x_train, x_test]) 11 | 12 | if y_train is not None: 13 | y = torch.cat([y_train, y_test]) 14 | else: 15 | y = None 16 | 17 | spectralreduction = SpectralReduction( 18 | n_components=2, 19 | should_use_ae=False, 20 | should_use_siamese=False, 21 | spectral_batch_size=712, 22 | spectral_epochs=40, 23 | spectral_is_local_scale=False, 24 | spectral_n_nbg=8, 25 | spectral_scale_k=2, 26 | spectral_lr=1e-2, 27 | spectral_hiddens=[128, 128, 2], 28 | ) 29 | 30 | X_new = spectralreduction.fit_transform(X) 31 | spectralreduction.visualize(X_new, y, n_components=1) 32 | 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /src/spectralnet/_models/_siamesenet_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SiameseNetModel(nn.Module): 6 | def __init__(self, architecture: dict, input_dim: int): 7 | super(SiameseNetModel, self).__init__() 8 | self.architecture = architecture 9 | self.layers = nn.ModuleList() 10 | 11 | current_dim = input_dim 12 | for layer in self.architecture: 13 | next_dim = layer 14 | self.layers.append( 15 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.ReLU()) 16 | ) 17 | current_dim = next_dim 18 | 19 | def forward_once(self, x: torch.Tensor) -> torch.Tensor: 20 | for layer in self.layers: 21 | x = layer(x) 22 | return x 23 | 24 | def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> tuple: 25 | output1 = self.forward_once(x1) 26 | output2 = self.forward_once(x2) 27 | return output1, output2 28 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Uri Shaham, Amitai Yacobi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/cluster_mnist.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | 6 | from spectralnet import Metrics 7 | from spectralnet import SpectralNet 8 | 9 | 10 | def main(): 11 | x_train, x_test, y_train, y_test = load_data("mnist") 12 | 13 | X = torch.cat([x_train, x_test]) 14 | 15 | if y_train is not None: 16 | y = torch.cat([y_train, y_test]) 17 | else: 18 | y = None 19 | 20 | spectralnet = SpectralNet( 21 | n_clusters=10, 22 | should_use_ae=True, 23 | should_use_siamese=True, 24 | ) 25 | spectralnet.fit(X, y) 26 | cluster_assignments = spectralnet.predict(X) 27 | embeddings = spectralnet.embeddings_ 28 | 29 | if y is not None: 30 | y = y.detach().cpu().numpy() 31 | acc_score = Metrics.acc_score(cluster_assignments, y, n_clusters=10) 32 | nmi_score = Metrics.nmi_score(cluster_assignments, y) 33 | print(f"ACC: {np.round(acc_score, 3)}") 34 | print(f"NMI: {np.round(nmi_score, 3)}") 35 | 36 | return embeddings, cluster_assignments 37 | 38 | 39 | if __name__ == "__main__": 40 | embeddings, assignments = main() 41 | -------------------------------------------------------------------------------- /src/spectralnet/_losses/_spectralnet_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SpectralNetLoss(nn.Module): 6 | def __init__(self): 7 | super(SpectralNetLoss, self).__init__() 8 | 9 | def forward( 10 | self, W: torch.Tensor, Y: torch.Tensor, is_normalized: bool = False 11 | ) -> torch.Tensor: 12 | """ 13 | This function computes the loss of the SpectralNet model. 14 | The loss is the rayleigh quotient of the Laplacian matrix obtained from W, 15 | and the orthonormalized output of the network. 16 | 17 | Args: 18 | W (torch.Tensor): Affinity matrix 19 | Y (torch.Tensor): Output of the network 20 | is_normalized (bool, optional): Whether to use the normalized Laplacian matrix or not. 21 | 22 | Returns: 23 | torch.Tensor: The loss 24 | """ 25 | m = Y.size(0) 26 | if is_normalized: 27 | D = torch.sum(W, dim=1) 28 | Y = Y / torch.sqrt(D)[:, None] 29 | 30 | Dy = torch.cdist(Y, Y) 31 | loss = torch.sum(W * Dy.pow(2)) / (2 * m) 32 | 33 | return loss 34 | -------------------------------------------------------------------------------- /examples/cluster_twomoons.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from data import load_data 5 | 6 | from spectralnet import Metrics 7 | from spectralnet import SpectralNet 8 | 9 | 10 | def main(): 11 | x_train, x_test, y_train, y_test = load_data("twomoons") 12 | X = torch.cat([x_train, x_test]) 13 | 14 | if y_train is not None: 15 | y = torch.cat([y_train, y_test]) 16 | else: 17 | y = None 18 | 19 | spectralnet = SpectralNet( 20 | n_clusters=2, 21 | should_use_ae=False, 22 | should_use_siamese=False, 23 | spectral_batch_size=712, 24 | spectral_epochs=40, 25 | spectral_is_local_scale=False, 26 | spectral_n_nbg=8, 27 | spectral_scale_k=2, 28 | spectral_lr=1e-2, 29 | spectral_hiddens=[128, 128, 2], 30 | ) 31 | 32 | spectralnet.fit(X, y) 33 | cluster_assignments = spectralnet.predict(X) 34 | embeddings = spectralnet.embeddings_ 35 | 36 | if y is not None: 37 | y = y.detach().cpu().numpy() 38 | acc_score = Metrics.acc_score(cluster_assignments, y, n_clusters=2) 39 | nmi_score = Metrics.nmi_score(cluster_assignments, y) 40 | print(f"ACC: {np.round(acc_score, 3)}") 41 | print(f"NMI: {np.round(nmi_score, 3)}") 42 | 43 | return embeddings, cluster_assignments 44 | 45 | 46 | if __name__ == "__main__": 47 | embeddings, assignments = main() 48 | -------------------------------------------------------------------------------- /src/spectralnet/_models/_ae_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class AEModel(nn.Module): 6 | def __init__(self, architecture: dict, input_dim: int): 7 | super(AEModel, self).__init__() 8 | self.architecture = architecture 9 | self.encoder = nn.ModuleList() 10 | self.decoder = nn.ModuleList() 11 | 12 | current_dim = input_dim 13 | for i, layer in enumerate(self.architecture): 14 | next_dim = layer 15 | if i == len(self.architecture) - 1: 16 | self.encoder.append(nn.Sequential(nn.Linear(current_dim, next_dim))) 17 | else: 18 | self.encoder.append( 19 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.ReLU()) 20 | ) 21 | current_dim = next_dim 22 | 23 | last_dim = input_dim 24 | current_dim = self.architecture[-1] 25 | for i, layer in enumerate(reversed(self.architecture[:-1])): 26 | next_dim = layer 27 | self.decoder.append( 28 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.ReLU()) 29 | ) 30 | current_dim = next_dim 31 | self.decoder.append(nn.Sequential(nn.Linear(current_dim, last_dim))) 32 | 33 | def encode(self, x: torch.Tensor) -> torch.Tensor: 34 | for layer in self.encoder: 35 | x = layer(x) 36 | return x 37 | 38 | def decode(self, x: torch.Tensor) -> torch.Tensor: 39 | for layer in self.decoder: 40 | x = layer(x) 41 | return x 42 | 43 | def forward(self, x: torch.Tensor) -> torch.Tensor: 44 | x = self.encode(x) 45 | x = self.decode(x) 46 | return x 47 | -------------------------------------------------------------------------------- /src/spectralnet/_losses/_siamese_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ContrastiveLoss(nn.Module): 6 | def __init__(self, margin: float = 1.0): 7 | super(ContrastiveLoss, self).__init__() 8 | self.margin = margin 9 | 10 | def forward( 11 | self, output1: torch.Tensor, output2: torch.Tensor, label: torch.Tensor 12 | ) -> torch.Tensor: 13 | """ 14 | Compute the contrastive loss between the two outputs of the siamese network. 15 | 16 | Parameters 17 | ---------- 18 | output1 : torch.Tensor 19 | The first output of the siamese network. 20 | output2 : torch.Tensor 21 | The second output of the siamese network. 22 | label : torch.Tensor 23 | The label indicating whether the two outputs are similar (1) or not (0). 24 | 25 | Returns 26 | ------- 27 | torch.Tensor 28 | The computed contrastive loss value. 29 | 30 | Notes 31 | ----- 32 | This function takes the two outputs `output1` and `output2` of the siamese network, 33 | along with the corresponding `label` indicating whether the outputs are similar (1) or not (0). 34 | The contrastive loss is computed based on the Euclidean distance between the outputs and the label, 35 | and the computed loss value is returned. 36 | """ 37 | 38 | euclidean = nn.functional.pairwise_distance(output1, output2) 39 | positive_distance = torch.pow(euclidean, 2) 40 | negative_distance = torch.pow(torch.clamp(self.margin - euclidean, min=0.0), 2) 41 | loss = torch.mean( 42 | (label * positive_distance) + ((1 - label) * negative_distance) 43 | ) 44 | return loss 45 | -------------------------------------------------------------------------------- /data/Reuters/make_reuters.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | 5 | 6 | # from dec (https://github.com/piiswrong/dec/tree/master/dec) 7 | def save_hdf5(X, y, name): 8 | import h5py 9 | with h5py.File('./{}.h5'.format(name), 'w') as f: 10 | f['data'] = X 11 | f['labels'] = y 12 | 13 | 14 | def make_reuters_data(): 15 | np.random.seed(1234) 16 | random.seed(1234) 17 | from sklearn.feature_extraction.text import CountVectorizer 18 | did_to_cat = {} 19 | cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT'] 20 | with open('../Reuters/rcv1-v2.topics.qrels') as fin: 21 | for line in fin.readlines(): 22 | line = line.strip().split(' ') 23 | cat = line[0] 24 | did = int(line[1]) 25 | if cat in cat_list: 26 | did_to_cat[did] = did_to_cat.get(did, []) + [cat] 27 | for did in list(did_to_cat): 28 | if len(did_to_cat[did]) > 1: 29 | del did_to_cat[did] 30 | 31 | dat_list = ['lyrl2004_tokens_test_pt0.dat', 32 | 'lyrl2004_tokens_test_pt1.dat', 33 | 'lyrl2004_tokens_test_pt2.dat', 34 | 'lyrl2004_tokens_test_pt3.dat', 35 | 'lyrl2004_tokens_train.dat'] 36 | data = [] 37 | target = [] 38 | cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3} 39 | del did 40 | for dat in dat_list: 41 | with open('../Reuters/' + dat) as fin: 42 | for line in fin.readlines(): 43 | if line.startswith('.I'): 44 | if 'did' in locals(): 45 | assert doc != '' 46 | if did in did_to_cat: 47 | data.append(doc) 48 | target.append(cat_to_cid[did_to_cat[did][0]]) 49 | did = int(line.strip().split(' ')[1]) 50 | doc = '' 51 | elif line.startswith('.W'): 52 | assert doc == '' 53 | else: 54 | doc += line 55 | 56 | assert len(data) == len(did_to_cat) 57 | 58 | X = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data) 59 | Y = np.asarray(target) 60 | 61 | from sklearn.feature_extraction.text import TfidfTransformer 62 | X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X) 63 | X = np.asarray(X.todense()) * np.sqrt(X.shape[1]) 64 | 65 | p = np.random.permutation(X.shape[0]) 66 | X = X[p] 67 | Y = Y[p] 68 | 69 | N = X.shape[0] 70 | save_hdf5(X[:N], Y[:N], 'reutersidf_train') 71 | save_hdf5(X[int(N * 4 / 5):N], Y[int(N * 4 / 5):N], 'reutersidf_test') 72 | save_hdf5(X[:N], Y[:N], 'reutersidf_total') 73 | 74 | 75 | if __name__ == '__main__': 76 | make_reuters_data() 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpectralNet 2 | 3 |

4 | 5 | 6 | SpectralNet is a Python package that performs spectral clustering with deep neural networks.

7 | This package is based on the following paper - [SpectralNet](https://openreview.net/pdf?id=HJ_aoCyRZ) 8 | 9 | ## Installation 10 | 11 | You can install the latest package version via 12 | 13 | ```bash 14 | pip install spectralnet 15 | ``` 16 | 17 | ## Usage 18 | 19 | ### Clustering 20 | 21 | The basic functionality is quite intuitive and easy to use, e.g., 22 | 23 | ```python 24 | from spectralnet import SpectralNet 25 | 26 | spectralnet = SpectralNet(n_clusters=10) 27 | spectralnet.fit(X) # X is the dataset and it should be a torch.Tensor 28 | cluster_assignments = spectralnet.predict(X) # Get the final assignments to clusters 29 | ``` 30 | 31 | If you have labels to your dataset and you want to measure ACC and NMI you can do the following: 32 | 33 | ```python 34 | from spectralnet import SpectralNet 35 | from spectralnet import Metrics 36 | 37 | 38 | spectralnet = SpectralNet(n_clusters=2) 39 | spectralnet.fit(X, y) # X is the dataset and it should be a torch.Tensor 40 | cluster_assignments = spectralnet.predict(X) # Get the final assignments to clusters 41 | 42 | y = y_train.detach().cpu().numpy() # In case your labels are of torch.Tensor type. 43 | acc_score = Metrics.acc_score(cluster_assignments, y, n_clusters=2) 44 | nmi_score = Metrics.nmi_score(cluster_assignments, y) 45 | print(f"ACC: {np.round(acc_score, 3)}") 46 | print(f"NMI: {np.round(nmi_score, 3)}") 47 | ``` 48 | 49 | You can read the code docs for more information and functionalities
50 | 51 | #### Running examples 52 | 53 | In order to run the model on twomoons or MNIST datasets, you should first cd to the examples folder and then run:
54 | `python3 cluster_twomoons.py`
55 | or
56 | `python3 cluster_mnist.py` 57 | 58 | 73 | 74 | 75 | 76 | ## Citation 77 | 78 | ``` 79 | 80 | @inproceedings{shaham2018, 81 | author = {Uri Shaham and Kelly Stanton and Henri Li and Boaz Nadler and Ronen Basri and Yuval Kluger}, 82 | title = {SpectralNet: Spectral Clustering Using Deep Neural Networks}, 83 | booktitle = {Proc. ICLR 2018}, 84 | year = {2018} 85 | } 86 | 87 | ``` 88 | -------------------------------------------------------------------------------- /src/spectralnet/_models/_spectralnet_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | 5 | 6 | class SpectralNetModel(nn.Module): 7 | def __init__(self, architecture: dict, input_dim: int): 8 | super(SpectralNetModel, self).__init__() 9 | self.architecture = architecture 10 | self.layers = nn.ModuleList() 11 | self.input_dim = input_dim 12 | 13 | current_dim = self.input_dim 14 | for i, layer in enumerate(self.architecture): 15 | next_dim = layer 16 | if i == len(self.architecture) - 1: 17 | self.layers.append( 18 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.Tanh()) 19 | ) 20 | else: 21 | self.layers.append( 22 | nn.Sequential(nn.Linear(current_dim, next_dim), nn.LeakyReLU()) 23 | ) 24 | current_dim = next_dim 25 | 26 | def _make_orthonorm_weights(self, Y: torch.Tensor) -> torch.Tensor: 27 | """ 28 | Orthonormalize the output of the network using the Cholesky decomposition. 29 | 30 | Parameters 31 | ---------- 32 | Y : torch.Tensor 33 | The output of the network. 34 | 35 | Returns 36 | ------- 37 | torch.Tensor 38 | The orthonormalized output. 39 | 40 | Notes 41 | ----- 42 | This function applies QR decomposition to orthonormalize the output (`Y`) of the network. 43 | The inverse of the R matrix is returned as the orthonormalization weights. 44 | """ 45 | 46 | m = Y.shape[0] 47 | _, R = torch.linalg.qr(Y) 48 | orthonorm_weights = np.sqrt(m) * torch.inverse(R) 49 | return orthonorm_weights 50 | 51 | def forward( 52 | self, x: torch.Tensor, should_update_orth_weights: bool = True 53 | ) -> torch.Tensor: 54 | """ 55 | Perform the forward pass of the model. 56 | 57 | Parameters 58 | ---------- 59 | x : torch.Tensor 60 | The input tensor. 61 | should_update_orth_weights : bool, optional 62 | Whether to update the orthonormalization weights using the Cholesky decomposition or not. 63 | 64 | Returns 65 | ------- 66 | torch.Tensor 67 | The output tensor. 68 | 69 | Notes 70 | ----- 71 | This function takes an input tensor `x` and computes the forward pass of the model. 72 | If `should_update_orth_weights` is set to True, the orthonormalization weights are updated 73 | using the QR decomposition. The output tensor is returned. 74 | """ 75 | 76 | for layer in self.layers: 77 | x = layer(x) 78 | 79 | Y_tilde = x 80 | if should_update_orth_weights: 81 | self.orthonorm_weights = self._make_orthonorm_weights(Y_tilde) 82 | 83 | Y = Y_tilde @ self.orthonorm_weights 84 | return Y 85 | -------------------------------------------------------------------------------- /src/spectralnet/_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.metrics as metrics 3 | 4 | from munkres import Munkres 5 | from sklearn.metrics import normalized_mutual_info_score as nmi 6 | 7 | from spectralnet._utils import * 8 | 9 | 10 | class Metrics: 11 | @staticmethod 12 | def acc_score( 13 | cluster_assignments: np.ndarray, y: np.ndarray, n_clusters: int 14 | ) -> float: 15 | """ 16 | Compute the accuracy score of the clustering algorithm. 17 | 18 | Parameters 19 | ---------- 20 | cluster_assignments : np.ndarray 21 | Cluster assignments for each data point. 22 | y : np.ndarray 23 | Ground truth labels. 24 | n_clusters : int 25 | Number of clusters. 26 | 27 | Returns 28 | ------- 29 | float 30 | The computed accuracy score. 31 | 32 | Notes 33 | ----- 34 | This function takes the `cluster_assignments` which represent the assigned clusters for each data point, 35 | the ground truth labels `y`, and the number of clusters `n_clusters`. It computes the accuracy score of the 36 | clustering algorithm by comparing the cluster assignments with the ground truth labels. The accuracy score 37 | is returned as a floating-point value. 38 | """ 39 | 40 | confusion_matrix = metrics.confusion_matrix(y, cluster_assignments, labels=None) 41 | cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters=n_clusters) 42 | indices = Munkres().compute(cost_matrix) 43 | kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices) 44 | y_pred = kmeans_to_true_cluster_labels[cluster_assignments] 45 | print(metrics.confusion_matrix(y, y_pred)) 46 | accuracy = np.mean(y_pred == y) 47 | return accuracy 48 | 49 | @staticmethod 50 | def nmi_score(cluster_assignments: np.ndarray, y: np.ndarray) -> float: 51 | """ 52 | Compute the normalized mutual information score of the clustering algorithm. 53 | 54 | Parameters 55 | ---------- 56 | cluster_assignments : np.ndarray 57 | Cluster assignments for each data point. 58 | y : np.ndarray 59 | Ground truth labels. 60 | 61 | Returns 62 | ------- 63 | float 64 | The computed normalized mutual information score. 65 | 66 | Notes 67 | ----- 68 | This function takes the `cluster_assignments` which represent the assigned clusters for each data point 69 | and the ground truth labels `y`. It computes the normalized mutual information (NMI) score of the clustering 70 | algorithm. NMI measures the mutual dependence between the cluster assignments and the ground truth labels, 71 | normalized by the entropy of both variables. The NMI score ranges between 0 and 1, where a higher score 72 | indicates a better clustering performance. The computed NMI score is returned as a floating-point value. 73 | """ 74 | return nmi(cluster_assignments, y) 75 | -------------------------------------------------------------------------------- /examples/data.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import torch 3 | import numpy as np 4 | import scipy.io 5 | 6 | 7 | from torch.utils.data import Dataset, Subset 8 | from sklearn.datasets import make_moons 9 | from torchvision import datasets, transforms 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.model_selection import train_test_split 12 | 13 | 14 | def load_mnist() -> tuple: 15 | tensor_transform = transforms.Compose([transforms.ToTensor()]) 16 | train_set = datasets.MNIST( 17 | root="../data", train=True, download=True, transform=tensor_transform 18 | ) 19 | test_set = datasets.MNIST( 20 | root="../data", train=False, download=True, transform=tensor_transform 21 | ) 22 | 23 | x_train, y_train = zip(*train_set) 24 | x_train, y_train = torch.cat(x_train), torch.Tensor(y_train) 25 | x_test, y_test = zip(*test_set) 26 | x_test, y_test = torch.cat(x_test), torch.Tensor(y_test) 27 | 28 | return x_train, y_train, x_test, y_test 29 | 30 | 31 | def load_twomoon() -> tuple: 32 | data, y = make_moons(n_samples=7000, shuffle=True, noise=0.075, random_state=None) 33 | scaler = StandardScaler() 34 | data = scaler.fit_transform(data) 35 | x_train, x_test, y_train, y_test = train_test_split( 36 | data, y, test_size=0.33, random_state=42 37 | ) 38 | x_train, x_test = torch.Tensor(x_train), torch.Tensor(x_test) 39 | y_train, y_test = torch.Tensor(y_train), torch.Tensor(y_test) 40 | return x_train, y_train, x_test, y_test 41 | 42 | 43 | def load_reuters() -> tuple: 44 | with h5py.File("../data/Reuters/reutersidf_total.h5", "r") as f: 45 | x = np.asarray(f.get("data"), dtype="float32") 46 | y = np.asarray(f.get("labels"), dtype="float32") 47 | 48 | n_train = int(0.9 * len(x)) 49 | x_train, x_test = x[:n_train], x[n_train:] 50 | y_train, y_test = y[:n_train], y[n_train:] 51 | 52 | x_train, x_test = torch.from_numpy(x_train), torch.from_numpy(x_test) 53 | y_train, y_test = torch.from_numpy(y_train), torch.from_numpy(y_test) 54 | 55 | return x_train, y_train, x_test, y_test 56 | 57 | 58 | def load_from_path(dpath: str, lpath: str = None) -> tuple: 59 | X = np.loadtxt(dpath, delimiter=",", dtype=np.float32) 60 | n_train = int(0.9 * len(X)) 61 | 62 | x_train, x_test = X[:n_train], X[n_train:] 63 | x_train, x_test = torch.from_numpy(x_train), torch.from_numpy(x_test) 64 | 65 | if lpath is not None: 66 | y = np.loadtxt(lpath, delimiter=",", dtype=np.float32) 67 | y_train, y_test = y[:n_train], y[n_train:] 68 | y_train, y_test = torch.from_numpy(y_train), torch.from_numpy(y_test) 69 | 70 | else: 71 | y_train, y_test = None, None 72 | 73 | return x_train, y_train, x_test, y_test 74 | 75 | 76 | def load_data(dataset: str) -> tuple: 77 | """ 78 | This function loads the dataset specified in the config file. 79 | 80 | 81 | Args: 82 | dataset (str or dictionary): In case you want to load your own dataset, 83 | you should specify the path to the data (and label if applicable) 84 | files in the config file in a dictionary fashion under the key "dataset". 85 | 86 | Raises: 87 | ValueError: If the dataset is not found in the config file. 88 | 89 | Returns: 90 | tuple: A tuple containing the train and test data and labels. 91 | """ 92 | 93 | if dataset == "mnist": 94 | x_train, y_train, x_test, y_test = load_mnist() 95 | elif dataset == "twomoons": 96 | x_train, y_train, x_test, y_test = load_twomoon() 97 | elif dataset == "reuters": 98 | x_train, y_train, x_test, y_test = load_reuters() 99 | else: 100 | try: 101 | data_path = dataset["dpath"] 102 | if "lpath" in dataset: 103 | label_path = dataset["lpath"] 104 | else: 105 | label_path = None 106 | except: 107 | raise ValueError("Could not find dataset path. Check your config file.") 108 | x_train, y_train, x_test, y_test = load_from_path(data_path, label_path) 109 | 110 | return x_train, x_test, y_train, y_test 111 | -------------------------------------------------------------------------------- /src/spectralnet/_trainers/_ae_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.optim as optim 5 | 6 | from tqdm import trange 7 | from ._trainer import Trainer 8 | from .._models import AEModel 9 | from torch.utils.data import DataLoader, random_split 10 | 11 | 12 | class AETrainer: 13 | def __init__(self, config: dict, device: torch.device): 14 | self.device = device 15 | self.ae_config = config 16 | self.lr = self.ae_config["lr"] 17 | self.epochs = self.ae_config["epochs"] 18 | self.min_lr = self.ae_config["min_lr"] 19 | self.lr_decay = self.ae_config["lr_decay"] 20 | self.patience = self.ae_config["patience"] 21 | self.architecture = self.ae_config["hiddens"] 22 | self.batch_size = self.ae_config["batch_size"] 23 | self.weights_dir = "spectralnet/_trainers/weights" 24 | self.weights_path = "spectralnet/_trainers/weights/ae_weights.pth" 25 | if not os.path.exists(self.weights_dir): 26 | os.makedirs(self.weights_dir) 27 | 28 | def train(self, X: torch.Tensor) -> AEModel: 29 | self.X = X.view(X.size(0), -1) 30 | self.criterion = nn.MSELoss() 31 | 32 | self.ae_net = AEModel(self.architecture, input_dim=self.X.shape[1]).to( 33 | self.device 34 | ) 35 | 36 | self.optimizer = optim.Adam(self.ae_net.parameters(), lr=self.lr) 37 | 38 | self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( 39 | self.optimizer, mode="min", factor=self.lr_decay, patience=self.patience 40 | ) 41 | 42 | if os.path.exists(self.weights_path): 43 | self.ae_net.load_state_dict(torch.load(self.weights_path)) 44 | return self.ae_net 45 | 46 | train_loader, valid_loader = self._get_data_loader() 47 | 48 | print("Training Autoencoder:") 49 | t = trange(self.epochs, leave=True) 50 | for epoch in t: 51 | train_loss = 0.0 52 | for batch_x in train_loader: 53 | batch_x = batch_x.to(self.device) 54 | batch_x = batch_x.view(batch_x.size(0), -1) 55 | self.optimizer.zero_grad() 56 | output = self.ae_net(batch_x) 57 | loss = self.criterion(output, batch_x) 58 | loss.backward() 59 | self.optimizer.step() 60 | train_loss += loss.item() 61 | 62 | train_loss /= len(train_loader) 63 | valid_loss = self.validate(valid_loader) 64 | self.scheduler.step(valid_loss) 65 | current_lr = self.optimizer.param_groups[0]["lr"] 66 | 67 | if current_lr <= self.min_lr: 68 | break 69 | 70 | t.set_description( 71 | "Train Loss: {:.7f}, Valid Loss: {:.7f}, LR: {:.6f}".format( 72 | train_loss, valid_loss, current_lr 73 | ) 74 | ) 75 | t.refresh() 76 | 77 | torch.save(self.ae_net.state_dict(), self.weights_path) 78 | return self.ae_net 79 | 80 | def validate(self, valid_loader: DataLoader) -> float: 81 | self.ae_net.eval() 82 | valid_loss = 0.0 83 | with torch.no_grad(): 84 | for batch_x in valid_loader: 85 | batch_x = batch_x.to(self.device) 86 | batch_x = batch_x.view(batch_x.size(0), -1) 87 | output = self.ae_net(batch_x) 88 | loss = self.criterion(output, batch_x) 89 | valid_loss += loss.item() 90 | valid_loss /= len(valid_loader) 91 | return valid_loss 92 | 93 | def embed(self, X: torch.Tensor) -> torch.Tensor: 94 | print("Embedding data ...") 95 | self.ae_net.eval() 96 | with torch.no_grad(): 97 | X = X.view(X.size(0), -1) 98 | encoded_data = self.ae_net.encode(X.to(self.device)) 99 | return encoded_data 100 | 101 | def _get_data_loader(self) -> tuple: 102 | trainset_len = int(len(self.X) * 0.9) 103 | validset_len = len(self.X) - trainset_len 104 | trainset, validset = random_split(self.X, [trainset_len, validset_len]) 105 | train_loader = DataLoader( 106 | trainset, batch_size=self.ae_config["batch_size"], shuffle=True 107 | ) 108 | valid_loader = DataLoader( 109 | validset, batch_size=self.ae_config["batch_size"], shuffle=False 110 | ) 111 | return train_loader, valid_loader 112 | -------------------------------------------------------------------------------- /src/spectralnet/_trainers/_spectralnet_trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from torch.utils.data import DataLoader, random_split, TensorDataset 5 | from sklearn.neighbors import kneighbors_graph 6 | from tqdm import trange 7 | from spectralnet._utils import * 8 | from ._trainer import Trainer 9 | from .._losses import SpectralNetLoss 10 | from .._models import SpectralNetModel 11 | 12 | 13 | class SpectralTrainer: 14 | def __init__(self, config: dict, device: torch.device, is_sparse: bool = False): 15 | """ 16 | Initialize the SpectralNet model trainer. 17 | 18 | Parameters 19 | ---------- 20 | config : dict 21 | The configuration dictionary. 22 | device : torch.device 23 | The device to use for training. 24 | is_sparse : bool, optional 25 | Whether the graph-laplacian obtained from a mini-batch is sparse or not. 26 | If True, the batch is constructed by taking 1/5 of the original random batch 27 | and adding 4 of its nearest neighbors to each sample. Defaults to False. 28 | 29 | Notes 30 | ----- 31 | This class is responsible for training the SpectralNet model. 32 | The configuration dictionary (`config`) contains various settings for training. 33 | The device (`device`) specifies the device (CPU or GPU) to be used for training. 34 | The `is_sparse` flag is used to determine the construction of the batch when the graph-laplacian is sparse. 35 | """ 36 | 37 | self.device = device 38 | self.is_sparse = is_sparse 39 | self.spectral_config = config 40 | self.lr = self.spectral_config["lr"] 41 | self.n_nbg = self.spectral_config["n_nbg"] 42 | self.min_lr = self.spectral_config["min_lr"] 43 | self.epochs = self.spectral_config["epochs"] 44 | self.scale_k = self.spectral_config["scale_k"] 45 | self.lr_decay = self.spectral_config["lr_decay"] 46 | self.patience = self.spectral_config["patience"] 47 | self.architecture = self.spectral_config["hiddens"] 48 | self.batch_size = self.spectral_config["batch_size"] 49 | self.is_local_scale = self.spectral_config["is_local_scale"] 50 | 51 | def train( 52 | self, X: torch.Tensor, y: torch.Tensor, siamese_net: nn.Module = None 53 | ) -> SpectralNetModel: 54 | """ 55 | Train the SpectralNet model. 56 | 57 | Parameters 58 | ---------- 59 | X : torch.Tensor 60 | The dataset to train on. 61 | y : torch.Tensor, optional 62 | The labels of the dataset in case there are any. 63 | siamese_net : nn.Module, optional 64 | The siamese network to use for computing the affinity matrix. 65 | 66 | Returns 67 | ------- 68 | SpectralNetModel 69 | The trained SpectralNet model. 70 | 71 | Notes 72 | ----- 73 | This function trains the SpectralNet model using the provided dataset (`X`) and labels (`y`). 74 | If labels are not provided (`y` is None), unsupervised training is performed. 75 | The siamese network (`siamese_net`) is an optional parameter used for computing the affinity matrix. 76 | The trained SpectralNet model is returned as the output. 77 | """ 78 | 79 | self.X = X.view(X.size(0), -1) 80 | self.y = y 81 | self.counter = 0 82 | self.siamese_net = siamese_net 83 | self.criterion = SpectralNetLoss() 84 | self.spectral_net = SpectralNetModel( 85 | self.architecture, input_dim=self.X.shape[1] 86 | ).to(self.device) 87 | 88 | self.optimizer = optim.Adam(self.spectral_net.parameters(), lr=self.lr) 89 | 90 | self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( 91 | self.optimizer, mode="min", factor=self.lr_decay, patience=self.patience 92 | ) 93 | 94 | train_loader, ortho_loader, valid_loader = self._get_data_loader() 95 | 96 | print("Training SpectralNet:") 97 | t = trange(self.epochs, leave=True) 98 | for epoch in t: 99 | train_loss = 0.0 100 | for (X_grad, _), (X_orth, _) in zip(train_loader, ortho_loader): 101 | X_grad = X_grad.to(device=self.device) 102 | X_grad = X_grad.view(X_grad.size(0), -1) 103 | X_orth = X_orth.to(device=self.device) 104 | X_orth = X_orth.view(X_orth.size(0), -1) 105 | 106 | if self.is_sparse: 107 | X_grad = make_batch_for_sparse_grapsh(X_grad) 108 | X_orth = make_batch_for_sparse_grapsh(X_orth) 109 | 110 | # Orthogonalization step 111 | self.spectral_net.eval() 112 | self.spectral_net(X_orth, should_update_orth_weights=True) 113 | 114 | # Gradient step 115 | self.spectral_net.train() 116 | self.optimizer.zero_grad() 117 | 118 | Y = self.spectral_net(X_grad, should_update_orth_weights=False) 119 | if self.siamese_net is not None: 120 | with torch.no_grad(): 121 | X_grad = self.siamese_net.forward_once(X_grad) 122 | 123 | W = self._get_affinity_matrix(X_grad) 124 | 125 | loss = self.criterion(W, Y) 126 | loss.backward() 127 | self.optimizer.step() 128 | train_loss += loss.item() 129 | 130 | train_loss /= len(train_loader) 131 | 132 | # Validation step 133 | valid_loss = self.validate(valid_loader) 134 | self.scheduler.step(valid_loss) 135 | 136 | current_lr = self.optimizer.param_groups[0]["lr"] 137 | if current_lr <= self.spectral_config["min_lr"]: 138 | break 139 | t.set_description( 140 | "Train Loss: {:.7f}, Valid Loss: {:.7f}, LR: {:.6f}".format( 141 | train_loss, valid_loss, current_lr 142 | ) 143 | ) 144 | t.refresh() 145 | 146 | return self.spectral_net 147 | 148 | def validate(self, valid_loader: DataLoader) -> float: 149 | valid_loss = 0.0 150 | self.spectral_net.eval() 151 | with torch.no_grad(): 152 | for batch in valid_loader: 153 | X, y = batch 154 | X, y = X.to(self.device), y.to(self.device) 155 | 156 | if self.is_sparse: 157 | X = make_batch_for_sparse_grapsh(X) 158 | 159 | Y = self.spectral_net(X, should_update_orth_weights=False) 160 | with torch.no_grad(): 161 | if self.siamese_net is not None: 162 | X = self.siamese_net.forward_once(X) 163 | 164 | W = self._get_affinity_matrix(X) 165 | 166 | loss = self.criterion(W, Y) 167 | valid_loss += loss.item() 168 | 169 | valid_loss /= len(valid_loader) 170 | return valid_loss 171 | 172 | def _get_affinity_matrix(self, X: torch.Tensor) -> torch.Tensor: 173 | """ 174 | This function computes the affinity matrix W using the Gaussian kernel. 175 | 176 | Args: 177 | X (torch.Tensor): The input data 178 | 179 | Returns: 180 | torch.Tensor: The affinity matrix W 181 | """ 182 | 183 | is_local = self.is_local_scale 184 | n_neighbors = self.n_nbg 185 | scale_k = self.scale_k 186 | Dx = torch.cdist(X, X) 187 | Dis, indices = get_nearest_neighbors(X, k=n_neighbors + 1) 188 | scale = compute_scale(Dis, k=scale_k, is_local=is_local) 189 | W = get_gaussian_kernel( 190 | Dx, scale, indices, device=self.device, is_local=is_local 191 | ) 192 | return W 193 | 194 | def _get_data_loader(self) -> tuple: 195 | """ 196 | This function returns the data loaders for training, validation and testing. 197 | 198 | Returns: 199 | tuple: The data loaders 200 | """ 201 | if self.y is None: 202 | self.y = torch.zeros(len(self.X)) 203 | train_size = int(0.9 * len(self.X)) 204 | valid_size = len(self.X) - train_size 205 | dataset = TensorDataset(self.X, self.y) 206 | train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size]) 207 | train_loader = DataLoader( 208 | train_dataset, batch_size=self.batch_size, shuffle=True 209 | ) 210 | ortho_loader = DataLoader( 211 | train_dataset, batch_size=self.batch_size, shuffle=True 212 | ) 213 | valid_loader = DataLoader( 214 | valid_dataset, batch_size=self.batch_size, shuffle=False 215 | ) 216 | return train_loader, ortho_loader, valid_loader 217 | -------------------------------------------------------------------------------- /src/spectralnet/_trainers/_siamesenet_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import torch.optim as optim 5 | 6 | from tqdm import trange 7 | from annoy import AnnoyIndex 8 | from sklearn.neighbors import NearestNeighbors 9 | from torch.utils.data import DataLoader, random_split 10 | 11 | from ._trainer import Trainer 12 | from .._models import SiameseNetModel 13 | from .._losses import ContrastiveLoss 14 | 15 | 16 | class SiameseDataset: 17 | def __init__(self, pairs: list): 18 | """ 19 | Initializes a Siamese dataset. 20 | 21 | Parameters 22 | ---------- 23 | pairs : list 24 | A list of tuples containing the pairs of data 25 | and their labels. 26 | """ 27 | self.pairs = pairs 28 | 29 | def __getitem__(self, index: int): 30 | x1 = self.pairs[index][0] 31 | x2 = self.pairs[index][1] 32 | label = self.pairs[index][2] 33 | return x1, x2, label 34 | 35 | def __len__(self): 36 | return len(self.pairs) 37 | 38 | 39 | class SiameseTrainer: 40 | def __init__(self, config: dict, device: torch.device): 41 | self.device = device 42 | self.siamese_config = config 43 | self.lr = self.siamese_config["lr"] 44 | self.n_nbg = self.siamese_config["n_nbg"] 45 | self.min_lr = self.siamese_config["min_lr"] 46 | self.epochs = self.siamese_config["epochs"] 47 | self.lr_decay = self.siamese_config["lr_decay"] 48 | self.patience = self.siamese_config["patience"] 49 | self.architecture = self.siamese_config["hiddens"] 50 | self.batch_size = self.siamese_config["batch_size"] 51 | self.use_approx = self.siamese_config["use_approx"] 52 | self.weights_path = "spectralnet/_trainers/weights/siamese_weights.pth" 53 | 54 | def train(self, X: torch.Tensor) -> SiameseNetModel: 55 | self.X = X.view(X.size(0), -1) 56 | # self.X = X 57 | 58 | self.criterion = ContrastiveLoss() 59 | self.siamese_net = SiameseNetModel( 60 | self.architecture, input_dim=self.X.shape[1] 61 | ).to(self.device) 62 | 63 | self.optimizer = optim.Adam(self.siamese_net.parameters(), lr=self.lr) 64 | 65 | self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( 66 | self.optimizer, mode="min", factor=self.lr_decay, patience=self.patience 67 | ) 68 | 69 | if os.path.exists(self.weights_path): 70 | self.siamese_net.load_state_dict(torch.load(self.weights_path)) 71 | return self.siamese_net 72 | 73 | train_loader, valid_loader = self._get_data_loader() 74 | 75 | print("Training Siamese Network:") 76 | t = trange(self.epochs, leave=True) 77 | self.siamese_net.train() 78 | for epoch in t: 79 | train_loss = 0.0 80 | for x1, x2, label in train_loader: 81 | x1 = x1.to(self.device) 82 | x1 = x1.view(x1.size(0), -1) 83 | x2 = x2.to(self.device) 84 | x2 = x2.view(x2.size(0), -1) 85 | label = label.to(self.device) 86 | self.optimizer.zero_grad() 87 | output1, output2 = self.siamese_net(x1, x2) 88 | loss = self.criterion(output1, output2, label) 89 | loss.backward() 90 | self.optimizer.step() 91 | train_loss += loss.item() 92 | 93 | train_loss /= len(train_loader) 94 | valid_loss = self.validate(valid_loader) 95 | self.scheduler.step(valid_loss) 96 | current_lr = self.optimizer.param_groups[0]["lr"] 97 | 98 | if current_lr <= self.min_lr: 99 | break 100 | t.set_description( 101 | "Train Loss: {:.7f}, Valid Loss: {:.7f}, LR: {:.6f}".format( 102 | train_loss, valid_loss, current_lr 103 | ) 104 | ) 105 | t.refresh() 106 | 107 | torch.save(self.siamese_net.state_dict(), self.weights_path) 108 | return self.siamese_net 109 | 110 | def validate(self, valid_loader: DataLoader) -> float: 111 | valid_loss = 0.0 112 | self.siamese_net.eval() 113 | with torch.no_grad(): 114 | for x1, x2, label in valid_loader: 115 | x1 = x1.to(self.device) 116 | x1 = x1.view(x1.size(0), -1) 117 | x2 = x2.to(self.device) 118 | x2 = x2.view(x2.size(0), -1) 119 | label = label.to(self.device) 120 | output1, output2 = self.siamese_net(x1, x2) 121 | loss = self.criterion(output1, output2, label) 122 | valid_loss += loss.item() 123 | valid_loss /= len(valid_loader) 124 | return valid_loss 125 | 126 | def _get_knn_pairs(self) -> list: 127 | """Gets the pairs of data points to be used for training the siamese network. 128 | 129 | Parameters 130 | ---------- 131 | None 132 | 133 | Returns 134 | ------- 135 | list 136 | A list of pairs of data points. 137 | 138 | Notes 139 | ----- 140 | The pairs are chosen such that each data point has n_neighbors positive pairs 141 | and n_neighbors negative pairs where the neighbors are chosen using KNN. 142 | """ 143 | 144 | pairs = [] 145 | X = self.X.detach().cpu().numpy() 146 | data_indices = np.arange(len(X)) 147 | n_neighbors = self.n_nbg 148 | nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, algorithm="ball_tree").fit( 149 | X 150 | ) 151 | _, neighbors_indices = nbrs.kneighbors(X) 152 | 153 | for i in range(len(X)): 154 | non_neighbors_indices = np.delete(data_indices, neighbors_indices[i]) 155 | non_neighbors_random_chosen_indices = np.random.choice( 156 | non_neighbors_indices, n_neighbors 157 | ) 158 | 159 | positive_pairs = [ 160 | [self.X[i], self.X[n], 1] 161 | for n in neighbors_indices[i][1 : n_neighbors + 1] 162 | ] 163 | negative_pairs = [ 164 | [self.X[i], self.X[n], 0] for n in non_neighbors_random_chosen_indices 165 | ] 166 | 167 | pairs += positive_pairs 168 | pairs += negative_pairs 169 | 170 | return pairs 171 | 172 | def _get_approx_nn_pairs(self) -> list: 173 | """Gets the pairs of data points to be used for training the siamese network. 174 | 175 | Parameters 176 | ---------- 177 | None 178 | 179 | Returns 180 | ------- 181 | list 182 | A list of pairs of data points. 183 | 184 | Notes 185 | ----- 186 | The pairs are chosen such that each data point has 1 neighbor from its nearest n_neighbors 187 | neighbors and 1 neighbor from the rest of the data points. The neighbors are chosen using 188 | approximate nearest neighbors using the Annoy library. 189 | """ 190 | 191 | pairs = [] 192 | n_samples = self.X.shape[0] 193 | n_neighbors = self.n_nbg 194 | indices = torch.randperm(self.X.shape[0])[:n_samples] 195 | x_train = self.X[indices] 196 | X_numpy = self.X[indices].detach().cpu().numpy() 197 | data_indices = np.arange(len(x_train)) 198 | 199 | ann = AnnoyIndex(X_numpy.shape[1], "euclidean") 200 | for i, x_ in enumerate(X_numpy): 201 | ann.add_item(i, x_) 202 | ann.build(50) 203 | 204 | neighbors_indices = np.empty((len(X_numpy), n_neighbors + 1)) 205 | for i in range(len(X_numpy)): 206 | nn_i = ann.get_nns_by_item(i, n_neighbors + 1, include_distances=False) 207 | neighbors_indices[i, :] = np.array(nn_i) 208 | neighbors_indices = neighbors_indices.astype(int) 209 | 210 | print("Building dataset for the siamese network ...") 211 | for i in range(len(X_numpy)): 212 | non_neighbors_indices = np.delete(data_indices, neighbors_indices[i]) 213 | 214 | neighbor_idx = np.random.choice(neighbors_indices[i][1:], 1) 215 | non_nbr_idx = np.random.choice(non_neighbors_indices, 1) 216 | 217 | positive_pairs = [[x_train[i], x_train[neighbor_idx], 1]] 218 | negative_pairs = [[x_train[i], x_train[non_nbr_idx], 0]] 219 | 220 | pairs += positive_pairs 221 | pairs += negative_pairs 222 | 223 | return pairs 224 | 225 | def _get_pairs(self) -> list: 226 | """Gets the pairs of data points to be used for training the siamese network. 227 | 228 | Parameters 229 | ---------- 230 | None 231 | 232 | Returns 233 | ------- 234 | list 235 | A list of pairs of data points. 236 | 237 | Notes 238 | ----- 239 | This method internally calls either _get_knn_pairs() or _get_approx_nn_pairs() based on the value 240 | of the 'use_approx' attribute. 241 | """ 242 | 243 | should_use_approx = self.use_approx 244 | if should_use_approx: 245 | return self._get_approx_nn_pairs() 246 | else: 247 | return self._get_knn_pairs() 248 | 249 | def _get_data_loader(self) -> tuple: 250 | """ 251 | Splits the data into train and validation sets and returns the corresponding data loaders. 252 | 253 | Parameters 254 | ---------- 255 | None 256 | 257 | Returns 258 | ------- 259 | tuple 260 | A tuple containing the train and validation data loaders. 261 | 262 | Notes 263 | ----- 264 | This function splits the data into train and validation sets and creates data loaders for them. 265 | The train and validation sets are obtained by randomly splitting the siamese dataset. 266 | The train and validation data loaders are created using DataLoader from the PyTorch library. 267 | """ 268 | 269 | pairs = self._get_pairs() 270 | siamese_dataset = SiameseDataset(pairs) 271 | siamese_trainset_len = int(len(siamese_dataset) * 0.9) 272 | siamese_validset_len = len(siamese_dataset) - siamese_trainset_len 273 | siamese_trainset, siamese_validset = random_split( 274 | siamese_dataset, [siamese_trainset_len, siamese_validset_len] 275 | ) 276 | siamese_trainloader = DataLoader( 277 | siamese_trainset, batch_size=self.siamese_config["batch_size"], shuffle=True 278 | ) 279 | siamese_validloader = DataLoader( 280 | siamese_validset, 281 | batch_size=self.siamese_config["batch_size"], 282 | shuffle=False, 283 | ) 284 | return siamese_trainloader, siamese_validloader 285 | -------------------------------------------------------------------------------- /src/spectralnet/_cluster.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from ._utils import * 5 | from sklearn.cluster import KMeans 6 | from ._trainers import SpectralTrainer, SiameseTrainer, AETrainer 7 | 8 | 9 | class SpectralNet: 10 | def __init__( 11 | self, 12 | n_clusters: int, 13 | should_use_ae: bool = False, 14 | should_use_siamese: bool = False, 15 | is_sparse_graph: bool = False, 16 | ae_hiddens: list = [512, 512, 2048, 10], 17 | ae_epochs: int = 40, 18 | ae_lr: float = 1e-3, 19 | ae_lr_decay: float = 0.1, 20 | ae_min_lr: float = 1e-7, 21 | ae_patience: int = 10, 22 | ae_batch_size: int = 256, 23 | siamese_hiddens: list = [1024, 1024, 512, 10], 24 | siamese_epochs: int = 30, 25 | siamese_lr: float = 1e-3, 26 | siamese_lr_decay: float = 0.1, 27 | siamese_min_lr: float = 1e-7, 28 | siamese_patience: int = 10, 29 | siamese_n_nbg: int = 2, 30 | siamese_use_approx: bool = False, 31 | siamese_batch_size: int = 128, 32 | spectral_hiddens: list = [1024, 1024, 512, 10], 33 | spectral_epochs: int = 30, 34 | spectral_lr: float = 1e-3, 35 | spectral_lr_decay: float = 0.1, 36 | spectral_min_lr: float = 1e-8, 37 | spectral_patience: int = 10, 38 | spectral_batch_size: int = 1024, 39 | spectral_n_nbg: int = 30, 40 | spectral_scale_k: int = 15, 41 | spectral_is_local_scale: bool = True, 42 | ): 43 | """SpectralNet is a class for implementing a Deep learning model that performs spectral clustering. 44 | This model optionally utilizes Autoencoders (AE) and Siamese networks for training. 45 | 46 | Parameters 47 | ---------- 48 | n_clusters : int 49 | The number of clusters to be generated by the SpectralNet algorithm. 50 | Also used for the dimention of the projection subspace. 51 | 52 | should_use_ae : bool, optional (default=False) 53 | Specifies whether to use the Autoencoder (AE) network as part of the training process. 54 | 55 | should_use_siamese : bool, optional (default=False) 56 | Specifies whether to use the Siamese network as part of the training process. 57 | 58 | is_sparse_graph : bool, optional (default=False) 59 | Specifies whether the graph Laplacian created from the data is sparse. 60 | 61 | ae_hiddens : list, optional (default=[512, 512, 2048, 10]) 62 | The number of hidden units in each layer of the Autoencoder network. 63 | 64 | ae_epochs : int, optional (default=30) 65 | The number of epochs to train the Autoencoder network. 66 | 67 | ae_lr : float, optional (default=1e-3) 68 | The learning rate for the Autoencoder network. 69 | 70 | ae_lr_decay : float, optional (default=0.1) 71 | The learning rate decay factor for the Autoencoder network. 72 | 73 | ae_min_lr : float, optional (default=1e-7) 74 | The minimum learning rate for the Autoencoder network. 75 | 76 | ae_patience : int, optional (default=10) 77 | The number of epochs to wait before reducing the learning rate for the Autoencoder network. 78 | 79 | ae_batch_size : int, optional (default=256) 80 | The batch size used during training of the Autoencoder network. 81 | 82 | siamese_hiddens : list, optional (default=[1024, 1024, 512, 10]) 83 | The number of hidden units in each layer of the Siamese network. 84 | 85 | siamese_epochs : int, optional (default=30) 86 | The number of epochs to train the Siamese network. 87 | 88 | siamese_lr : float, optional (default=1e-3) 89 | The learning rate for the Siamese network. 90 | 91 | siamese_lr_decay : float, optional (default=0.1) 92 | The learning rate decay factor for the Siamese network. 93 | 94 | siamese_min_lr : float, optional (default=1e-7) 95 | The minimum learning rate for the Siamese network. 96 | 97 | siamese_patience : int, optional (default=10) 98 | The number of epochs to wait before reducing the learning rate for the Siamese network. 99 | 100 | siamese_n_nbg : int, optional (default=2) 101 | The number of nearest neighbors to consider as 'positive' pairs by the Siamese network. 102 | 103 | siamese_use_approx : bool, optional (default=False) 104 | Specifies whether to use Annoy instead of KNN for computing nearest neighbors, 105 | particularly useful for large datasets. 106 | 107 | siamese_batch_size : int, optional (default=256) 108 | The batch size used during training of the Siamese network. 109 | 110 | spectral_hiddens : list, optional (default=[1024, 1024, 512, 10]) 111 | The number of hidden units in each layer of the Spectral network. 112 | 113 | spectral_epochs : int, optional (default=30) 114 | The number of epochs to train the Spectral network. 115 | 116 | spectral_lr : float, optional (default=1e-3) 117 | The learning rate for the Spectral network. 118 | 119 | spectral_lr_decay : float, optional (default=0.1) 120 | The learning rate decay factor""" 121 | 122 | self.n_clusters = n_clusters 123 | self.should_use_ae = should_use_ae 124 | self.should_use_siamese = should_use_siamese 125 | self.is_sparse_graph = is_sparse_graph 126 | self.ae_hiddens = ae_hiddens 127 | self.ae_epochs = ae_epochs 128 | self.ae_lr = ae_lr 129 | self.ae_lr_decay = ae_lr_decay 130 | self.ae_min_lr = ae_min_lr 131 | self.ae_patience = ae_patience 132 | self.ae_batch_size = ae_batch_size 133 | self.siamese_hiddens = siamese_hiddens 134 | self.siamese_epochs = siamese_epochs 135 | self.siamese_lr = siamese_lr 136 | self.siamese_lr_decay = siamese_lr_decay 137 | self.siamese_min_lr = siamese_min_lr 138 | self.siamese_patience = siamese_patience 139 | self.siamese_n_nbg = siamese_n_nbg 140 | self.siamese_use_approx = siamese_use_approx 141 | self.siamese_batch_size = siamese_batch_size 142 | self.spectral_hiddens = spectral_hiddens 143 | self.spectral_epochs = spectral_epochs 144 | self.spectral_lr = spectral_lr 145 | self.spectral_lr_decay = spectral_lr_decay 146 | self.spectral_min_lr = spectral_min_lr 147 | self.spectral_patience = spectral_patience 148 | self.spectral_n_nbg = spectral_n_nbg 149 | self.spectral_scale_k = spectral_scale_k 150 | self.spectral_is_local_scale = spectral_is_local_scale 151 | self.spectral_batch_size = spectral_batch_size 152 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 153 | 154 | self._validate_spectral_hiddens() 155 | 156 | def _validate_spectral_hiddens(self): 157 | """Validates the number of hidden units in each layer of the Spectral network.""" 158 | 159 | if self.spectral_hiddens[-1] != self.n_clusters: 160 | raise ValueError( 161 | "The number of units in the last layer of spectral_hiddens network must be equal to the number of clusters or components." 162 | ) 163 | 164 | def fit(self, X: torch.Tensor, y: torch.Tensor = None): 165 | """Performs the main training loop for the SpectralNet model. 166 | 167 | Parameters 168 | ---------- 169 | X : torch.Tensor 170 | Data to train the networks on. 171 | 172 | y : torch.Tensor, optional 173 | Labels in case there are any. Defaults to None. 174 | """ 175 | self._X = X 176 | ae_config = { 177 | "hiddens": self.ae_hiddens, 178 | "epochs": self.ae_epochs, 179 | "lr": self.ae_lr, 180 | "lr_decay": self.ae_lr_decay, 181 | "min_lr": self.ae_min_lr, 182 | "patience": self.ae_patience, 183 | "batch_size": self.ae_batch_size, 184 | } 185 | 186 | siamese_config = { 187 | "hiddens": self.siamese_hiddens, 188 | "epochs": self.siamese_epochs, 189 | "lr": self.siamese_lr, 190 | "lr_decay": self.siamese_lr_decay, 191 | "min_lr": self.siamese_min_lr, 192 | "patience": self.siamese_patience, 193 | "n_nbg": self.siamese_n_nbg, 194 | "use_approx": self.siamese_use_approx, 195 | "batch_size": self.siamese_batch_size, 196 | } 197 | 198 | spectral_config = { 199 | "hiddens": self.spectral_hiddens, 200 | "epochs": self.spectral_epochs, 201 | "lr": self.spectral_lr, 202 | "lr_decay": self.spectral_lr_decay, 203 | "min_lr": self.spectral_min_lr, 204 | "patience": self.spectral_patience, 205 | "n_nbg": self.spectral_n_nbg, 206 | "scale_k": self.spectral_scale_k, 207 | "is_local_scale": self.spectral_is_local_scale, 208 | "batch_size": self.spectral_batch_size, 209 | } 210 | 211 | if self.should_use_ae: 212 | self.ae_trainer = AETrainer(config=ae_config, device=self.device) 213 | self.ae_net = self.ae_trainer.train(X) 214 | X = self.ae_trainer.embed(X) 215 | 216 | if self.should_use_siamese: 217 | self.siamese_trainer = SiameseTrainer( 218 | config=siamese_config, device=self.device 219 | ) 220 | self.siamese_net = self.siamese_trainer.train(X) 221 | else: 222 | self.siamese_net = None 223 | 224 | is_sparse = self.is_sparse_graph 225 | if is_sparse: 226 | build_ann(X) 227 | 228 | self.spectral_trainer = SpectralTrainer( 229 | config=spectral_config, device=self.device, is_sparse=is_sparse 230 | ) 231 | self.spec_net = self.spectral_trainer.train(X, y, self.siamese_net) 232 | 233 | def predict(self, X: torch.Tensor) -> np.ndarray: 234 | """Predicts the cluster assignments for the given data. 235 | 236 | Parameters 237 | ---------- 238 | X : torch.Tensor 239 | Data to be clustered. 240 | 241 | Returns 242 | ------- 243 | np.ndarray 244 | The cluster assignments for the given data. 245 | """ 246 | X = X.view(X.size(0), -1) 247 | X = X.to(self.device) 248 | 249 | with torch.no_grad(): 250 | if self.should_use_ae: 251 | X = self.ae_net.encode(X) 252 | self.embeddings_ = self.spec_net(X, should_update_orth_weights=False) 253 | self.embeddings_ = self.embeddings_.detach().cpu().numpy() 254 | 255 | cluster_assignments = self._get_clusters_by_kmeans(self.embeddings_) 256 | return cluster_assignments 257 | 258 | def get_random_batch(self, batch_size: int = 1024) -> tuple: 259 | """Get a batch of the input data. 260 | 261 | Parameters 262 | ---------- 263 | batch_size : int 264 | The size of the batch to use. 265 | 266 | Returns 267 | ------- 268 | tuple 269 | The raw batch and the encoded batch. 270 | 271 | """ 272 | permuted_indices = torch.randperm(batch_size) 273 | X_raw = self._X.view(self._X.size(0), -1) 274 | X_encoded = X_raw 275 | 276 | if self.should_use_ae: 277 | X_encoded = self.ae_trainer.embed(self._X) 278 | 279 | if self.should_use_siamese: 280 | X_encoded = self.siamese_net.forward_once(X_encoded) 281 | 282 | X_encoded = X_encoded[permuted_indices] 283 | X_raw = X_raw[permuted_indices] 284 | X_encoded = X_encoded.to(self.device) 285 | return X_raw, X_encoded 286 | 287 | def _get_clusters_by_kmeans(self, embeddings: np.ndarray) -> np.ndarray: 288 | """Performs k-means clustering on the spectral-embedding space. 289 | 290 | Parameters 291 | ---------- 292 | embeddings : np.ndarray 293 | The spectral-embedding space. 294 | 295 | Returns 296 | ------- 297 | np.ndarray 298 | The cluster assignments for the given data. 299 | """ 300 | 301 | kmeans = KMeans(n_clusters=self.n_clusters, n_init=10).fit(embeddings) 302 | cluster_assignments = kmeans.predict(embeddings) 303 | return cluster_assignments 304 | -------------------------------------------------------------------------------- /src/spectralnet/_reduction.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from ._utils import * 6 | from ._cluster import SpectralNet 7 | from sklearn.cluster import KMeans 8 | from ._metrics import Metrics 9 | 10 | 11 | class SpectralReduction: 12 | def __init__( 13 | self, 14 | n_components: int, 15 | should_use_ae: bool = False, 16 | should_use_siamese: bool = False, 17 | is_sparse_graph: bool = False, 18 | ae_hiddens: list = [512, 512, 2048, 10], 19 | ae_epochs: int = 40, 20 | ae_lr: float = 1e-3, 21 | ae_lr_decay: float = 0.1, 22 | ae_min_lr: float = 1e-7, 23 | ae_patience: int = 10, 24 | ae_batch_size: int = 256, 25 | siamese_hiddens: list = [1024, 1024, 512, 10], 26 | siamese_epochs: int = 30, 27 | siamese_lr: float = 1e-3, 28 | siamese_lr_decay: float = 0.1, 29 | siamese_min_lr: float = 1e-7, 30 | siamese_patience: int = 10, 31 | siamese_n_nbg: int = 2, 32 | siamese_use_approx: bool = False, 33 | siamese_batch_size: int = 128, 34 | spectral_hiddens: list = [1024, 1024, 512, 10], 35 | spectral_epochs: int = 30, 36 | spectral_lr: float = 1e-3, 37 | spectral_lr_decay: float = 0.1, 38 | spectral_min_lr: float = 1e-8, 39 | spectral_patience: int = 10, 40 | spectral_batch_size: int = 1024, 41 | spectral_n_nbg: int = 30, 42 | spectral_scale_k: int = 15, 43 | spectral_is_local_scale: bool = True, 44 | ): 45 | """SpectralNet is a class for implementing a Deep learning model that performs spectral clustering. 46 | This model optionally utilizes Autoencoders (AE) and Siamese networks for training. 47 | 48 | Parameters 49 | ---------- 50 | n_components : int 51 | The number of components to keep. 52 | 53 | should_use_ae : bool, optional (default=False) 54 | Specifies whether to use the Autoencoder (AE) network as part of the training process. 55 | 56 | should_use_siamese : bool, optional (default=False) 57 | Specifies whether to use the Siamese network as part of the training process. 58 | 59 | is_sparse_graph : bool, optional (default=False) 60 | Specifies whether the graph Laplacian created from the data is sparse. 61 | 62 | ae_hiddens : list, optional (default=[512, 512, 2048, 10]) 63 | The number of hidden units in each layer of the Autoencoder network. 64 | 65 | ae_epochs : int, optional (default=30) 66 | The number of epochs to train the Autoencoder network. 67 | 68 | ae_lr : float, optional (default=1e-3) 69 | The learning rate for the Autoencoder network. 70 | 71 | ae_lr_decay : float, optional (default=0.1) 72 | The learning rate decay factor for the Autoencoder network. 73 | 74 | ae_min_lr : float, optional (default=1e-7) 75 | The minimum learning rate for the Autoencoder network. 76 | 77 | ae_patience : int, optional (default=10) 78 | The number of epochs to wait before reducing the learning rate for the Autoencoder network. 79 | 80 | ae_batch_size : int, optional (default=256) 81 | The batch size used during training of the Autoencoder network. 82 | 83 | siamese_hiddens : list, optional (default=[1024, 1024, 512, 10]) 84 | The number of hidden units in each layer of the Siamese network. 85 | 86 | siamese_epochs : int, optional (default=30) 87 | The number of epochs to train the Siamese network. 88 | 89 | siamese_lr : float, optional (default=1e-3) 90 | The learning rate for the Siamese network. 91 | 92 | siamese_lr_decay : float, optional (default=0.1) 93 | The learning rate decay factor for the Siamese network. 94 | 95 | siamese_min_lr : float, optional (default=1e-7) 96 | The minimum learning rate for the Siamese network. 97 | 98 | siamese_patience : int, optional (default=10) 99 | The number of epochs to wait before reducing the learning rate for the Siamese network. 100 | 101 | siamese_n_nbg : int, optional (default=2) 102 | The number of nearest neighbors to consider as 'positive' pairs by the Siamese network. 103 | 104 | siamese_use_approx : bool, optional (default=False) 105 | Specifies whether to use Annoy instead of KNN for computing nearest neighbors, 106 | particularly useful for large datasets. 107 | 108 | siamese_batch_size : int, optional (default=256) 109 | The batch size used during training of the Siamese network. 110 | 111 | spectral_hiddens : list, optional (default=[1024, 1024, 512, 10]) 112 | The number of hidden units in each layer of the Spectral network. 113 | 114 | spectral_epochs : int, optional (default=30) 115 | The number of epochs to train the Spectral network. 116 | 117 | spectral_lr : float, optional (default=1e-3) 118 | The learning rate for the Spectral network. 119 | 120 | spectral_lr_decay : float, optional (default=0.1) 121 | The learning rate decay factor""" 122 | 123 | self.n_components = n_components 124 | self.should_use_ae = should_use_ae 125 | self.should_use_siamese = should_use_siamese 126 | self.is_sparse_graph = is_sparse_graph 127 | self.ae_hiddens = ae_hiddens 128 | self.ae_epochs = ae_epochs 129 | self.ae_lr = ae_lr 130 | self.ae_lr_decay = ae_lr_decay 131 | self.ae_min_lr = ae_min_lr 132 | self.ae_patience = ae_patience 133 | self.ae_batch_size = ae_batch_size 134 | self.siamese_hiddens = siamese_hiddens 135 | self.siamese_epochs = siamese_epochs 136 | self.siamese_lr = siamese_lr 137 | self.siamese_lr_decay = siamese_lr_decay 138 | self.siamese_min_lr = siamese_min_lr 139 | self.siamese_patience = siamese_patience 140 | self.siamese_n_nbg = siamese_n_nbg 141 | self.siamese_use_approx = siamese_use_approx 142 | self.siamese_batch_size = siamese_batch_size 143 | self.spectral_hiddens = spectral_hiddens 144 | self.spectral_epochs = spectral_epochs 145 | self.spectral_lr = spectral_lr 146 | self.spectral_lr_decay = spectral_lr_decay 147 | self.spectral_min_lr = spectral_min_lr 148 | self.spectral_patience = spectral_patience 149 | self.spectral_n_nbg = spectral_n_nbg 150 | self.spectral_scale_k = spectral_scale_k 151 | self.spectral_is_local_scale = spectral_is_local_scale 152 | self.spectral_batch_size = spectral_batch_size 153 | self.X_new = None 154 | 155 | def _fit(self, X: torch.Tensor, y: torch.Tensor) -> np.ndarray: 156 | """Fit the SpectralNet model to the input data. 157 | 158 | Parameters 159 | ---------- 160 | X : torch.Tensor 161 | The input data of shape (n_samples, n_features). 162 | 163 | y: torch.Tensor 164 | The labels of the input data of shape (n_samples,). 165 | 166 | Returns 167 | ------- 168 | np.ndarray 169 | The fitted embeddings of shape (n_samples, n_components). 170 | """ 171 | self._spectralnet = SpectralNet( 172 | n_clusters=self.n_components, 173 | should_use_ae=self.should_use_ae, 174 | should_use_siamese=self.should_use_siamese, 175 | is_sparse_graph=self.is_sparse_graph, 176 | ae_hiddens=self.ae_hiddens, 177 | ae_epochs=self.ae_epochs, 178 | ae_lr=self.ae_lr, 179 | ae_lr_decay=self.ae_lr_decay, 180 | ae_min_lr=self.ae_min_lr, 181 | ae_patience=self.ae_patience, 182 | ae_batch_size=self.ae_batch_size, 183 | siamese_hiddens=self.siamese_hiddens, 184 | siamese_epochs=self.siamese_epochs, 185 | siamese_lr=self.siamese_lr, 186 | siamese_lr_decay=self.siamese_lr_decay, 187 | siamese_min_lr=self.siamese_min_lr, 188 | siamese_patience=self.siamese_patience, 189 | siamese_n_nbg=self.siamese_n_nbg, 190 | siamese_use_approx=self.siamese_use_approx, 191 | siamese_batch_size=self.siamese_batch_size, 192 | spectral_hiddens=self.spectral_hiddens, 193 | spectral_epochs=self.spectral_epochs, 194 | spectral_lr=self.spectral_lr, 195 | spectral_lr_decay=self.spectral_lr_decay, 196 | spectral_min_lr=self.spectral_min_lr, 197 | spectral_patience=self.spectral_patience, 198 | spectral_n_nbg=self.spectral_n_nbg, 199 | spectral_scale_k=self.spectral_scale_k, 200 | spectral_is_local_scale=self.spectral_is_local_scale, 201 | spectral_batch_size=self.spectral_batch_size, 202 | ) 203 | 204 | self._spectralnet.fit(X, y) 205 | 206 | def _predict(self, X: torch.Tensor) -> np.ndarray: 207 | """Predict embeddings for the input data using the fitted SpectralNet model. 208 | 209 | Parameters 210 | ---------- 211 | X : torch.Tensor 212 | The input data of shape (n_samples, n_features). 213 | 214 | Returns 215 | ------- 216 | np.ndarray 217 | The predicted embeddings of shape (n_samples, n_components). 218 | """ 219 | self._spectralnet.predict(X) 220 | return self._spectralnet.embeddings_ 221 | 222 | def _transform(self, X: torch.Tensor) -> np.ndarray: 223 | """Transform the input data into embeddings using the fitted SpectralNet model. 224 | 225 | Parameters 226 | ---------- 227 | X : torch.Tensor 228 | The input data of shape (n_samples, n_features). 229 | 230 | Returns 231 | ------- 232 | np.ndarray 233 | The transformed embeddings of shape (n_samples, n_components). 234 | """ 235 | return self._predict(X) 236 | 237 | def fit_transform(self, X: torch.Tensor, y: torch.Tensor = None) -> np.ndarray: 238 | """Fit the SpectralNet model to the input data and transform it into embeddings. 239 | 240 | This is a convenience method that combines the fit and transform steps. 241 | 242 | Parameters 243 | ---------- 244 | X : torch.Tensor 245 | The input data of shape (n_samples, n_features). 246 | 247 | y: torch.Tensor 248 | The labels of the input data of shape (n_samples,). 249 | 250 | Returns 251 | ------- 252 | np.ndarray 253 | The fitted and transformed embeddings of shape (n_samples, n_components). 254 | """ 255 | self._fit(X, y) 256 | return self._transform(X) 257 | 258 | def _get_laplacian_of_small_batch(self, batch: torch.Tensor) -> np.ndarray: 259 | """Get the Laplacian of a small batch of the input data 260 | 261 | Parameters 262 | ---------- 263 | 264 | batch : torch.Tensor 265 | A small batch of the input data of shape (batch_size, n_features). 266 | 267 | Returns 268 | ------- 269 | np.ndarray 270 | The Laplacian of the small batch of the input data. 271 | 272 | 273 | 274 | """ 275 | 276 | W = get_affinity_matrix(batch, self.spectral_n_nbg, self._spectralnet.device) 277 | L = get_laplacian(W) 278 | return L 279 | 280 | def _remove_smallest_eigenvector(self, V: np.ndarray) -> np.ndarray: 281 | """Remove the constant eigenvector from the eigenvectors of the Laplacian of a small batch of the input data. 282 | 283 | 284 | Parameters 285 | ---------- 286 | V : np.ndarray 287 | The eigenvectors of the Laplacian of a small batch of the input data. 288 | 289 | 290 | Returns 291 | ------- 292 | np.ndarray 293 | The eigenvectors of the Laplacian of a small batch of the input data without the constant eigenvector. 294 | """ 295 | 296 | batch_raw, batch_encoded = self._spectralnet.get_random_batch() 297 | L_batch = self._get_laplacian_of_small_batch(batch_encoded) 298 | V_batch = self._predict(batch_raw) 299 | eigenvalues = np.diag(V_batch.T @ L_batch @ V_batch) 300 | indices = np.argsort(eigenvalues) 301 | smallest_index = indices[0] 302 | V = V[:, np.arange(V.shape[1]) != smallest_index] 303 | V = V[ 304 | :, 305 | (np.arange(V.shape[1]) == indices[1]) 306 | | (np.arange(V.shape[1]) == indices[2]), 307 | ] 308 | 309 | return V 310 | 311 | def visualize( 312 | self, V: np.ndarray, y: torch.Tensor = None, n_components: int = 1 313 | ) -> None: 314 | """Visualize the embeddings of the input data using the fitted SpectralNet model. 315 | 316 | Parameters 317 | ---------- 318 | V : torch.Tensor 319 | The reduced data of shape (n_samples, n_features) to be visualized. 320 | y : torch.Tensor 321 | The input labels of shape (n_samples,). 322 | """ 323 | V = self._remove_smallest_eigenvector(V) 324 | print(V.shape) 325 | 326 | plot_laplacian_eigenvectors(V, y) 327 | cluster_labels = self._get_clusters_by_kmeans(V) 328 | acc = Metrics.acc_score(cluster_labels, y.detach().cpu().numpy(), n_clusters=10) 329 | print("acc with 2 components: ", acc) 330 | 331 | if n_components > 1: 332 | x_axis = V[:, 0] 333 | y_axis = V[:, 1] 334 | 335 | elif n_components == 1: 336 | x_axis = V 337 | y_axis = np.zeros_like(V) 338 | 339 | else: 340 | raise ValueError( 341 | "n_components must be a positive integer (greater than 0))" 342 | ) 343 | 344 | if y is None: 345 | plt.scatter(x_axis, y_axis) 346 | else: 347 | plt.scatter(x_axis, y_axis, c=y, cmap="tab10", s=3) 348 | 349 | plt.show() 350 | 351 | def _get_clusters_by_kmeans(self, embeddings: np.ndarray) -> np.ndarray: 352 | """Performs k-means clustering on the spectral-embedding space. 353 | 354 | Parameters 355 | ---------- 356 | embeddings : np.ndarray 357 | The spectral-embedding space. 358 | 359 | Returns 360 | ------- 361 | np.ndarray 362 | The cluster assignments for the given data. 363 | """ 364 | 365 | kmeans = KMeans(n_clusters=self.n_components, n_init=10).fit(embeddings) 366 | cluster_assignments = kmeans.predict(embeddings) 367 | return cluster_assignments 368 | -------------------------------------------------------------------------------- /src/spectralnet/_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import matplotlib.colors as colors 6 | 7 | from annoy import AnnoyIndex 8 | from sklearn.neighbors import NearestNeighbors 9 | 10 | 11 | def build_ann(X: torch.Tensor): 12 | """ 13 | Builds approximate-nearest-neighbors object 14 | that can be used to calculate the k-nearest neighbors of a data-point 15 | 16 | Parameters 17 | ---------- 18 | X : torch.Tensor 19 | Dataset. 20 | 21 | Returns 22 | ------- 23 | None 24 | """ 25 | 26 | X = X.view(X.size(0), -1) 27 | t = AnnoyIndex(X[0].shape[0], "euclidean") 28 | for i, x_i in enumerate(X): 29 | t.add_item(i, x_i) 30 | 31 | t.build(50) 32 | t.save("ann_index.ann") 33 | 34 | 35 | def make_batch_for_sparse_grapsh(batch_x: torch.Tensor) -> torch.Tensor: 36 | """ 37 | Computes a new batch of data points from the given batch (batch_x) 38 | in case that the graph-laplacian obtained from the given batch is sparse. 39 | The new batch is computed based on the nearest neighbors of 0.25 40 | of the given batch. 41 | 42 | Parameters 43 | ---------- 44 | batch_x : torch.Tensor 45 | Batch of data points. 46 | 47 | Returns 48 | ------- 49 | torch.Tensor 50 | New batch of data points. 51 | """ 52 | 53 | batch_size = batch_x.shape[0] 54 | batch_size //= 5 55 | new_batch_x = batch_x[:batch_size] 56 | batch_x = new_batch_x 57 | n_neighbors = 5 58 | 59 | u = AnnoyIndex(batch_x[0].shape[0], "euclidean") 60 | u.load("ann_index.ann") 61 | for x in batch_x: 62 | x = x.detach().cpu().numpy() 63 | nn_indices = u.get_nns_by_vector(x, n_neighbors) 64 | nn_tensors = [u.get_item_vector(i) for i in nn_indices[1:]] 65 | nn_tensors = torch.tensor(nn_tensors, device=batch_x.device) 66 | new_batch_x = torch.cat((new_batch_x, nn_tensors)) 67 | 68 | return new_batch_x 69 | 70 | 71 | def get_laplacian(W: torch.Tensor) -> np.ndarray: 72 | """ 73 | Computes the unnormalized Laplacian matrix, given the affinity matrix W. 74 | 75 | Parameters 76 | ---------- 77 | W : torch.Tensor 78 | Affinity matrix. 79 | 80 | Returns 81 | ------- 82 | np.ndarray 83 | Laplacian matrix. 84 | """ 85 | 86 | W = W.detach().cpu().numpy() 87 | D = np.diag(W.sum(axis=1)) 88 | L = D - W 89 | return L 90 | 91 | 92 | def sort_laplacian(L: np.ndarray, y: np.ndarray) -> np.ndarray: 93 | """ 94 | Sorts the columns and rows of the Laplacian by the true labels in order 95 | to see whether the sorted Laplacian is a block diagonal matrix. 96 | 97 | Parameters 98 | ---------- 99 | L : np.ndarray 100 | Laplacian matrix. 101 | y : np.ndarray 102 | Labels. 103 | 104 | Returns 105 | ------- 106 | np.ndarray 107 | Sorted Laplacian. 108 | """ 109 | 110 | i = np.argsort(y) 111 | L = L[i, :] 112 | L = L[:, i] 113 | return L 114 | 115 | 116 | def sort_matrix_rows(A: np.ndarray, y: np.ndarray) -> np.ndarray: 117 | """ 118 | Sorts the rows of a matrix by a given order. 119 | 120 | Parameters 121 | ---------- 122 | A : np.ndarray 123 | Numpy ndarray. 124 | y : np.ndarray 125 | True labels. 126 | 127 | Returns 128 | ------- 129 | np.ndarray 130 | Sorted matrix. 131 | """ 132 | 133 | i = np.argsort(y) 134 | A = A[i, :] 135 | return A 136 | 137 | 138 | def get_eigenvalues(A: np.ndarray) -> np.ndarray: 139 | """ 140 | Computes the eigenvalues of a given matrix A and sorts them in increasing order. 141 | 142 | Parameters 143 | ---------- 144 | A : np.ndarray 145 | Numpy ndarray. 146 | 147 | Returns 148 | ------- 149 | np.ndarray 150 | Sorted eigenvalues. 151 | """ 152 | 153 | _, vals, _ = np.linalg.svd(A) 154 | sorted_vals = vals[np.argsort(vals)] 155 | return sorted_vals 156 | 157 | 158 | def get_eigenvectors(A: np.ndarray) -> np.ndarray: 159 | """ 160 | Computes the eigenvectors of a given matrix A and sorts them by the eigenvalues. 161 | 162 | Parameters 163 | ---------- 164 | A : np.ndarray 165 | Numpy ndarray. 166 | 167 | Returns 168 | ------- 169 | np.ndarray 170 | Sorted eigenvectors. 171 | """ 172 | 173 | vecs, vals, _ = np.linalg.svd(A) 174 | vecs = vecs[:, np.argsort(vals)] 175 | return vecs 176 | 177 | 178 | def plot_eigenvalues(vals: np.ndarray): 179 | """ 180 | Plot the eigenvalues of the Laplacian. 181 | 182 | Parameters 183 | ---------- 184 | vals : np.ndarray 185 | Eigenvalues. 186 | """ 187 | 188 | rang = range(len(vals)) 189 | plt.plot(rang, vals) 190 | plt.show() 191 | 192 | 193 | def get_laplacian_eigenvectors(V: torch.Tensor, y: np.ndarray) -> np.ndarray: 194 | """ 195 | Returns eigenvectors of the Laplacian when the data is sorted in increasing 196 | order by the true label. 197 | 198 | Parameters 199 | ---------- 200 | V : torch.Tensor 201 | Eigenvectors matrix. 202 | y : np.ndarray 203 | True labels. 204 | 205 | Returns 206 | ------- 207 | np.ndarray 208 | Sorted eigenvectors matrix and range. 209 | 210 | """ 211 | 212 | V = sort_matrix_rows(V, y) 213 | rang = range(len(y)) 214 | return V, rang 215 | 216 | 217 | def plot_laplacian_eigenvectors(V: np.ndarray, y: np.ndarray): 218 | """ 219 | Plot the eigenvectors of the Laplacian when the data is sorted in increasing 220 | order by the true label. 221 | 222 | Parameters 223 | ---------- 224 | V : np.ndarray 225 | Eigenvectors matrix. 226 | y : np.ndarray 227 | True labels. 228 | 229 | Returns 230 | ------- 231 | plt.Axes 232 | The matplotlib Axes object containing the plot. 233 | """ 234 | 235 | V = sort_matrix_rows(V, y) 236 | rang = range(len(y)) 237 | plt.plot(rang, V) 238 | plt.show() 239 | return plt 240 | 241 | 242 | def plot_sorted_laplacian(W: torch.Tensor, y: np.ndarray): 243 | """ 244 | Plot the block diagonal matrix obtained from the sorted Laplacian. 245 | 246 | Parameters 247 | ---------- 248 | W : torch.Tensor 249 | Affinity matrix. 250 | y : np.ndarray 251 | True labels. 252 | """ 253 | L = get_laplacian(W) 254 | L = sort_laplacian(L, y) 255 | plt.imshow(L, cmap="hot", norm=colors.LogNorm()) 256 | plt.imshow(L, cmap="flag") 257 | plt.show() 258 | 259 | 260 | def get_nearest_neighbors( 261 | X: torch.Tensor, Y: torch.Tensor = None, k: int = 3 262 | ) -> tuple[np.ndarray, np.ndarray]: 263 | """ 264 | Computes the distances and the indices of the k nearest neighbors of each data point. 265 | 266 | Parameters 267 | ---------- 268 | X : torch.Tensor 269 | Batch of data points. 270 | Y : torch.Tensor, optional 271 | Defaults to None. 272 | k : int, optional 273 | Number of nearest neighbors to calculate. Defaults to 3. 274 | 275 | Returns 276 | ------- 277 | tuple[np.ndarray, np.ndarray] 278 | Distances and indices of each data point. 279 | """ 280 | if Y is None: 281 | Y = X 282 | if len(X) < k: 283 | k = len(X) 284 | X = X.cpu().detach().numpy() 285 | Y = Y.cpu().detach().numpy() 286 | nbrs = NearestNeighbors(n_neighbors=k).fit(X) 287 | Dis, Ids = nbrs.kneighbors(X) 288 | return Dis, Ids 289 | 290 | 291 | def get_grassman_distance(A: np.ndarray, B: np.ndarray) -> float: 292 | """ 293 | Computes the Grassmann distance between the subspaces spanned by the columns of A and B. 294 | 295 | Parameters 296 | ---------- 297 | A : np.ndarray 298 | Numpy ndarray. 299 | B : np.ndarray 300 | Numpy ndarray. 301 | 302 | Returns 303 | ------- 304 | float 305 | The Grassmann distance. 306 | """ 307 | 308 | M = np.dot(np.transpose(A), B) 309 | _, s, _ = np.linalg.svd(M, full_matrices=False) 310 | s = 1 - np.square(s) 311 | grassmann = np.sum(s) 312 | return grassmann 313 | 314 | 315 | def compute_scale( 316 | Dis: np.ndarray, k: int = 2, med: bool = True, is_local: bool = True 317 | ) -> np.ndarray: 318 | """ 319 | Computes the scale for the Gaussian similarity function. 320 | 321 | Parameters 322 | ---------- 323 | Dis : np.ndarray 324 | Distances of the k nearest neighbors of each data point. 325 | k : int, optional 326 | Number of nearest neighbors for the scale calculation. Relevant for global scale only. 327 | med : bool, optional 328 | Scale calculation method. Can be calculated by the median distance from a data point to its neighbors, 329 | or by the maximum distance. Defaults to True. 330 | is_local : bool, optional 331 | Local distance (different for each data point), or global distance. Defaults to True. 332 | 333 | Returns 334 | ------- 335 | np.ndarray 336 | Scale (global or local). 337 | """ 338 | 339 | if is_local: 340 | if not med: 341 | scale = np.max(Dis, axis=1) 342 | else: 343 | scale = np.median(Dis, axis=1) 344 | else: 345 | if not med: 346 | scale = np.max(Dis[:, k - 1]) 347 | else: 348 | scale = np.median(Dis[:, k - 1]) 349 | return scale 350 | 351 | 352 | def get_gaussian_kernel( 353 | D: torch.Tensor, scale, Ids: np.ndarray, device: torch.device, is_local: bool = True 354 | ) -> torch.Tensor: 355 | """ 356 | Computes the Gaussian similarity function according to a given distance matrix D and a given scale. 357 | 358 | Parameters 359 | ---------- 360 | D : torch.Tensor 361 | Distance matrix. 362 | scale : 363 | Scale. 364 | Ids : np.ndarray 365 | Indices of the k nearest neighbors of each sample. 366 | device : torch.device 367 | Defaults to torch.device("cpu"). 368 | is_local : bool, optional 369 | Determines whether the given scale is global or local. Defaults to True. 370 | 371 | Returns 372 | ------- 373 | torch.Tensor 374 | Matrix W with Gaussian similarities. 375 | """ 376 | 377 | if not is_local: 378 | # global scale 379 | W = torch.exp(-torch.pow(D, 2) / (scale**2)) 380 | else: 381 | # local scales 382 | W = torch.exp( 383 | -torch.pow(D, 2).to(device) 384 | / (torch.tensor(scale).float().to(device).clamp_min(1e-7) ** 2) 385 | ) 386 | if Ids is not None: 387 | n, k = Ids.shape 388 | mask = torch.zeros([n, n]).to(device=device) 389 | for i in range(len(Ids)): 390 | mask[i, Ids[i]] = 1 391 | W = W * mask 392 | sym_W = (W + torch.t(W)) / 2.0 393 | return sym_W 394 | 395 | 396 | def get_t_kernel( 397 | D: torch.Tensor, Ids: np.ndarray, device: torch.device, is_local: bool = True 398 | ) -> torch.Tensor: 399 | """ 400 | Computes the t similarity function according to a given distance matrix D and a given scale. 401 | 402 | Parameters 403 | ---------- 404 | D : torch.Tensor 405 | Distance matrix. 406 | Ids : np.ndarray 407 | Indices of the k nearest neighbors of each sample. 408 | device : torch.device 409 | Defaults to torch.device("cpu"). 410 | is_local : bool, optional 411 | Determines whether the given scale is global or local. Defaults to True. 412 | 413 | Returns 414 | ------- 415 | torch.Tensor 416 | Matrix W with t similarities. 417 | """ 418 | 419 | W = torch.pow(1 + torch.pow(D, 2), -1) 420 | if Ids is not None: 421 | n, k = Ids.shape 422 | mask = torch.zeros([n, n]).to(device=device) 423 | for i in range(len(Ids)): 424 | mask[i, Ids[i]] = 1 425 | W = W * mask 426 | sym_W = (W + W.T) / 2.0 427 | return sym_W 428 | 429 | 430 | def get_affinity_matrix( 431 | X: torch.Tensor, n_neighbors: int, device: torch.device 432 | ) -> torch.Tensor: 433 | """ 434 | Computes the affinity matrix for the data X. 435 | 436 | Parameters 437 | ---------- 438 | X : torch.Tensor 439 | Data. 440 | n_neighbors : int 441 | Number of nearest neighbors to calculate. 442 | device : torch.device 443 | Defaults to torch.device("cpu"). 444 | 445 | Returns 446 | ------- 447 | torch.Tensor 448 | Affinity matrix. 449 | """ 450 | 451 | Dx = torch.cdist(X, X) 452 | Dis, indices = get_nearest_neighbors(X, k=n_neighbors + 1) 453 | W = get_t_kernel(Dx, indices, device=device) 454 | return W 455 | 456 | 457 | def plot_data_by_assignments(X, assignments: np.ndarray): 458 | """ 459 | Plots the data with the assignments obtained from SpectralNet. Relevant only for 2D data. 460 | 461 | Parameters 462 | ---------- 463 | X : 464 | Data. 465 | assignments : np.ndarray 466 | Cluster assignments. 467 | """ 468 | 469 | plt.scatter(X[:, 0], X[:, 1], c=assignments) 470 | plt.show() 471 | 472 | 473 | def calculate_cost_matrix(C: np.ndarray, n_clusters: int) -> np.ndarray: 474 | """ 475 | Calculates the cost matrix for the Munkres algorithm. 476 | 477 | Parameters 478 | ---------- 479 | C : np.ndarray 480 | Confusion matrix. 481 | n_clusters : int 482 | Number of clusters. 483 | 484 | Returns 485 | ------- 486 | np.ndarray 487 | Cost matrix. 488 | """ 489 | 490 | cost_matrix = np.zeros((n_clusters, n_clusters)) 491 | # cost_matrix[i,j] will be the cost of assigning cluster i to label j 492 | for j in range(n_clusters): 493 | s = np.sum(C[:, j]) # number of examples in cluster i 494 | for i in range(n_clusters): 495 | t = C[i, j] 496 | cost_matrix[j, i] = s - t 497 | return cost_matrix 498 | 499 | 500 | def get_cluster_labels_from_indices(indices: np.ndarray) -> np.ndarray: 501 | """ 502 | Gets the cluster labels from their indices. 503 | 504 | Parameters 505 | ---------- 506 | indices : np.ndarray 507 | Indices of the clusters. 508 | 509 | Returns 510 | ------- 511 | np.ndarray 512 | Cluster labels. 513 | """ 514 | 515 | num_clusters = len(indices) 516 | cluster_labels = np.zeros(num_clusters) 517 | for i in range(num_clusters): 518 | cluster_labels[i] = indices[i][1] 519 | return cluster_labels 520 | 521 | 522 | def write_assignments_to_file(assignments: np.ndarray): 523 | """ 524 | Saves SpectralNet cluster assignments to a file. 525 | 526 | Parameters 527 | ---------- 528 | assignments : np.ndarray 529 | The assignments that obtained from SpectralNet. 530 | """ 531 | 532 | np.savetxt( 533 | "cluster_assignments.csv", assignments.astype(int), fmt="%i", delimiter="," 534 | ) 535 | 536 | 537 | def create_weights_dir(): 538 | """ 539 | Creates a directory for the weights of the Autoencoder and the Siamese network 540 | """ 541 | if not os.path.exists("weights"): 542 | os.makedirs("weights") 543 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | SpectralNet: Spectral Clustering Using Deep Neural Networks 9 | 10 | 11 | 12 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 |

463 |
464 |
465 |

466 | SpectralNet: Spectral Clustering Using Deep Neural Networks 467 |

468 |
469 |
470 |
471 | 510 | 524 |
525 |
526 |
527 |
528 | 529 |
530 |

531 | Abstract 532 |

533 |

534 | Spectral clustering is a leading and popular technique in unsupervised data analysis. Two of its major 535 | limitations are scalability and generalization of the spectral embedding (i.e., out-of-sample-extension). In 536 | this paper we introduce a deep 537 | learning approach to spectral clustering that overcomes the above shortcomings. 538 | Our network, which we call SpectralNet, learns a map that embeds input data 539 | points into the eigenspace of their associated graph Laplacian matrix and subsequently clusters them. We 540 | train SpectralNet using a procedure that involves 541 | constrained stochastic optimization. Stochastic optimization allows it to scale 542 | to large datasets, while the constraints, which are implemented using a specialpurpose output layer, allow 543 | us to keep the network output orthogonal. Moreover, the map learned by SpectralNet naturally generalizes the 544 | spectral embedding to unseen data points. To further improve the quality of the clustering, we 545 | replace the standard pairwise Gaussian affinities with affinities learned from the 546 | given unlabeled data using a Siamese network. Additional improvement of the 547 | resulting clustering can be achieved by applying the network to code representations produced, e.g., by 548 | standard autoencoders. Our end-to-end learning procedure is fully unsupervised. In addition, we apply VC 549 | dimension theory to derive a lower bound on the size of SpectralNet. State-of-the-art clustering results 550 | are reported on the Reuters dataset. 551 |

552 | 555 |
556 |
557 | 558 | 559 | 560 |
561 | 575 |
576 | 577 |

578 | Citation 579 |

580 |
@inproceedings{shaham2018,
581 |   author = {Uri Shaham and Kelly Stanton and Henri Li and Boaz Nadler and Ronen Basri and Yuval Kluger},
582 |   title = {SpectralNet: Spectral Clustering Using Deep Neural Networks},
583 |   booktitle = {Proc. ICLR 2018},
584 |   year = {2018}
585 | }
586 | 
587 |

588 | License 589 |

590 |

591 | MIT License. 592 | Feel free to use any of the material in your own work, as long as you give us appropriate credit by 593 | mentioning the title and author list of our paper. 594 |

595 | 596 |
597 | 599 | 600 | 601 | --------------------------------------------------------------------------------