├── __init__.py ├── som.pyc ├── som_plot.pyc ├── visuals ├── som_latice.png └── 2d_projection.png ├── setup.py ├── README.md ├── tests.py ├── grid_search.py ├── .gitignore ├── sample_run.py ├── rsom.py └── som_theano.py /__init__.py: -------------------------------------------------------------------------------- 1 | from .som import SOM 2 | -------------------------------------------------------------------------------- /som.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erogol/RSOM/HEAD/som.pyc -------------------------------------------------------------------------------- /som_plot.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erogol/RSOM/HEAD/som_plot.pyc -------------------------------------------------------------------------------- /visuals/som_latice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erogol/RSOM/HEAD/visuals/som_latice.png -------------------------------------------------------------------------------- /visuals/2d_projection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erogol/RSOM/HEAD/visuals/2d_projection.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="RSOM", 8 | version="0.1.0", 9 | author="Eren Gölge", 10 | author_email="", 11 | description="A Rectifying Self-Organizing Map (RSOM) implementation", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/erogol/RSOM", 15 | packages=find_packages(exclude=["tests*", "examples*"]), 16 | classifiers=[ 17 | "Development Status :: 3 - Alpha", 18 | "Intended Audience :: Developers", 19 | "License :: OSI Approved :: MIT License", 20 | ], 21 | python_requires=">=3.6", 22 | install_requires=[ 23 | "numpy", 24 | "matplotlib", 25 | "torch", 26 | "scikit-learn", 27 | ], 28 | ) 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Rectifying Self Organizing Map (RSOM) 2 | =============================== 3 | 4 | 📎 **Paper 1:** Gölge, E., & Duygulu, P. 2013. Rectifying Self Organizing Maps for Automatic Concept Learning from Web Images 5 | 6 | 📎 **Paper 2:** Gölge, E., & Duygulu, P.. ConceptMap:Mining noisy web data for concept learning , The European Conference on Computer Vision (ECCV) 2014. 7 | 8 | RSOM is an algorithm as an extension of well-known Self Organizing Map (SOM). It mimics SOM clustering and additionally detects outliers in the given dataset in the cluster level or instance level. 9 | It is mainly used with image tasks but works as good with any other type of data. 10 | 11 | ## Installation 12 | 13 | ``` 14 | git clone https://github.com/erogol/RSOM.git 15 | cd RSOM 16 | python setup.py install 17 | ``` 18 | 19 | or 20 | 21 | ``` 22 | pip install git+https://github.com/erogol/RSOM.git 23 | ``` 24 | 25 | ## Usage 26 | 27 | Check ```sample_run.py``` for more. 28 | 29 | ```python 30 | from rsom import RSOM 31 | 32 | # Load Iris dataset 33 | data = load_digits().data 34 | data = torch.from_numpy(data).float() 35 | print(data.shape) 36 | 37 | # Initialize SOM 38 | som = RSOM(data, alpha_max=0.05, num_units=49) 39 | 40 | # Train SOM 41 | som.train_batch(num_epoch=1000, verbose=True) 42 | 43 | # Get salient instances and units 44 | salient_insts = som.salient_insts() 45 | salient_units = som.salient_units() 46 | ``` 47 | 48 | 49 | 50 | ## Citation 51 | 52 | ``` 53 | @misc{golge2013rectifyingselforganizingmaps, 54 | title={Rectifying Self Organizing Maps for Automatic Concept Learning from Web Images}, 55 | author={Eren Golge and Pinar Duygulu}, 56 | year={2013}, 57 | eprint={1312.4384}, 58 | archivePrefix={arXiv}, 59 | primaryClass={cs.CV}, 60 | url={https://arxiv.org/abs/1312.4384}, 61 | } 62 | ``` 63 | 64 | ## Example Visuals 65 | 66 | Visuals are generated using ```sample_run.py``` and digits dataset. 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | 5 | from rsom import RSOM 6 | 7 | 8 | class TestSOM(unittest.TestCase): 9 | def setUp(self): 10 | self.data = torch.randn(100, 10) 11 | self.som = RSOM(self.data, num_units=25, height=5, width=5) 12 | 13 | def test_init(self): 14 | self.assertEqual(self.som.num_units, 25) 15 | self.assertEqual(self.som.height, 5) 16 | self.assertEqual(self.som.width, 5) 17 | self.assertEqual(self.som.W.shape, (25, 10)) 18 | 19 | def test_normalize_weights(self): 20 | self.som._normalize_weights() 21 | norms = torch.norm(self.som.W.data, dim=1) 22 | self.assertTrue(torch.allclose(norms, torch.ones_like(norms), atol=1e-6)) 23 | 24 | def test_unit_cords(self): 25 | self.assertEqual(self.som.unit_cords(7), (2, 1)) 26 | self.assertEqual(self.som.unit_cords(24), (4, 4)) 27 | 28 | def test_euq_dist(self): 29 | X = self.data[:5] 30 | X2 = (X**2).sum(1).unsqueeze(1) 31 | D = self.som._euq_dist(X2, X) 32 | self.assertEqual(D.shape, (25, 5)) 33 | 34 | def test_find_neighbors(self): 35 | neighbors = self.som.find_neighbors(12, 1) 36 | self.assertEqual(neighbors.shape, (1, 25)) 37 | self.assertEqual(neighbors[0, 12].item(), 0) 38 | 39 | def test_best_match(self): 40 | X = self.data[:5] 41 | BMU, D = self.som.best_match(X) 42 | self.assertEqual(BMU.shape, (5, 25)) 43 | self.assertEqual(D.shape, (25, 5)) 44 | 45 | def test_assing_to_units(self): 46 | self.som.assing_to_units() 47 | self.assertEqual(self.som.ins_unit_assign.shape, (100,)) 48 | self.assertEqual(self.som.ins_unit_dist.shape, (100,)) 49 | 50 | def test_set_params(self): 51 | U = self.som.set_params(10) 52 | self.assertEqual(len(U["alphas"]), 10) 53 | self.assertEqual(len(U["H_maps"]), 10) 54 | self.assertEqual(len(U["radiuses"]), 10) 55 | 56 | def test_train_batch(self): 57 | self.som.train_batch(num_epoch=5, batch_size=20, verbose=False) 58 | self.assertIsNotNone(self.som.ins_unit_assign) 59 | self.assertIsNotNone(self.som.ins_unit_dist) 60 | 61 | def test_update_unit_saliency(self): 62 | win_counts = torch.ones(25) 63 | update_rate = torch.ones(25, 25) 64 | self.som._update_unit_saliency(win_counts, update_rate, 0.1) 65 | self.assertGreater(self.som.unit_saliency_coeffs.sum().item(), 0) 66 | 67 | 68 | if __name__ == "__main__": 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /grid_search.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from matplotlib import pyplot as plt 4 | from sklearn.datasets import load_digits 5 | from sklearn.model_selection import train_test_split 6 | from tqdm import tqdm 7 | 8 | from som import SOM 9 | 10 | 11 | def quantization_error(som, data): 12 | _, distances = som.best_match(data) 13 | return torch.mean(torch.min(distances, dim=0)[0]) 14 | 15 | 16 | def grid_search_som(data, unit_range, epochs=1000, alpha_max=0.05, trials=3): 17 | results = [] 18 | 19 | for num_units in tqdm(unit_range, desc="Grid Search"): 20 | trial_errors = [] 21 | for _ in range(trials): 22 | som = SOM(data, num_units=num_units, alpha_max=alpha_max) 23 | som.train_batch(num_epoch=epochs, verbose=False) 24 | error = quantization_error(som, data) 25 | trial_errors.append(error.item()) 26 | 27 | avg_error = np.mean(trial_errors) 28 | std_error = np.std(trial_errors) 29 | results.append((num_units, avg_error, std_error)) 30 | 31 | print( 32 | f"Units: {num_units}, Avg Error: {avg_error:.4f}, Std Error: {std_error:.4f}" 33 | ) 34 | 35 | return results 36 | 37 | 38 | def find_elbow(x, y): 39 | # Normalize the data 40 | x = np.array(x) 41 | y = np.array(y) 42 | x_norm = (x - min(x)) / (max(x) - min(x)) 43 | y_norm = (y - min(y)) / (max(y) - min(y)) 44 | 45 | # Calculate the distances from each point to the line connecting the first and last points 46 | coords = np.vstack([x_norm, y_norm]).T 47 | first = coords[0] 48 | line_vec = coords[-1] - coords[0] 49 | line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2)) 50 | vec_from_first = coords - first 51 | scalar_proj = np.dot(vec_from_first, line_vec_norm) 52 | proj = np.outer(scalar_proj, line_vec_norm) 53 | distances = np.sqrt(np.sum((vec_from_first - proj) ** 2, axis=1)) 54 | 55 | # Find the elbow point (maximum distance) 56 | elbow_index = np.argmax(distances) 57 | return x[elbow_index], y[elbow_index] 58 | 59 | 60 | if __name__ == "__main__": 61 | # Load Digits dataset 62 | digits = load_digits() 63 | data = torch.from_numpy(digits.data).float() 64 | 65 | # Normalize the data 66 | data = (data - data.min()) / (data.max() - data.min()) 67 | 68 | # Split the data into train and test sets 69 | X_train, X_test = train_test_split(data, test_size=0.2, random_state=42) 70 | 71 | # Define the range of units to search 72 | unit_range = [9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196] 73 | 74 | # Perform grid search 75 | results = grid_search_som( 76 | X_train, unit_range, epochs=1000, alpha_max=0.05, trials=3 77 | ) 78 | 79 | # Extract units and errors 80 | units = [r[0] for r in results] 81 | errors = [r[1] for r in results] 82 | error_stds = [r[2] for r in results] 83 | 84 | # Find the elbow point 85 | elbow_units, elbow_error = find_elbow(units, errors) 86 | 87 | print(f"\nElbow point: {elbow_units:.0f} units, Error: {elbow_error:.4f}") 88 | 89 | # Plot the results 90 | plt.figure(figsize=(10, 6)) 91 | plt.errorbar(units, errors, yerr=error_stds, fmt="o-", capsize=5) 92 | plt.plot(elbow_units, elbow_error, "ro", markersize=10, label="Elbow point") 93 | plt.xlabel("Number of Units") 94 | plt.ylabel("Quantization Error") 95 | plt.title("SOM Grid Search Results") 96 | plt.xscale("log") 97 | plt.grid(True) 98 | plt.legend() 99 | plt.show() 100 | 101 | # Train the SOM with the elbow point number of units 102 | best_som = SOM(data, num_units=int(elbow_units), alpha_max=0.05) 103 | best_som.train_batch(num_epoch=1000, verbose=True) 104 | 105 | # Evaluate on test set 106 | test_error = quantization_error(best_som, X_test) 107 | print(f"\nTest set quantization error: {test_error:.4f}") 108 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .png -------------------------------------------------------------------------------- /sample_run.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import torch 6 | from PIL import Image 7 | from sklearn.datasets import load_digits 8 | from sklearn.decomposition import PCA 9 | 10 | from rsom import RSOM 11 | 12 | 13 | def get_node_coordinates(som, pca): 14 | coords = [] 15 | for i in range(som.height): 16 | for j in range(som.width): 17 | node_index = i * som.width + j 18 | node_weights = som.W[node_index].detach().numpy() 19 | coord = pca.transform([node_weights])[0] 20 | coords.append(coord) 21 | return np.array(coords) 22 | 23 | 24 | # Load Iris dataset 25 | data = load_digits().data 26 | data = torch.from_numpy(data).float() 27 | print(data.shape) 28 | 29 | # standardize the data 30 | data = (data - data.mean(axis=0)) / (data.std(axis=0) - 1e-8) 31 | 32 | # Initialize SOM 33 | som = RSOM( 34 | data, 35 | alpha_max=0.001, 36 | alpha_min=0.0005, 37 | num_units=49, 38 | ) 39 | 40 | # Train batch SOM 41 | som.train_batch(num_epoch=10000, verbose=True, batch_size=128) 42 | 43 | # Get salient instances and units 44 | salient_insts = som.salient_insts() 45 | salient_units = som.salient_units() 46 | 47 | # Perform PCA to reduce data to 2D for visualization 48 | pca = PCA(n_components=2) 49 | data_2d = pca.fit_transform(som.X.numpy()) 50 | units_2d = pca.transform(som.W.detach().numpy()) 51 | 52 | # Get node coordinates 53 | node_coords = get_node_coordinates(som, pca) 54 | 55 | # Create a plot 56 | plt.figure(figsize=(12, 8)) 57 | 58 | # Plot data points 59 | salient_mask = som.inst_saliency.numpy() 60 | plt.scatter( 61 | data_2d[salient_mask, 0], 62 | data_2d[salient_mask, 1], 63 | c=som.ins_unit_assign[salient_mask], 64 | cmap="viridis", 65 | alpha=0.6, 66 | label="Salient Samples", 67 | ) 68 | plt.scatter( 69 | data_2d[~salient_mask, 0], 70 | data_2d[~salient_mask, 1], 71 | c="red", 72 | marker="x", 73 | alpha=0.6, 74 | label="Outlier Samples", 75 | ) 76 | 77 | # Plot SOM units 78 | salient_units_mask = som.unit_saliency.numpy() 79 | plt.scatter( 80 | node_coords[salient_units_mask, 0], 81 | node_coords[salient_units_mask, 1], 82 | c="black", 83 | marker="s", 84 | s=50, 85 | label="Salient Units", 86 | ) 87 | plt.scatter( 88 | node_coords[~salient_units_mask, 0], 89 | node_coords[~salient_units_mask, 1], 90 | c="red", 91 | marker="s", 92 | s=50, 93 | label="Outlier Units", 94 | ) 95 | 96 | # Draw lattice lines 97 | for i in range(som.height): 98 | for j in range(som.width): 99 | node_index = i * som.width + j 100 | if j < som.width - 1: # Horizontal line 101 | next_node_index = node_index + 1 102 | plt.plot( 103 | [node_coords[node_index, 0], node_coords[next_node_index, 0]], 104 | [node_coords[node_index, 1], node_coords[next_node_index, 1]], 105 | "gray", 106 | alpha=0.5, 107 | ) 108 | if i < som.height - 1: # Vertical line 109 | next_node_index = node_index + som.width 110 | plt.plot( 111 | [node_coords[node_index, 0], node_coords[next_node_index, 0]], 112 | [node_coords[node_index, 1], node_coords[next_node_index, 1]], 113 | "gray", 114 | alpha=0.5, 115 | ) 116 | 117 | # Add labels and title 118 | plt.xlabel("First Principal Component") 119 | plt.ylabel("Second Principal Component") 120 | plt.title("SOM Units and Data Samples with Outliers and Lattice") 121 | plt.legend() 122 | 123 | # Show the plot 124 | plt.show() 125 | 126 | # Optional: Print some statistics 127 | print(f"Number of salient samples: {salient_mask.sum()}") 128 | print(f"Number of outlier samples: {(~salient_mask).sum()}") 129 | print(f"Number of salient units: {salient_units_mask.sum()}") 130 | print(f"Number of outlier units: {(~salient_units_mask).sum()}") 131 | 132 | # Create a new figure for the perfect 2D lattice plot 133 | plt.figure(figsize=(12, 12)) 134 | 135 | # Create a perfect 2D grid for SOM nodes 136 | grid_x, grid_y = np.meshgrid(np.arange(som.width), np.arange(som.height)) 137 | grid_x = grid_x.flatten() 138 | grid_y = grid_y.flatten() 139 | 140 | # Plot the perfect grid 141 | plt.scatter(grid_x, grid_y, c="lightgray", s=200, marker="s") 142 | 143 | # Draw grid lines 144 | for x in range(som.width): 145 | plt.axvline(x, color="lightgray", linestyle="--") 146 | for y in range(som.height): 147 | plt.axhline(y, color="lightgray", linestyle="--") 148 | 149 | # Get the unit assignments for each sample 150 | unit_assignments = som.ins_unit_assign.numpy() 151 | 152 | # Calculate the positions of samples on the grid 153 | sample_x = grid_x[unit_assignments].astype(float) 154 | sample_y = grid_y[unit_assignments].astype(float) 155 | 156 | # Add some jitter to prevent complete overlap 157 | jitter = 0.2 158 | sample_x += np.random.uniform(-jitter, jitter, sample_x.shape) 159 | sample_y += np.random.uniform(-jitter, jitter, sample_y.shape) 160 | 161 | # Plot the samples on the grid 162 | scatter = plt.scatter( 163 | sample_x, sample_y, c=som.ins_unit_assign, cmap="viridis", alpha=0.6 164 | ) 165 | 166 | # Highlight outlier samples 167 | outlier_mask = ~som.inst_saliency.numpy() 168 | plt.scatter( 169 | sample_x[outlier_mask], 170 | sample_y[outlier_mask], 171 | facecolors="none", 172 | edgecolors="red", 173 | s=50, 174 | linewidths=2, 175 | ) 176 | 177 | # Highlight outlier units 178 | for unit in np.where(~som.unit_saliency.numpy())[0]: 179 | unit_x, unit_y = som.unit_cords(unit) 180 | plt.gca().add_patch( 181 | plt.Circle((unit_x, unit_y), 0.4, fill=False, edgecolor="red", linewidth=2) 182 | ) 183 | 184 | # Set labels and title 185 | plt.xlabel("SOM Width") 186 | plt.ylabel("SOM Height") 187 | plt.title("Samples Mapped to Perfect 2D SOM Lattice") 188 | 189 | # Set tick labels 190 | plt.xticks(range(som.width)) 191 | plt.yticks(range(som.height)) 192 | 193 | # Add colorbar 194 | cbar = plt.colorbar(scatter) 195 | cbar.set_label("Unit Assignment") 196 | 197 | # Adjust plot limits 198 | plt.xlim(-0.5, som.width - 0.5) 199 | plt.ylim(-0.5, som.height - 0.5) 200 | 201 | # Show the plot 202 | plt.tight_layout() 203 | plt.show() 204 | 205 | # Create a folder to save outlier images 206 | output_folder = "outlier_digits" 207 | os.makedirs(output_folder, exist_ok=True) 208 | 209 | # Get the original digit images and their labels 210 | digits = load_digits() 211 | images = digits.images 212 | labels = digits.target 213 | 214 | # Find the indices of outlier samples 215 | outlier_indices = np.where(~salient_mask)[0] 216 | 217 | # Save outlier images 218 | for i, idx in enumerate(outlier_indices): 219 | img = images[idx] 220 | label = labels[idx] 221 | 222 | # Normalize the image to 0-255 range 223 | img_normalized = ((img - img.min()) / (img.max() - img.min()) * 255).astype( 224 | np.uint8 225 | ) 226 | 227 | # Create a PIL Image 228 | pil_img = Image.fromarray(img_normalized) 229 | 230 | # Save the image 231 | filename = f"outlier_{i}_label_{label}.png" 232 | pil_img.save(os.path.join(output_folder, filename)) 233 | 234 | print(f"Saved {len(outlier_indices)} outlier images to '{output_folder}' folder.") 235 | 236 | # Find samples closest to salient units 237 | salient_folder = "salient_digits" 238 | os.makedirs(salient_folder, exist_ok=True) 239 | salient_unit_indices = np.where(som.unit_saliency.numpy())[0] 240 | 241 | for i, unit_idx in enumerate(salient_unit_indices): 242 | # Find the sample closest to this salient unit 243 | unit_weights = som.W[unit_idx].detach().numpy() 244 | distances = np.linalg.norm(data.numpy() - unit_weights, axis=1) 245 | closest_sample_idx = np.argmin(distances) 246 | 247 | img = images[closest_sample_idx] 248 | label = labels[closest_sample_idx] 249 | 250 | # Normalize the image to 0-255 range 251 | img_normalized = ((img - img.min()) / (img.max() - img.min()) * 255).astype( 252 | np.uint8 253 | ) 254 | 255 | # Create a PIL Image 256 | pil_img = Image.fromarray(img_normalized) 257 | 258 | # Save the image 259 | filename = f"salient_unit_{i}_label_{label}.png" 260 | pil_img.save(os.path.join(salient_folder, filename)) 261 | 262 | print( 263 | f"Saved {len(salient_unit_indices)} salient unit images to '{salient_folder}' folder." 264 | ) 265 | -------------------------------------------------------------------------------- /rsom.py: -------------------------------------------------------------------------------- 1 | """ 2 | Rectifying Self Organazing Maps a.k.a RSOM 3 | 4 | RSOM is a clustering and outlier detection method that is predicated with 5 | old Self Organazing Maps. 6 | 7 | It includes Batch and Stochastic learning rules. There are two different 8 | implementations. One is based on Numpy and tthe other is Theano. If you have 9 | tall and wide data matrix, we suggest to use Theano version. Otherwise 10 | Numpy version is faster. You can also use GPU with Theano but you need to 11 | set Theano configurations. 12 | 13 | For more detail about RSOM refer to http://arxiv.org/abs/1312.4384 14 | 15 | AUTHOR: 16 | Eren Golge 17 | erengolge@gmail.com 18 | www.erengolge.com 19 | """ 20 | 21 | """ 22 | TO DO: 23 | -> Try dot product distance instead of Euclidean 24 | -> Normzalize only updated weight vectors in that epoch 25 | -> compare code with https://github.com/JustGlowing/minisom/blob/master/minisom.py 26 | -> print resulting objective values 27 | -> write bookeeping for best objective value 28 | -> learning rate is already decreasing so radius might be good to keep it constant 29 | -> UPDATE only winners 30 | """ 31 | 32 | import logging 33 | from typing import Optional, Tuple 34 | 35 | import numpy as np 36 | import torch 37 | 38 | # Set up logging 39 | logging.basicConfig( 40 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 41 | ) 42 | logger = logging.getLogger(__name__) 43 | 44 | 45 | class RSOM(torch.nn.Module): 46 | def __init__( 47 | self, 48 | data: torch.Tensor, 49 | num_units: int = 10, 50 | height: Optional[int] = None, 51 | width: Optional[int] = None, 52 | alpha_max: float = 0.05, 53 | alpha_min: float = 0.001, 54 | set_count_activations: bool = True, 55 | set_outlier_unit_det: bool = True, 56 | set_inunit_outlier_det: bool = True, 57 | outlier_unit_thresh: float = 0.5, 58 | inunit_outlier_thresh: float = 95, 59 | dist: str = "euclidean", 60 | log_full_data_cost: bool = False, 61 | steps_to_full_data_cost: int = -1, # TODO: number of steps to compute full data cost 62 | ): 63 | """Rectifying Self Organizing Maps. RSOM is a clustering and outlier detection method that is predicated with old Self Organizing Maps. 64 | 65 | Args: 66 | data: Input data. 67 | num_units: Number of units. 68 | height: Height of the map. 69 | width: Width of the map. 70 | alpha_max: Maximum learning rate. 71 | alpha_min: Minimum learning rate. 72 | set_count_activations: Whether to count activations. 73 | set_outlier_unit_det: Whether to detect outlier units. 74 | set_inunit_outlier_det: Whether to detect in-unit outliers. 75 | outlier_unit_thresh: Threshold for outlier unit detection. 76 | inunit_outlier_thresh: Threshold for in-unit outlier detection. 77 | dist: Distance metric. Can be "euclidean" or "cosine". 78 | log_full_data_cost: Whether to log full data cost or use exponential moving average. Computing full data 79 | cost is expensive and might cause OOM. 80 | 81 | """ 82 | 83 | super(RSOM, self).__init__() 84 | self.X = data 85 | self.num_units = num_units 86 | self.height = height 87 | self.width = width 88 | self.alpha_max = alpha_max 89 | self.alpha_min = alpha_min 90 | self.set_count_activations = set_count_activations 91 | self.set_outlier_unit_det = set_outlier_unit_det 92 | self.set_inunit_outlier_det = set_inunit_outlier_det 93 | self.outlier_unit_thresh = outlier_unit_thresh 94 | self.inunit_outlier_thresh = inunit_outlier_thresh 95 | self.log_full_data_cost = log_full_data_cost 96 | 97 | self._estimate_map_shape() 98 | self.data_dim = self.X.shape[1] 99 | 100 | self.W = torch.nn.Parameter(torch.randn(self.num_units, self.data_dim)) 101 | self._normalize_weights() 102 | 103 | self.distance_metric = dist 104 | if self.distance_metric not in ["euclidean", "cosine"]: 105 | raise ValueError("distance_metric must be either 'euclidean' or 'cosine'") 106 | 107 | self.activations = torch.zeros(self.num_units) 108 | self.unit_saliency_coeffs = torch.zeros(self.num_units) 109 | self.unit_saliency = torch.ones(self.num_units, dtype=torch.bool) 110 | self.inst_saliency = torch.tensor([]) 111 | self.ins_unit_assign = torch.tensor([]) 112 | self.ins_unit_dist = torch.tensor([]) 113 | self.unit_coher = torch.tensor([]) 114 | 115 | def _normalize_weights(self): 116 | self.W.data = self.W.data / torch.norm(self.W.data, dim=1, keepdim=True) 117 | 118 | def _estimate_map_shape(self): 119 | if self.height is None or self.width is None: 120 | u, s, v = torch.svd(self.X) 121 | ratio = s[0] / s[1] 122 | self.height = min( 123 | self.num_units, int(np.ceil(np.sqrt(self.num_units / ratio))) 124 | ) 125 | self.width = int(np.ceil(self.num_units / self.height)) 126 | self.num_units = self.height * self.width 127 | logging.info( 128 | f"Estimated map size is -> height = {self.height}, width = {self.width}" 129 | ) 130 | 131 | def unit_cords(self, index: int) -> Tuple[int, int]: 132 | return index % self.width, index // self.width 133 | 134 | def _calc_distance(self, X, X2=None): 135 | if self.distance_metric == "euclidean": 136 | return self._euclidean_distance(X, X2) 137 | elif self.distance_metric == "cosine": 138 | return self._cosine_distance(X) 139 | 140 | def _euclidean_distance(self, X, X2=None): 141 | if X2 is None: 142 | X2 = (X**2).sum(1)[:, None] 143 | W2 = (self.W**2).sum(1)[:, None] 144 | return -2 * torch.mm(self.W, X.t()) + W2 + X2.t() 145 | 146 | def _cosine_distance(self, X): 147 | X_norm = X / torch.norm(X, dim=1, keepdim=True) 148 | W_norm = self.W / torch.norm(self.W, dim=1, keepdim=True) 149 | return 1 - torch.mm(W_norm, X_norm.t()) 150 | 151 | def find_neighbors(self, unit_id: int, radius: int) -> torch.Tensor: 152 | neighbors = torch.zeros(1, self.num_units) 153 | unit_x, unit_y = self.unit_cords(unit_id) 154 | 155 | min_y = max(int(unit_y - radius), 0) 156 | max_y = min(int(unit_y + radius), self.height - 1) 157 | min_x = max(int(unit_x - radius), 0) 158 | max_x = min(int(unit_x + radius), self.width - 1) 159 | 160 | for y in range(min_y, max_y + 1): 161 | for x in range(min_x, max_x + 1): 162 | dist = abs(y - unit_y) + abs(x - unit_x) 163 | neighbors[0, x + (y * self.width)] = dist 164 | 165 | return neighbors 166 | 167 | def best_match(self, X: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 168 | if X.dim() == 1: 169 | X = X.unsqueeze(0) 170 | X2 = (X**2).sum(1).unsqueeze(1) 171 | D = -2 * torch.mm(self.W, X.t()) + (self.W**2).sum(1).unsqueeze(1) + X2.t() 172 | BMU = (D == D.min(0)[0]).float().t() 173 | return BMU, D 174 | 175 | def assing_to_units(self, X=None): 176 | if X is None: 177 | D = self._calc_distance(self.X) 178 | self.ins_unit_assign = D.argmin(axis=0) 179 | self.ins_unit_dist = D[self.ins_unit_assign, torch.arange(self.X.shape[0])] 180 | else: 181 | D = self._calc_distance(X) 182 | ins_unit_assign = D.argmin(axis=0) 183 | ins_unit_dist = D[ins_unit_assign, torch.arange(X.shape[0])] 184 | return ins_unit_assign, ins_unit_dist 185 | 186 | def set_params(self, num_epoch: int) -> dict: 187 | U = {"alphas": [], "H_maps": [], "radiuses": []} 188 | 189 | dist_map = torch.zeros(self.num_units, self.num_units) 190 | radius = np.ceil(1 + np.floor(min(self.width, self.height) - 1) / 2) - 1 191 | for u in range(self.num_units): 192 | dist_map[u, :] = self.find_neighbors(u, self.num_units) 193 | 194 | for epoch in range(num_epoch): 195 | alpha = self.alpha_max - self.alpha_min 196 | alpha = alpha * (num_epoch - epoch) / num_epoch + self.alpha_min 197 | radius = np.ceil(1 + np.floor(min(self.width, self.height) - 1) / 2) - 1 198 | radius = radius * (num_epoch - epoch) / (num_epoch - 1) - 1 199 | radius = max(radius, 0) 200 | 201 | neigh_updt_map = alpha * (1 - dist_map / float((1 + radius))) 202 | neigh_updt_map[dist_map > radius] = 0 203 | 204 | U["H_maps"].append(neigh_updt_map) 205 | U["alphas"].append(alpha) 206 | U["radiuses"].append(radius) 207 | 208 | return U 209 | 210 | def train_batch( 211 | self, 212 | num_epoch: Optional[int] = None, 213 | batch_size: Optional[int] = None, 214 | verbose: bool = True, 215 | ): 216 | """ 217 | Args: 218 | num_epoch: number of epochs to train 219 | batch_size: number of samples to train in one batch 220 | verbose: if True, print the progress of training 221 | """ 222 | if num_epoch is None: 223 | num_epoch = 500 * self.num_units 224 | 225 | if batch_size is None: 226 | batch_size = self.X.shape[0] 227 | 228 | logger.info("Learning...") 229 | U = self.set_params(num_epoch) 230 | 231 | X2 = None 232 | if batch_size == self.X.shape[0]: 233 | X2 = (self.X**2).sum(1).unsqueeze(1) 234 | 235 | for epoch in range(num_epoch): 236 | logger.info(f"Epoch --- {epoch}") 237 | update_rate = U["H_maps"][epoch] 238 | learn_rate = U["alphas"][epoch] 239 | 240 | shuffle_indices = torch.randperm(self.X.shape[0]) 241 | win_counts = torch.zeros(self.num_units) 242 | 243 | batches = torch.split(shuffle_indices, batch_size) 244 | num_steps = len(batches) 245 | 246 | for step, batch_indices in enumerate(batches): 247 | batch_data = self.X[batch_indices, :] 248 | D = self._calc_distance(batch_data, X2) 249 | BMU = (D == D.min(0)[0][None, :]).float().t() 250 | 251 | win_counts += BMU.sum(dim=0) 252 | 253 | if self.set_count_activations: 254 | self.activations += win_counts 255 | 256 | A = torch.mm(BMU, update_rate) 257 | S = A.sum(0) 258 | non_zeros = S.nonzero().squeeze() 259 | 260 | self.W.data[non_zeros] = torch.mm(A[:, non_zeros].t(), batch_data) / S[ 261 | non_zeros 262 | ].unsqueeze(1) 263 | 264 | self._print_cost(X2, D, epoch, num_epoch, step, num_steps) 265 | 266 | if self.set_outlier_unit_det: 267 | self._update_unit_saliency(win_counts, update_rate, learn_rate) 268 | 269 | if self.set_count_activations: 270 | self.activations /= self.activations.sum() 271 | 272 | self.assing_to_units() 273 | 274 | if self.set_outlier_unit_det: 275 | self._find_outlier_units() 276 | 277 | if self.set_inunit_outlier_det: 278 | self._find_inunit_outliers() 279 | 280 | def _print_cost( 281 | self, 282 | X2: torch.Tensor, 283 | D: torch.Tensor, 284 | epoch: int, 285 | num_epoch: int, 286 | step: int, 287 | num_steps: int, 288 | ): 289 | batch_cost = D.min(0)[0].mean() 290 | 291 | if self.log_full_data_cost: 292 | # TODO: handle when data is too large 293 | if self.distance_metric == "euclidean": 294 | D = self._calc_distance(self.X, X2) 295 | cost = torch.norm(D.min(0)[0], p=1) / self.X.shape[0] 296 | elif self.distance_metric == "cosine": 297 | D = self._calc_distance(self.X) 298 | cost = D.min(0)[0].mean() # Average minimum cosine distance 299 | else: 300 | # exponential moving average cost 301 | if not hasattr(self, "avg_cost"): 302 | self.avg_cost = batch_cost 303 | else: 304 | self.avg_cost = self.avg_cost * 0.01 + batch_cost * 0.99 305 | cost = self.avg_cost 306 | 307 | logger.info( 308 | f"epoch {epoch} / {num_epoch} -- step {step} / {num_steps} -- avg-cost: {cost.item():.6f} -- batch-cost: {batch_cost.item():.6f}" 309 | ) 310 | 311 | def _update_unit_saliency( 312 | self, win_counts: torch.Tensor, update_rate: torch.Tensor, learn_rate: float 313 | ): 314 | excitations = (update_rate * win_counts.unsqueeze(1)).sum(dim=0) / learn_rate 315 | excitations = excitations / excitations.sum() 316 | single_excitations = win_counts * learn_rate 317 | single_excitations = single_excitations / single_excitations.sum() 318 | self.unit_saliency_coeffs += excitations + single_excitations 319 | 320 | def _find_outlier_units(self): 321 | self.unit_saliency_coeffs /= self.unit_saliency_coeffs.sum() 322 | self.unit_saliency = ( 323 | self.unit_saliency_coeffs > self.outlier_unit_thresh / self.num_units 324 | ) 325 | 326 | self.inst_saliency = torch.ones(self.X.shape[0], dtype=torch.bool) 327 | outlier_units = torch.where(self.unit_saliency == False)[0] 328 | for i in outlier_units: 329 | self.inst_saliency[torch.where(self.ins_unit_assign == i)[0]] = False 330 | 331 | def _find_inunit_outliers(self): 332 | if self.inst_saliency.numel() == 0: 333 | self.inst_saliency = torch.ones(self.X.shape[0], dtype=torch.bool) 334 | 335 | for i in torch.unique(self.ins_unit_assign): 336 | indices = torch.where(self.ins_unit_assign == i)[0] 337 | unit_thresh = torch.quantile( 338 | self.ins_unit_dist[indices], self.inunit_outlier_thresh / 100 339 | ) 340 | outlier_insts = indices[self.ins_unit_dist[indices] > unit_thresh] 341 | self.inst_saliency[outlier_insts] = False 342 | 343 | def salient_inst_index(self) -> torch.Tensor: 344 | return torch.where(self.inst_saliency == True)[0] 345 | 346 | def salient_unit_index(self) -> torch.Tensor: 347 | return torch.where(self.unit_saliency == True)[0] 348 | 349 | def salient_insts(self) -> torch.Tensor: 350 | return self.X[self.inst_saliency] 351 | 352 | def salient_units(self) -> torch.Tensor: 353 | return self.W[self.unit_saliency] 354 | 355 | def inst_to_unit_mapping(self) -> torch.Tensor: 356 | return torch.stack((torch.arange(self.X.shape[0]), self.ins_unit_assign)) 357 | 358 | 359 | if __name__ == "__main__": 360 | import matplotlib.pyplot as plt 361 | import som_plot 362 | import torch 363 | from sklearn.datasets import load_digits 364 | from sklearn.preprocessing import StandardScaler 365 | 366 | # Load the digits dataset 367 | digits = load_digits() 368 | X = digits.data 369 | y = digits.target 370 | 371 | # Preprocess the data 372 | scaler = StandardScaler() 373 | X_scaled = scaler.fit_transform(X) 374 | 375 | # Convert to PyTorch tensor 376 | X_tensor = torch.tensor(X_scaled, dtype=torch.float32) 377 | 378 | # Initialize and train SOM 379 | som = SOM(X_tensor, num_units=100, alpha_max=0.05, alpha_min=0.01) 380 | som.train_batch(num_epoch=1000, batch_size=32, verbose=True) 381 | 382 | # Get the weights and assign instances to units 383 | W = som.W.detach().numpy() 384 | som.assing_to_units() 385 | 386 | # Plot scatter plot 387 | som_plot.som_plot_scatter(W, X_scaled, som.activations.numpy()) 388 | 389 | # Plot outlier scatter plot 390 | som_plot.som_plot_outlier_scatter( 391 | W, 392 | X_scaled, 393 | som.unit_saliency.numpy(), 394 | som.inst_saliency.numpy(), 395 | som.activations.numpy(), 396 | ) 397 | 398 | # Plot mapping 399 | distance_map = ( 400 | som._euq_dist(torch.sum(X_tensor**2, dim=1).unsqueeze(1), X_tensor) 401 | .detach() 402 | .numpy() 403 | ) 404 | distance_map = distance_map.reshape(som.height, som.width) 405 | som_plot.som_plot_mapping(distance_map) 406 | 407 | plt.show() 408 | -------------------------------------------------------------------------------- /som_theano.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | ''' 3 | Rectifying Self Organazing Maps a.k.a RSOM 4 | 5 | RSOM is a clustering and outlier detection method that is predicated with 6 | old Self Organazing Maps. 7 | 8 | It includes Batch and Stochastic learning rules. There are two different 9 | implementations. One is based on Numpy and tthe other is Theano. If you have 10 | tall and wide data matrix, we suggest to use Theano version. Otherwise 11 | Numpy version is faster. You can also use GPU with Theano but you need to 12 | set Theano configurations. 13 | 14 | For more detail about RSOM refer to http://arxiv.org/abs/1312.4384 15 | 16 | AUTHOR: 17 | Eren Golge 18 | erengolge@gmail.com 19 | www.erengolge.com 20 | ''' 21 | 22 | """ 23 | TO DO: 24 | -> Try dot product distance instead of Euclidean 25 | -> Normzalize only updated weight vectors in that epoch 26 | -> compare code with https://github.com/JustGlowing/minisom/blob/master/minisom.py 27 | -> print resulting objective values 28 | -> write bookeeping for best objective value 29 | -> learning rate is already decreasing so radius might be good to keep it constant 30 | -> UPDATE only winners 31 | """ 32 | 33 | import warnings 34 | from random import * 35 | from math import * 36 | import sys 37 | import scipy 38 | import numpy as np 39 | from numpy import linalg 40 | from som_plot import * 41 | import theano 42 | import theano.tensor as T 43 | from theano import function, config, shared, sandbox 44 | from theano import ProfileMode 45 | from collections import Counter 46 | #from theano import ProfileMode 47 | 48 | EPS = 2.2204e-16; 49 | 50 | class SOM(object): 51 | 52 | def __init__(self, DATA=None, num_units = 10, height=None, width=None, \ 53 | alpha_max=0.05, alpha_min=0.001, set_count_activations = True, \ 54 | set_outlier_unit_det = True, set_inunit_outlier_det = True, outlier_unit_thresh = 0.5,\ 55 | inunit_outlier_thresh = 95): 56 | 57 | ''' 58 | CONSTRUCTOR PARAMETERS: 59 | 60 | DATA --- data matrix with shape nxm n is number of instances and 61 | m is number of variables 62 | num_units --- number of som units. This can be changes a bit after 63 | 2D lattice shape is computed by eigen heuristic, if its shape 64 | paramters are not given already. 65 | height --- height of the 2D lattice of SOM 66 | width --- width of the 2D lattice of SOM. height * width = num_inst 67 | alpha_max --- is the maximum learning rate that is gradually 68 | decreasing up to alpha_min 69 | alpha_min --- is the minimum learning rate attined at the last epoch 70 | set_count_activations --- whether count the activation of each unit 71 | set_outlier_unit_det --- whether outlier units are detected. If a unit 72 | is detected as outlier, all of the assigned items signed as outlier as well 73 | set_inunit_outlier_det --- wheter in-unit outlier instances are detected 74 | outlier_unit_thresh --- default value 0.5 works good for many cases 75 | inunit_outlier_thresh --- is the upper whisker percentage. 76 | ''' 77 | 78 | self.X = DATA 79 | self.num_units = num_units 80 | if height == None or width == None: 81 | self._estimate_map_shape() 82 | self.num_units = self.height * self.width 83 | else: 84 | self.height = height 85 | self.width = width 86 | 87 | if self.height * self.width != self.num_units: 88 | print "Number of units is not conforming to lattice size so it is set num_units = width + heigth" 89 | self.num_units = self.height * self.width 90 | print "New number of units : ",self.num_units 91 | raw_input("Press Enter to continue...") 92 | 93 | self.data_dim = DATA.shape[1] 94 | 95 | # normalize data and save mean and std values 96 | self.data_mean = 0 97 | self.data_std = 0 98 | #self._norm_data() 99 | 100 | # optimization parameters 101 | self.alpha_max = alpha_max 102 | self.alpha_min = alpha_min 103 | 104 | self.W = np.random.random((self.num_units , self.data_dim)) 105 | self.W = np.array([v/linalg.norm(v) for v in self.W]) # normalizat 106 | 107 | # book keeping 108 | self.best_W = self.W 109 | self.best_W_obj = 0 110 | 111 | # unit statistics 112 | self.set_count_activations = set_count_activations 113 | self.activations = np.zeros((self.num_units)) 114 | self.set_outlier_unit_det = set_outlier_unit_det 115 | self.set_inunit_outlier_det = set_inunit_outlier_det 116 | self.unit_saliency_coeffs = np.zeros((self.num_units)) 117 | self.unit_saliency = np.ones((self.num_units), dtype=bool) 118 | self.inst_saliency = np.array(()) 119 | self.outlier_unit_thresh = outlier_unit_thresh 120 | self.inunit_outlier_thresh = inunit_outlier_thresh 121 | self.ins_unit_assign = np.array(()) 122 | self.ins_unit_dist = np.array(()) 123 | self.unit_coher = np.array(()) 124 | 125 | unit_x = lambda self, index, width : index % width 126 | unit_y = lambda self, index, width : np.floor( index / width ) 127 | 128 | def unit_cords(self, index): 129 | return self.unit_x(index, self.width), self.unit_y(index, self.width) 130 | 131 | # Euclidean distance with pre-computed data square X2 132 | def _euq_dist(self, X2, X): 133 | return -2*np.dot(self.W, X.T) + (self.W**2).sum(1)[:, None] + X2.T 134 | 135 | # Print function for Numpy based optimization functions 136 | def _print_cost(self,X2, epoch, num_epoch): 137 | D = self._euq_dist(X2, self.X) 138 | print "epoch", epoch, "of", num_epoch, " cost: ", np.linalg.norm(D.min(0), ord=1) / self.X.shape[0] 139 | 140 | 141 | def set_params(self, num_epoch): 142 | 143 | ''' 144 | Before starting to learning, all imperative parameters are set regarding 145 | corresponding epoch. It wastes some additional memory but proposes faster 146 | learning speed. 147 | 148 | Outputs: 149 | U --- is a dictionary including all necessary parameter structures 150 | 151 | U['alphas'] -- learning rates for each epoch 152 | U['H_maps'] -- matrix array of neighboorhood masks 153 | U['radiuses'] -- neighboor radiuses for each epoch 154 | 155 | ''' 156 | 157 | U = {'alphas':[], 'H_maps':[], 'radiuses':[]} 158 | alphas = [None]*num_epoch 159 | H_maps = [None]*num_epoch 160 | radiuses = [None]*num_epoch 161 | 162 | dist_map = np.zeros((self.num_units, self.num_units)) 163 | radius = np.ceil(1 + floor(min(self.width, self.height)-1)/2)-1 164 | for u in range(int(self.num_units)): 165 | #for r in range(1,int(radius)+1,1): 166 | dist_map[u,:] = self.find_neighbors(u,self.num_units) 167 | 168 | for epoch in range(0,num_epoch,1): 169 | alpha = self.alpha_max - self.alpha_min 170 | alpha = alpha * (num_epoch - epoch) 171 | alpha = alpha / num_epoch + self.alpha_min 172 | radius = np.ceil(1 + floor(min(self.width, self.height)-1)/2)-1 173 | radius = radius * (num_epoch - epoch) 174 | radius = ceil(radius / (num_epoch - 1))-1 175 | if radius < 0 : 176 | radius = 0 177 | neigh_updt_map = alpha * (1 - dist_map/float((1 + radius))) 178 | # neigh_updt_map[dist_map == 0] = 1 179 | neigh_updt_map[dist_map > radius] = 0 # Optimize this part 180 | H_maps[epoch] = neigh_updt_map 181 | alphas[epoch] = alpha 182 | radiuses[epoch] = radius 183 | 184 | U['alphas'] = alphas 185 | U['H_maps'] = H_maps 186 | U['radiuses'] = radiuses 187 | return U 188 | 189 | def train_stoch(self, num_epoch, verbose =True): 190 | 191 | ''' 192 | Numpy based stochastic training where each instance is take individually 193 | and weight are updatesd in terms of winner neuron. 194 | 195 | Generally faster than Theano version 196 | ''' 197 | 198 | if num_epoch == None: 199 | num_epoch = 500 * self.num_units # Kohonen's suggestion 200 | 201 | U = self.set_params(num_epoch) 202 | X2 = (self.X**2).sum(1)[:, None] 203 | 204 | for epoch in range(num_epoch): 205 | shuffle_indices = np.random.permutation(self.X.shape[0]) 206 | 207 | update_rate = U['H_maps'][epoch] 208 | learn_rate = U['alphas'][epoch] 209 | win_counts = np.zeros((self.num_units)) 210 | for i in shuffle_indices: 211 | instance = self.X[i,:] 212 | D = self._euq_dist(X2[i][None,:], instance[None,:]) 213 | BMU_indx = np.argmin(D) 214 | 215 | win_counts[BMU_indx] += 1 216 | if self.set_count_activations: 217 | self.activations[BMU_indx] += 1 218 | 219 | self.W = self.W + learn_rate * update_rate[...,BMU_indx,None]* (instance - self.W) 220 | ## Normalization is not imperative unless given input instances are normalized 221 | # self.W = self.W / np.linalg.norm(self.W) 222 | 223 | if verbose and (epoch % 1) == 0: 224 | self._print_cost(X2, epoch, num_epoch) 225 | 226 | if self.set_outlier_unit_det: 227 | self._update_unit_saliency(win_counts, update_rate, learn_rate) 228 | 229 | # Normalize activation counts 230 | if self.set_count_activations: 231 | total_act = self.activations.sum() 232 | self.activations = self.activations / total_act 233 | 234 | self.assing_to_units() # final unit assignments 235 | 236 | if self.set_outlier_unit_det: 237 | self._find_outlier_units() 238 | 239 | if self.set_inunit_outlier_det: 240 | self._find_inunit_outliers() 241 | 242 | 243 | 244 | def train_stoch_theano(self, num_epoch = None, verbose =True): 245 | 246 | ''' 247 | Theano based stochastic learning 248 | ''' 249 | 250 | warnings.simplefilter("ignore", DeprecationWarning) 251 | warnings.filterwarnings("ignore") 252 | 253 | if num_epoch == None: 254 | num_epoch = 500 * self.X.shape[0] 255 | 256 | # Symmbol variables 257 | X = T.dmatrix('X') 258 | WIN = T.dmatrix('WIN') 259 | H = T.dmatrix('H') 260 | 261 | # Init weights random 262 | W = theano.shared(self.W, name="W") 263 | #W = theano.shared(rng.randn(cluster_num, data.shape[1]).astype(theano.config.floatX), name="W") 264 | 265 | # Find winner unit 266 | D = (W**2).sum(axis=1, keepdims=True) + (X**2).sum(axis=1, keepdims=True).T - 2*T.dot(W, X.T) 267 | bmu = (D).argmin(axis=0) 268 | dist = T.dot(WIN.T, X) - WIN.sum(0)[:, None] * W 269 | err = D.min(0).norm(1)/X.shape[0] 270 | 271 | update = function([X,WIN, H],outputs=err,updates=[(W, W + T.addbroadcast(H,1)*dist)]) 272 | find_bmu = function([X], bmu) 273 | 274 | # Update 275 | U = self.set_params(num_epoch) 276 | for epoch in range(num_epoch): 277 | update_rate = U['H_maps'][epoch] 278 | learn_rate = U['alphas'][epoch] 279 | win_counts = np.zeros((self.num_units)) 280 | shuff_indx = np.random.permutation(self.X.shape[0]) 281 | for i in shuff_indx: 282 | ins = self.X[i, :][None,:] 283 | D = find_bmu(ins) 284 | S = np.zeros([ins.shape[0],self.num_units]) 285 | #S = np.zeros([batch,cluster_num], theano.config.floatX) 286 | S[:,D] = 1 287 | win_counts[D] += 1 288 | h = update_rate[D,:].sum(0)[:,None] 289 | cost = update(ins,S,h) 290 | 291 | if verbose: 292 | print "Avg. centroid distance -- ", cost,"\t EPOCH : ",epoch , " of ", num_epoch 293 | if self.set_count_activations: 294 | self.activations += win_counts 295 | 296 | if self.set_outlier_unit_det: 297 | self._update_unit_saliency(win_counts, update_rate, learn_rate) 298 | 299 | # get the data from shared theano variable 300 | self.W = W.get_value() 301 | 302 | # Normalize activation counts 303 | if self.set_count_activations: 304 | total_act = self.activations.sum() 305 | self.activations = self.activations / total_act 306 | 307 | self.assing_to_units() # final unit assignments 308 | 309 | if self.set_outlier_unit_det: 310 | self._find_outlier_units() 311 | 312 | if self.set_inunit_outlier_det: 313 | self._find_inunit_outliers() 314 | 315 | 316 | def train_batch_theano(self, num_epoch = None, batch_size = None, verbose=True): 317 | ''' 318 | Theano based batch learning. If you don't define batch size, then all the 319 | instances are fed for each epoch. 320 | 321 | It is preferred to use batch learning initially then fine tune with 322 | stochastic version 323 | 324 | In general Theano version is faster if the data is not very small. 325 | ''' 326 | 327 | if num_epoch == None: 328 | num_epoch = 500 * self.X.shape[0] 329 | 330 | if batch_size == None: 331 | batch_size = self.X.shape[0] 332 | 333 | # Symmbol variables 334 | X = T.dmatrix('X') 335 | WIN = T.dmatrix('WIN') 336 | alpha = T.dscalar('learn_rate') 337 | H = T.dmatrix('update_rate') 338 | 339 | # Init weights random 340 | W = theano.shared(self.W, name='W') 341 | W_old = W.get_value() 342 | 343 | # Find winner unit 344 | D = (W**2).sum(axis=1, keepdims=True) + (X**2).sum(axis=1, keepdims=True).T - 2*T.dot(W, X.T) 345 | BMU = (T.eq(D,D.min(axis=0, keepdims=True))).T 346 | dist = T.dot(BMU.T, X) - BMU.sum(0)[:, None] * W 347 | err = D.min(0).sum().norm(1)/X.shape[0] 348 | 349 | #update = function([X,WIN,alpha],outputs=err,updates=[(W, W + alpha * dist)]) 350 | 351 | A = T.dot(BMU, H) 352 | S = A.sum(axis=0) 353 | update_neigh_no_verbose = function([X, H],outputs=BMU, updates=[(W, T.where((S[:,None] > 0) ,T.dot(A.T, X), W) / T.where((S > 0), S, 1)[:,None])]) 354 | update_neigh = function([X, H],outputs=[err, BMU], updates=[(W, T.where((S[:,None] > 0) ,T.dot(A.T, X), W) / T.where((S > 0), S, 1)[:,None])]) 355 | find_bmu = function([X], BMU) 356 | 357 | # if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in 358 | # update_neigh.maker.fgraph.toposort()]): 359 | # print 'Used the cpu' 360 | # elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in 361 | # update_neigh.maker.fgraph.toposort()]): 362 | # print 'Used the gpu' 363 | # else: 364 | # print 'ERROR, not able to tell if theano used the cpu or the gpu' 365 | # print update_neigh.maker.fgraph.toposort() 366 | 367 | U = self.set_params(num_epoch) 368 | for epoch in range(num_epoch): 369 | print 'Epoch --- ', epoch 370 | update_rate = U['H_maps'][epoch] 371 | learn_rate = U['alphas'][epoch] 372 | win_counts = np.zeros((self.num_units)) 373 | for i in range(0, self.X.shape[0], batch_size): 374 | batch_data = self.X[i:i+batch_size, :] 375 | #temp = find_bmu(batch_data) 376 | if verbose and epoch % 5 == 0: 377 | cost, winners = update_neigh(batch_data, update_rate) 378 | else: 379 | winners = update_neigh_no_verbose(batch_data, update_rate) 380 | win_counts =+ winners.sum(axis=0) 381 | ## Normalization is not imperative unless given input instances are normalized 382 | # self.W = self.W / np.linalg.norm(self.W) 383 | 384 | 385 | if verbose and epoch % 5 == 0: 386 | print "Avg. centroid distance -- ", cost,"\t EPOCH : ", epoch, " of ", num_epoch 387 | 388 | if self.set_count_activations: 389 | self.activations += win_counts 390 | 391 | if self.set_outlier_unit_det: 392 | self._update_unit_saliency(win_counts, update_rate, learn_rate) 393 | 394 | # get the data from shared theano variable 395 | self.W = W.get_value() 396 | 397 | # Normalize activation counts 398 | if self.set_count_activations: 399 | total_act = self.activations.sum() 400 | self.activations = self.activations / total_act 401 | 402 | self.assing_to_units() # final unit assignments 403 | 404 | if self.set_outlier_unit_det: 405 | self._find_outlier_units() 406 | 407 | if self.set_inunit_outlier_det: 408 | self._find_inunit_outliers() 409 | 410 | 411 | def train_batch(self, num_epoch = None, batch_size = None, verbose=True): 412 | 413 | ''' 414 | Numpy version of batch learning 415 | ''' 416 | 417 | if num_epoch == None: 418 | num_epoch = 500 * self.num_units # Kohonen's suggestion 419 | 420 | if batch_size == None: 421 | batch_size = self.X.shape[0] 422 | 423 | print 'Learning ... ' 424 | U = self.set_params(num_epoch) 425 | X2 = (self.X**2).sum(1)[:, None] 426 | for epoch in range(num_epoch): 427 | print 'Epoch --- ', epoch 428 | update_rate = U['H_maps'][epoch] 429 | learn_rate = U['alphas'][epoch] 430 | # randomize batch order 431 | shuffle_indices = np.random.permutation(self.X.shape[0]) 432 | win_counts = np.zeros((self.num_units)) 433 | for batch_indices in np.array_split(shuffle_indices, self.X.shape[0]/batch_size): 434 | batch_data = self.X[batch_indices,:] 435 | D = self._euq_dist(X2[batch_indices,:], batch_data) 436 | BMU = (D==D.min(0)[None,:]).astype("float32").T 437 | 438 | win_counts += BMU.sum(axis=0) 439 | #print win_counts 440 | 441 | if self.set_count_activations: 442 | self.activations += win_counts 443 | 444 | # batch learning 445 | A = np.dot(BMU, update_rate) 446 | S = A.sum(0) 447 | non_zeros = S.nonzero()[0] 448 | self.W[non_zeros, ...] = np.dot(A[:,non_zeros].T, batch_data) / S[non_zeros][..., None] 449 | 450 | # normalize weight vector 451 | ## Normalization is not imperative unless given input instances are normalized 452 | # self.W = self.W / np.linalg.norm(self.W) 453 | #self.W = self.W / np.linalg.norm(self.W) 454 | 455 | if self.set_outlier_unit_det: 456 | self._update_unit_saliency(win_counts, update_rate, learn_rate) 457 | 458 | if verbose and ((epoch % 1) == 0): 459 | self._print_cost(X2, epoch, num_epoch) 460 | 461 | # Normalize activation counts 462 | if self.set_count_activations: 463 | total_act = self.activations.sum() 464 | self.activations = self.activations / total_act 465 | 466 | self.assing_to_units() # final unit assignments 467 | 468 | if self.set_outlier_unit_det: 469 | self._find_outlier_units() 470 | 471 | if self.set_inunit_outlier_det: 472 | self._find_inunit_outliers() 473 | 474 | 475 | # Uses the Chessboard distance 476 | # Find the neighbooring units to given unit 477 | vis_neigh = lambda neigh_map, indx : neigh_map[indx].reshape((self.height, self.width)) 478 | def find_neighbors(self, unit_id, radius): 479 | neighbors = np.zeros((1,self.num_units)) 480 | test_neig = np.zeros((self.height, self.width)) 481 | unit_x, unit_y = self.unit_cords(unit_id) 482 | 483 | min_y = max(int(unit_y - radius), 0) 484 | max_y = min(int(unit_y + radius), self.height-1) 485 | min_x = max(int(unit_x - radius), 0) 486 | max_x = min(int(unit_x + radius), self.width-1) 487 | for y in range(min_y, max_y+1,1): 488 | for x in range(min_x, max_x+1,1): 489 | dist = abs(y-unit_y) + abs(x-unit_x) 490 | neighbors[0, x + ( y * self.width )] = dist 491 | test_neig[y,x] = dist 492 | return neighbors 493 | 494 | # find BMUs and between-distances for given set of instances 495 | def best_match(self, X): 496 | if len(X.shape) == 1: 497 | 498 | X = X.reshape((1,2)) 499 | X2 = (self.X**2).sum(1)[:, None] 500 | D = -2*np.dot(self.W, X.T)[None,:] + (self.W**2).sum(1)[:, None] + X2.T 501 | BMU = (D==D.min(0)[None,:]).astype("float32").T 502 | return BMU, D 503 | 504 | # structure the unit weight to be shown at U map 505 | def som_map(self): 506 | print('Som mapping is being computed...') 507 | sqrt_weigths = np.reshape(self.W,(self.height, self.width, self.data_dim)) 508 | um = np.zeros((sqrt_weigths.shape[0],sqrt_weigths.shape[1])) 509 | it = np.nditer(um, flags=['multi_index']) 510 | while not it.finished: 511 | for ii in range(it.multi_index[0]-1,it.multi_index[0]+2): 512 | for jj in range(it.multi_index[1]-1,it.multi_index[1]+2): 513 | if ii >= 0 and ii < sqrt_weigths.shape[0] and jj >= 0 and jj < sqrt_weigths.shape[1]: 514 | um[it.multi_index] += np.linalg.norm(sqrt_weigths[ii,jj,:]-sqrt_weigths[it.multi_index]) 515 | it.iternext() 516 | um = um/um.max() 517 | print("Mapping finished...!") 518 | return um 519 | 520 | 521 | # set the ratio of width and height of the map by the 522 | # ratio between largest 2 eigenvalues, computed from data 523 | def _estimate_map_shape(self): 524 | #num_instances = self.X.shape[0] 525 | u,s,v = np.linalg.svd(self.X ,full_matrices = False) 526 | s_sorted = np.sort(s)[::-1] 527 | ratio = s_sorted[0] / s_sorted[1] 528 | self.height = int(min(self.num_units, np.ceil(np.sqrt(self.num_units / ratio)))) 529 | self.width = int(np.ceil(self.num_units / self.height)) 530 | # self.height = int(np.round(np.sqrt(num_instances))) 531 | # self.width = int(np.round(num_instances / self.height)) 532 | print 'Estimated map size is -> height = ', self.height, ' width = ',self.width 533 | 534 | # assign instances to matching BMUs 535 | def assing_to_units(self, X=None): 536 | if X == None: 537 | X2 = (self.X**2).sum(1)[:, None] 538 | D = -2*np.dot(self.W, self.X.T) + (self.W**2).sum(1)[:, None] + X2.T 539 | 540 | self.ins_unit_assign = D.argmin(axis=0) 541 | self.ins_unit_dist = D[self.ins_unit_assign, np.arange(self.X.shape[0])] 542 | else: 543 | X2 = (X**2).sum(1)[:, None] 544 | D = -2*np.dot(self.W, X.T) + (self.W**2).sum(1)[:, None] + X2.T 545 | ins_unit_assign = D.argmin(axis=0) 546 | ins_unit_dist = D[ins_unit_assign, np.arange(X.shape[0])] 547 | return ins_unit_assign , ins_unit_dist 548 | 549 | 550 | def find_units_coherence(self): 551 | 552 | ''' 553 | Find individually coherence of each unit by looking to avg. distance 554 | between unit weight and the assigned instances 555 | ''' 556 | 557 | self.unit_coher = np.zeros((self.num_units)) 558 | for i in np.unique(self.ins_unit_assign): 559 | indices = np.where(self.ins_unit_assign == i) 560 | self.unit_coher[i] = np.sum(self.ins_unit_dist[indices]) / indices[0].size 561 | 562 | # return BMU, BMU distance, saliency by already trained params 563 | def process_new_data(self, X): 564 | BMU,dist = self.assing_to_units(X) 565 | 566 | # find outlier instanes in outlier units 567 | ins_saliency= np.ones((X.shape[0]), dtype=bool) 568 | outlier_units = np.where(self.unit_saliency == False)[0] 569 | for i in outlier_units: 570 | ins_saliency[np.where(BMU == i)] = False 571 | 572 | # find salient unit outliers 573 | for i in np.unique(BMU): 574 | indices = np.where(BMU == i)[0] 575 | unit_thresh = scipy.stats.scoreatpercentile(dist[indices], self.inunit_outlier_thresh) 576 | outlier_insts = indices[dist[indices] > unit_thresh] 577 | ins_saliency[outlier_insts] = False; 578 | 579 | return BMU, dist, ins_saliency 580 | 581 | 582 | 583 | def _update_unit_saliency(self, win_counts, update_rate, learn_rate): 584 | 585 | ''' 586 | It is called after each epoch of the learning. It compute the 587 | unit saliencies with the paper formula. At the end, those values 588 | defines the outlier and salient units 589 | ''' 590 | 591 | excitations = (update_rate * win_counts).sum(axis=0) / learn_rate 592 | excitations = excitations / excitations.sum() 593 | single_excitations = win_counts * learn_rate 594 | single_excitations = single_excitations / single_excitations.sum() 595 | self.unit_saliency_coeffs += excitations + single_excitations 596 | 597 | def _find_outlier_units(self): 598 | 599 | ''' 600 | After we compute unit saliencies, this function detects the outlier 601 | units by the paper heuristic 602 | ''' 603 | 604 | # find outlier units 605 | self.unit_saliency_coeffs /= self.unit_saliency_coeffs.sum() 606 | self.unit_saliency = self.unit_saliency_coeffs > self.outlier_unit_thresh/self.num_units 607 | 608 | # sign outlier instances 609 | self.inst_saliency = np.ones((self.X.shape[0]), dtype=bool) 610 | outlier_units = np.where(self.unit_saliency == False)[0] 611 | for i in outlier_units: 612 | self.inst_saliency[np.where(self.ins_unit_assign == i)] = False 613 | 614 | def _find_inunit_outliers(self): 615 | 616 | ''' 617 | Find the poor instances at the salient units. It uses an upper whisker 618 | assigned to the distances of the unit weight to unit instances. given the threshold, 619 | outside of the whisker is detedted as outlier. 620 | ''' 621 | 622 | # #remove outlier units 623 | # int_units = np.array(range(self.num_units)) 624 | # if self.unit_saliency.size > 0 and self.set_inunit_outlier_det: 625 | # int_units = int_units[self.unit_saliency] 626 | if self.inst_saliency.size == 0: 627 | self.inst_saliency = np.ones((self.X.shape[0]), dtype=bool) 628 | 629 | for i in np.unique(self.ins_unit_assign): 630 | indices = np.where(self.ins_unit_assign == i)[0] 631 | unit_thresh = scipy.stats.scoreatpercentile(self.ins_unit_dist[indices], self.inunit_outlier_thresh) 632 | outlier_insts = indices[self.ins_unit_dist[indices] > unit_thresh] 633 | self.inst_saliency[outlier_insts] = False; 634 | 635 | # Returns indices of salient instances 636 | def salient_inst_index(self): 637 | return np.where(self.inst_saliency == True)[0] 638 | 639 | def salient_unit_index(self): 640 | return np.where(self.unit_saliency == True)[0] 641 | 642 | def salient_insts(self): 643 | return self.X[np.where(self.inst_saliency == True)] 644 | 645 | def salient_units(self): 646 | return self.W[np.where(self.unit_saliency == True)] 647 | 648 | ## Returns instance to unit mapping. First row is instances. 649 | def inst_to_unit_mapping(self): 650 | return np.concatenate((np.arange(self.X.shape[0])[None,:], self.ins_unit_assign[None, :])) 651 | 652 | def salient_inst_to_unit_mapping(self): 653 | mapping = self.inst_to_unit_mapping() 654 | 655 | 656 | def _norm_data(self, X = None): 657 | 658 | ''' 659 | Take the norm of the given data matrix and save std and mean 660 | for future purposes 661 | ''' 662 | 663 | if X == None: 664 | self.data_mean = self.X.mean(axis=0) 665 | self.data_std = self.X.std(axis=0, ddof=1) 666 | self.X = (self.X - self.data_mean) / (self.data_std + EPS) 667 | else: 668 | data_mean = X.mean(axis=0) 669 | data_std = X.std(axis=0, ddof=1) 670 | X = (X - data_mean) / data_std 671 | return X, data_mean, data_std 672 | 673 | 674 | ''' 675 | DEMO CODE 676 | ''' 677 | if __name__ == "__main__": 678 | from sklearn import datasets 679 | import time 680 | 681 | data = datasets.load_digits().data 682 | 683 | som = SOM(DATA = data, alpha_max=0.05, num_units=100, height = 10, width = 10) 684 | #som.train_batch(100) 685 | #start = time.time() 686 | #som.train_stoch_theano(10) 687 | som.train_batch_theano(num_epoch=100) 688 | #som.train_stoch(10) 689 | #clusters = som.ins_unit_assign 690 | #print clusters 691 | #stop = time.time() 692 | # 693 | print som.unit_saliency 694 | 695 | #som_plot_scatter(som.W, som.X, som.activations) 696 | #som_plot_outlier_scatter(som.W, som.X, som.unit_saliency, som.inst_saliency, som.activations) 697 | #som_mapping = som.som_map() 698 | #som_plot_mapping(som_mapping) 699 | print "Demo finished!" 700 | #print "Pass time : ", stop - start 701 | --------------------------------------------------------------------------------