├── __init__.py
├── som.pyc
├── som_plot.pyc
├── visuals
├── som_latice.png
└── 2d_projection.png
├── setup.py
├── README.md
├── tests.py
├── grid_search.py
├── .gitignore
├── sample_run.py
├── rsom.py
└── som_theano.py
/__init__.py:
--------------------------------------------------------------------------------
1 | from .som import SOM
2 |
--------------------------------------------------------------------------------
/som.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/som.pyc
--------------------------------------------------------------------------------
/som_plot.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/som_plot.pyc
--------------------------------------------------------------------------------
/visuals/som_latice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/visuals/som_latice.png
--------------------------------------------------------------------------------
/visuals/2d_projection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/visuals/2d_projection.png
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | setup(
7 | name="RSOM",
8 | version="0.1.0",
9 | author="Eren Gölge",
10 | author_email="",
11 | description="A Rectifying Self-Organizing Map (RSOM) implementation",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/erogol/RSOM",
15 | packages=find_packages(exclude=["tests*", "examples*"]),
16 | classifiers=[
17 | "Development Status :: 3 - Alpha",
18 | "Intended Audience :: Developers",
19 | "License :: OSI Approved :: MIT License",
20 | ],
21 | python_requires=">=3.6",
22 | install_requires=[
23 | "numpy",
24 | "matplotlib",
25 | "torch",
26 | "scikit-learn",
27 | ],
28 | )
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Rectifying Self Organizing Map (RSOM)
2 | ===============================
3 |
4 | 📎 **Paper 1:** Gölge, E., & Duygulu, P. 2013. Rectifying Self Organizing Maps for Automatic Concept Learning from Web Images
5 |
6 | 📎 **Paper 2:** Gölge, E., & Duygulu, P.. ConceptMap:Mining noisy web data for concept learning , The European Conference on Computer Vision (ECCV) 2014.
7 |
8 | RSOM is an algorithm as an extension of well-known Self Organizing Map (SOM). It mimics SOM clustering and additionally detects outliers in the given dataset in the cluster level or instance level.
9 | It is mainly used with image tasks but works as good with any other type of data.
10 |
11 | ## Installation
12 |
13 | ```
14 | git clone https://github.com/erogol/RSOM.git
15 | cd RSOM
16 | python setup.py install
17 | ```
18 |
19 | or
20 |
21 | ```
22 | pip install git+https://github.com/erogol/RSOM.git
23 | ```
24 |
25 | ## Usage
26 |
27 | Check ```sample_run.py``` for more.
28 |
29 | ```python
30 | from rsom import RSOM
31 |
32 | # Load Iris dataset
33 | data = load_digits().data
34 | data = torch.from_numpy(data).float()
35 | print(data.shape)
36 |
37 | # Initialize SOM
38 | som = RSOM(data, alpha_max=0.05, num_units=49)
39 |
40 | # Train SOM
41 | som.train_batch(num_epoch=1000, verbose=True)
42 |
43 | # Get salient instances and units
44 | salient_insts = som.salient_insts()
45 | salient_units = som.salient_units()
46 | ```
47 |
48 |
49 |
50 | ## Citation
51 |
52 | ```
53 | @misc{golge2013rectifyingselforganizingmaps,
54 | title={Rectifying Self Organizing Maps for Automatic Concept Learning from Web Images},
55 | author={Eren Golge and Pinar Duygulu},
56 | year={2013},
57 | eprint={1312.4384},
58 | archivePrefix={arXiv},
59 | primaryClass={cs.CV},
60 | url={https://arxiv.org/abs/1312.4384},
61 | }
62 | ```
63 |
64 | ## Example Visuals
65 |
66 | Visuals are generated using ```sample_run.py``` and digits dataset.
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import torch
4 |
5 | from rsom import RSOM
6 |
7 |
8 | class TestSOM(unittest.TestCase):
9 | def setUp(self):
10 | self.data = torch.randn(100, 10)
11 | self.som = RSOM(self.data, num_units=25, height=5, width=5)
12 |
13 | def test_init(self):
14 | self.assertEqual(self.som.num_units, 25)
15 | self.assertEqual(self.som.height, 5)
16 | self.assertEqual(self.som.width, 5)
17 | self.assertEqual(self.som.W.shape, (25, 10))
18 |
19 | def test_normalize_weights(self):
20 | self.som._normalize_weights()
21 | norms = torch.norm(self.som.W.data, dim=1)
22 | self.assertTrue(torch.allclose(norms, torch.ones_like(norms), atol=1e-6))
23 |
24 | def test_unit_cords(self):
25 | self.assertEqual(self.som.unit_cords(7), (2, 1))
26 | self.assertEqual(self.som.unit_cords(24), (4, 4))
27 |
28 | def test_euq_dist(self):
29 | X = self.data[:5]
30 | X2 = (X**2).sum(1).unsqueeze(1)
31 | D = self.som._euq_dist(X2, X)
32 | self.assertEqual(D.shape, (25, 5))
33 |
34 | def test_find_neighbors(self):
35 | neighbors = self.som.find_neighbors(12, 1)
36 | self.assertEqual(neighbors.shape, (1, 25))
37 | self.assertEqual(neighbors[0, 12].item(), 0)
38 |
39 | def test_best_match(self):
40 | X = self.data[:5]
41 | BMU, D = self.som.best_match(X)
42 | self.assertEqual(BMU.shape, (5, 25))
43 | self.assertEqual(D.shape, (25, 5))
44 |
45 | def test_assing_to_units(self):
46 | self.som.assing_to_units()
47 | self.assertEqual(self.som.ins_unit_assign.shape, (100,))
48 | self.assertEqual(self.som.ins_unit_dist.shape, (100,))
49 |
50 | def test_set_params(self):
51 | U = self.som.set_params(10)
52 | self.assertEqual(len(U["alphas"]), 10)
53 | self.assertEqual(len(U["H_maps"]), 10)
54 | self.assertEqual(len(U["radiuses"]), 10)
55 |
56 | def test_train_batch(self):
57 | self.som.train_batch(num_epoch=5, batch_size=20, verbose=False)
58 | self.assertIsNotNone(self.som.ins_unit_assign)
59 | self.assertIsNotNone(self.som.ins_unit_dist)
60 |
61 | def test_update_unit_saliency(self):
62 | win_counts = torch.ones(25)
63 | update_rate = torch.ones(25, 25)
64 | self.som._update_unit_saliency(win_counts, update_rate, 0.1)
65 | self.assertGreater(self.som.unit_saliency_coeffs.sum().item(), 0)
66 |
67 |
68 | if __name__ == "__main__":
69 | unittest.main()
70 |
--------------------------------------------------------------------------------
/grid_search.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from matplotlib import pyplot as plt
4 | from sklearn.datasets import load_digits
5 | from sklearn.model_selection import train_test_split
6 | from tqdm import tqdm
7 |
8 | from som import SOM
9 |
10 |
11 | def quantization_error(som, data):
12 | _, distances = som.best_match(data)
13 | return torch.mean(torch.min(distances, dim=0)[0])
14 |
15 |
16 | def grid_search_som(data, unit_range, epochs=1000, alpha_max=0.05, trials=3):
17 | results = []
18 |
19 | for num_units in tqdm(unit_range, desc="Grid Search"):
20 | trial_errors = []
21 | for _ in range(trials):
22 | som = SOM(data, num_units=num_units, alpha_max=alpha_max)
23 | som.train_batch(num_epoch=epochs, verbose=False)
24 | error = quantization_error(som, data)
25 | trial_errors.append(error.item())
26 |
27 | avg_error = np.mean(trial_errors)
28 | std_error = np.std(trial_errors)
29 | results.append((num_units, avg_error, std_error))
30 |
31 | print(
32 | f"Units: {num_units}, Avg Error: {avg_error:.4f}, Std Error: {std_error:.4f}"
33 | )
34 |
35 | return results
36 |
37 |
38 | def find_elbow(x, y):
39 | # Normalize the data
40 | x = np.array(x)
41 | y = np.array(y)
42 | x_norm = (x - min(x)) / (max(x) - min(x))
43 | y_norm = (y - min(y)) / (max(y) - min(y))
44 |
45 | # Calculate the distances from each point to the line connecting the first and last points
46 | coords = np.vstack([x_norm, y_norm]).T
47 | first = coords[0]
48 | line_vec = coords[-1] - coords[0]
49 | line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2))
50 | vec_from_first = coords - first
51 | scalar_proj = np.dot(vec_from_first, line_vec_norm)
52 | proj = np.outer(scalar_proj, line_vec_norm)
53 | distances = np.sqrt(np.sum((vec_from_first - proj) ** 2, axis=1))
54 |
55 | # Find the elbow point (maximum distance)
56 | elbow_index = np.argmax(distances)
57 | return x[elbow_index], y[elbow_index]
58 |
59 |
60 | if __name__ == "__main__":
61 | # Load Digits dataset
62 | digits = load_digits()
63 | data = torch.from_numpy(digits.data).float()
64 |
65 | # Normalize the data
66 | data = (data - data.min()) / (data.max() - data.min())
67 |
68 | # Split the data into train and test sets
69 | X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)
70 |
71 | # Define the range of units to search
72 | unit_range = [9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196]
73 |
74 | # Perform grid search
75 | results = grid_search_som(
76 | X_train, unit_range, epochs=1000, alpha_max=0.05, trials=3
77 | )
78 |
79 | # Extract units and errors
80 | units = [r[0] for r in results]
81 | errors = [r[1] for r in results]
82 | error_stds = [r[2] for r in results]
83 |
84 | # Find the elbow point
85 | elbow_units, elbow_error = find_elbow(units, errors)
86 |
87 | print(f"\nElbow point: {elbow_units:.0f} units, Error: {elbow_error:.4f}")
88 |
89 | # Plot the results
90 | plt.figure(figsize=(10, 6))
91 | plt.errorbar(units, errors, yerr=error_stds, fmt="o-", capsize=5)
92 | plt.plot(elbow_units, elbow_error, "ro", markersize=10, label="Elbow point")
93 | plt.xlabel("Number of Units")
94 | plt.ylabel("Quantization Error")
95 | plt.title("SOM Grid Search Results")
96 | plt.xscale("log")
97 | plt.grid(True)
98 | plt.legend()
99 | plt.show()
100 |
101 | # Train the SOM with the elbow point number of units
102 | best_som = SOM(data, num_units=int(elbow_units), alpha_max=0.05)
103 | best_som.train_batch(num_epoch=1000, verbose=True)
104 |
105 | # Evaluate on test set
106 | test_error = quantization_error(best_som, X_test)
107 | print(f"\nTest set quantization error: {test_error:.4f}")
108 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | .png
--------------------------------------------------------------------------------
/sample_run.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import torch
6 | from PIL import Image
7 | from sklearn.datasets import load_digits
8 | from sklearn.decomposition import PCA
9 |
10 | from rsom import RSOM
11 |
12 |
13 | def get_node_coordinates(som, pca):
14 | coords = []
15 | for i in range(som.height):
16 | for j in range(som.width):
17 | node_index = i * som.width + j
18 | node_weights = som.W[node_index].detach().numpy()
19 | coord = pca.transform([node_weights])[0]
20 | coords.append(coord)
21 | return np.array(coords)
22 |
23 |
24 | # Load Iris dataset
25 | data = load_digits().data
26 | data = torch.from_numpy(data).float()
27 | print(data.shape)
28 |
29 | # standardize the data
30 | data = (data - data.mean(axis=0)) / (data.std(axis=0) - 1e-8)
31 |
32 | # Initialize SOM
33 | som = RSOM(
34 | data,
35 | alpha_max=0.001,
36 | alpha_min=0.0005,
37 | num_units=49,
38 | )
39 |
40 | # Train batch SOM
41 | som.train_batch(num_epoch=10000, verbose=True, batch_size=128)
42 |
43 | # Get salient instances and units
44 | salient_insts = som.salient_insts()
45 | salient_units = som.salient_units()
46 |
47 | # Perform PCA to reduce data to 2D for visualization
48 | pca = PCA(n_components=2)
49 | data_2d = pca.fit_transform(som.X.numpy())
50 | units_2d = pca.transform(som.W.detach().numpy())
51 |
52 | # Get node coordinates
53 | node_coords = get_node_coordinates(som, pca)
54 |
55 | # Create a plot
56 | plt.figure(figsize=(12, 8))
57 |
58 | # Plot data points
59 | salient_mask = som.inst_saliency.numpy()
60 | plt.scatter(
61 | data_2d[salient_mask, 0],
62 | data_2d[salient_mask, 1],
63 | c=som.ins_unit_assign[salient_mask],
64 | cmap="viridis",
65 | alpha=0.6,
66 | label="Salient Samples",
67 | )
68 | plt.scatter(
69 | data_2d[~salient_mask, 0],
70 | data_2d[~salient_mask, 1],
71 | c="red",
72 | marker="x",
73 | alpha=0.6,
74 | label="Outlier Samples",
75 | )
76 |
77 | # Plot SOM units
78 | salient_units_mask = som.unit_saliency.numpy()
79 | plt.scatter(
80 | node_coords[salient_units_mask, 0],
81 | node_coords[salient_units_mask, 1],
82 | c="black",
83 | marker="s",
84 | s=50,
85 | label="Salient Units",
86 | )
87 | plt.scatter(
88 | node_coords[~salient_units_mask, 0],
89 | node_coords[~salient_units_mask, 1],
90 | c="red",
91 | marker="s",
92 | s=50,
93 | label="Outlier Units",
94 | )
95 |
96 | # Draw lattice lines
97 | for i in range(som.height):
98 | for j in range(som.width):
99 | node_index = i * som.width + j
100 | if j < som.width - 1: # Horizontal line
101 | next_node_index = node_index + 1
102 | plt.plot(
103 | [node_coords[node_index, 0], node_coords[next_node_index, 0]],
104 | [node_coords[node_index, 1], node_coords[next_node_index, 1]],
105 | "gray",
106 | alpha=0.5,
107 | )
108 | if i < som.height - 1: # Vertical line
109 | next_node_index = node_index + som.width
110 | plt.plot(
111 | [node_coords[node_index, 0], node_coords[next_node_index, 0]],
112 | [node_coords[node_index, 1], node_coords[next_node_index, 1]],
113 | "gray",
114 | alpha=0.5,
115 | )
116 |
117 | # Add labels and title
118 | plt.xlabel("First Principal Component")
119 | plt.ylabel("Second Principal Component")
120 | plt.title("SOM Units and Data Samples with Outliers and Lattice")
121 | plt.legend()
122 |
123 | # Show the plot
124 | plt.show()
125 |
126 | # Optional: Print some statistics
127 | print(f"Number of salient samples: {salient_mask.sum()}")
128 | print(f"Number of outlier samples: {(~salient_mask).sum()}")
129 | print(f"Number of salient units: {salient_units_mask.sum()}")
130 | print(f"Number of outlier units: {(~salient_units_mask).sum()}")
131 |
132 | # Create a new figure for the perfect 2D lattice plot
133 | plt.figure(figsize=(12, 12))
134 |
135 | # Create a perfect 2D grid for SOM nodes
136 | grid_x, grid_y = np.meshgrid(np.arange(som.width), np.arange(som.height))
137 | grid_x = grid_x.flatten()
138 | grid_y = grid_y.flatten()
139 |
140 | # Plot the perfect grid
141 | plt.scatter(grid_x, grid_y, c="lightgray", s=200, marker="s")
142 |
143 | # Draw grid lines
144 | for x in range(som.width):
145 | plt.axvline(x, color="lightgray", linestyle="--")
146 | for y in range(som.height):
147 | plt.axhline(y, color="lightgray", linestyle="--")
148 |
149 | # Get the unit assignments for each sample
150 | unit_assignments = som.ins_unit_assign.numpy()
151 |
152 | # Calculate the positions of samples on the grid
153 | sample_x = grid_x[unit_assignments].astype(float)
154 | sample_y = grid_y[unit_assignments].astype(float)
155 |
156 | # Add some jitter to prevent complete overlap
157 | jitter = 0.2
158 | sample_x += np.random.uniform(-jitter, jitter, sample_x.shape)
159 | sample_y += np.random.uniform(-jitter, jitter, sample_y.shape)
160 |
161 | # Plot the samples on the grid
162 | scatter = plt.scatter(
163 | sample_x, sample_y, c=som.ins_unit_assign, cmap="viridis", alpha=0.6
164 | )
165 |
166 | # Highlight outlier samples
167 | outlier_mask = ~som.inst_saliency.numpy()
168 | plt.scatter(
169 | sample_x[outlier_mask],
170 | sample_y[outlier_mask],
171 | facecolors="none",
172 | edgecolors="red",
173 | s=50,
174 | linewidths=2,
175 | )
176 |
177 | # Highlight outlier units
178 | for unit in np.where(~som.unit_saliency.numpy())[0]:
179 | unit_x, unit_y = som.unit_cords(unit)
180 | plt.gca().add_patch(
181 | plt.Circle((unit_x, unit_y), 0.4, fill=False, edgecolor="red", linewidth=2)
182 | )
183 |
184 | # Set labels and title
185 | plt.xlabel("SOM Width")
186 | plt.ylabel("SOM Height")
187 | plt.title("Samples Mapped to Perfect 2D SOM Lattice")
188 |
189 | # Set tick labels
190 | plt.xticks(range(som.width))
191 | plt.yticks(range(som.height))
192 |
193 | # Add colorbar
194 | cbar = plt.colorbar(scatter)
195 | cbar.set_label("Unit Assignment")
196 |
197 | # Adjust plot limits
198 | plt.xlim(-0.5, som.width - 0.5)
199 | plt.ylim(-0.5, som.height - 0.5)
200 |
201 | # Show the plot
202 | plt.tight_layout()
203 | plt.show()
204 |
205 | # Create a folder to save outlier images
206 | output_folder = "outlier_digits"
207 | os.makedirs(output_folder, exist_ok=True)
208 |
209 | # Get the original digit images and their labels
210 | digits = load_digits()
211 | images = digits.images
212 | labels = digits.target
213 |
214 | # Find the indices of outlier samples
215 | outlier_indices = np.where(~salient_mask)[0]
216 |
217 | # Save outlier images
218 | for i, idx in enumerate(outlier_indices):
219 | img = images[idx]
220 | label = labels[idx]
221 |
222 | # Normalize the image to 0-255 range
223 | img_normalized = ((img - img.min()) / (img.max() - img.min()) * 255).astype(
224 | np.uint8
225 | )
226 |
227 | # Create a PIL Image
228 | pil_img = Image.fromarray(img_normalized)
229 |
230 | # Save the image
231 | filename = f"outlier_{i}_label_{label}.png"
232 | pil_img.save(os.path.join(output_folder, filename))
233 |
234 | print(f"Saved {len(outlier_indices)} outlier images to '{output_folder}' folder.")
235 |
236 | # Find samples closest to salient units
237 | salient_folder = "salient_digits"
238 | os.makedirs(salient_folder, exist_ok=True)
239 | salient_unit_indices = np.where(som.unit_saliency.numpy())[0]
240 |
241 | for i, unit_idx in enumerate(salient_unit_indices):
242 | # Find the sample closest to this salient unit
243 | unit_weights = som.W[unit_idx].detach().numpy()
244 | distances = np.linalg.norm(data.numpy() - unit_weights, axis=1)
245 | closest_sample_idx = np.argmin(distances)
246 |
247 | img = images[closest_sample_idx]
248 | label = labels[closest_sample_idx]
249 |
250 | # Normalize the image to 0-255 range
251 | img_normalized = ((img - img.min()) / (img.max() - img.min()) * 255).astype(
252 | np.uint8
253 | )
254 |
255 | # Create a PIL Image
256 | pil_img = Image.fromarray(img_normalized)
257 |
258 | # Save the image
259 | filename = f"salient_unit_{i}_label_{label}.png"
260 | pil_img.save(os.path.join(salient_folder, filename))
261 |
262 | print(
263 | f"Saved {len(salient_unit_indices)} salient unit images to '{salient_folder}' folder."
264 | )
265 |
--------------------------------------------------------------------------------
/rsom.py:
--------------------------------------------------------------------------------
1 | """
2 | Rectifying Self Organazing Maps a.k.a RSOM
3 |
4 | RSOM is a clustering and outlier detection method that is predicated with
5 | old Self Organazing Maps.
6 |
7 | It includes Batch and Stochastic learning rules. There are two different
8 | implementations. One is based on Numpy and tthe other is Theano. If you have
9 | tall and wide data matrix, we suggest to use Theano version. Otherwise
10 | Numpy version is faster. You can also use GPU with Theano but you need to
11 | set Theano configurations.
12 |
13 | For more detail about RSOM refer to http://arxiv.org/abs/1312.4384
14 |
15 | AUTHOR:
16 | Eren Golge
17 | erengolge@gmail.com
18 | www.erengolge.com
19 | """
20 |
21 | """
22 | TO DO:
23 | -> Try dot product distance instead of Euclidean
24 | -> Normzalize only updated weight vectors in that epoch
25 | -> compare code with https://github.com/JustGlowing/minisom/blob/master/minisom.py
26 | -> print resulting objective values
27 | -> write bookeeping for best objective value
28 | -> learning rate is already decreasing so radius might be good to keep it constant
29 | -> UPDATE only winners
30 | """
31 |
32 | import logging
33 | from typing import Optional, Tuple
34 |
35 | import numpy as np
36 | import torch
37 |
38 | # Set up logging
39 | logging.basicConfig(
40 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
41 | )
42 | logger = logging.getLogger(__name__)
43 |
44 |
45 | class RSOM(torch.nn.Module):
46 | def __init__(
47 | self,
48 | data: torch.Tensor,
49 | num_units: int = 10,
50 | height: Optional[int] = None,
51 | width: Optional[int] = None,
52 | alpha_max: float = 0.05,
53 | alpha_min: float = 0.001,
54 | set_count_activations: bool = True,
55 | set_outlier_unit_det: bool = True,
56 | set_inunit_outlier_det: bool = True,
57 | outlier_unit_thresh: float = 0.5,
58 | inunit_outlier_thresh: float = 95,
59 | dist: str = "euclidean",
60 | log_full_data_cost: bool = False,
61 | steps_to_full_data_cost: int = -1, # TODO: number of steps to compute full data cost
62 | ):
63 | """Rectifying Self Organizing Maps. RSOM is a clustering and outlier detection method that is predicated with old Self Organizing Maps.
64 |
65 | Args:
66 | data: Input data.
67 | num_units: Number of units.
68 | height: Height of the map.
69 | width: Width of the map.
70 | alpha_max: Maximum learning rate.
71 | alpha_min: Minimum learning rate.
72 | set_count_activations: Whether to count activations.
73 | set_outlier_unit_det: Whether to detect outlier units.
74 | set_inunit_outlier_det: Whether to detect in-unit outliers.
75 | outlier_unit_thresh: Threshold for outlier unit detection.
76 | inunit_outlier_thresh: Threshold for in-unit outlier detection.
77 | dist: Distance metric. Can be "euclidean" or "cosine".
78 | log_full_data_cost: Whether to log full data cost or use exponential moving average. Computing full data
79 | cost is expensive and might cause OOM.
80 |
81 | """
82 |
83 | super(RSOM, self).__init__()
84 | self.X = data
85 | self.num_units = num_units
86 | self.height = height
87 | self.width = width
88 | self.alpha_max = alpha_max
89 | self.alpha_min = alpha_min
90 | self.set_count_activations = set_count_activations
91 | self.set_outlier_unit_det = set_outlier_unit_det
92 | self.set_inunit_outlier_det = set_inunit_outlier_det
93 | self.outlier_unit_thresh = outlier_unit_thresh
94 | self.inunit_outlier_thresh = inunit_outlier_thresh
95 | self.log_full_data_cost = log_full_data_cost
96 |
97 | self._estimate_map_shape()
98 | self.data_dim = self.X.shape[1]
99 |
100 | self.W = torch.nn.Parameter(torch.randn(self.num_units, self.data_dim))
101 | self._normalize_weights()
102 |
103 | self.distance_metric = dist
104 | if self.distance_metric not in ["euclidean", "cosine"]:
105 | raise ValueError("distance_metric must be either 'euclidean' or 'cosine'")
106 |
107 | self.activations = torch.zeros(self.num_units)
108 | self.unit_saliency_coeffs = torch.zeros(self.num_units)
109 | self.unit_saliency = torch.ones(self.num_units, dtype=torch.bool)
110 | self.inst_saliency = torch.tensor([])
111 | self.ins_unit_assign = torch.tensor([])
112 | self.ins_unit_dist = torch.tensor([])
113 | self.unit_coher = torch.tensor([])
114 |
115 | def _normalize_weights(self):
116 | self.W.data = self.W.data / torch.norm(self.W.data, dim=1, keepdim=True)
117 |
118 | def _estimate_map_shape(self):
119 | if self.height is None or self.width is None:
120 | u, s, v = torch.svd(self.X)
121 | ratio = s[0] / s[1]
122 | self.height = min(
123 | self.num_units, int(np.ceil(np.sqrt(self.num_units / ratio)))
124 | )
125 | self.width = int(np.ceil(self.num_units / self.height))
126 | self.num_units = self.height * self.width
127 | logging.info(
128 | f"Estimated map size is -> height = {self.height}, width = {self.width}"
129 | )
130 |
131 | def unit_cords(self, index: int) -> Tuple[int, int]:
132 | return index % self.width, index // self.width
133 |
134 | def _calc_distance(self, X, X2=None):
135 | if self.distance_metric == "euclidean":
136 | return self._euclidean_distance(X, X2)
137 | elif self.distance_metric == "cosine":
138 | return self._cosine_distance(X)
139 |
140 | def _euclidean_distance(self, X, X2=None):
141 | if X2 is None:
142 | X2 = (X**2).sum(1)[:, None]
143 | W2 = (self.W**2).sum(1)[:, None]
144 | return -2 * torch.mm(self.W, X.t()) + W2 + X2.t()
145 |
146 | def _cosine_distance(self, X):
147 | X_norm = X / torch.norm(X, dim=1, keepdim=True)
148 | W_norm = self.W / torch.norm(self.W, dim=1, keepdim=True)
149 | return 1 - torch.mm(W_norm, X_norm.t())
150 |
151 | def find_neighbors(self, unit_id: int, radius: int) -> torch.Tensor:
152 | neighbors = torch.zeros(1, self.num_units)
153 | unit_x, unit_y = self.unit_cords(unit_id)
154 |
155 | min_y = max(int(unit_y - radius), 0)
156 | max_y = min(int(unit_y + radius), self.height - 1)
157 | min_x = max(int(unit_x - radius), 0)
158 | max_x = min(int(unit_x + radius), self.width - 1)
159 |
160 | for y in range(min_y, max_y + 1):
161 | for x in range(min_x, max_x + 1):
162 | dist = abs(y - unit_y) + abs(x - unit_x)
163 | neighbors[0, x + (y * self.width)] = dist
164 |
165 | return neighbors
166 |
167 | def best_match(self, X: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
168 | if X.dim() == 1:
169 | X = X.unsqueeze(0)
170 | X2 = (X**2).sum(1).unsqueeze(1)
171 | D = -2 * torch.mm(self.W, X.t()) + (self.W**2).sum(1).unsqueeze(1) + X2.t()
172 | BMU = (D == D.min(0)[0]).float().t()
173 | return BMU, D
174 |
175 | def assing_to_units(self, X=None):
176 | if X is None:
177 | D = self._calc_distance(self.X)
178 | self.ins_unit_assign = D.argmin(axis=0)
179 | self.ins_unit_dist = D[self.ins_unit_assign, torch.arange(self.X.shape[0])]
180 | else:
181 | D = self._calc_distance(X)
182 | ins_unit_assign = D.argmin(axis=0)
183 | ins_unit_dist = D[ins_unit_assign, torch.arange(X.shape[0])]
184 | return ins_unit_assign, ins_unit_dist
185 |
186 | def set_params(self, num_epoch: int) -> dict:
187 | U = {"alphas": [], "H_maps": [], "radiuses": []}
188 |
189 | dist_map = torch.zeros(self.num_units, self.num_units)
190 | radius = np.ceil(1 + np.floor(min(self.width, self.height) - 1) / 2) - 1
191 | for u in range(self.num_units):
192 | dist_map[u, :] = self.find_neighbors(u, self.num_units)
193 |
194 | for epoch in range(num_epoch):
195 | alpha = self.alpha_max - self.alpha_min
196 | alpha = alpha * (num_epoch - epoch) / num_epoch + self.alpha_min
197 | radius = np.ceil(1 + np.floor(min(self.width, self.height) - 1) / 2) - 1
198 | radius = radius * (num_epoch - epoch) / (num_epoch - 1) - 1
199 | radius = max(radius, 0)
200 |
201 | neigh_updt_map = alpha * (1 - dist_map / float((1 + radius)))
202 | neigh_updt_map[dist_map > radius] = 0
203 |
204 | U["H_maps"].append(neigh_updt_map)
205 | U["alphas"].append(alpha)
206 | U["radiuses"].append(radius)
207 |
208 | return U
209 |
210 | def train_batch(
211 | self,
212 | num_epoch: Optional[int] = None,
213 | batch_size: Optional[int] = None,
214 | verbose: bool = True,
215 | ):
216 | """
217 | Args:
218 | num_epoch: number of epochs to train
219 | batch_size: number of samples to train in one batch
220 | verbose: if True, print the progress of training
221 | """
222 | if num_epoch is None:
223 | num_epoch = 500 * self.num_units
224 |
225 | if batch_size is None:
226 | batch_size = self.X.shape[0]
227 |
228 | logger.info("Learning...")
229 | U = self.set_params(num_epoch)
230 |
231 | X2 = None
232 | if batch_size == self.X.shape[0]:
233 | X2 = (self.X**2).sum(1).unsqueeze(1)
234 |
235 | for epoch in range(num_epoch):
236 | logger.info(f"Epoch --- {epoch}")
237 | update_rate = U["H_maps"][epoch]
238 | learn_rate = U["alphas"][epoch]
239 |
240 | shuffle_indices = torch.randperm(self.X.shape[0])
241 | win_counts = torch.zeros(self.num_units)
242 |
243 | batches = torch.split(shuffle_indices, batch_size)
244 | num_steps = len(batches)
245 |
246 | for step, batch_indices in enumerate(batches):
247 | batch_data = self.X[batch_indices, :]
248 | D = self._calc_distance(batch_data, X2)
249 | BMU = (D == D.min(0)[0][None, :]).float().t()
250 |
251 | win_counts += BMU.sum(dim=0)
252 |
253 | if self.set_count_activations:
254 | self.activations += win_counts
255 |
256 | A = torch.mm(BMU, update_rate)
257 | S = A.sum(0)
258 | non_zeros = S.nonzero().squeeze()
259 |
260 | self.W.data[non_zeros] = torch.mm(A[:, non_zeros].t(), batch_data) / S[
261 | non_zeros
262 | ].unsqueeze(1)
263 |
264 | self._print_cost(X2, D, epoch, num_epoch, step, num_steps)
265 |
266 | if self.set_outlier_unit_det:
267 | self._update_unit_saliency(win_counts, update_rate, learn_rate)
268 |
269 | if self.set_count_activations:
270 | self.activations /= self.activations.sum()
271 |
272 | self.assing_to_units()
273 |
274 | if self.set_outlier_unit_det:
275 | self._find_outlier_units()
276 |
277 | if self.set_inunit_outlier_det:
278 | self._find_inunit_outliers()
279 |
280 | def _print_cost(
281 | self,
282 | X2: torch.Tensor,
283 | D: torch.Tensor,
284 | epoch: int,
285 | num_epoch: int,
286 | step: int,
287 | num_steps: int,
288 | ):
289 | batch_cost = D.min(0)[0].mean()
290 |
291 | if self.log_full_data_cost:
292 | # TODO: handle when data is too large
293 | if self.distance_metric == "euclidean":
294 | D = self._calc_distance(self.X, X2)
295 | cost = torch.norm(D.min(0)[0], p=1) / self.X.shape[0]
296 | elif self.distance_metric == "cosine":
297 | D = self._calc_distance(self.X)
298 | cost = D.min(0)[0].mean() # Average minimum cosine distance
299 | else:
300 | # exponential moving average cost
301 | if not hasattr(self, "avg_cost"):
302 | self.avg_cost = batch_cost
303 | else:
304 | self.avg_cost = self.avg_cost * 0.01 + batch_cost * 0.99
305 | cost = self.avg_cost
306 |
307 | logger.info(
308 | f"epoch {epoch} / {num_epoch} -- step {step} / {num_steps} -- avg-cost: {cost.item():.6f} -- batch-cost: {batch_cost.item():.6f}"
309 | )
310 |
311 | def _update_unit_saliency(
312 | self, win_counts: torch.Tensor, update_rate: torch.Tensor, learn_rate: float
313 | ):
314 | excitations = (update_rate * win_counts.unsqueeze(1)).sum(dim=0) / learn_rate
315 | excitations = excitations / excitations.sum()
316 | single_excitations = win_counts * learn_rate
317 | single_excitations = single_excitations / single_excitations.sum()
318 | self.unit_saliency_coeffs += excitations + single_excitations
319 |
320 | def _find_outlier_units(self):
321 | self.unit_saliency_coeffs /= self.unit_saliency_coeffs.sum()
322 | self.unit_saliency = (
323 | self.unit_saliency_coeffs > self.outlier_unit_thresh / self.num_units
324 | )
325 |
326 | self.inst_saliency = torch.ones(self.X.shape[0], dtype=torch.bool)
327 | outlier_units = torch.where(self.unit_saliency == False)[0]
328 | for i in outlier_units:
329 | self.inst_saliency[torch.where(self.ins_unit_assign == i)[0]] = False
330 |
331 | def _find_inunit_outliers(self):
332 | if self.inst_saliency.numel() == 0:
333 | self.inst_saliency = torch.ones(self.X.shape[0], dtype=torch.bool)
334 |
335 | for i in torch.unique(self.ins_unit_assign):
336 | indices = torch.where(self.ins_unit_assign == i)[0]
337 | unit_thresh = torch.quantile(
338 | self.ins_unit_dist[indices], self.inunit_outlier_thresh / 100
339 | )
340 | outlier_insts = indices[self.ins_unit_dist[indices] > unit_thresh]
341 | self.inst_saliency[outlier_insts] = False
342 |
343 | def salient_inst_index(self) -> torch.Tensor:
344 | return torch.where(self.inst_saliency == True)[0]
345 |
346 | def salient_unit_index(self) -> torch.Tensor:
347 | return torch.where(self.unit_saliency == True)[0]
348 |
349 | def salient_insts(self) -> torch.Tensor:
350 | return self.X[self.inst_saliency]
351 |
352 | def salient_units(self) -> torch.Tensor:
353 | return self.W[self.unit_saliency]
354 |
355 | def inst_to_unit_mapping(self) -> torch.Tensor:
356 | return torch.stack((torch.arange(self.X.shape[0]), self.ins_unit_assign))
357 |
358 |
359 | if __name__ == "__main__":
360 | import matplotlib.pyplot as plt
361 | import som_plot
362 | import torch
363 | from sklearn.datasets import load_digits
364 | from sklearn.preprocessing import StandardScaler
365 |
366 | # Load the digits dataset
367 | digits = load_digits()
368 | X = digits.data
369 | y = digits.target
370 |
371 | # Preprocess the data
372 | scaler = StandardScaler()
373 | X_scaled = scaler.fit_transform(X)
374 |
375 | # Convert to PyTorch tensor
376 | X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
377 |
378 | # Initialize and train SOM
379 | som = SOM(X_tensor, num_units=100, alpha_max=0.05, alpha_min=0.01)
380 | som.train_batch(num_epoch=1000, batch_size=32, verbose=True)
381 |
382 | # Get the weights and assign instances to units
383 | W = som.W.detach().numpy()
384 | som.assing_to_units()
385 |
386 | # Plot scatter plot
387 | som_plot.som_plot_scatter(W, X_scaled, som.activations.numpy())
388 |
389 | # Plot outlier scatter plot
390 | som_plot.som_plot_outlier_scatter(
391 | W,
392 | X_scaled,
393 | som.unit_saliency.numpy(),
394 | som.inst_saliency.numpy(),
395 | som.activations.numpy(),
396 | )
397 |
398 | # Plot mapping
399 | distance_map = (
400 | som._euq_dist(torch.sum(X_tensor**2, dim=1).unsqueeze(1), X_tensor)
401 | .detach()
402 | .numpy()
403 | )
404 | distance_map = distance_map.reshape(som.height, som.width)
405 | som_plot.som_plot_mapping(distance_map)
406 |
407 | plt.show()
408 |
--------------------------------------------------------------------------------
/som_theano.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | '''
3 | Rectifying Self Organazing Maps a.k.a RSOM
4 |
5 | RSOM is a clustering and outlier detection method that is predicated with
6 | old Self Organazing Maps.
7 |
8 | It includes Batch and Stochastic learning rules. There are two different
9 | implementations. One is based on Numpy and tthe other is Theano. If you have
10 | tall and wide data matrix, we suggest to use Theano version. Otherwise
11 | Numpy version is faster. You can also use GPU with Theano but you need to
12 | set Theano configurations.
13 |
14 | For more detail about RSOM refer to http://arxiv.org/abs/1312.4384
15 |
16 | AUTHOR:
17 | Eren Golge
18 | erengolge@gmail.com
19 | www.erengolge.com
20 | '''
21 |
22 | """
23 | TO DO:
24 | -> Try dot product distance instead of Euclidean
25 | -> Normzalize only updated weight vectors in that epoch
26 | -> compare code with https://github.com/JustGlowing/minisom/blob/master/minisom.py
27 | -> print resulting objective values
28 | -> write bookeeping for best objective value
29 | -> learning rate is already decreasing so radius might be good to keep it constant
30 | -> UPDATE only winners
31 | """
32 |
33 | import warnings
34 | from random import *
35 | from math import *
36 | import sys
37 | import scipy
38 | import numpy as np
39 | from numpy import linalg
40 | from som_plot import *
41 | import theano
42 | import theano.tensor as T
43 | from theano import function, config, shared, sandbox
44 | from theano import ProfileMode
45 | from collections import Counter
46 | #from theano import ProfileMode
47 |
48 | EPS = 2.2204e-16;
49 |
50 | class SOM(object):
51 |
52 | def __init__(self, DATA=None, num_units = 10, height=None, width=None, \
53 | alpha_max=0.05, alpha_min=0.001, set_count_activations = True, \
54 | set_outlier_unit_det = True, set_inunit_outlier_det = True, outlier_unit_thresh = 0.5,\
55 | inunit_outlier_thresh = 95):
56 |
57 | '''
58 | CONSTRUCTOR PARAMETERS:
59 |
60 | DATA --- data matrix with shape nxm n is number of instances and
61 | m is number of variables
62 | num_units --- number of som units. This can be changes a bit after
63 | 2D lattice shape is computed by eigen heuristic, if its shape
64 | paramters are not given already.
65 | height --- height of the 2D lattice of SOM
66 | width --- width of the 2D lattice of SOM. height * width = num_inst
67 | alpha_max --- is the maximum learning rate that is gradually
68 | decreasing up to alpha_min
69 | alpha_min --- is the minimum learning rate attined at the last epoch
70 | set_count_activations --- whether count the activation of each unit
71 | set_outlier_unit_det --- whether outlier units are detected. If a unit
72 | is detected as outlier, all of the assigned items signed as outlier as well
73 | set_inunit_outlier_det --- wheter in-unit outlier instances are detected
74 | outlier_unit_thresh --- default value 0.5 works good for many cases
75 | inunit_outlier_thresh --- is the upper whisker percentage.
76 | '''
77 |
78 | self.X = DATA
79 | self.num_units = num_units
80 | if height == None or width == None:
81 | self._estimate_map_shape()
82 | self.num_units = self.height * self.width
83 | else:
84 | self.height = height
85 | self.width = width
86 |
87 | if self.height * self.width != self.num_units:
88 | print "Number of units is not conforming to lattice size so it is set num_units = width + heigth"
89 | self.num_units = self.height * self.width
90 | print "New number of units : ",self.num_units
91 | raw_input("Press Enter to continue...")
92 |
93 | self.data_dim = DATA.shape[1]
94 |
95 | # normalize data and save mean and std values
96 | self.data_mean = 0
97 | self.data_std = 0
98 | #self._norm_data()
99 |
100 | # optimization parameters
101 | self.alpha_max = alpha_max
102 | self.alpha_min = alpha_min
103 |
104 | self.W = np.random.random((self.num_units , self.data_dim))
105 | self.W = np.array([v/linalg.norm(v) for v in self.W]) # normalizat
106 |
107 | # book keeping
108 | self.best_W = self.W
109 | self.best_W_obj = 0
110 |
111 | # unit statistics
112 | self.set_count_activations = set_count_activations
113 | self.activations = np.zeros((self.num_units))
114 | self.set_outlier_unit_det = set_outlier_unit_det
115 | self.set_inunit_outlier_det = set_inunit_outlier_det
116 | self.unit_saliency_coeffs = np.zeros((self.num_units))
117 | self.unit_saliency = np.ones((self.num_units), dtype=bool)
118 | self.inst_saliency = np.array(())
119 | self.outlier_unit_thresh = outlier_unit_thresh
120 | self.inunit_outlier_thresh = inunit_outlier_thresh
121 | self.ins_unit_assign = np.array(())
122 | self.ins_unit_dist = np.array(())
123 | self.unit_coher = np.array(())
124 |
125 | unit_x = lambda self, index, width : index % width
126 | unit_y = lambda self, index, width : np.floor( index / width )
127 |
128 | def unit_cords(self, index):
129 | return self.unit_x(index, self.width), self.unit_y(index, self.width)
130 |
131 | # Euclidean distance with pre-computed data square X2
132 | def _euq_dist(self, X2, X):
133 | return -2*np.dot(self.W, X.T) + (self.W**2).sum(1)[:, None] + X2.T
134 |
135 | # Print function for Numpy based optimization functions
136 | def _print_cost(self,X2, epoch, num_epoch):
137 | D = self._euq_dist(X2, self.X)
138 | print "epoch", epoch, "of", num_epoch, " cost: ", np.linalg.norm(D.min(0), ord=1) / self.X.shape[0]
139 |
140 |
141 | def set_params(self, num_epoch):
142 |
143 | '''
144 | Before starting to learning, all imperative parameters are set regarding
145 | corresponding epoch. It wastes some additional memory but proposes faster
146 | learning speed.
147 |
148 | Outputs:
149 | U --- is a dictionary including all necessary parameter structures
150 |
151 | U['alphas'] -- learning rates for each epoch
152 | U['H_maps'] -- matrix array of neighboorhood masks
153 | U['radiuses'] -- neighboor radiuses for each epoch
154 |
155 | '''
156 |
157 | U = {'alphas':[], 'H_maps':[], 'radiuses':[]}
158 | alphas = [None]*num_epoch
159 | H_maps = [None]*num_epoch
160 | radiuses = [None]*num_epoch
161 |
162 | dist_map = np.zeros((self.num_units, self.num_units))
163 | radius = np.ceil(1 + floor(min(self.width, self.height)-1)/2)-1
164 | for u in range(int(self.num_units)):
165 | #for r in range(1,int(radius)+1,1):
166 | dist_map[u,:] = self.find_neighbors(u,self.num_units)
167 |
168 | for epoch in range(0,num_epoch,1):
169 | alpha = self.alpha_max - self.alpha_min
170 | alpha = alpha * (num_epoch - epoch)
171 | alpha = alpha / num_epoch + self.alpha_min
172 | radius = np.ceil(1 + floor(min(self.width, self.height)-1)/2)-1
173 | radius = radius * (num_epoch - epoch)
174 | radius = ceil(radius / (num_epoch - 1))-1
175 | if radius < 0 :
176 | radius = 0
177 | neigh_updt_map = alpha * (1 - dist_map/float((1 + radius)))
178 | # neigh_updt_map[dist_map == 0] = 1
179 | neigh_updt_map[dist_map > radius] = 0 # Optimize this part
180 | H_maps[epoch] = neigh_updt_map
181 | alphas[epoch] = alpha
182 | radiuses[epoch] = radius
183 |
184 | U['alphas'] = alphas
185 | U['H_maps'] = H_maps
186 | U['radiuses'] = radiuses
187 | return U
188 |
189 | def train_stoch(self, num_epoch, verbose =True):
190 |
191 | '''
192 | Numpy based stochastic training where each instance is take individually
193 | and weight are updatesd in terms of winner neuron.
194 |
195 | Generally faster than Theano version
196 | '''
197 |
198 | if num_epoch == None:
199 | num_epoch = 500 * self.num_units # Kohonen's suggestion
200 |
201 | U = self.set_params(num_epoch)
202 | X2 = (self.X**2).sum(1)[:, None]
203 |
204 | for epoch in range(num_epoch):
205 | shuffle_indices = np.random.permutation(self.X.shape[0])
206 |
207 | update_rate = U['H_maps'][epoch]
208 | learn_rate = U['alphas'][epoch]
209 | win_counts = np.zeros((self.num_units))
210 | for i in shuffle_indices:
211 | instance = self.X[i,:]
212 | D = self._euq_dist(X2[i][None,:], instance[None,:])
213 | BMU_indx = np.argmin(D)
214 |
215 | win_counts[BMU_indx] += 1
216 | if self.set_count_activations:
217 | self.activations[BMU_indx] += 1
218 |
219 | self.W = self.W + learn_rate * update_rate[...,BMU_indx,None]* (instance - self.W)
220 | ## Normalization is not imperative unless given input instances are normalized
221 | # self.W = self.W / np.linalg.norm(self.W)
222 |
223 | if verbose and (epoch % 1) == 0:
224 | self._print_cost(X2, epoch, num_epoch)
225 |
226 | if self.set_outlier_unit_det:
227 | self._update_unit_saliency(win_counts, update_rate, learn_rate)
228 |
229 | # Normalize activation counts
230 | if self.set_count_activations:
231 | total_act = self.activations.sum()
232 | self.activations = self.activations / total_act
233 |
234 | self.assing_to_units() # final unit assignments
235 |
236 | if self.set_outlier_unit_det:
237 | self._find_outlier_units()
238 |
239 | if self.set_inunit_outlier_det:
240 | self._find_inunit_outliers()
241 |
242 |
243 |
244 | def train_stoch_theano(self, num_epoch = None, verbose =True):
245 |
246 | '''
247 | Theano based stochastic learning
248 | '''
249 |
250 | warnings.simplefilter("ignore", DeprecationWarning)
251 | warnings.filterwarnings("ignore")
252 |
253 | if num_epoch == None:
254 | num_epoch = 500 * self.X.shape[0]
255 |
256 | # Symmbol variables
257 | X = T.dmatrix('X')
258 | WIN = T.dmatrix('WIN')
259 | H = T.dmatrix('H')
260 |
261 | # Init weights random
262 | W = theano.shared(self.W, name="W")
263 | #W = theano.shared(rng.randn(cluster_num, data.shape[1]).astype(theano.config.floatX), name="W")
264 |
265 | # Find winner unit
266 | D = (W**2).sum(axis=1, keepdims=True) + (X**2).sum(axis=1, keepdims=True).T - 2*T.dot(W, X.T)
267 | bmu = (D).argmin(axis=0)
268 | dist = T.dot(WIN.T, X) - WIN.sum(0)[:, None] * W
269 | err = D.min(0).norm(1)/X.shape[0]
270 |
271 | update = function([X,WIN, H],outputs=err,updates=[(W, W + T.addbroadcast(H,1)*dist)])
272 | find_bmu = function([X], bmu)
273 |
274 | # Update
275 | U = self.set_params(num_epoch)
276 | for epoch in range(num_epoch):
277 | update_rate = U['H_maps'][epoch]
278 | learn_rate = U['alphas'][epoch]
279 | win_counts = np.zeros((self.num_units))
280 | shuff_indx = np.random.permutation(self.X.shape[0])
281 | for i in shuff_indx:
282 | ins = self.X[i, :][None,:]
283 | D = find_bmu(ins)
284 | S = np.zeros([ins.shape[0],self.num_units])
285 | #S = np.zeros([batch,cluster_num], theano.config.floatX)
286 | S[:,D] = 1
287 | win_counts[D] += 1
288 | h = update_rate[D,:].sum(0)[:,None]
289 | cost = update(ins,S,h)
290 |
291 | if verbose:
292 | print "Avg. centroid distance -- ", cost,"\t EPOCH : ",epoch , " of ", num_epoch
293 | if self.set_count_activations:
294 | self.activations += win_counts
295 |
296 | if self.set_outlier_unit_det:
297 | self._update_unit_saliency(win_counts, update_rate, learn_rate)
298 |
299 | # get the data from shared theano variable
300 | self.W = W.get_value()
301 |
302 | # Normalize activation counts
303 | if self.set_count_activations:
304 | total_act = self.activations.sum()
305 | self.activations = self.activations / total_act
306 |
307 | self.assing_to_units() # final unit assignments
308 |
309 | if self.set_outlier_unit_det:
310 | self._find_outlier_units()
311 |
312 | if self.set_inunit_outlier_det:
313 | self._find_inunit_outliers()
314 |
315 |
316 | def train_batch_theano(self, num_epoch = None, batch_size = None, verbose=True):
317 | '''
318 | Theano based batch learning. If you don't define batch size, then all the
319 | instances are fed for each epoch.
320 |
321 | It is preferred to use batch learning initially then fine tune with
322 | stochastic version
323 |
324 | In general Theano version is faster if the data is not very small.
325 | '''
326 |
327 | if num_epoch == None:
328 | num_epoch = 500 * self.X.shape[0]
329 |
330 | if batch_size == None:
331 | batch_size = self.X.shape[0]
332 |
333 | # Symmbol variables
334 | X = T.dmatrix('X')
335 | WIN = T.dmatrix('WIN')
336 | alpha = T.dscalar('learn_rate')
337 | H = T.dmatrix('update_rate')
338 |
339 | # Init weights random
340 | W = theano.shared(self.W, name='W')
341 | W_old = W.get_value()
342 |
343 | # Find winner unit
344 | D = (W**2).sum(axis=1, keepdims=True) + (X**2).sum(axis=1, keepdims=True).T - 2*T.dot(W, X.T)
345 | BMU = (T.eq(D,D.min(axis=0, keepdims=True))).T
346 | dist = T.dot(BMU.T, X) - BMU.sum(0)[:, None] * W
347 | err = D.min(0).sum().norm(1)/X.shape[0]
348 |
349 | #update = function([X,WIN,alpha],outputs=err,updates=[(W, W + alpha * dist)])
350 |
351 | A = T.dot(BMU, H)
352 | S = A.sum(axis=0)
353 | update_neigh_no_verbose = function([X, H],outputs=BMU, updates=[(W, T.where((S[:,None] > 0) ,T.dot(A.T, X), W) / T.where((S > 0), S, 1)[:,None])])
354 | update_neigh = function([X, H],outputs=[err, BMU], updates=[(W, T.where((S[:,None] > 0) ,T.dot(A.T, X), W) / T.where((S > 0), S, 1)[:,None])])
355 | find_bmu = function([X], BMU)
356 |
357 | # if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
358 | # update_neigh.maker.fgraph.toposort()]):
359 | # print 'Used the cpu'
360 | # elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
361 | # update_neigh.maker.fgraph.toposort()]):
362 | # print 'Used the gpu'
363 | # else:
364 | # print 'ERROR, not able to tell if theano used the cpu or the gpu'
365 | # print update_neigh.maker.fgraph.toposort()
366 |
367 | U = self.set_params(num_epoch)
368 | for epoch in range(num_epoch):
369 | print 'Epoch --- ', epoch
370 | update_rate = U['H_maps'][epoch]
371 | learn_rate = U['alphas'][epoch]
372 | win_counts = np.zeros((self.num_units))
373 | for i in range(0, self.X.shape[0], batch_size):
374 | batch_data = self.X[i:i+batch_size, :]
375 | #temp = find_bmu(batch_data)
376 | if verbose and epoch % 5 == 0:
377 | cost, winners = update_neigh(batch_data, update_rate)
378 | else:
379 | winners = update_neigh_no_verbose(batch_data, update_rate)
380 | win_counts =+ winners.sum(axis=0)
381 | ## Normalization is not imperative unless given input instances are normalized
382 | # self.W = self.W / np.linalg.norm(self.W)
383 |
384 |
385 | if verbose and epoch % 5 == 0:
386 | print "Avg. centroid distance -- ", cost,"\t EPOCH : ", epoch, " of ", num_epoch
387 |
388 | if self.set_count_activations:
389 | self.activations += win_counts
390 |
391 | if self.set_outlier_unit_det:
392 | self._update_unit_saliency(win_counts, update_rate, learn_rate)
393 |
394 | # get the data from shared theano variable
395 | self.W = W.get_value()
396 |
397 | # Normalize activation counts
398 | if self.set_count_activations:
399 | total_act = self.activations.sum()
400 | self.activations = self.activations / total_act
401 |
402 | self.assing_to_units() # final unit assignments
403 |
404 | if self.set_outlier_unit_det:
405 | self._find_outlier_units()
406 |
407 | if self.set_inunit_outlier_det:
408 | self._find_inunit_outliers()
409 |
410 |
411 | def train_batch(self, num_epoch = None, batch_size = None, verbose=True):
412 |
413 | '''
414 | Numpy version of batch learning
415 | '''
416 |
417 | if num_epoch == None:
418 | num_epoch = 500 * self.num_units # Kohonen's suggestion
419 |
420 | if batch_size == None:
421 | batch_size = self.X.shape[0]
422 |
423 | print 'Learning ... '
424 | U = self.set_params(num_epoch)
425 | X2 = (self.X**2).sum(1)[:, None]
426 | for epoch in range(num_epoch):
427 | print 'Epoch --- ', epoch
428 | update_rate = U['H_maps'][epoch]
429 | learn_rate = U['alphas'][epoch]
430 | # randomize batch order
431 | shuffle_indices = np.random.permutation(self.X.shape[0])
432 | win_counts = np.zeros((self.num_units))
433 | for batch_indices in np.array_split(shuffle_indices, self.X.shape[0]/batch_size):
434 | batch_data = self.X[batch_indices,:]
435 | D = self._euq_dist(X2[batch_indices,:], batch_data)
436 | BMU = (D==D.min(0)[None,:]).astype("float32").T
437 |
438 | win_counts += BMU.sum(axis=0)
439 | #print win_counts
440 |
441 | if self.set_count_activations:
442 | self.activations += win_counts
443 |
444 | # batch learning
445 | A = np.dot(BMU, update_rate)
446 | S = A.sum(0)
447 | non_zeros = S.nonzero()[0]
448 | self.W[non_zeros, ...] = np.dot(A[:,non_zeros].T, batch_data) / S[non_zeros][..., None]
449 |
450 | # normalize weight vector
451 | ## Normalization is not imperative unless given input instances are normalized
452 | # self.W = self.W / np.linalg.norm(self.W)
453 | #self.W = self.W / np.linalg.norm(self.W)
454 |
455 | if self.set_outlier_unit_det:
456 | self._update_unit_saliency(win_counts, update_rate, learn_rate)
457 |
458 | if verbose and ((epoch % 1) == 0):
459 | self._print_cost(X2, epoch, num_epoch)
460 |
461 | # Normalize activation counts
462 | if self.set_count_activations:
463 | total_act = self.activations.sum()
464 | self.activations = self.activations / total_act
465 |
466 | self.assing_to_units() # final unit assignments
467 |
468 | if self.set_outlier_unit_det:
469 | self._find_outlier_units()
470 |
471 | if self.set_inunit_outlier_det:
472 | self._find_inunit_outliers()
473 |
474 |
475 | # Uses the Chessboard distance
476 | # Find the neighbooring units to given unit
477 | vis_neigh = lambda neigh_map, indx : neigh_map[indx].reshape((self.height, self.width))
478 | def find_neighbors(self, unit_id, radius):
479 | neighbors = np.zeros((1,self.num_units))
480 | test_neig = np.zeros((self.height, self.width))
481 | unit_x, unit_y = self.unit_cords(unit_id)
482 |
483 | min_y = max(int(unit_y - radius), 0)
484 | max_y = min(int(unit_y + radius), self.height-1)
485 | min_x = max(int(unit_x - radius), 0)
486 | max_x = min(int(unit_x + radius), self.width-1)
487 | for y in range(min_y, max_y+1,1):
488 | for x in range(min_x, max_x+1,1):
489 | dist = abs(y-unit_y) + abs(x-unit_x)
490 | neighbors[0, x + ( y * self.width )] = dist
491 | test_neig[y,x] = dist
492 | return neighbors
493 |
494 | # find BMUs and between-distances for given set of instances
495 | def best_match(self, X):
496 | if len(X.shape) == 1:
497 |
498 | X = X.reshape((1,2))
499 | X2 = (self.X**2).sum(1)[:, None]
500 | D = -2*np.dot(self.W, X.T)[None,:] + (self.W**2).sum(1)[:, None] + X2.T
501 | BMU = (D==D.min(0)[None,:]).astype("float32").T
502 | return BMU, D
503 |
504 | # structure the unit weight to be shown at U map
505 | def som_map(self):
506 | print('Som mapping is being computed...')
507 | sqrt_weigths = np.reshape(self.W,(self.height, self.width, self.data_dim))
508 | um = np.zeros((sqrt_weigths.shape[0],sqrt_weigths.shape[1]))
509 | it = np.nditer(um, flags=['multi_index'])
510 | while not it.finished:
511 | for ii in range(it.multi_index[0]-1,it.multi_index[0]+2):
512 | for jj in range(it.multi_index[1]-1,it.multi_index[1]+2):
513 | if ii >= 0 and ii < sqrt_weigths.shape[0] and jj >= 0 and jj < sqrt_weigths.shape[1]:
514 | um[it.multi_index] += np.linalg.norm(sqrt_weigths[ii,jj,:]-sqrt_weigths[it.multi_index])
515 | it.iternext()
516 | um = um/um.max()
517 | print("Mapping finished...!")
518 | return um
519 |
520 |
521 | # set the ratio of width and height of the map by the
522 | # ratio between largest 2 eigenvalues, computed from data
523 | def _estimate_map_shape(self):
524 | #num_instances = self.X.shape[0]
525 | u,s,v = np.linalg.svd(self.X ,full_matrices = False)
526 | s_sorted = np.sort(s)[::-1]
527 | ratio = s_sorted[0] / s_sorted[1]
528 | self.height = int(min(self.num_units, np.ceil(np.sqrt(self.num_units / ratio))))
529 | self.width = int(np.ceil(self.num_units / self.height))
530 | # self.height = int(np.round(np.sqrt(num_instances)))
531 | # self.width = int(np.round(num_instances / self.height))
532 | print 'Estimated map size is -> height = ', self.height, ' width = ',self.width
533 |
534 | # assign instances to matching BMUs
535 | def assing_to_units(self, X=None):
536 | if X == None:
537 | X2 = (self.X**2).sum(1)[:, None]
538 | D = -2*np.dot(self.W, self.X.T) + (self.W**2).sum(1)[:, None] + X2.T
539 |
540 | self.ins_unit_assign = D.argmin(axis=0)
541 | self.ins_unit_dist = D[self.ins_unit_assign, np.arange(self.X.shape[0])]
542 | else:
543 | X2 = (X**2).sum(1)[:, None]
544 | D = -2*np.dot(self.W, X.T) + (self.W**2).sum(1)[:, None] + X2.T
545 | ins_unit_assign = D.argmin(axis=0)
546 | ins_unit_dist = D[ins_unit_assign, np.arange(X.shape[0])]
547 | return ins_unit_assign , ins_unit_dist
548 |
549 |
550 | def find_units_coherence(self):
551 |
552 | '''
553 | Find individually coherence of each unit by looking to avg. distance
554 | between unit weight and the assigned instances
555 | '''
556 |
557 | self.unit_coher = np.zeros((self.num_units))
558 | for i in np.unique(self.ins_unit_assign):
559 | indices = np.where(self.ins_unit_assign == i)
560 | self.unit_coher[i] = np.sum(self.ins_unit_dist[indices]) / indices[0].size
561 |
562 | # return BMU, BMU distance, saliency by already trained params
563 | def process_new_data(self, X):
564 | BMU,dist = self.assing_to_units(X)
565 |
566 | # find outlier instanes in outlier units
567 | ins_saliency= np.ones((X.shape[0]), dtype=bool)
568 | outlier_units = np.where(self.unit_saliency == False)[0]
569 | for i in outlier_units:
570 | ins_saliency[np.where(BMU == i)] = False
571 |
572 | # find salient unit outliers
573 | for i in np.unique(BMU):
574 | indices = np.where(BMU == i)[0]
575 | unit_thresh = scipy.stats.scoreatpercentile(dist[indices], self.inunit_outlier_thresh)
576 | outlier_insts = indices[dist[indices] > unit_thresh]
577 | ins_saliency[outlier_insts] = False;
578 |
579 | return BMU, dist, ins_saliency
580 |
581 |
582 |
583 | def _update_unit_saliency(self, win_counts, update_rate, learn_rate):
584 |
585 | '''
586 | It is called after each epoch of the learning. It compute the
587 | unit saliencies with the paper formula. At the end, those values
588 | defines the outlier and salient units
589 | '''
590 |
591 | excitations = (update_rate * win_counts).sum(axis=0) / learn_rate
592 | excitations = excitations / excitations.sum()
593 | single_excitations = win_counts * learn_rate
594 | single_excitations = single_excitations / single_excitations.sum()
595 | self.unit_saliency_coeffs += excitations + single_excitations
596 |
597 | def _find_outlier_units(self):
598 |
599 | '''
600 | After we compute unit saliencies, this function detects the outlier
601 | units by the paper heuristic
602 | '''
603 |
604 | # find outlier units
605 | self.unit_saliency_coeffs /= self.unit_saliency_coeffs.sum()
606 | self.unit_saliency = self.unit_saliency_coeffs > self.outlier_unit_thresh/self.num_units
607 |
608 | # sign outlier instances
609 | self.inst_saliency = np.ones((self.X.shape[0]), dtype=bool)
610 | outlier_units = np.where(self.unit_saliency == False)[0]
611 | for i in outlier_units:
612 | self.inst_saliency[np.where(self.ins_unit_assign == i)] = False
613 |
614 | def _find_inunit_outliers(self):
615 |
616 | '''
617 | Find the poor instances at the salient units. It uses an upper whisker
618 | assigned to the distances of the unit weight to unit instances. given the threshold,
619 | outside of the whisker is detedted as outlier.
620 | '''
621 |
622 | # #remove outlier units
623 | # int_units = np.array(range(self.num_units))
624 | # if self.unit_saliency.size > 0 and self.set_inunit_outlier_det:
625 | # int_units = int_units[self.unit_saliency]
626 | if self.inst_saliency.size == 0:
627 | self.inst_saliency = np.ones((self.X.shape[0]), dtype=bool)
628 |
629 | for i in np.unique(self.ins_unit_assign):
630 | indices = np.where(self.ins_unit_assign == i)[0]
631 | unit_thresh = scipy.stats.scoreatpercentile(self.ins_unit_dist[indices], self.inunit_outlier_thresh)
632 | outlier_insts = indices[self.ins_unit_dist[indices] > unit_thresh]
633 | self.inst_saliency[outlier_insts] = False;
634 |
635 | # Returns indices of salient instances
636 | def salient_inst_index(self):
637 | return np.where(self.inst_saliency == True)[0]
638 |
639 | def salient_unit_index(self):
640 | return np.where(self.unit_saliency == True)[0]
641 |
642 | def salient_insts(self):
643 | return self.X[np.where(self.inst_saliency == True)]
644 |
645 | def salient_units(self):
646 | return self.W[np.where(self.unit_saliency == True)]
647 |
648 | ## Returns instance to unit mapping. First row is instances.
649 | def inst_to_unit_mapping(self):
650 | return np.concatenate((np.arange(self.X.shape[0])[None,:], self.ins_unit_assign[None, :]))
651 |
652 | def salient_inst_to_unit_mapping(self):
653 | mapping = self.inst_to_unit_mapping()
654 |
655 |
656 | def _norm_data(self, X = None):
657 |
658 | '''
659 | Take the norm of the given data matrix and save std and mean
660 | for future purposes
661 | '''
662 |
663 | if X == None:
664 | self.data_mean = self.X.mean(axis=0)
665 | self.data_std = self.X.std(axis=0, ddof=1)
666 | self.X = (self.X - self.data_mean) / (self.data_std + EPS)
667 | else:
668 | data_mean = X.mean(axis=0)
669 | data_std = X.std(axis=0, ddof=1)
670 | X = (X - data_mean) / data_std
671 | return X, data_mean, data_std
672 |
673 |
674 | '''
675 | DEMO CODE
676 | '''
677 | if __name__ == "__main__":
678 | from sklearn import datasets
679 | import time
680 |
681 | data = datasets.load_digits().data
682 |
683 | som = SOM(DATA = data, alpha_max=0.05, num_units=100, height = 10, width = 10)
684 | #som.train_batch(100)
685 | #start = time.time()
686 | #som.train_stoch_theano(10)
687 | som.train_batch_theano(num_epoch=100)
688 | #som.train_stoch(10)
689 | #clusters = som.ins_unit_assign
690 | #print clusters
691 | #stop = time.time()
692 | #
693 | print som.unit_saliency
694 |
695 | #som_plot_scatter(som.W, som.X, som.activations)
696 | #som_plot_outlier_scatter(som.W, som.X, som.unit_saliency, som.inst_saliency, som.activations)
697 | #som_mapping = som.som_map()
698 | #som_plot_mapping(som_mapping)
699 | print "Demo finished!"
700 | #print "Pass time : ", stop - start
701 |
--------------------------------------------------------------------------------