├── __init__.py
├── som.pyc
├── som_plot.pyc
├── visuals
    ├── som_latice.png
    └── 2d_projection.png
├── setup.py
├── README.md
├── tests.py
├── grid_search.py
├── .gitignore
├── sample_run.py
├── rsom.py
└── som_theano.py


/__init__.py:
--------------------------------------------------------------------------------
1 | from .som import SOM
2 | 


--------------------------------------------------------------------------------
/som.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/som.pyc


--------------------------------------------------------------------------------
/som_plot.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/som_plot.pyc


--------------------------------------------------------------------------------
/visuals/som_latice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/visuals/som_latice.png


--------------------------------------------------------------------------------
/visuals/2d_projection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erogol/RSOM/HEAD/visuals/2d_projection.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(
 7 |     name="RSOM",
 8 |     version="0.1.0",
 9 |     author="Eren Gölge",
10 |     author_email="",
11 |     description="A Rectifying Self-Organizing Map (RSOM) implementation",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/erogol/RSOM",
15 |     packages=find_packages(exclude=["tests*", "examples*"]),
16 |     classifiers=[
17 |         "Development Status :: 3 - Alpha",
18 |         "Intended Audience :: Developers",
19 |         "License :: OSI Approved :: MIT License",
20 |     ],
21 |     python_requires=">=3.6",
22 |     install_requires=[
23 |         "numpy",
24 |         "matplotlib",
25 |         "torch",
26 |         "scikit-learn",
27 |     ],
28 | )
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Rectifying Self Organizing Map (RSOM)
 2 | ===============================
 3 | 
 4 | 📎 **Paper 1:** Gölge, E., & Duygulu, P. 2013. Rectifying Self Organizing Maps for Automatic Concept Learning from Web Images
 5 | 
 6 | 📎 **Paper 2:** Gölge, E., & Duygulu, P.. ConceptMap:Mining noisy web data for concept learning , The European Conference on Computer Vision (ECCV) 2014.
 7 | 
 8 | RSOM is an algorithm as an extension of well-known Self Organizing Map (SOM). It mimics SOM clustering and additionally detects outliers in the given dataset in the cluster level or instance level.
 9 | It is mainly used with image tasks but works as good with any other type of data.
10 | 
11 | ## Installation
12 | 
13 | ```
14 | git clone https://github.com/erogol/RSOM.git
15 | cd RSOM
16 | python setup.py install
17 | ```
18 | 
19 | or
20 | 
21 | ```
22 | pip install git+https://github.com/erogol/RSOM.git
23 | ```
24 | 
25 | ## Usage
26 | 
27 | Check ```sample_run.py``` for more.
28 | 
29 | ```python
30 | from rsom import RSOM
31 | 
32 | # Load Iris dataset
33 | data = load_digits().data
34 | data = torch.from_numpy(data).float()
35 | print(data.shape)
36 | 
37 | # Initialize SOM
38 | som = RSOM(data, alpha_max=0.05, num_units=49)
39 | 
40 | # Train SOM
41 | som.train_batch(num_epoch=1000, verbose=True)
42 | 
43 | # Get salient instances and units
44 | salient_insts = som.salient_insts()
45 | salient_units = som.salient_units()
46 | ```
47 | 
48 | 
49 | 
50 | ## Citation
51 | 
52 | ```
53 | @misc{golge2013rectifyingselforganizingmaps,
54 |       title={Rectifying Self Organizing Maps for Automatic Concept Learning from Web Images},
55 |       author={Eren Golge and Pinar Duygulu},
56 |       year={2013},
57 |       eprint={1312.4384},
58 |       archivePrefix={arXiv},
59 |       primaryClass={cs.CV},
60 |       url={https://arxiv.org/abs/1312.4384},
61 | }
62 | ```
63 | 
64 | ## Example Visuals
65 | 
66 | Visuals are generated using ```sample_run.py``` and digits dataset.
67 | 
68 | <img src="https://github.com/erogol/RSOM/blob/master/visuals/2d_projection.png" width="600">
69 | 
70 | <img src="https://github.com/erogol/RSOM/blob/master/visuals/som_latice.png" width="600">
71 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import torch
 4 | 
 5 | from rsom import RSOM
 6 | 
 7 | 
 8 | class TestSOM(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.data = torch.randn(100, 10)
11 |         self.som = RSOM(self.data, num_units=25, height=5, width=5)
12 | 
13 |     def test_init(self):
14 |         self.assertEqual(self.som.num_units, 25)
15 |         self.assertEqual(self.som.height, 5)
16 |         self.assertEqual(self.som.width, 5)
17 |         self.assertEqual(self.som.W.shape, (25, 10))
18 | 
19 |     def test_normalize_weights(self):
20 |         self.som._normalize_weights()
21 |         norms = torch.norm(self.som.W.data, dim=1)
22 |         self.assertTrue(torch.allclose(norms, torch.ones_like(norms), atol=1e-6))
23 | 
24 |     def test_unit_cords(self):
25 |         self.assertEqual(self.som.unit_cords(7), (2, 1))
26 |         self.assertEqual(self.som.unit_cords(24), (4, 4))
27 | 
28 |     def test_euq_dist(self):
29 |         X = self.data[:5]
30 |         X2 = (X**2).sum(1).unsqueeze(1)
31 |         D = self.som._euq_dist(X2, X)
32 |         self.assertEqual(D.shape, (25, 5))
33 | 
34 |     def test_find_neighbors(self):
35 |         neighbors = self.som.find_neighbors(12, 1)
36 |         self.assertEqual(neighbors.shape, (1, 25))
37 |         self.assertEqual(neighbors[0, 12].item(), 0)
38 | 
39 |     def test_best_match(self):
40 |         X = self.data[:5]
41 |         BMU, D = self.som.best_match(X)
42 |         self.assertEqual(BMU.shape, (5, 25))
43 |         self.assertEqual(D.shape, (25, 5))
44 | 
45 |     def test_assing_to_units(self):
46 |         self.som.assing_to_units()
47 |         self.assertEqual(self.som.ins_unit_assign.shape, (100,))
48 |         self.assertEqual(self.som.ins_unit_dist.shape, (100,))
49 | 
50 |     def test_set_params(self):
51 |         U = self.som.set_params(10)
52 |         self.assertEqual(len(U["alphas"]), 10)
53 |         self.assertEqual(len(U["H_maps"]), 10)
54 |         self.assertEqual(len(U["radiuses"]), 10)
55 | 
56 |     def test_train_batch(self):
57 |         self.som.train_batch(num_epoch=5, batch_size=20, verbose=False)
58 |         self.assertIsNotNone(self.som.ins_unit_assign)
59 |         self.assertIsNotNone(self.som.ins_unit_dist)
60 | 
61 |     def test_update_unit_saliency(self):
62 |         win_counts = torch.ones(25)
63 |         update_rate = torch.ones(25, 25)
64 |         self.som._update_unit_saliency(win_counts, update_rate, 0.1)
65 |         self.assertGreater(self.som.unit_saliency_coeffs.sum().item(), 0)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     unittest.main()
70 | 


--------------------------------------------------------------------------------
/grid_search.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from matplotlib import pyplot as plt
  4 | from sklearn.datasets import load_digits
  5 | from sklearn.model_selection import train_test_split
  6 | from tqdm import tqdm
  7 | 
  8 | from som import SOM
  9 | 
 10 | 
 11 | def quantization_error(som, data):
 12 |     _, distances = som.best_match(data)
 13 |     return torch.mean(torch.min(distances, dim=0)[0])
 14 | 
 15 | 
 16 | def grid_search_som(data, unit_range, epochs=1000, alpha_max=0.05, trials=3):
 17 |     results = []
 18 | 
 19 |     for num_units in tqdm(unit_range, desc="Grid Search"):
 20 |         trial_errors = []
 21 |         for _ in range(trials):
 22 |             som = SOM(data, num_units=num_units, alpha_max=alpha_max)
 23 |             som.train_batch(num_epoch=epochs, verbose=False)
 24 |             error = quantization_error(som, data)
 25 |             trial_errors.append(error.item())
 26 | 
 27 |         avg_error = np.mean(trial_errors)
 28 |         std_error = np.std(trial_errors)
 29 |         results.append((num_units, avg_error, std_error))
 30 | 
 31 |         print(
 32 |             f"Units: {num_units}, Avg Error: {avg_error:.4f}, Std Error: {std_error:.4f}"
 33 |         )
 34 | 
 35 |     return results
 36 | 
 37 | 
 38 | def find_elbow(x, y):
 39 |     # Normalize the data
 40 |     x = np.array(x)
 41 |     y = np.array(y)
 42 |     x_norm = (x - min(x)) / (max(x) - min(x))
 43 |     y_norm = (y - min(y)) / (max(y) - min(y))
 44 | 
 45 |     # Calculate the distances from each point to the line connecting the first and last points
 46 |     coords = np.vstack([x_norm, y_norm]).T
 47 |     first = coords[0]
 48 |     line_vec = coords[-1] - coords[0]
 49 |     line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2))
 50 |     vec_from_first = coords - first
 51 |     scalar_proj = np.dot(vec_from_first, line_vec_norm)
 52 |     proj = np.outer(scalar_proj, line_vec_norm)
 53 |     distances = np.sqrt(np.sum((vec_from_first - proj) ** 2, axis=1))
 54 | 
 55 |     # Find the elbow point (maximum distance)
 56 |     elbow_index = np.argmax(distances)
 57 |     return x[elbow_index], y[elbow_index]
 58 | 
 59 | 
 60 | if __name__ == "__main__":
 61 |     # Load Digits dataset
 62 |     digits = load_digits()
 63 |     data = torch.from_numpy(digits.data).float()
 64 | 
 65 |     # Normalize the data
 66 |     data = (data - data.min()) / (data.max() - data.min())
 67 | 
 68 |     # Split the data into train and test sets
 69 |     X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)
 70 | 
 71 |     # Define the range of units to search
 72 |     unit_range = [9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196]
 73 | 
 74 |     # Perform grid search
 75 |     results = grid_search_som(
 76 |         X_train, unit_range, epochs=1000, alpha_max=0.05, trials=3
 77 |     )
 78 | 
 79 |     # Extract units and errors
 80 |     units = [r[0] for r in results]
 81 |     errors = [r[1] for r in results]
 82 |     error_stds = [r[2] for r in results]
 83 | 
 84 |     # Find the elbow point
 85 |     elbow_units, elbow_error = find_elbow(units, errors)
 86 | 
 87 |     print(f"\nElbow point: {elbow_units:.0f} units, Error: {elbow_error:.4f}")
 88 | 
 89 |     # Plot the results
 90 |     plt.figure(figsize=(10, 6))
 91 |     plt.errorbar(units, errors, yerr=error_stds, fmt="o-", capsize=5)
 92 |     plt.plot(elbow_units, elbow_error, "ro", markersize=10, label="Elbow point")
 93 |     plt.xlabel("Number of Units")
 94 |     plt.ylabel("Quantization Error")
 95 |     plt.title("SOM Grid Search Results")
 96 |     plt.xscale("log")
 97 |     plt.grid(True)
 98 |     plt.legend()
 99 |     plt.show()
100 | 
101 |     # Train the SOM with the elbow point number of units
102 |     best_som = SOM(data, num_units=int(elbow_units), alpha_max=0.05)
103 |     best_som.train_batch(num_epoch=1000, verbose=True)
104 | 
105 |     # Evaluate on test set
106 |     test_error = quantization_error(best_som, X_test)
107 |     print(f"\nTest set quantization error: {test_error:.4f}")
108 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | .png


--------------------------------------------------------------------------------
/sample_run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import torch
  6 | from PIL import Image
  7 | from sklearn.datasets import load_digits
  8 | from sklearn.decomposition import PCA
  9 | 
 10 | from rsom import RSOM
 11 | 
 12 | 
 13 | def get_node_coordinates(som, pca):
 14 |     coords = []
 15 |     for i in range(som.height):
 16 |         for j in range(som.width):
 17 |             node_index = i * som.width + j
 18 |             node_weights = som.W[node_index].detach().numpy()
 19 |             coord = pca.transform([node_weights])[0]
 20 |             coords.append(coord)
 21 |     return np.array(coords)
 22 | 
 23 | 
 24 | # Load Iris dataset
 25 | data = load_digits().data
 26 | data = torch.from_numpy(data).float()
 27 | print(data.shape)
 28 | 
 29 | # standardize the data
 30 | data = (data - data.mean(axis=0)) / (data.std(axis=0) - 1e-8)
 31 | 
 32 | # Initialize SOM
 33 | som = RSOM(
 34 |     data,
 35 |     alpha_max=0.001,
 36 |     alpha_min=0.0005,
 37 |     num_units=49,
 38 | )
 39 | 
 40 | # Train batch SOM
 41 | som.train_batch(num_epoch=10000, verbose=True, batch_size=128)
 42 | 
 43 | # Get salient instances and units
 44 | salient_insts = som.salient_insts()
 45 | salient_units = som.salient_units()
 46 | 
 47 | # Perform PCA to reduce data to 2D for visualization
 48 | pca = PCA(n_components=2)
 49 | data_2d = pca.fit_transform(som.X.numpy())
 50 | units_2d = pca.transform(som.W.detach().numpy())
 51 | 
 52 | # Get node coordinates
 53 | node_coords = get_node_coordinates(som, pca)
 54 | 
 55 | # Create a plot
 56 | plt.figure(figsize=(12, 8))
 57 | 
 58 | # Plot data points
 59 | salient_mask = som.inst_saliency.numpy()
 60 | plt.scatter(
 61 |     data_2d[salient_mask, 0],
 62 |     data_2d[salient_mask, 1],
 63 |     c=som.ins_unit_assign[salient_mask],
 64 |     cmap="viridis",
 65 |     alpha=0.6,
 66 |     label="Salient Samples",
 67 | )
 68 | plt.scatter(
 69 |     data_2d[~salient_mask, 0],
 70 |     data_2d[~salient_mask, 1],
 71 |     c="red",
 72 |     marker="x",
 73 |     alpha=0.6,
 74 |     label="Outlier Samples",
 75 | )
 76 | 
 77 | # Plot SOM units
 78 | salient_units_mask = som.unit_saliency.numpy()
 79 | plt.scatter(
 80 |     node_coords[salient_units_mask, 0],
 81 |     node_coords[salient_units_mask, 1],
 82 |     c="black",
 83 |     marker="s",
 84 |     s=50,
 85 |     label="Salient Units",
 86 | )
 87 | plt.scatter(
 88 |     node_coords[~salient_units_mask, 0],
 89 |     node_coords[~salient_units_mask, 1],
 90 |     c="red",
 91 |     marker="s",
 92 |     s=50,
 93 |     label="Outlier Units",
 94 | )
 95 | 
 96 | # Draw lattice lines
 97 | for i in range(som.height):
 98 |     for j in range(som.width):
 99 |         node_index = i * som.width + j
100 |         if j < som.width - 1:  # Horizontal line
101 |             next_node_index = node_index + 1
102 |             plt.plot(
103 |                 [node_coords[node_index, 0], node_coords[next_node_index, 0]],
104 |                 [node_coords[node_index, 1], node_coords[next_node_index, 1]],
105 |                 "gray",
106 |                 alpha=0.5,
107 |             )
108 |         if i < som.height - 1:  # Vertical line
109 |             next_node_index = node_index + som.width
110 |             plt.plot(
111 |                 [node_coords[node_index, 0], node_coords[next_node_index, 0]],
112 |                 [node_coords[node_index, 1], node_coords[next_node_index, 1]],
113 |                 "gray",
114 |                 alpha=0.5,
115 |             )
116 | 
117 | # Add labels and title
118 | plt.xlabel("First Principal Component")
119 | plt.ylabel("Second Principal Component")
120 | plt.title("SOM Units and Data Samples with Outliers and Lattice")
121 | plt.legend()
122 | 
123 | # Show the plot
124 | plt.show()
125 | 
126 | # Optional: Print some statistics
127 | print(f"Number of salient samples: {salient_mask.sum()}")
128 | print(f"Number of outlier samples: {(~salient_mask).sum()}")
129 | print(f"Number of salient units: {salient_units_mask.sum()}")
130 | print(f"Number of outlier units: {(~salient_units_mask).sum()}")
131 | 
132 | # Create a new figure for the perfect 2D lattice plot
133 | plt.figure(figsize=(12, 12))
134 | 
135 | # Create a perfect 2D grid for SOM nodes
136 | grid_x, grid_y = np.meshgrid(np.arange(som.width), np.arange(som.height))
137 | grid_x = grid_x.flatten()
138 | grid_y = grid_y.flatten()
139 | 
140 | # Plot the perfect grid
141 | plt.scatter(grid_x, grid_y, c="lightgray", s=200, marker="s")
142 | 
143 | # Draw grid lines
144 | for x in range(som.width):
145 |     plt.axvline(x, color="lightgray", linestyle="--")
146 | for y in range(som.height):
147 |     plt.axhline(y, color="lightgray", linestyle="--")
148 | 
149 | # Get the unit assignments for each sample
150 | unit_assignments = som.ins_unit_assign.numpy()
151 | 
152 | # Calculate the positions of samples on the grid
153 | sample_x = grid_x[unit_assignments].astype(float)
154 | sample_y = grid_y[unit_assignments].astype(float)
155 | 
156 | # Add some jitter to prevent complete overlap
157 | jitter = 0.2
158 | sample_x += np.random.uniform(-jitter, jitter, sample_x.shape)
159 | sample_y += np.random.uniform(-jitter, jitter, sample_y.shape)
160 | 
161 | # Plot the samples on the grid
162 | scatter = plt.scatter(
163 |     sample_x, sample_y, c=som.ins_unit_assign, cmap="viridis", alpha=0.6
164 | )
165 | 
166 | # Highlight outlier samples
167 | outlier_mask = ~som.inst_saliency.numpy()
168 | plt.scatter(
169 |     sample_x[outlier_mask],
170 |     sample_y[outlier_mask],
171 |     facecolors="none",
172 |     edgecolors="red",
173 |     s=50,
174 |     linewidths=2,
175 | )
176 | 
177 | # Highlight outlier units
178 | for unit in np.where(~som.unit_saliency.numpy())[0]:
179 |     unit_x, unit_y = som.unit_cords(unit)
180 |     plt.gca().add_patch(
181 |         plt.Circle((unit_x, unit_y), 0.4, fill=False, edgecolor="red", linewidth=2)
182 |     )
183 | 
184 | # Set labels and title
185 | plt.xlabel("SOM Width")
186 | plt.ylabel("SOM Height")
187 | plt.title("Samples Mapped to Perfect 2D SOM Lattice")
188 | 
189 | # Set tick labels
190 | plt.xticks(range(som.width))
191 | plt.yticks(range(som.height))
192 | 
193 | # Add colorbar
194 | cbar = plt.colorbar(scatter)
195 | cbar.set_label("Unit Assignment")
196 | 
197 | # Adjust plot limits
198 | plt.xlim(-0.5, som.width - 0.5)
199 | plt.ylim(-0.5, som.height - 0.5)
200 | 
201 | # Show the plot
202 | plt.tight_layout()
203 | plt.show()
204 | 
205 | # Create a folder to save outlier images
206 | output_folder = "outlier_digits"
207 | os.makedirs(output_folder, exist_ok=True)
208 | 
209 | # Get the original digit images and their labels
210 | digits = load_digits()
211 | images = digits.images
212 | labels = digits.target
213 | 
214 | # Find the indices of outlier samples
215 | outlier_indices = np.where(~salient_mask)[0]
216 | 
217 | # Save outlier images
218 | for i, idx in enumerate(outlier_indices):
219 |     img = images[idx]
220 |     label = labels[idx]
221 | 
222 |     # Normalize the image to 0-255 range
223 |     img_normalized = ((img - img.min()) / (img.max() - img.min()) * 255).astype(
224 |         np.uint8
225 |     )
226 | 
227 |     # Create a PIL Image
228 |     pil_img = Image.fromarray(img_normalized)
229 | 
230 |     # Save the image
231 |     filename = f"outlier_{i}_label_{label}.png"
232 |     pil_img.save(os.path.join(output_folder, filename))
233 | 
234 | print(f"Saved {len(outlier_indices)} outlier images to '{output_folder}' folder.")
235 | 
236 | # Find samples closest to salient units
237 | salient_folder = "salient_digits"
238 | os.makedirs(salient_folder, exist_ok=True)
239 | salient_unit_indices = np.where(som.unit_saliency.numpy())[0]
240 | 
241 | for i, unit_idx in enumerate(salient_unit_indices):
242 |     # Find the sample closest to this salient unit
243 |     unit_weights = som.W[unit_idx].detach().numpy()
244 |     distances = np.linalg.norm(data.numpy() - unit_weights, axis=1)
245 |     closest_sample_idx = np.argmin(distances)
246 | 
247 |     img = images[closest_sample_idx]
248 |     label = labels[closest_sample_idx]
249 | 
250 |     # Normalize the image to 0-255 range
251 |     img_normalized = ((img - img.min()) / (img.max() - img.min()) * 255).astype(
252 |         np.uint8
253 |     )
254 | 
255 |     # Create a PIL Image
256 |     pil_img = Image.fromarray(img_normalized)
257 | 
258 |     # Save the image
259 |     filename = f"salient_unit_{i}_label_{label}.png"
260 |     pil_img.save(os.path.join(salient_folder, filename))
261 | 
262 | print(
263 |     f"Saved {len(salient_unit_indices)} salient unit images to '{salient_folder}' folder."
264 | )
265 | 


--------------------------------------------------------------------------------
/rsom.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Rectifying Self Organazing Maps a.k.a RSOM
  3 | 
  4 | RSOM is a clustering and outlier detection method that is predicated with
  5 | old Self Organazing Maps.
  6 | 
  7 | It includes Batch and Stochastic learning rules. There are two different
  8 | implementations. One is based on Numpy and tthe other is Theano. If you have
  9 | tall and wide data matrix, we suggest to use Theano version. Otherwise
 10 | Numpy version is faster. You can also use GPU with Theano but you need to
 11 | set Theano configurations.
 12 | 
 13 | For more detail about RSOM refer to http://arxiv.org/abs/1312.4384
 14 | 
 15 | AUTHOR:
 16 |     Eren Golge
 17 |     erengolge@gmail.com
 18 |     www.erengolge.com
 19 | """
 20 | 
 21 | """
 22 | TO DO:
 23 | -> Try dot product distance instead of Euclidean
 24 | -> Normzalize only updated weight vectors in that epoch
 25 | -> compare code with https://github.com/JustGlowing/minisom/blob/master/minisom.py
 26 | -> print resulting objective values
 27 | -> write bookeeping for best objective value
 28 | -> learning rate is already decreasing so radius might be good to keep it constant
 29 | -> UPDATE only winners
 30 | """
 31 | 
 32 | import logging
 33 | from typing import Optional, Tuple
 34 | 
 35 | import numpy as np
 36 | import torch
 37 | 
 38 | # Set up logging
 39 | logging.basicConfig(
 40 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 41 | )
 42 | logger = logging.getLogger(__name__)
 43 | 
 44 | 
 45 | class RSOM(torch.nn.Module):
 46 |     def __init__(
 47 |         self,
 48 |         data: torch.Tensor,
 49 |         num_units: int = 10,
 50 |         height: Optional[int] = None,
 51 |         width: Optional[int] = None,
 52 |         alpha_max: float = 0.05,
 53 |         alpha_min: float = 0.001,
 54 |         set_count_activations: bool = True,
 55 |         set_outlier_unit_det: bool = True,
 56 |         set_inunit_outlier_det: bool = True,
 57 |         outlier_unit_thresh: float = 0.5,
 58 |         inunit_outlier_thresh: float = 95,
 59 |         dist: str = "euclidean",
 60 |         log_full_data_cost: bool = False,
 61 |         steps_to_full_data_cost: int = -1,  # TODO: number of steps to compute full data cost
 62 |     ):
 63 |         """Rectifying Self Organizing Maps. RSOM is a clustering and outlier detection method that is predicated with old Self Organizing Maps.
 64 | 
 65 |         Args:
 66 |             data: Input data.
 67 |             num_units: Number of units.
 68 |             height: Height of the map.
 69 |             width: Width of the map.
 70 |             alpha_max: Maximum learning rate.
 71 |             alpha_min: Minimum learning rate.
 72 |             set_count_activations: Whether to count activations.
 73 |             set_outlier_unit_det: Whether to detect outlier units.
 74 |             set_inunit_outlier_det: Whether to detect in-unit outliers.
 75 |             outlier_unit_thresh: Threshold for outlier unit detection.
 76 |             inunit_outlier_thresh: Threshold for in-unit outlier detection.
 77 |             dist: Distance metric. Can be "euclidean" or "cosine".
 78 |             log_full_data_cost: Whether to log full data cost or use exponential moving average. Computing full data
 79 |                 cost is expensive and might cause OOM.
 80 | 
 81 |         """
 82 | 
 83 |         super(RSOM, self).__init__()
 84 |         self.X = data
 85 |         self.num_units = num_units
 86 |         self.height = height
 87 |         self.width = width
 88 |         self.alpha_max = alpha_max
 89 |         self.alpha_min = alpha_min
 90 |         self.set_count_activations = set_count_activations
 91 |         self.set_outlier_unit_det = set_outlier_unit_det
 92 |         self.set_inunit_outlier_det = set_inunit_outlier_det
 93 |         self.outlier_unit_thresh = outlier_unit_thresh
 94 |         self.inunit_outlier_thresh = inunit_outlier_thresh
 95 |         self.log_full_data_cost = log_full_data_cost
 96 | 
 97 |         self._estimate_map_shape()
 98 |         self.data_dim = self.X.shape[1]
 99 | 
100 |         self.W = torch.nn.Parameter(torch.randn(self.num_units, self.data_dim))
101 |         self._normalize_weights()
102 | 
103 |         self.distance_metric = dist
104 |         if self.distance_metric not in ["euclidean", "cosine"]:
105 |             raise ValueError("distance_metric must be either 'euclidean' or 'cosine'")
106 | 
107 |         self.activations = torch.zeros(self.num_units)
108 |         self.unit_saliency_coeffs = torch.zeros(self.num_units)
109 |         self.unit_saliency = torch.ones(self.num_units, dtype=torch.bool)
110 |         self.inst_saliency = torch.tensor([])
111 |         self.ins_unit_assign = torch.tensor([])
112 |         self.ins_unit_dist = torch.tensor([])
113 |         self.unit_coher = torch.tensor([])
114 | 
115 |     def _normalize_weights(self):
116 |         self.W.data = self.W.data / torch.norm(self.W.data, dim=1, keepdim=True)
117 | 
118 |     def _estimate_map_shape(self):
119 |         if self.height is None or self.width is None:
120 |             u, s, v = torch.svd(self.X)
121 |             ratio = s[0] / s[1]
122 |             self.height = min(
123 |                 self.num_units, int(np.ceil(np.sqrt(self.num_units / ratio)))
124 |             )
125 |             self.width = int(np.ceil(self.num_units / self.height))
126 |             self.num_units = self.height * self.width
127 |         logging.info(
128 |             f"Estimated map size is -> height = {self.height}, width = {self.width}"
129 |         )
130 | 
131 |     def unit_cords(self, index: int) -> Tuple[int, int]:
132 |         return index % self.width, index // self.width
133 | 
134 |     def _calc_distance(self, X, X2=None):
135 |         if self.distance_metric == "euclidean":
136 |             return self._euclidean_distance(X, X2)
137 |         elif self.distance_metric == "cosine":
138 |             return self._cosine_distance(X)
139 | 
140 |     def _euclidean_distance(self, X, X2=None):
141 |         if X2 is None:
142 |             X2 = (X**2).sum(1)[:, None]
143 |         W2 = (self.W**2).sum(1)[:, None]
144 |         return -2 * torch.mm(self.W, X.t()) + W2 + X2.t()
145 | 
146 |     def _cosine_distance(self, X):
147 |         X_norm = X / torch.norm(X, dim=1, keepdim=True)
148 |         W_norm = self.W / torch.norm(self.W, dim=1, keepdim=True)
149 |         return 1 - torch.mm(W_norm, X_norm.t())
150 | 
151 |     def find_neighbors(self, unit_id: int, radius: int) -> torch.Tensor:
152 |         neighbors = torch.zeros(1, self.num_units)
153 |         unit_x, unit_y = self.unit_cords(unit_id)
154 | 
155 |         min_y = max(int(unit_y - radius), 0)
156 |         max_y = min(int(unit_y + radius), self.height - 1)
157 |         min_x = max(int(unit_x - radius), 0)
158 |         max_x = min(int(unit_x + radius), self.width - 1)
159 | 
160 |         for y in range(min_y, max_y + 1):
161 |             for x in range(min_x, max_x + 1):
162 |                 dist = abs(y - unit_y) + abs(x - unit_x)
163 |                 neighbors[0, x + (y * self.width)] = dist
164 | 
165 |         return neighbors
166 | 
167 |     def best_match(self, X: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
168 |         if X.dim() == 1:
169 |             X = X.unsqueeze(0)
170 |         X2 = (X**2).sum(1).unsqueeze(1)
171 |         D = -2 * torch.mm(self.W, X.t()) + (self.W**2).sum(1).unsqueeze(1) + X2.t()
172 |         BMU = (D == D.min(0)[0]).float().t()
173 |         return BMU, D
174 | 
175 |     def assing_to_units(self, X=None):
176 |         if X is None:
177 |             D = self._calc_distance(self.X)
178 |             self.ins_unit_assign = D.argmin(axis=0)
179 |             self.ins_unit_dist = D[self.ins_unit_assign, torch.arange(self.X.shape[0])]
180 |         else:
181 |             D = self._calc_distance(X)
182 |             ins_unit_assign = D.argmin(axis=0)
183 |             ins_unit_dist = D[ins_unit_assign, torch.arange(X.shape[0])]
184 |             return ins_unit_assign, ins_unit_dist
185 | 
186 |     def set_params(self, num_epoch: int) -> dict:
187 |         U = {"alphas": [], "H_maps": [], "radiuses": []}
188 | 
189 |         dist_map = torch.zeros(self.num_units, self.num_units)
190 |         radius = np.ceil(1 + np.floor(min(self.width, self.height) - 1) / 2) - 1
191 |         for u in range(self.num_units):
192 |             dist_map[u, :] = self.find_neighbors(u, self.num_units)
193 | 
194 |         for epoch in range(num_epoch):
195 |             alpha = self.alpha_max - self.alpha_min
196 |             alpha = alpha * (num_epoch - epoch) / num_epoch + self.alpha_min
197 |             radius = np.ceil(1 + np.floor(min(self.width, self.height) - 1) / 2) - 1
198 |             radius = radius * (num_epoch - epoch) / (num_epoch - 1) - 1
199 |             radius = max(radius, 0)
200 | 
201 |             neigh_updt_map = alpha * (1 - dist_map / float((1 + radius)))
202 |             neigh_updt_map[dist_map > radius] = 0
203 | 
204 |             U["H_maps"].append(neigh_updt_map)
205 |             U["alphas"].append(alpha)
206 |             U["radiuses"].append(radius)
207 | 
208 |         return U
209 | 
210 |     def train_batch(
211 |         self,
212 |         num_epoch: Optional[int] = None,
213 |         batch_size: Optional[int] = None,
214 |         verbose: bool = True,
215 |     ):
216 |         """
217 |         Args:
218 |             num_epoch: number of epochs to train
219 |             batch_size: number of samples to train in one batch
220 |             verbose: if True, print the progress of training
221 |         """
222 |         if num_epoch is None:
223 |             num_epoch = 500 * self.num_units
224 | 
225 |         if batch_size is None:
226 |             batch_size = self.X.shape[0]
227 | 
228 |         logger.info("Learning...")
229 |         U = self.set_params(num_epoch)
230 | 
231 |         X2 = None
232 |         if batch_size == self.X.shape[0]:
233 |             X2 = (self.X**2).sum(1).unsqueeze(1)
234 | 
235 |         for epoch in range(num_epoch):
236 |             logger.info(f"Epoch --- {epoch}")
237 |             update_rate = U["H_maps"][epoch]
238 |             learn_rate = U["alphas"][epoch]
239 | 
240 |             shuffle_indices = torch.randperm(self.X.shape[0])
241 |             win_counts = torch.zeros(self.num_units)
242 | 
243 |             batches = torch.split(shuffle_indices, batch_size)
244 |             num_steps = len(batches)
245 | 
246 |             for step, batch_indices in enumerate(batches):
247 |                 batch_data = self.X[batch_indices, :]
248 |                 D = self._calc_distance(batch_data, X2)
249 |                 BMU = (D == D.min(0)[0][None, :]).float().t()
250 | 
251 |                 win_counts += BMU.sum(dim=0)
252 | 
253 |                 if self.set_count_activations:
254 |                     self.activations += win_counts
255 | 
256 |                 A = torch.mm(BMU, update_rate)
257 |                 S = A.sum(0)
258 |                 non_zeros = S.nonzero().squeeze()
259 | 
260 |                 self.W.data[non_zeros] = torch.mm(A[:, non_zeros].t(), batch_data) / S[
261 |                     non_zeros
262 |                 ].unsqueeze(1)
263 | 
264 |                 self._print_cost(X2, D, epoch, num_epoch, step, num_steps)
265 | 
266 |             if self.set_outlier_unit_det:
267 |                 self._update_unit_saliency(win_counts, update_rate, learn_rate)
268 | 
269 |         if self.set_count_activations:
270 |             self.activations /= self.activations.sum()
271 | 
272 |         self.assing_to_units()
273 | 
274 |         if self.set_outlier_unit_det:
275 |             self._find_outlier_units()
276 | 
277 |         if self.set_inunit_outlier_det:
278 |             self._find_inunit_outliers()
279 | 
280 |     def _print_cost(
281 |         self,
282 |         X2: torch.Tensor,
283 |         D: torch.Tensor,
284 |         epoch: int,
285 |         num_epoch: int,
286 |         step: int,
287 |         num_steps: int,
288 |     ):
289 |         batch_cost = D.min(0)[0].mean()
290 | 
291 |         if self.log_full_data_cost:
292 |             # TODO: handle when data is too large
293 |             if self.distance_metric == "euclidean":
294 |                 D = self._calc_distance(self.X, X2)
295 |                 cost = torch.norm(D.min(0)[0], p=1) / self.X.shape[0]
296 |             elif self.distance_metric == "cosine":
297 |                 D = self._calc_distance(self.X)
298 |                 cost = D.min(0)[0].mean()  # Average minimum cosine distance
299 |         else:
300 |             # exponential moving average cost
301 |             if not hasattr(self, "avg_cost"):
302 |                 self.avg_cost = batch_cost
303 |             else:
304 |                 self.avg_cost = self.avg_cost * 0.01 + batch_cost * 0.99
305 |             cost = self.avg_cost
306 | 
307 |         logger.info(
308 |             f"epoch {epoch} / {num_epoch} -- step {step} / {num_steps} -- avg-cost: {cost.item():.6f} -- batch-cost: {batch_cost.item():.6f}"
309 |         )
310 | 
311 |     def _update_unit_saliency(
312 |         self, win_counts: torch.Tensor, update_rate: torch.Tensor, learn_rate: float
313 |     ):
314 |         excitations = (update_rate * win_counts.unsqueeze(1)).sum(dim=0) / learn_rate
315 |         excitations = excitations / excitations.sum()
316 |         single_excitations = win_counts * learn_rate
317 |         single_excitations = single_excitations / single_excitations.sum()
318 |         self.unit_saliency_coeffs += excitations + single_excitations
319 | 
320 |     def _find_outlier_units(self):
321 |         self.unit_saliency_coeffs /= self.unit_saliency_coeffs.sum()
322 |         self.unit_saliency = (
323 |             self.unit_saliency_coeffs > self.outlier_unit_thresh / self.num_units
324 |         )
325 | 
326 |         self.inst_saliency = torch.ones(self.X.shape[0], dtype=torch.bool)
327 |         outlier_units = torch.where(self.unit_saliency == False)[0]
328 |         for i in outlier_units:
329 |             self.inst_saliency[torch.where(self.ins_unit_assign == i)[0]] = False
330 | 
331 |     def _find_inunit_outliers(self):
332 |         if self.inst_saliency.numel() == 0:
333 |             self.inst_saliency = torch.ones(self.X.shape[0], dtype=torch.bool)
334 | 
335 |         for i in torch.unique(self.ins_unit_assign):
336 |             indices = torch.where(self.ins_unit_assign == i)[0]
337 |             unit_thresh = torch.quantile(
338 |                 self.ins_unit_dist[indices], self.inunit_outlier_thresh / 100
339 |             )
340 |             outlier_insts = indices[self.ins_unit_dist[indices] > unit_thresh]
341 |             self.inst_saliency[outlier_insts] = False
342 | 
343 |     def salient_inst_index(self) -> torch.Tensor:
344 |         return torch.where(self.inst_saliency == True)[0]
345 | 
346 |     def salient_unit_index(self) -> torch.Tensor:
347 |         return torch.where(self.unit_saliency == True)[0]
348 | 
349 |     def salient_insts(self) -> torch.Tensor:
350 |         return self.X[self.inst_saliency]
351 | 
352 |     def salient_units(self) -> torch.Tensor:
353 |         return self.W[self.unit_saliency]
354 | 
355 |     def inst_to_unit_mapping(self) -> torch.Tensor:
356 |         return torch.stack((torch.arange(self.X.shape[0]), self.ins_unit_assign))
357 | 
358 | 
359 | if __name__ == "__main__":
360 |     import matplotlib.pyplot as plt
361 |     import som_plot
362 |     import torch
363 |     from sklearn.datasets import load_digits
364 |     from sklearn.preprocessing import StandardScaler
365 | 
366 |     # Load the digits dataset
367 |     digits = load_digits()
368 |     X = digits.data
369 |     y = digits.target
370 | 
371 |     # Preprocess the data
372 |     scaler = StandardScaler()
373 |     X_scaled = scaler.fit_transform(X)
374 | 
375 |     # Convert to PyTorch tensor
376 |     X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
377 | 
378 |     # Initialize and train SOM
379 |     som = SOM(X_tensor, num_units=100, alpha_max=0.05, alpha_min=0.01)
380 |     som.train_batch(num_epoch=1000, batch_size=32, verbose=True)
381 | 
382 |     # Get the weights and assign instances to units
383 |     W = som.W.detach().numpy()
384 |     som.assing_to_units()
385 | 
386 |     # Plot scatter plot
387 |     som_plot.som_plot_scatter(W, X_scaled, som.activations.numpy())
388 | 
389 |     # Plot outlier scatter plot
390 |     som_plot.som_plot_outlier_scatter(
391 |         W,
392 |         X_scaled,
393 |         som.unit_saliency.numpy(),
394 |         som.inst_saliency.numpy(),
395 |         som.activations.numpy(),
396 |     )
397 | 
398 |     # Plot mapping
399 |     distance_map = (
400 |         som._euq_dist(torch.sum(X_tensor**2, dim=1).unsqueeze(1), X_tensor)
401 |         .detach()
402 |         .numpy()
403 |     )
404 |     distance_map = distance_map.reshape(som.height, som.width)
405 |     som_plot.som_plot_mapping(distance_map)
406 | 
407 |     plt.show()
408 | 


--------------------------------------------------------------------------------
/som_theano.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | '''
  3 |     Rectifying Self Organazing Maps a.k.a RSOM
  4 |     
  5 |     RSOM is a clustering and outlier detection method that is predicated with
  6 |     old Self Organazing Maps.
  7 |     
  8 |     It includes Batch and Stochastic learning rules. There are two different
  9 |     implementations. One is based on Numpy and tthe other is Theano. If you have
 10 |     tall and wide data matrix, we suggest to use Theano version. Otherwise 
 11 |     Numpy version is faster. You can also use GPU with Theano but you need to 
 12 |     set Theano configurations.
 13 |     
 14 |     For more detail about RSOM refer to http://arxiv.org/abs/1312.4384
 15 |     
 16 |     AUTHOR:
 17 |         Eren Golge
 18 |         erengolge@gmail.com
 19 |         www.erengolge.com
 20 | '''
 21 | 
 22 | """
 23 | TO DO:
 24 | -> Try dot product distance instead of Euclidean 
 25 | -> Normzalize only updated weight vectors in that epoch
 26 | -> compare code with https://github.com/JustGlowing/minisom/blob/master/minisom.py
 27 | -> print resulting objective values
 28 | -> write bookeeping for best objective value
 29 | -> learning rate is already decreasing so radius might be good to keep it constant
 30 | -> UPDATE only winners 
 31 | """
 32 | 
 33 | import warnings
 34 | from random import *
 35 | from math import *
 36 | import sys
 37 | import scipy
 38 | import numpy as np
 39 | from numpy import linalg
 40 | from som_plot import *
 41 | import theano
 42 | import theano.tensor as T
 43 | from theano import function, config, shared, sandbox
 44 | from theano import ProfileMode
 45 | from collections  import Counter
 46 | #from theano import ProfileMode
 47 | 
 48 | EPS =  2.2204e-16;
 49 | 
 50 | class SOM(object):
 51 | 
 52 |     def __init__(self, DATA=None,  num_units = 10, height=None, width=None, \
 53 |      alpha_max=0.05, alpha_min=0.001, set_count_activations = True, \
 54 |      set_outlier_unit_det = True, set_inunit_outlier_det = True, outlier_unit_thresh = 0.5,\
 55 |      inunit_outlier_thresh = 95):
 56 |          
 57 |         '''
 58 |              CONSTRUCTOR PARAMETERS:
 59 | 
 60 |                 DATA                    --- data matrix with shape nxm n is number of instances and
 61 |                                          m is number of variables
 62 |                 num_units               --- number of som units. This can be changes a bit after
 63 |                                          2D lattice shape is computed by eigen heuristic, if its shape
 64 |                                          paramters are not given already.
 65 |                 height                  --- height of the 2D lattice of SOM
 66 |                 width                   --- width of the 2D lattice of SOM. height * width = num_inst
 67 |                 alpha_max               --- is the maximum learning rate that is gradually 
 68 |                                          decreasing up to alpha_min
 69 |                 alpha_min               --- is the minimum learning rate attined at the last epoch
 70 |                 set_count_activations   --- whether count the activation of each unit
 71 |                 set_outlier_unit_det    --- whether outlier units are detected. If a unit 
 72 |                                          is detected as outlier, all of the assigned items signed as outlier as well
 73 |                 set_inunit_outlier_det  --- wheter in-unit outlier instances are detected
 74 |                 outlier_unit_thresh     --- default value 0.5 works good for many cases
 75 |                 inunit_outlier_thresh   --- is the upper whisker percentage.
 76 |         '''
 77 | 
 78 |         self.X = DATA
 79 |         self.num_units = num_units
 80 |         if height == None or width == None:
 81 |             self._estimate_map_shape()
 82 |             self.num_units = self.height * self.width
 83 |         else:
 84 |             self.height = height
 85 |             self.width = width
 86 |         
 87 |         if self.height * self.width != self.num_units:
 88 |             print "Number of units is not conforming to lattice size so it is set num_units = width + heigth"
 89 |             self.num_units = self.height * self.width
 90 |             print "New number of units : ",self.num_units
 91 |             raw_input("Press Enter to continue...")
 92 |             
 93 |         self.data_dim = DATA.shape[1]
 94 | 
 95 |         # normalize data and save mean and std values
 96 |         self.data_mean = 0
 97 |         self.data_std  = 0
 98 |         #self._norm_data()
 99 | 
100 |         # optimization parameters
101 |         self.alpha_max = alpha_max
102 |         self.alpha_min = alpha_min
103 |         
104 |         self.W = np.random.random((self.num_units , self.data_dim))
105 |         self.W = np.array([v/linalg.norm(v) for v in self.W]) # normalizat   
106 |         
107 |         # book keeping
108 |         self.best_W = self.W
109 |         self.best_W_obj = 0
110 |         
111 |         # unit statistics
112 |         self.set_count_activations = set_count_activations
113 |         self.activations = np.zeros((self.num_units))
114 |         self.set_outlier_unit_det = set_outlier_unit_det
115 |         self.set_inunit_outlier_det = set_inunit_outlier_det  
116 |         self.unit_saliency_coeffs = np.zeros((self.num_units))
117 |         self.unit_saliency = np.ones((self.num_units), dtype=bool)
118 |         self.inst_saliency = np.array(())
119 |         self.outlier_unit_thresh = outlier_unit_thresh
120 |         self.inunit_outlier_thresh = inunit_outlier_thresh
121 |         self.ins_unit_assign = np.array(())
122 |         self.ins_unit_dist = np.array(())
123 |         self.unit_coher = np.array(())
124 | 
125 |     unit_x = lambda self, index, width : index % width
126 |     unit_y = lambda self, index, width : np.floor( index / width )
127 |     
128 |     def unit_cords(self, index):
129 |         return self.unit_x(index, self.width), self.unit_y(index, self.width)
130 |     
131 |     # Euclidean distance with pre-computed data square X2
132 |     def _euq_dist(self, X2, X):
133 |         return -2*np.dot(self.W, X.T) + (self.W**2).sum(1)[:, None] + X2.T
134 |         
135 |     # Print function for Numpy based optimization functions  
136 |     def _print_cost(self,X2, epoch, num_epoch):
137 |         D = self._euq_dist(X2, self.X)
138 |         print "epoch", epoch, "of", num_epoch, " cost: ", np.linalg.norm(D.min(0), ord=1) / self.X.shape[0]
139 |     
140 |     
141 |     def set_params(self, num_epoch):
142 |         
143 |         '''
144 |             Before starting to learning, all imperative parameters are set regarding
145 |             corresponding epoch. It wastes some additional memory but proposes faster 
146 |             learning speed.
147 |             
148 |             Outputs:
149 |                 U --- is a dictionary including all necessary parameter structures
150 |                     
151 |                     U['alphas'] -- learning rates for each epoch
152 |                     U['H_maps'] -- matrix array of neighboorhood masks
153 |                     U['radiuses'] -- neighboor radiuses for each epoch
154 |                     
155 |         '''
156 |         
157 |         U = {'alphas':[], 'H_maps':[], 'radiuses':[]}              
158 |         alphas = [None]*num_epoch       
159 |         H_maps = [None]*num_epoch
160 |         radiuses = [None]*num_epoch
161 | 
162 |         dist_map = np.zeros((self.num_units, self.num_units))
163 |         radius = np.ceil(1 + floor(min(self.width, self.height)-1)/2)-1
164 |         for u in range(int(self.num_units)):
165 |             #for r in range(1,int(radius)+1,1):  
166 |             dist_map[u,:] = self.find_neighbors(u,self.num_units)
167 |         
168 |         for epoch in range(0,num_epoch,1):
169 |             alpha = self.alpha_max - self.alpha_min
170 |             alpha = alpha * (num_epoch - epoch)
171 |             alpha = alpha / num_epoch + self.alpha_min
172 |             radius = np.ceil(1 + floor(min(self.width, self.height)-1)/2)-1
173 |             radius = radius * (num_epoch - epoch)
174 |             radius = ceil(radius / (num_epoch - 1))-1
175 |             if radius < 0 :
176 |                 radius = 0 
177 |             neigh_updt_map = alpha * (1 - dist_map/float((1 + radius))) 
178 |            # neigh_updt_map[dist_map == 0] = 1
179 |             neigh_updt_map[dist_map > radius] = 0 # Optimize this part
180 |             H_maps[epoch] = neigh_updt_map
181 |             alphas[epoch] = alpha
182 |             radiuses[epoch] = radius
183 | 
184 |         U['alphas'] = alphas
185 |         U['H_maps'] = H_maps
186 |         U['radiuses'] = radiuses
187 |         return U
188 | 
189 |     def train_stoch(self, num_epoch, verbose =True):
190 |         
191 |         '''
192 |             Numpy based stochastic training where each instance is take individually
193 |             and weight are updatesd in terms of winner neuron. 
194 |             
195 |             Generally faster than Theano version
196 |         '''
197 | 
198 |         if num_epoch == None:
199 |             num_epoch = 500 * self.num_units # Kohonen's suggestion
200 |             
201 |         U = self.set_params(num_epoch)
202 |         X2 = (self.X**2).sum(1)[:, None]
203 |         
204 |         for epoch in range(num_epoch):
205 |             shuffle_indices = np.random.permutation(self.X.shape[0])
206 |             
207 |             update_rate = U['H_maps'][epoch]
208 |             learn_rate = U['alphas'][epoch]
209 |             win_counts = np.zeros((self.num_units))
210 |             for i in shuffle_indices:
211 |                 instance = self.X[i,:]
212 |                 D = self._euq_dist(X2[i][None,:], instance[None,:])
213 |                 BMU_indx = np.argmin(D)
214 |                 
215 |                 win_counts[BMU_indx] += 1
216 |                 if self.set_count_activations:
217 |                     self.activations[BMU_indx] += 1
218 |                 
219 |                 self.W  = self.W + learn_rate * update_rate[...,BMU_indx,None]* (instance - self.W)
220 |                 ## Normalization is not imperative unless given input instances are normalized
221 |                 # self.W = self.W / np.linalg.norm(self.W)
222 | 
223 |             if verbose and (epoch % 1) == 0:
224 |                 self._print_cost(X2, epoch, num_epoch)
225 |             
226 |             if self.set_outlier_unit_det:
227 |                 self._update_unit_saliency(win_counts, update_rate, learn_rate)      
228 | 
229 |         # Normalize activation counts
230 |         if self.set_count_activations:
231 |             total_act = self.activations.sum()
232 |             self.activations = self.activations / total_act
233 |         
234 |         self.assing_to_units() # final unit assignments
235 | 
236 |         if self.set_outlier_unit_det:
237 |             self._find_outlier_units()
238 |         
239 |         if self.set_inunit_outlier_det:
240 |             self._find_inunit_outliers()
241 |             
242 |             
243 |                 
244 |     def train_stoch_theano(self, num_epoch = None, verbose =True):
245 |         
246 |         '''
247 |             Theano based stochastic learning
248 |         '''
249 |         
250 |         warnings.simplefilter("ignore", DeprecationWarning)
251 |         warnings.filterwarnings("ignore")
252 |         
253 |         if num_epoch == None:
254 |             num_epoch = 500 * self.X.shape[0]
255 |         
256 |         # Symmbol variables
257 |         X = T.dmatrix('X')
258 |         WIN = T.dmatrix('WIN')
259 |         H = T.dmatrix('H')
260 |         
261 |         # Init weights random
262 |         W = theano.shared(self.W, name="W")
263 |         #W = theano.shared(rng.randn(cluster_num, data.shape[1]).astype(theano.config.floatX), name="W")
264 | 
265 |         # Find winner unit
266 |         D = (W**2).sum(axis=1, keepdims=True) + (X**2).sum(axis=1, keepdims=True).T - 2*T.dot(W, X.T) 
267 |         bmu = (D).argmin(axis=0)
268 |         dist = T.dot(WIN.T, X) - WIN.sum(0)[:, None] * W
269 |         err = D.min(0).norm(1)/X.shape[0]
270 | 
271 |         update = function([X,WIN, H],outputs=err,updates=[(W, W + T.addbroadcast(H,1)*dist)])
272 |         find_bmu = function([X], bmu)
273 | 
274 |         # Update
275 |         U = self.set_params(num_epoch)
276 |         for epoch in range(num_epoch):
277 |             update_rate = U['H_maps'][epoch]
278 |             learn_rate = U['alphas'][epoch]
279 |             win_counts = np.zeros((self.num_units))
280 |             shuff_indx = np.random.permutation(self.X.shape[0])
281 |             for i in shuff_indx:
282 |                 ins = self.X[i, :][None,:]
283 |                 D = find_bmu(ins)
284 |                 S = np.zeros([ins.shape[0],self.num_units])
285 |                 #S = np.zeros([batch,cluster_num], theano.config.floatX)
286 |                 S[:,D] = 1
287 |                 win_counts[D] += 1 
288 |                 h = update_rate[D,:].sum(0)[:,None]
289 |                 cost = update(ins,S,h)
290 |                 
291 |             if verbose:
292 |                 print "Avg. centroid distance -- ", cost,"\t EPOCH : ",epoch , " of ", num_epoch
293 |         if self.set_count_activations:
294 |             self.activations += win_counts
295 |             
296 |         if self.set_outlier_unit_det:
297 |             self._update_unit_saliency(win_counts, update_rate, learn_rate)
298 | 
299 |          # get the data from shared theano variable        
300 |         self.W = W.get_value()
301 | 
302 |         # Normalize activation counts
303 |         if self.set_count_activations:
304 |             total_act = self.activations.sum()
305 |             self.activations = self.activations / total_act
306 | 
307 |         self.assing_to_units() # final unit assignments
308 | 
309 |         if self.set_outlier_unit_det:
310 |             self._find_outlier_units()
311 | 
312 |         if self.set_inunit_outlier_det:
313 |             self._find_inunit_outliers()
314 |             
315 | 
316 |     def train_batch_theano(self, num_epoch = None, batch_size = None, verbose=True):
317 |         '''
318 |             Theano based batch learning. If you don't define batch size, then all the
319 |             instances are fed for each epoch. 
320 |             
321 |             It is preferred to use batch learning initially then fine tune with 
322 |             stochastic version
323 |             
324 |             In general Theano version is faster if the data is not very small.
325 |         '''
326 |         
327 |         if num_epoch == None:
328 |             num_epoch = 500 * self.X.shape[0]
329 |          
330 |         if batch_size == None:
331 |             batch_size = self.X.shape[0]
332 |         
333 |         # Symmbol variables
334 |         X = T.dmatrix('X')
335 |         WIN = T.dmatrix('WIN')
336 |         alpha = T.dscalar('learn_rate')
337 |         H = T.dmatrix('update_rate')
338 | 
339 |         # Init weights random
340 |         W = theano.shared(self.W, name='W')
341 |         W_old = W.get_value()
342 | 
343 |         # Find winner unit
344 |         D = (W**2).sum(axis=1, keepdims=True) + (X**2).sum(axis=1, keepdims=True).T - 2*T.dot(W, X.T)
345 |         BMU = (T.eq(D,D.min(axis=0, keepdims=True))).T
346 |         dist = T.dot(BMU.T, X) - BMU.sum(0)[:, None] * W
347 |         err = D.min(0).sum().norm(1)/X.shape[0] 
348 | 
349 |         #update = function([X,WIN,alpha],outputs=err,updates=[(W, W + alpha * dist)])
350 |         
351 |         A = T.dot(BMU, H)
352 |         S = A.sum(axis=0)
353 |         update_neigh_no_verbose = function([X, H],outputs=BMU, updates=[(W,  T.where((S[:,None] > 0) ,T.dot(A.T, X), W) / T.where((S > 0), S, 1)[:,None])])
354 |         update_neigh = function([X, H],outputs=[err, BMU], updates=[(W,  T.where((S[:,None] > 0) ,T.dot(A.T, X), W) / T.where((S > 0), S, 1)[:,None])])
355 |         find_bmu = function([X], BMU)
356 | 
357 | #        if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
358 | #            update_neigh.maker.fgraph.toposort()]):
359 | #            print 'Used the cpu'
360 | #        elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
361 | #            update_neigh.maker.fgraph.toposort()]):
362 | #            print 'Used the gpu'
363 | #        else:
364 | #            print 'ERROR, not able to tell if theano used the cpu or the gpu'
365 | #            print update_neigh.maker.fgraph.toposort()
366 | 
367 |         U = self.set_params(num_epoch)
368 |         for epoch in range(num_epoch):
369 |             print 'Epoch --- ', epoch
370 |             update_rate = U['H_maps'][epoch]
371 |             learn_rate = U['alphas'][epoch]
372 |             win_counts = np.zeros((self.num_units))
373 |             for i in range(0, self.X.shape[0], batch_size):
374 |                 batch_data = self.X[i:i+batch_size, :]
375 |                 #temp = find_bmu(batch_data)
376 |                 if verbose and epoch % 5 == 0:
377 |                     cost, winners = update_neigh(batch_data, update_rate)
378 |                 else:
379 |                     winners = update_neigh_no_verbose(batch_data, update_rate)
380 |                 win_counts =+ winners.sum(axis=0)
381 |                 ## Normalization is not imperative unless given input instances are normalized
382 |                 # self.W = self.W / np.linalg.norm(self.W)
383 |             
384 |                 
385 |             if verbose and epoch % 5 == 0:
386 |                 print "Avg. centroid distance -- ", cost,"\t EPOCH : ", epoch, " of ", num_epoch
387 |                 
388 |             if self.set_count_activations:
389 |                 self.activations += win_counts
390 |             
391 |             if self.set_outlier_unit_det:
392 |                 self._update_unit_saliency(win_counts, update_rate, learn_rate)
393 | 
394 |         # get the data from shared theano variable        
395 |         self.W = W.get_value()
396 | 
397 |         # Normalize activation counts
398 |         if self.set_count_activations:
399 |             total_act = self.activations.sum()
400 |             self.activations = self.activations / total_act
401 |         
402 |         self.assing_to_units() # final unit assignments
403 | 
404 |         if self.set_outlier_unit_det:
405 |             self._find_outlier_units()
406 |         
407 |         if self.set_inunit_outlier_det:
408 |             self._find_inunit_outliers()
409 | 
410 | 
411 |     def train_batch(self, num_epoch = None, batch_size = None, verbose=True):
412 |         
413 |         '''
414 |             Numpy version of batch learning
415 |         '''
416 | 
417 |         if num_epoch == None:
418 |             num_epoch = 500 * self.num_units # Kohonen's suggestion
419 |         
420 |         if batch_size ==  None:
421 |             batch_size = self.X.shape[0]
422 |         
423 |         print 'Learning ... '
424 |         U = self.set_params(num_epoch)
425 |         X2 = (self.X**2).sum(1)[:, None]
426 |         for epoch in range(num_epoch):
427 |             print 'Epoch --- ', epoch
428 |             update_rate = U['H_maps'][epoch]
429 |             learn_rate = U['alphas'][epoch]
430 |             # randomize batch order
431 |             shuffle_indices = np.random.permutation(self.X.shape[0])
432 |             win_counts = np.zeros((self.num_units))
433 |             for batch_indices in  np.array_split(shuffle_indices, self.X.shape[0]/batch_size):
434 |                 batch_data = self.X[batch_indices,:]
435 |                 D = self._euq_dist(X2[batch_indices,:], batch_data)
436 |                 BMU = (D==D.min(0)[None,:]).astype("float32").T
437 |                 
438 |                 win_counts += BMU.sum(axis=0)
439 |                 #print win_counts
440 |                 
441 |                 if self.set_count_activations:
442 |                     self.activations += win_counts
443 |                 
444 |                 # batch learning
445 |                 A = np.dot(BMU, update_rate)
446 |                 S = A.sum(0)
447 |                 non_zeros = S.nonzero()[0]
448 |                 self.W[non_zeros, ...] =  np.dot(A[:,non_zeros].T, batch_data) / S[non_zeros][..., None]
449 |                 
450 |                 # normalize weight vector
451 |                 ## Normalization is not imperative unless given input instances are normalized
452 |                 # self.W = self.W / np.linalg.norm(self.W)
453 |                 #self.W = self.W / np.linalg.norm(self.W)
454 |         
455 |             if self.set_outlier_unit_det:
456 |                 self._update_unit_saliency(win_counts, update_rate, learn_rate)
457 |                     
458 |             if verbose and ((epoch % 1) == 0):
459 |                self._print_cost(X2, epoch, num_epoch)
460 | 
461 |         # Normalize activation counts
462 |         if self.set_count_activations:
463 |             total_act = self.activations.sum()
464 |             self.activations = self.activations / total_act
465 |         
466 |         self.assing_to_units() # final unit assignments
467 | 
468 |         if self.set_outlier_unit_det:
469 |             self._find_outlier_units()
470 |         
471 |         if self.set_inunit_outlier_det:
472 |             self._find_inunit_outliers()
473 |             
474 |       
475 |     # Uses the Chessboard distance
476 |     # Find the neighbooring units to given unit
477 |     vis_neigh = lambda neigh_map, indx : neigh_map[indx].reshape((self.height, self.width))
478 |     def find_neighbors(self, unit_id, radius):
479 |         neighbors = np.zeros((1,self.num_units))      
480 |         test_neig = np.zeros((self.height, self.width))
481 |         unit_x, unit_y = self.unit_cords(unit_id) 
482 |         
483 |         min_y = max(int(unit_y - radius), 0)
484 |         max_y = min(int(unit_y + radius), self.height-1)
485 |         min_x = max(int(unit_x - radius), 0)
486 |         max_x = min(int(unit_x + radius), self.width-1)
487 |         for y in range(min_y, max_y+1,1):
488 |             for x in range(min_x, max_x+1,1):
489 |                 dist = abs(y-unit_y) + abs(x-unit_x)
490 |                 neighbors[0, x + ( y * self.width )] = dist
491 |                 test_neig[y,x] = dist
492 |         return neighbors
493 |     
494 |     # find BMUs and between-distances for given set of instances
495 |     def best_match(self, X):
496 |         if len(X.shape) == 1:
497 | 
498 |             X = X.reshape((1,2))
499 |         X2 = (self.X**2).sum(1)[:, None]
500 |         D = -2*np.dot(self.W, X.T)[None,:] + (self.W**2).sum(1)[:, None] + X2.T
501 |         BMU = (D==D.min(0)[None,:]).astype("float32").T
502 |         return BMU, D
503 |     
504 |     # structure the unit weight to be shown at U map
505 |     def som_map(self):
506 |         print('Som mapping is being computed...')
507 |         sqrt_weigths = np.reshape(self.W,(self.height, self.width, self.data_dim))
508 |         um = np.zeros((sqrt_weigths.shape[0],sqrt_weigths.shape[1]))
509 |         it = np.nditer(um, flags=['multi_index'])
510 |         while not it.finished:
511 |             for ii in range(it.multi_index[0]-1,it.multi_index[0]+2):
512 |                 for jj in range(it.multi_index[1]-1,it.multi_index[1]+2):
513 |                     if ii >= 0 and ii < sqrt_weigths.shape[0] and jj >= 0 and jj < sqrt_weigths.shape[1]:
514 |                         um[it.multi_index] += np.linalg.norm(sqrt_weigths[ii,jj,:]-sqrt_weigths[it.multi_index])
515 |             it.iternext()
516 |         um = um/um.max()
517 |         print("Mapping finished...!")
518 |         return um  
519 |         
520 | 
521 |     # set the ratio of width and height of the map by the 
522 |     # ratio between largest 2 eigenvalues, computed from data
523 |     def _estimate_map_shape(self):
524 |         #num_instances = self.X.shape[0]
525 |         u,s,v = np.linalg.svd(self.X ,full_matrices = False)
526 |         s_sorted = np.sort(s)[::-1]
527 |         ratio = s_sorted[0] / s_sorted[1]
528 |         self.height = int(min(self.num_units, np.ceil(np.sqrt(self.num_units / ratio))))
529 |         self.width = int(np.ceil(self.num_units / self.height))
530 |         # self.height = int(np.round(np.sqrt(num_instances)))
531 |         # self.width = int(np.round(num_instances / self.height))
532 |         print 'Estimated map size is -> height = ', self.height, ' width = ',self.width 
533 | 
534 |     # assign instances to matching BMUs
535 |     def assing_to_units(self, X=None):
536 |         if X == None:
537 |             X2 = (self.X**2).sum(1)[:, None]
538 |             D = -2*np.dot(self.W, self.X.T) + (self.W**2).sum(1)[:, None] + X2.T
539 | 
540 |             self.ins_unit_assign = D.argmin(axis=0)
541 |             self.ins_unit_dist = D[self.ins_unit_assign, np.arange(self.X.shape[0])]
542 |         else:
543 |             X2 = (X**2).sum(1)[:, None]
544 |             D = -2*np.dot(self.W, X.T) + (self.W**2).sum(1)[:, None] + X2.T
545 |             ins_unit_assign = D.argmin(axis=0)
546 |             ins_unit_dist = D[ins_unit_assign, np.arange(X.shape[0])]
547 |             return ins_unit_assign , ins_unit_dist
548 | 
549 |         
550 |     def find_units_coherence(self):
551 |         
552 |         '''
553 |             Find individually coherence of each unit by looking to avg. distance
554 |             between unit weight and the assigned instances
555 |         '''
556 |         
557 |         self.unit_coher = np.zeros((self.num_units))
558 |         for i in np.unique(self.ins_unit_assign):
559 |             indices = np.where(self.ins_unit_assign == i)
560 |             self.unit_coher[i] = np.sum(self.ins_unit_dist[indices]) / indices[0].size
561 | 
562 |     # return BMU, BMU distance, saliency by already trained params
563 |     def process_new_data(self, X):
564 |         BMU,dist = self.assing_to_units(X)
565 | 
566 |         # find outlier instanes in outlier units
567 |         ins_saliency= np.ones((X.shape[0]), dtype=bool)
568 |         outlier_units = np.where(self.unit_saliency == False)[0]
569 |         for i in outlier_units:
570 |             ins_saliency[np.where(BMU == i)] = False
571 | 
572 |         # find salient unit outliers
573 |         for i in np.unique(BMU):
574 |             indices = np.where(BMU == i)[0]
575 |             unit_thresh = scipy.stats.scoreatpercentile(dist[indices], self.inunit_outlier_thresh)
576 |             outlier_insts = indices[dist[indices] > unit_thresh]
577 |             ins_saliency[outlier_insts] = False;
578 | 
579 |         return BMU, dist, ins_saliency
580 | 
581 | 
582 | 
583 |     def _update_unit_saliency(self, win_counts, update_rate, learn_rate):
584 |         
585 |         '''
586 |             It is called after each epoch of the learning. It compute the 
587 |             unit saliencies with the paper formula. At the end, those values
588 |             defines the outlier and salient units
589 |         '''
590 |         
591 |         excitations = (update_rate * win_counts).sum(axis=0) / learn_rate
592 |         excitations = excitations / excitations.sum()
593 |         single_excitations = win_counts * learn_rate
594 |         single_excitations = single_excitations / single_excitations.sum()
595 |         self.unit_saliency_coeffs += excitations + single_excitations
596 |         
597 |     def _find_outlier_units(self):
598 |         
599 |         '''
600 |             After we compute unit saliencies, this function detects the outlier
601 |             units by the paper heuristic
602 |         '''
603 |         
604 |         # find outlier units
605 |         self.unit_saliency_coeffs /= self.unit_saliency_coeffs.sum()
606 |         self.unit_saliency = self.unit_saliency_coeffs > self.outlier_unit_thresh/self.num_units
607 | 
608 |         # sign outlier instances
609 |         self.inst_saliency = np.ones((self.X.shape[0]), dtype=bool)
610 |         outlier_units = np.where(self.unit_saliency == False)[0]
611 |         for i in outlier_units:
612 |             self.inst_saliency[np.where(self.ins_unit_assign == i)] = False
613 |     
614 |     def _find_inunit_outliers(self):
615 | 
616 |         '''
617 |             Find the poor instances at the salient units. It uses an upper whisker
618 |             assigned to the distances of the unit weight to unit instances. given the threshold,
619 |             outside of the whisker is detedted as outlier.
620 |         '''        
621 |         
622 |         # #remove outlier units
623 | #        int_units = np.array(range(self.num_units))
624 | #        if self.unit_saliency.size > 0 and self.set_inunit_outlier_det:
625 | #            int_units = int_units[self.unit_saliency]
626 |         if self.inst_saliency.size == 0:
627 |             self.inst_saliency = np.ones((self.X.shape[0]), dtype=bool)
628 |             
629 |         for i in np.unique(self.ins_unit_assign):
630 |             indices = np.where(self.ins_unit_assign == i)[0]
631 |             unit_thresh = scipy.stats.scoreatpercentile(self.ins_unit_dist[indices], self.inunit_outlier_thresh)
632 |             outlier_insts = indices[self.ins_unit_dist[indices] > unit_thresh]
633 |             self.inst_saliency[outlier_insts] = False;
634 | 
635 |     # Returns indices of salient instances
636 |     def salient_inst_index(self):
637 |         return np.where(self.inst_saliency == True)[0]
638 | 
639 |     def salient_unit_index(self):
640 |         return np.where(self.unit_saliency == True)[0]  
641 |         
642 |     def salient_insts(self):
643 |         return self.X[np.where(self.inst_saliency == True)]
644 | 
645 |     def salient_units(self):
646 |         return self.W[np.where(self.unit_saliency == True)]
647 | 
648 |     ## Returns instance to unit mapping. First row is instances.
649 |     def inst_to_unit_mapping(self):
650 |         return np.concatenate((np.arange(self.X.shape[0])[None,:], self.ins_unit_assign[None, :]))
651 | 
652 |     def salient_inst_to_unit_mapping(self):
653 |         mapping = self.inst_to_unit_mapping()
654 |         
655 | 
656 |     def _norm_data(self, X = None):
657 |         
658 |         '''
659 |             Take the norm of the given data matrix and save std and mean 
660 |             for future purposes
661 |         '''
662 |         
663 |         if X == None:
664 |             self.data_mean =  self.X.mean(axis=0)
665 |             self.data_std  =  self.X.std(axis=0, ddof=1)
666 |             self.X = (self.X - self.data_mean) / (self.data_std  + EPS)
667 |         else:
668 |             data_mean =  X.mean(axis=0)
669 |             data_std  =  X.std(axis=0, ddof=1)
670 |             X = (X - data_mean) / data_std
671 |             return X, data_mean, data_std
672 |             
673 |    
674 | '''
675 | DEMO CODE
676 | '''
677 | if __name__ == "__main__":
678 |     from sklearn import datasets
679 |     import time
680 | 
681 |     data = datasets.load_digits().data
682 | 
683 |     som = SOM(DATA = data, alpha_max=0.05, num_units=100, height = 10, width = 10)
684 |     #som.train_batch(100)
685 |     #start = time.time()
686 |     #som.train_stoch_theano(10)
687 |     som.train_batch_theano(num_epoch=100)
688 |     #som.train_stoch(10)
689 |     #clusters = som.ins_unit_assign
690 |     #print clusters
691 |     #stop = time.time()
692 |     #
693 |     print som.unit_saliency
694 | 
695 |     #som_plot_scatter(som.W, som.X, som.activations)    
696 |     #som_plot_outlier_scatter(som.W, som.X, som.unit_saliency, som.inst_saliency, som.activations)
697 |     #som_mapping = som.som_map()
698 |     #som_plot_mapping(som_mapping)
699 |     print "Demo finished!"
700 |     #print "Pass time : ", stop - start
701 |    


--------------------------------------------------------------------------------