├── examples
├── gallery
│ ├── horsey.png
│ ├── hyperspace.png
│ ├── topo-galaxy.png
│ └── dimensionality-canvas.png
├── cat
│ ├── cat-reference-viz.png
│ ├── cat-reference.csv.png
│ ├── cat.py
│ └── cat_keplermapper_output.html
├── horse
│ ├── horse-reference.png
│ ├── horse-reference.csv.png
│ ├── horse.py
│ └── horse_keplermapper_output.html
├── lion
│ ├── lion-reference-viz.png
│ ├── lion-reference.csv.png
│ ├── lion.py
│ └── lion_keplermapper_output.html
├── makecircles
│ ├── make_circles_xaxis.png
│ ├── make_circles_distmean.png
│ ├── make_circles_raw_dimensions.png
│ ├── make_circles_xaxis.py
│ ├── make_circles_distmean.py
│ ├── keplermapper-makecircles-xaxis.html
│ └── keplermapper-makecircles-distmean.html
├── digits
│ ├── digits-tsne-custom-tooltip.png
│ ├── digits-tsne-custom-tooltip-mnist.png
│ ├── digits.py
│ └── keplermapper_digits_ylabel_tooltips.html
└── readme.md
├── .gitattributes
├── .gitignore
├── disclaimer.txt
├── license.txt
├── todo.txt
├── release.txt
├── readme.md
├── depricated
└── km.py
└── km.py
/examples/gallery/horsey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/horsey.png
--------------------------------------------------------------------------------
/examples/gallery/hyperspace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/hyperspace.png
--------------------------------------------------------------------------------
/examples/cat/cat-reference-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/cat/cat-reference-viz.png
--------------------------------------------------------------------------------
/examples/cat/cat-reference.csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/cat/cat-reference.csv.png
--------------------------------------------------------------------------------
/examples/gallery/topo-galaxy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/topo-galaxy.png
--------------------------------------------------------------------------------
/examples/horse/horse-reference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/horse/horse-reference.png
--------------------------------------------------------------------------------
/examples/lion/lion-reference-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/lion/lion-reference-viz.png
--------------------------------------------------------------------------------
/examples/lion/lion-reference.csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/lion/lion-reference.csv.png
--------------------------------------------------------------------------------
/examples/horse/horse-reference.csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/horse/horse-reference.csv.png
--------------------------------------------------------------------------------
/examples/gallery/dimensionality-canvas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/dimensionality-canvas.png
--------------------------------------------------------------------------------
/examples/makecircles/make_circles_xaxis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/makecircles/make_circles_xaxis.png
--------------------------------------------------------------------------------
/examples/digits/digits-tsne-custom-tooltip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/digits/digits-tsne-custom-tooltip.png
--------------------------------------------------------------------------------
/examples/makecircles/make_circles_distmean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/makecircles/make_circles_distmean.png
--------------------------------------------------------------------------------
/examples/digits/digits-tsne-custom-tooltip-mnist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/digits/digits-tsne-custom-tooltip-mnist.png
--------------------------------------------------------------------------------
/examples/makecircles/make_circles_raw_dimensions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/makecircles/make_circles_raw_dimensions.png
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Windows image file caches
2 | Thumbs.db
3 | ehthumbs.db
4 |
5 | # Folder config file
6 | Desktop.ini
7 |
8 | # Recycle Bin used on file shares
9 | $RECYCLE.BIN/
10 |
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 |
17 | # Windows shortcuts
18 | *.lnk
19 |
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 |
24 | # OSX
25 | # =========================
26 |
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 |
31 | # Thumbnails
32 | ._*
33 |
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 |
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 |
--------------------------------------------------------------------------------
/examples/cat/cat.py:
--------------------------------------------------------------------------------
1 | import km
2 |
3 | data = km.np.genfromtxt('cat-reference.csv',delimiter=',')
4 |
5 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.1, min_samples=5), nr_cubes=10, overlap_perc=0.8, verbose=1)
6 |
7 | mapper.fit(data)
8 |
9 | complex = mapper.map(data, dimension_index=1, dimension_name="Y-axis")
10 |
11 | mapper.visualize(complex, "cat_keplermapper_output.html", "cat-reference.csv")
12 |
13 | # You may want to visualize the original point cloud data in 3D scatter too
14 | """
15 | import matplotlib.pyplot as plt
16 | from mpl_toolkits.mplot3d import Axes3D
17 |
18 | fig = plt.figure()
19 | ax = fig.add_subplot(111, projection='3d')
20 | ax.scatter(data[:,0],data[:,1],data[:,2])
21 | plt.savefig("cat-reference.csv.png")
22 | plt.show()
23 | """
--------------------------------------------------------------------------------
/examples/lion/lion.py:
--------------------------------------------------------------------------------
1 | import km
2 |
3 | data = km.np.genfromtxt('lion-reference.csv',delimiter=',')
4 |
5 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.1, min_samples=5), nr_cubes=10, overlap_perc=0.8, verbose=1)
6 |
7 | mapper.fit(data)
8 |
9 | complex = mapper.map(data, dimension_index=1, dimension_name="Y-axis")
10 |
11 | mapper.visualize(complex, "lion_keplermapper_output.html", "lion-reference.csv")
12 |
13 | # You may want to visualize the original point cloud data in 3D scatter too
14 | """
15 | import matplotlib.pyplot as plt
16 | from mpl_toolkits.mplot3d import Axes3D
17 |
18 | fig = plt.figure()
19 | ax = fig.add_subplot(111, projection='3d')
20 | ax.scatter(data[:,0],data[:,1],data[:,2])
21 | plt.savefig("lion-reference.csv.png")
22 | plt.show()
23 | """
--------------------------------------------------------------------------------
/examples/horse/horse.py:
--------------------------------------------------------------------------------
1 | import km
2 |
3 | data = km.np.genfromtxt('horse-reference.csv',delimiter=',')
4 |
5 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.3, min_samples=3), nr_cubes=25, link_local=False, overlap_perc=0.7, verbose=1)
6 |
7 | mapper.fit(data)
8 |
9 | complex = mapper.map(data, dimension_index=1, dimension_name="Y-axis")
10 |
11 | mapper.visualize(complex, "horse_keplermapper_output.html", "horse-reference.csv")
12 |
13 | # You may want to visualize the original point cloud data in 3D scatter too
14 | """
15 | import matplotlib.pyplot as plt
16 | from mpl_toolkits.mplot3d import Axes3D
17 |
18 | fig = plt.figure()
19 | ax = fig.add_subplot(111, projection='3d')
20 | ax.scatter(data[:,0],data[:,1],data[:,2])
21 | plt.savefig("horse-reference.csv.png")
22 | plt.show()
23 | """
--------------------------------------------------------------------------------
/examples/makecircles/make_circles_xaxis.py:
--------------------------------------------------------------------------------
1 | import km
2 |
3 | # Make fairly noisy circles
4 | from sklearn import datasets
5 | data, labels = datasets.make_circles(n_samples=5000, noise=0.05, factor=0.3)
6 |
7 | # Initialize
8 | mapper = km.KeplerMapper(verbose=1)
9 |
10 | # Fit to and transform the data
11 | projected_data = mapper.fit_transform(data, projection=[0])
12 |
13 | # Create dictionary called 'complex' with nodes, edges and meta-information
14 | complex = mapper.map(projected_X=projected_data, inverse_X=data,
15 | clusterer=km.cluster.DBSCAN(eps=0.1, min_samples=10),
16 | nr_cubes=20, overlap_perc=0.1)
17 |
18 | # Visualize it
19 | mapper.visualize(complex, path_html="keplermapper-makecircles-xaxis.html",
20 | title="datasets.make_circles(n_samples=5000, noise=0.05, factor=0.3)",
21 | custom_tooltips=labels, color_function="average_signal_cluster")
--------------------------------------------------------------------------------
/examples/makecircles/make_circles_distmean.py:
--------------------------------------------------------------------------------
1 | import km
2 |
3 | # Make very noisy circles
4 | from sklearn import datasets
5 | data, labels = datasets.make_circles(n_samples=5000, noise=0.3, factor=0.3)
6 |
7 | # Initialize
8 | mapper = km.KeplerMapper(verbose=1)
9 |
10 | # Fit to and transform the data
11 | projected_data = mapper.fit_transform(data, projection="dist_mean")
12 |
13 | # Create dictionary called 'complex' with nodes, edges and meta-information
14 | complex = mapper.map(projected_X=projected_data, inverse_X=data,
15 | clusterer=km.cluster.DBSCAN(eps=0.9, min_samples=3),
16 | nr_cubes=30, overlap_perc=0.7)
17 |
18 | # Visualize it
19 | mapper.visualize(complex, path_html="keplermapper-makecircles-distmean.html",
20 | title="datasets.make_circles(n_samples=5000, noise=0.3, factor=0.3)",
21 | custom_tooltips=labels, color_function="average_signal_cluster",
22 | graph_gravity=0.03, graph_link_distance=30, graph_charge=-80)
--------------------------------------------------------------------------------
/disclaimer.txt:
--------------------------------------------------------------------------------
1 | DISCLAIMER:
2 | This code was written for self-study and is not production-ready. It is very
3 | much a work in progress.
4 |
5 | This early version is released, because it may be useful to other researchers
6 | and students starting out in topological data science, and to motivate the
7 | author to keep working on it.
8 |
9 | The author makes no guarantee that this code is:
10 | - correct,
11 | - the right way to do things,
12 | - true to the original MAPPER algorithm
13 | - ready for production
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Triskelion - HJ van Veen - info@mlwave.com
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/todo.txt:
--------------------------------------------------------------------------------
1 | TODO:
2 | Use CSS animations for the div nodes to the remove "jittery" movement.
3 | Add a curvature to the edges
4 | Add member expand function on node click
5 | Style edge width according to the size of set of shared members
6 | Add a bounding box for the force-directed graph
7 | Implement better and more color functions. Also support for custom color functions.
8 | Implement better and more node size functions.
9 | Make better suited to model introspection (visualize out-of-fold predictions performance and feature interactions)
10 | Implement cheap dimensionality reduction for extremely high dimensional data (RandomizedPCA)
11 | Support for online mapping (Column-Oriented datastore https://users.soe.ucsc.edu/~niejiazhong/slides/chandra.pdf)
12 | Add samples to showcase fundamental data shapes: Regression, Clusters, Circular/looping, Y-flares.
13 | Create multi-view to compare graphs
14 | Add an example with a custom (scikit-learn API compatible) clusterer
15 | Write tests
16 | Better support for IDs
17 | Add novel neural gas mapping technique
18 | Turn into an API. Visualizer output to localhost can ask for new json to render graphs with.
19 | Add sliders for on-page graph settings (redraw with a new Gravity or LinkDistance)
20 | Proper accessible HTML output
21 | Better complex["meta"] information
22 | Create distribution
23 | Refactor
--------------------------------------------------------------------------------
/examples/readme.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | ## Horse
4 |
5 | In the horse directory you'll find an example of creating simplicial complexes from point cloud data.
6 |
7 | `horse-reference.csv` is courtesy of Python Mapper (http://danifold.net/mapper/index.html)
8 |
9 | 
10 |
11 | ## Lion
12 |
13 | In the `lion` directory you'll find an example of creating simplicial complexes from point cloud data.
14 |
15 | `lion-reference.csv` is courtesy of Python Mapper (http://danifold.net/mapper/index.html)
16 |
17 | 
18 |
19 | ## Cat
20 |
21 | In the `cat` directory you'll find an example of creating simplicial complexes from point cloud data.
22 |
23 | `cat-reference.csv` is courtesy of Python Mapper (http://danifold.net/mapper/index.html)
24 |
25 | 
26 |
27 | ## Linking locally
28 |
29 | With `link_local=True` you can link up the local clusters (clusters in the same hypercube). Below
30 | you can see a low-resolution house cat on the left *without local linking* and *with local linking* on
31 | the right.
32 |
33 | 
34 |
35 | ## Digits
36 |
37 | In the `digits` directory you'll find an example of using the `custom_tooltips`-functionality. It also
38 | shows the usage of `reducer` with t-SNE to reduce the dimensionality to 2.
39 |
40 | 
41 |
42 | Note how sub-clusters form for digits with a different slant.
43 |
44 | ## Gallery
45 |
46 | Check the `gallery` for a variety of images.
47 |
48 | ## Soon
49 |
50 | Iris, Diabetes 100k, Customer purchase behaviour.
--------------------------------------------------------------------------------
/examples/digits/digits.py:
--------------------------------------------------------------------------------
1 | import km
2 |
3 | # Load digits data
4 | from sklearn import datasets
5 | data, labels = datasets.load_digits().data, datasets.load_digits().target
6 |
7 | # Create images for a custom tooltip array
8 | import StringIO
9 | from scipy.misc import imsave, toimage
10 | import base64
11 | tooltip_s = []
12 | for image_data in data:
13 | output = StringIO.StringIO()
14 | img = toimage(image_data.reshape((8,8))) # Data was a flat row of 64 "pixels".
15 | img.save(output, format="PNG")
16 | contents = output.getvalue()
17 | tooltip_s.append( """ """%base64.b64encode(contents).replace("\n","") )
18 | output.close()
19 |
20 | tooltip_s = km.np.array(tooltip_s) # need to make sure to feed it as a NumPy array, not a list
21 |
22 | # Initialize to use t-SNE with 2 components (reduces data to 2 dimensions). Also note high overlap_percentage.
23 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.3, min_samples=15),
24 | reducer = km.manifold.TSNE(), nr_cubes=35, overlap_perc=0.9,
25 | link_local=False, verbose=2)
26 |
27 | # Fit and transform data
28 | data = mapper.fit_transform(data)
29 |
30 | # Create the graph
31 | complex = mapper.map(data, dimension_index=[0,1], dimension_name="t-SNE(2) 2D")
32 |
33 | # Create the visualizations (increased the graph_gravity for a tighter graph-look.)
34 |
35 | # Tooltips with image data for every cluster member
36 | mapper.visualize(complex, "keplermapper_digits_custom_tooltips.html", "Digits", graph_gravity=0.25, custom_tooltips=tooltip_s)
37 | # Tooltips with the target y-labels for every cluster member
38 | mapper.visualize(complex, "keplermapper_digits_ylabel_tooltips.html", "Digits", graph_gravity=0.25, custom_tooltips=labels)
--------------------------------------------------------------------------------
/release.txt:
--------------------------------------------------------------------------------
1 | Pre-alpha
2 |
3 | v00001
4 | Wrote class
5 | Wrote documentation
6 | Added license
7 |
8 | v00002
9 | Added a multi-dimensional mode: use all dimensions.
10 | Added case study: 3D point cloud data for animals
11 | Added case study: Make circles
12 | Added advanced parameters for graph layout settings. Should probably be sliders on the .html page itself.
13 | Improved documentation
14 | Added disclaimer
15 | Added todo
16 | Added release log
17 |
18 | v00003
19 | Refactored dimension index to use a list of arbitrary dimensions
20 | Improved verbosity
21 | Added levels of verbosity
22 | Decreased number of code lines by using a single approach
23 | Added sample to explain local linkage True vs. False
24 | Added side-view for animal point-cloud data
25 | Added a gallery in the example directory
26 |
27 | v00004
28 | Added dimensionality reduction
29 | Added "digits" case study
30 | changed fit to fit_transform and return of data
31 | added tooltips
32 | added support for custom tooltips
33 |
34 | Alpha
35 |
36 | v00005
37 | Made Python 3 compatible
38 | Ability to turn off title, meta and tooltips
39 | Ability to set the window height and width of HTML output
40 | Added basic support for another color function: average signal
41 | De-emphasized link_local functionality, since its current implementation is no good.
42 |
43 | v00006
44 | Removed link_local functionality
45 | Halved the number of edges drawn (no two-way edges)
46 | Added support for clustering on the inverse image
47 | Refactored code (see updated documentation)
48 | Added code comments
49 | Added feature to use reducers/manifold learning/dimensions and stat functions
50 | Added 7 projections/lenses from statistics
51 |
--------------------------------------------------------------------------------
/examples/cat/cat_keplermapper_output.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
16 |
30 |
31 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # KeplerMapper
2 |
3 | > Nature uses as little as possible of anything. - Johannes Kepler
4 |
5 | This is a class containing a mapping algorithm in Python. KeplerMapper can be used for
6 | visualization of high-dimensional data and 3D point cloud data.
7 |
8 | KeplerMapper employs approaches based on the MAPPER algorithm (Singh et al.) as first
9 | described in the paper "Topological Methods for the Analysis of High Dimensional
10 | Data Sets and 3D Object Recognition".
11 |
12 | KeplerMapper can make use of Scikit-Learn API compatible cluster and scaling algorithms.
13 |
14 | ## Usage
15 |
16 | ### Python code
17 | ```python
18 | # Import the class
19 | import km
20 |
21 | # Some sample data
22 | from sklearn import datasets
23 | data, labels = datasets.make_circles(n_samples=5000, noise=0.03, factor=0.3)
24 |
25 | # Initialize
26 | mapper = km.KeplerMapper(verbose=1)
27 |
28 | # Fit to and transform the data
29 | projected_data = mapper.fit_transform(data, projection=[0,1]) # X-Y axis
30 |
31 | # Create dictionary called 'complex' with nodes, edges and meta-information
32 | complex = mapper.map(projected_data, data, nr_cubes=10)
33 |
34 | # Visualize it
35 | mapper.visualize(complex, path_html="make_circles_keplermapper_output.html",
36 | title="make_circles(n_samples=5000, noise=0.03, factor=0.3)")
37 | ```
38 |
39 | ### Console output
40 | ```
41 | ..Projecting data using: [0, 1]
42 |
43 | ..Scaling with: MinMaxScaler(copy=True, feature_range=(0, 1))
44 |
45 | Mapping on data shaped (5000L, 2L) using dimensions
46 |
47 | Creating 1000 hypercubes.
48 |
49 | created 86 edges and 57 nodes in 0:00:03.614000.
50 |
51 | Wrote d3.js graph to 'make_circles_keplermapper_output.html'
52 | ```
53 |
54 | ### Visualization output
55 |
56 | 
57 |
58 | Click here for an [interactive version](http://mlwave.github.io/tda/make_circles_keplermapper_output2.html).
59 | Click here for an older [interactive version](http://mlwave.github.io/tda/make_circles_keplermapper_output.html).
60 |
61 | ## Install
62 |
63 | The class is currently just one file. Simply dropping it in any directory which Python is able to import from should work.
64 |
65 | ## Required
66 |
67 | These libraries are required to be installed for KeplerMapper to work:
68 |
69 | * NumPy
70 | * Scikit-Learn
71 |
72 | KeplerMapper works on both Python 2.7 and Python 3+.
73 |
74 | ## External resources
75 |
76 | These resources are loaded by the visualization output.
77 |
78 | * Roboto Webfont (Google)
79 | * D3.js (Mike Bostock)
80 |
81 | ## Parameters
82 |
83 | ### Initialize
84 |
85 | ```python
86 | mapper = km.KeplerMapper(verbose=1)
87 | ```
88 |
89 | Parameter | Description
90 | --- | ---
91 | verbose | Int. Verbosity of the mapper. *Default = 0*
92 |
93 | ### Fitting and transforming
94 | Input the data set. Specify a projection/lens type. Output the projected data/lens.
95 |
96 | ```python
97 | projected_data = mapper.fit_transform(data, projection="sum",
98 | scaler=km.preprocessing.MinMaxScaler() )
99 | ```
100 |
101 | Parameter | Description
102 | --- | ---
103 | data | Numpy Array. The data to fit a projection/lens to. *Required*
104 | projection | Any of: list with dimension indices. Scikit-learn API compatible manifold learner or dimensionality reducer. A string from ["sum","mean","median","max","min","std","dist_mean"]. *Default = "sum"*
105 | scaler | Scikit-Learn API compatible scaler. Scaler of the data applied before mapping. Use `None` for no scaling. *Default = preprocessing.MinMaxScaler()*
106 |
107 | ### Mapping
108 |
109 | ```python
110 | topological_network = mapper.map(projected_X, inverse_X=None,
111 | clusterer=cluster.DBSCAN(eps=0.5,min_samples=3),
112 | nr_cubes=10, overlap_perc=0.1)
113 |
114 | print(topological_network["nodes"])
115 | print(topological_network["links"])
116 | print(topological_network["meta"])
117 | ```
118 |
119 | Parameter | Description
120 | --- | ---
121 | projected_X | Numpy array. Output from fit_transform. *Required*
122 | inverse_X | Numpy array or `None`. When `None`, cluster on the projection, else cluster on the original data (inverse image).
123 | clusterer | Scikit-Learn API compatible clustering algorithm. The clustering algorithm to use for mapping. *Default = cluster.DBSCAN(eps=0.5,min_samples=3)*
124 | nr_cubes | Int. The number of cubes/intervals to create. *Default = 10*
125 | overlap_perc | Float. How much the cubes/intervals overlap (relevant for creating the edges). *Default = 0.1*
126 |
127 | ### Visualizing
128 |
129 | ```python
130 | mapper.visualize(topological_network,
131 | path_html="mapper_visualization_output.html")
132 | ```
133 |
134 | Parameter | Description
135 | --- | ---
136 | topological_network | Dict. The `topological_network`-dictionary with nodes, edges and meta-information. *Required*
137 | path_html | File path. Path where to output the .html file *Default = mapper_visualization_output.html*
138 | title | String. Document title for use in the outputted .html. *Default = "My Data"*
139 | graph_link_distance | Int. Global length of links between nodes. Use less for larger graphs. *Default = 30*
140 | graph_charge | Int. The charge between nodes. Use less negative charge for larger graphs. *Default = -120*
141 | graph_gravity | Float. A weak geometric constraint similar to a virtual spring connecting each node to the center of the layout's size. Don't you set to negative or it's turtles all the way up. *Default = 0.1*
142 | custom_tooltips | NumPy Array. Create custom tooltips for all the node members. You could use the target labels `y` for this. Use `None` for standard tooltips. *Default = None*.
143 | show_title | Bool. Whether to show the title. *Default = True*
144 | show_meta | Bool. Whether to show meta information, like the overlap percentage and the clusterer used. *Default = True*
145 | show_tooltips | Bool. Whether to show the tooltips on hover. *Default = True*
146 | width_html | Int. Size in pixels of the graph canvas width. *Default = 0 (full screen width)*
147 | height_html | Int. Size in pixels of the graph canvas height. *Default = 0 (full screen height)*
148 |
149 | ## Examples
150 |
151 | ### 3D-point cloud
152 |
153 | Check the `examples` directory for more.
154 |
155 | 
156 |
157 | ### Very noisy datasets
158 |
159 | Check the `examples\makecircles` directory for code
160 |
161 | 
162 |
163 | ### Dimensionality reduction
164 |
165 | t-SNE on 4K images of MNIST dataset.
166 |
167 | 
168 |
169 | ## References
170 |
171 | > Mapper Algorithm
172 | > "Topological Methods for the Analysis of High Dimensional Data Sets and 3D Object Recognition"
173 | > Gurjeet Singh, Facundo Mémoli, and Gunnar Carlsson
174 |
175 | http://www.ayasdi.com/wp-content/uploads/2015/02/Topological_Methods_for_the_Analysis_of_High_Dimensional_Data_Sets_and_3D_Object_Recognition.pdf
176 |
177 | > Topological Data Analysis
178 | > Stanford Seminar. "Topological Data Analysis: How Ayasdi used TDA to Solve Complex Problems"
179 | > SF Data Mining. "Shape and Meaning."
180 | > Anthony Bak
181 |
182 | https://www.youtube.com/watch?v=x3Hl85OBuc0
183 | https://www.youtube.com/watch?v=4RNpuZydlKY
184 |
185 | > Projection vs. Inverse image & Examples
186 | > MLconf ATL. Topological Learning with Ayasdi
187 | > Allison Gilmore
188 |
189 | https://www.youtube.com/watch?v=cJ8W0ASsnp0
190 |
191 | > The shape of data
192 | > "Conference Talk. The shape of data"
193 | > Topology and Data
194 | > Gunnar Carlsson
195 |
196 | https://www.youtube.com/watch?v=kctyag2Xi8o
197 | http://www.ams.org/images/carlsson-notes.pdf
198 |
199 | > Business Value, Problems, Algorithms, Computation and User Experience of TDA
200 | > Data Driven NYC. "Making Data Work"
201 | > Gurjeet Singh
202 |
203 | https://www.youtube.com/watch?v=UZH5xJXJG2I
204 |
205 | > Implementation details and sample data
206 | > Python Mapper
207 | > Daniel Müllner and Aravindakshan Babu
208 |
209 | http://danifold.net/mapper/index.html
210 |
211 | > Applied Topology
212 | > "Elementary Applied Topology"
213 | > R. Ghrist
214 |
215 | https://www.math.upenn.edu/~ghrist/notes.html
216 |
217 | > Applied Topology
218 | > "Qualitative data analysis"
219 | > Community effort
220 |
221 | http://appliedtopology.org/
222 |
223 | > Single Linkage Clustering
224 | > "Minimum Spanning Trees and Single Linkage Cluster Analysis"
225 | > J. C. Gower, and G. J. S. Ross
226 |
227 | http://www.cs.ucsb.edu/~veronika/MAE/mstSingleLinkage_GowerRoss_1969.pdf
228 |
229 | > Clustering and Manifold Learning
230 | > Scikit-learn: Machine Learning in Python
231 | > Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.
232 |
233 | http://scikit-learn.org/stable/modules/clustering.html
234 | http://scikit-learn.org/stable/modules/manifold.html
235 |
236 | > Force-directed Graphing/Clustering
237 | > Force-directed Graphs
238 | > Mike Bostock, Tim Dwyer, Thomas Jakobsen
239 |
240 | http://bl.ocks.org/mbostock/4062045
241 |
242 | > Graphing
243 | > Grapher
244 | > Cindy Zhang, Danny Cochran, Diana Suvorova, Curtis Mitchell
245 |
246 | https://github.com/ayasdi/grapher
247 |
248 | > Color scales
249 | > "Creating A Custom Hot to Cold Temperature Color Gradient for use with RRDTool"
250 | > Dale Reagan
251 |
252 | http://web-tech.ga-usa.com/2012/05/creating-a-custom-hot-to-cold-temperature-color-gradient-for-use-with-rrdtool/
253 |
254 | > Design
255 | > Material Design
256 | > Google
257 |
258 | https://design.google.com/
259 |
260 | > Design
261 | > Ayasdi Core Product Screenshots
262 | > Ayasdi
263 |
264 | http://www.ayasdi.com/product/core/
265 |
266 | ## Disclaimer
267 |
268 | See disclaimer.txt for more. Basically this is a work in progress to familiarize myself with topological data analysis. The details of the algorithm implementations may be lacking. I'll gladly accept feedback and pull requests to make it more robust. You can contact me at info@mlwave.com or by opening an issue.
--------------------------------------------------------------------------------
/depricated/km.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import numpy as np
3 | from collections import defaultdict
4 | import json
5 | import itertools
6 | from sklearn import cluster, preprocessing, manifold
7 | from datetime import datetime
8 | import sys
9 |
10 | class KeplerMapper(object):
11 | def __init__(self, cluster_algorithm=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10,
12 | overlap_perc=0.1, scaler=preprocessing.MinMaxScaler(), reducer=None, color_function="distance_origin",
13 | link_local=False, verbose=1):
14 | self.clf = cluster_algorithm
15 | self.nr_cubes = nr_cubes
16 | self.overlap_perc = overlap_perc
17 | self.scaler = scaler
18 | self.color_function = color_function
19 | self.verbose = verbose
20 | self.link_local = link_local
21 | self.reducer = reducer
22 |
23 | self.chunk_dist = []
24 | self.overlap_dist = []
25 | self.d = []
26 |
27 | if self.verbose > 0:
28 | print("\nnr_cubes = %s \n\noverlap_perc = %s\n\nlink_local = %s\n\nClusterer = %s\n\nScaler = %s\n\n"%(self.nr_cubes, overlap_perc, self.link_local, str(self.clf),str(self.scaler)))
29 |
30 | def fit_transform(self, X):
31 | # Dimensionality Reduction
32 | if self.reducer != None:
33 | if self.verbose > 0:
34 | try:
35 | self.reducer.set_params(**{"verbose":self.verbose})
36 | except:
37 | pass
38 | print("\n..Reducing Dimensionality using: \n\t%s\n"%str(self.reducer))
39 |
40 | reducer = self.reducer
41 | X = reducer.fit_transform(X)
42 |
43 | # Scaling
44 | if self.scaler != None:
45 | if self.verbose > 0:
46 | print("\n..Scaling\n")
47 | scaler = self.scaler
48 | X = scaler.fit_transform(X)
49 |
50 | # We chop up the min-max column ranges into 'nr_cubes' parts
51 | self.chunk_dist = (np.max(X, axis=0) - np.min(X, axis=0))/self.nr_cubes
52 |
53 | # We calculate the overlapping windows distance
54 | self.overlap_dist = self.overlap_perc * self.chunk_dist
55 |
56 | # We find our starting point
57 | self.d = np.min(X, axis=0)
58 |
59 | return X
60 |
61 | def map(self, X, dimension_index=[0], dimension_name=""):
62 | # This maps the data to a simplicial complex. Returns a dictionary with nodes and links.
63 |
64 | start = datetime.now()
65 |
66 | def cube_coordinates_all(nr_cubes, nr_dimensions):
67 | # if there are 4 cubes per dimension and 3 dimensions
68 | # return the bottom left (origin) coordinates of 64 hypercubes, in a sorted list of Numpy arrays
69 | l = []
70 | for x in range(nr_cubes):
71 | l += [x] * nr_dimensions
72 | return [np.array(list(f)) for f in sorted(set(itertools.permutations(l,nr_dimensions)))]
73 |
74 | nodes = defaultdict(list)
75 | links = defaultdict(list)
76 | complex = {}
77 |
78 | if self.verbose > 0:
79 | print("Mapping on data shaped %s using dimensions %s\n"%(str(X.shape),str(dimension_index)))
80 |
81 | # Scaling
82 | if self.scaler != None:
83 | scaler = self.scaler
84 | X = scaler.fit_transform(X)
85 |
86 | # Initialize Cluster Algorithm
87 | clf = self.clf
88 |
89 | # Prefix'ing the data with ID's
90 | ids = np.array([x for x in range(X.shape[0])])
91 | X = np.c_[ids,X]
92 |
93 | # Subdivide the data X in intervals/hypercubes with overlap
94 | if self.verbose > 0:
95 | total_cubes = len(cube_coordinates_all(self.nr_cubes,len(dimension_index)))
96 | print("Creating %s hypercubes."%total_cubes)
97 | di = np.array(dimension_index)
98 | for i, coor in enumerate(cube_coordinates_all(self.nr_cubes,di.shape[0])):
99 | # Slice the hypercube
100 | hypercube = X[ np.invert(np.any((X[:,di+1] >= self.d[di] + (coor * self.chunk_dist[di])) &
101 | (X[:,di+1] < self.d[di] + (coor * self.chunk_dist[di]) + self.chunk_dist[di] + self.overlap_dist[di]) == False, axis=1 )) ]
102 |
103 | if self.verbose > 1:
104 | print("There are %s points in cube_%s / %s with starting range %s"%
105 | (hypercube.shape[0],i,total_cubes,self.d[di] + (coor * self.chunk_dist[di])))
106 |
107 | # If at least one sample inside the hypercube
108 | if hypercube.shape[0] > 0:
109 | # Cluster the data point(s) inside the cube, skipping the id-column
110 | clf.fit(hypercube[:,1:])
111 |
112 | if self.verbose > 1:
113 | print("Found %s clusters in cube_%s\n"%(np.unique(clf.labels_[clf.labels_ > -1]).shape[0],i))
114 |
115 | #Now for every (sample id in cube, predicted cluster label)
116 | for a in np.c_[hypercube[:,0],clf.labels_]:
117 | if a[1] != -1: #if not predicted as noise
118 | cluster_id = str(coor[0])+"_"+str(i)+"_"+str(a[1])+"_"+str(coor)+"_"+str(self.d[di] + (coor * self.chunk_dist[di])) # Rudimentary cluster id
119 | nodes[cluster_id].append( int(a[0]) ) # Append the member id's as integers
120 | else:
121 | if self.verbose > 1:
122 | print("Cube_%s is empty.\n"%(i))
123 |
124 | # Create links when clusters from different hypercubes have members with the same sample id.
125 | for k in nodes:
126 | for kn in nodes:
127 | if k != kn:
128 | if len(nodes[k] + nodes[kn]) != len(set(nodes[kn] + nodes[k])): # there are non-unique id's in the union
129 | links[k].append( kn )
130 |
131 | # Create links between local hypercube clusters if setting link_local = True
132 | # This is an experimental feature deviating too much from the original mapper algo.
133 | # Creates a lot of spurious edges, and should only be used when mapping one or at most two dimensions.
134 | if self.link_local:
135 | if k.split("_")[0] == kn.split("_")[0]:
136 | links[k].append( kn )
137 |
138 | # Reporting
139 | if self.verbose > 0:
140 | nr_links = 0
141 | for k in links:
142 | nr_links += len(links[k])
143 | print("\ncreated %s edges and %s nodes in %s."%(nr_links,len(nodes),str(datetime.now()-start)))
144 |
145 | complex["nodes"] = nodes
146 | complex["links"] = links
147 | complex["meta"] = dimension_name
148 |
149 | return complex
150 |
151 | def visualize(self, complex, path_html="mapper_visualization_output.html", title="My Data", graph_link_distance=30, graph_gravity=0.1, graph_charge=-120, custom_tooltips=None, width_html=0, height_html=0, show_tooltips=True, show_title=True, show_meta=True):
152 | # Turns the dictionary 'complex' in a html file with d3.js
153 |
154 | # Format JSON
155 | json_s = {}
156 | json_s["nodes"] = []
157 | json_s["links"] = []
158 | k2e = {} # a key to incremental int dict, used for id's when linking
159 |
160 | for e, k in enumerate(complex["nodes"]):
161 | # Tooltip formatting
162 | if custom_tooltips != None:
163 | tooltip_s = "
Cluster %s
"%k + " ".join([str(f) for f in custom_tooltips[complex["nodes"][k]]])
164 | if self.color_function == "average_signal_cluster":
165 | tooltip_i = int(((sum([f for f in custom_tooltips[complex["nodes"][k]]]) / len(custom_tooltips[complex["nodes"][k]])) * 30) )
166 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(tooltip_i)})
167 | else:
168 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])})
169 | else:
170 | tooltip_s = "
Cluster %s
Contains %s members."%(k,len(complex["nodes"][k]))
171 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])})
172 | k2e[k] = e
173 | for k in complex["links"]:
174 | for link in complex["links"][k]:
175 | json_s["links"].append({"source": k2e[k], "target":k2e[link],"value":1})
176 |
177 | # Width and height of graph in HTML output
178 | if width_html == 0:
179 | width_css = "100%"
180 | width_js = 'document.getElementById("holder").offsetWidth-20'
181 | else:
182 | width_css = "%spx" % width_html
183 | width_js = "%s" % width_html
184 | if height_html == 0:
185 | height_css = "100%"
186 | height_js = 'document.getElementById("holder").offsetHeight-20'
187 | else:
188 | height_css = "%spx" % height_html
189 | height_js = "%s" % height_html
190 |
191 | # Whether to show certain UI elements or not
192 | if show_tooltips == False:
193 | tooltips_display = "display: none;"
194 | else:
195 | tooltips_display = ""
196 |
197 | if show_meta == False:
198 | meta_display = "display: none;"
199 | else:
200 | meta_display = ""
201 |
202 | if show_title == False:
203 | title_display = "display: none;"
204 | else:
205 | title_display = ""
206 |
207 | with open(path_html,"wb") as outfile:
208 | html = """
209 |
210 |
211 | %s | KeplerMapper
212 |
213 |
227 |
228 |
229 |
%s
230 |
231 | Lens %s
232 | Cubes per dimension %s
233 | Overlap percentage %s%%
234 |
235 | Color Function %s( %s )
236 | Clusterer %s
237 | Scaler %s
238 |
239 |
240 |
241 | """%(title,width_css, height_css, title_display, meta_display, tooltips_display, title,complex["meta"],self.nr_cubes,self.overlap_perc*100,self.link_local,self.color_function,complex["meta"],str(self.clf),str(self.scaler),width_js,height_js,graph_charge,graph_link_distance,graph_gravity,json.dumps(json_s))
313 | outfile.write(html.encode("utf-8"))
314 | if self.verbose > 0:
315 | print("\nWrote d3.js graph to '%s'"%path_html)
--------------------------------------------------------------------------------
/km.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import numpy as np
3 | from collections import defaultdict
4 | import json
5 | import itertools
6 | from sklearn import cluster, preprocessing, manifold
7 | from datetime import datetime
8 | import sys
9 |
10 | class KeplerMapper(object):
11 | # With this class you can build topological networks from (high-dimensional) data.
12 | #
13 | # 1) Fit a projection/lens/function to a dataset and transform it.
14 | # For instance "mean_of_row(x) for x in X"
15 | # 2) Map this projection with overlapping intervals/hypercubes.
16 | # Cluster the points inside the interval
17 | # (Note: we cluster on the inverse image/original data to lessen projection loss).
18 | # If two clusters/nodes have the same members (due to the overlap), then:
19 | # connect these with an edge.
20 | # 3) Visualize the network using HTML and D3.js.
21 | #
22 | # functions
23 | # ---------
24 | # fit_transform: Create a projection (lens) from a dataset
25 | # map: Apply Mapper algorithm on this projection and build a simplicial complex
26 | # visualize: Turns the complex dictionary into a HTML/D3.js visualization
27 |
28 | def __init__(self, verbose=2):
29 | self.verbose = verbose
30 |
31 | self.chunk_dist = []
32 | self.overlap_dist = []
33 | self.d = []
34 | self.nr_cubes = 0
35 | self.overlap_perc = 0
36 | self.clusterer = False
37 |
38 | def fit_transform(self, X, projection="sum", scaler=preprocessing.MinMaxScaler()):
39 | # Creates the projection/lens from X.
40 | #
41 | # Input: X. Input features as a numpy array.
42 | # Output: projected_X. original data transformed to a projection (lens).
43 | #
44 | # parameters
45 | # ----------
46 | # projection: Projection parameter is either a string,
47 | # a scikit class with fit_transform, like manifold.TSNE(),
48 | # or a list of dimension indices.
49 | # scaler: if None, do no scaling, else apply scaling to the projection
50 | # Default: Min-Max scaling
51 |
52 | self.scaler = scaler
53 | self.projection = str(projection)
54 |
55 | # Detect if projection is a class (for scikit-learn)
56 | if str(type(projection))[1:6] == "class": #TODO: de-ugly-fy
57 | reducer = projection
58 | if self.verbose > 0:
59 | try:
60 | projection.set_params(**{"verbose":self.verbose})
61 | except:
62 | pass
63 | print("\n..Projecting data using: \n\t%s\n"%str(projection))
64 | X = reducer.fit_transform(X)
65 |
66 | # Detect if projection is a string (for standard functions)
67 | if isinstance(projection, str):
68 | if self.verbose > 0:
69 | print("\n..Projecting data using: %s"%(projection))
70 | # Stats lenses
71 | if projection == "sum": # sum of row
72 | X = np.sum(X, axis=1).reshape((X.shape[0],1))
73 | if projection == "mean": # mean of row
74 | X = np.mean(X, axis=1).reshape((X.shape[0],1))
75 | if projection == "median": # mean of row
76 | X = np.median(X, axis=1).reshape((X.shape[0],1))
77 | if projection == "max": # max of row
78 | X = np.max(X, axis=1).reshape((X.shape[0],1))
79 | if projection == "min": # min of row
80 | X = np.min(X, axis=1).reshape((X.shape[0],1))
81 | if projection == "std": # std of row
82 | X = np.std(X, axis=1).reshape((X.shape[0],1))
83 |
84 | if projection == "dist_mean": # Distance of x to mean of X
85 | X_mean = np.mean(X, axis=0)
86 | X = np.sum(np.sqrt((X - X_mean)**2), axis=1).reshape((X.shape[0],1))
87 |
88 | # Detect if projection is a list (with dimension indices)
89 | if isinstance(projection, list):
90 | if self.verbose > 0:
91 | print("\n..Projecting data using: %s"%(str(projection)))
92 | X = X[:,np.array(projection)]
93 |
94 | # Scaling
95 | if scaler is not None:
96 | if self.verbose > 0:
97 | print("\n..Scaling with: %s\n"%str(scaler))
98 | X = scaler.fit_transform(X)
99 |
100 | return X
101 |
102 | def map(self, projected_X, inverse_X=None, clusterer=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10, overlap_perc=0.1):
103 | # This maps the data to a simplicial complex. Returns a dictionary with nodes and links.
104 | #
105 | # Input: projected_X. A Numpy array with the projection/lens.
106 | # Output: complex. A dictionary with "nodes", "links" and "meta information"
107 | #
108 | # parameters
109 | # ----------
110 | # projected_X projected_X. A Numpy array with the projection/lens. Required.
111 | # inverse_X Numpy array or None. If None then the projection itself is used for clustering.
112 | # clusterer Scikit-learn API compatible clustering algorithm. Default: DBSCAN
113 | # nr_cubes Int. The number of intervals/hypercubes to create.
114 | # overlap_perc Float. The percentage of overlap "between" the intervals/hypercubes.
115 |
116 | start = datetime.now()
117 |
118 | # Helper function
119 | def cube_coordinates_all(nr_cubes, nr_dimensions):
120 | # Helper function to get origin coordinates for our intervals/hypercubes
121 | # Useful for looping no matter the number of cubes or dimensions
122 | # Example: if there are 4 cubes per dimension and 3 dimensions
123 | # return the bottom left (origin) coordinates of 64 hypercubes,
124 | # as a sorted list of Numpy arrays
125 | # TODO: elegance-ify...
126 | l = []
127 | for x in range(nr_cubes):
128 | l += [x] * nr_dimensions
129 | return [np.array(list(f)) for f in sorted(set(itertools.permutations(l,nr_dimensions)))]
130 |
131 | nodes = defaultdict(list)
132 | links = defaultdict(list)
133 | complex = {}
134 | self.nr_cubes = nr_cubes
135 | self.clusterer = clusterer
136 | self.overlap_perc = overlap_perc
137 |
138 | if self.verbose > 0:
139 | print("Mapping on data shaped %s using dimensions\n"%(str(projected_X.shape)))
140 |
141 | # If inverse image is not provided, we use the projection as the inverse image (suffer projection loss)
142 | if inverse_X is None:
143 | inverse_X = projected_X
144 |
145 | # We chop up the min-max column ranges into 'nr_cubes' parts
146 | self.chunk_dist = (np.max(projected_X, axis=0) - np.min(projected_X, axis=0))/nr_cubes
147 |
148 | # We calculate the overlapping windows distance
149 | self.overlap_dist = self.overlap_perc * self.chunk_dist
150 |
151 | # We find our starting point
152 | self.d = np.min(projected_X, axis=0)
153 |
154 | # Use a dimension index array on the projected X
155 | # (For now this uses the entire dimensionality, but we keep for experimentation)
156 | di = np.array([x for x in range(projected_X.shape[1])])
157 |
158 | # Prefix'ing the data with ID's
159 | ids = np.array([x for x in range(projected_X.shape[0])])
160 | projected_X = np.c_[ids,projected_X]
161 | inverse_X = np.c_[ids,inverse_X]
162 |
163 | # Subdivide the projected data X in intervals/hypercubes with overlap
164 | if self.verbose > 0:
165 | total_cubes = len(cube_coordinates_all(nr_cubes,projected_X.shape[1]))
166 | print("Creating %s hypercubes."%total_cubes)
167 |
168 | for i, coor in enumerate(cube_coordinates_all(nr_cubes,di.shape[0])):
169 | # Slice the hypercube
170 | hypercube = projected_X[ np.invert(np.any((projected_X[:,di+1] >= self.d[di] + (coor * self.chunk_dist[di])) &
171 | (projected_X[:,di+1] < self.d[di] + (coor * self.chunk_dist[di]) + self.chunk_dist[di] + self.overlap_dist[di]) == False, axis=1 )) ]
172 |
173 | if self.verbose > 1:
174 | print("There are %s points in cube_%s / %s with starting range %s"%
175 | (hypercube.shape[0],i,total_cubes,self.d[di] + (coor * self.chunk_dist[di])))
176 |
177 | # If at least one sample inside the hypercube
178 | if hypercube.shape[0] > 0:
179 | # Cluster the data point(s) in the cube, skipping the id-column
180 | # Note that we apply clustering on the inverse image (original data samples) that fall inside the cube.
181 | inverse_x = inverse_X[[int(nn) for nn in hypercube[:,0]]]
182 |
183 | clusterer.fit(inverse_x[:,1:])
184 |
185 | if self.verbose > 1:
186 | print("Found %s clusters in cube_%s\n"%(np.unique(clusterer.labels_[clusterer.labels_ > -1]).shape[0],i))
187 |
188 | #Now for every (sample id in cube, predicted cluster label)
189 | for a in np.c_[hypercube[:,0],clusterer.labels_]:
190 | if a[1] != -1: #if not predicted as noise
191 | cluster_id = str(coor[0])+"_"+str(i)+"_"+str(a[1])+"_"+str(coor)+"_"+str(self.d[di] + (coor * self.chunk_dist[di])) # TODO: de-rudimentary-ify
192 | nodes[cluster_id].append( int(a[0]) ) # Append the member id's as integers
193 | else:
194 | if self.verbose > 1:
195 | print("Cube_%s is empty.\n"%(i))
196 |
197 | # Create links when clusters from different hypercubes have members with the same sample id.
198 | candidates = itertools.combinations(nodes.keys(),2)
199 | for candidate in candidates:
200 | # if there are non-unique members in the union
201 | if len(nodes[candidate[0]]+nodes[candidate[1]]) != len(set(nodes[candidate[0]]+nodes[candidate[1]])):
202 | links[candidate[0]].append( candidate[1] )
203 |
204 | # Reporting
205 | if self.verbose > 0:
206 | nr_links = 0
207 | for k in links:
208 | nr_links += len(links[k])
209 | print("\ncreated %s edges and %s nodes in %s."%(nr_links,len(nodes),str(datetime.now()-start)))
210 |
211 | complex["nodes"] = nodes
212 | complex["links"] = links
213 | complex["meta"] = self.projection
214 |
215 | return complex
216 |
217 | def visualize(self, complex, color_function="", path_html="mapper_visualization_output.html", title="My Data",
218 | graph_link_distance=30, graph_gravity=0.1, graph_charge=-120, custom_tooltips=None, width_html=0,
219 | height_html=0, show_tooltips=True, show_title=True, show_meta=True):
220 | # Turns the dictionary 'complex' in a html file with d3.js
221 | #
222 | # Input: complex. Dictionary (output from calling .map())
223 | # Output: a HTML page saved as a file in 'path_html'.
224 | #
225 | # parameters
226 | # ----------
227 | # color_function string. Not fully implemented. Default: "" (distance to origin)
228 | # path_html file path as string. Where to save the HTML page.
229 | # title string. HTML page document title and first heading.
230 | # graph_link_distance int. Edge length.
231 | # graph_gravity float. "Gravity" to center of layout.
232 | # graph_charge int. charge between nodes.
233 | # custom_tooltips None or Numpy Array. You could use "y"-label array for this.
234 | # width_html int. Width of canvas. Default: 0 (full width)
235 | # height_html int. Height of canvas. Default: 0 (full height)
236 | # show_tooltips bool. default:True
237 | # show_title bool. default:True
238 | # show_meta bool. default:True
239 |
240 | # Format JSON for D3 graph
241 | json_s = {}
242 | json_s["nodes"] = []
243 | json_s["links"] = []
244 | k2e = {} # a key to incremental int dict, used for id's when linking
245 |
246 | for e, k in enumerate(complex["nodes"]):
247 | # Tooltip and node color formatting, TODO: de-mess-ify
248 | if custom_tooltips is not None:
249 | tooltip_s = "
Cluster %s
"%k + " ".join([str(f) for f in custom_tooltips[complex["nodes"][k]]])
250 | if color_function == "average_signal_cluster":
251 | tooltip_i = int(((sum([f for f in custom_tooltips[complex["nodes"][k]]]) / len(custom_tooltips[complex["nodes"][k]])) * 30) )
252 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(tooltip_i)})
253 | else:
254 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])})
255 | else:
256 | tooltip_s = "
Cluster %s
Contains %s members."%(k,len(complex["nodes"][k]))
257 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])})
258 | k2e[k] = e
259 | for k in complex["links"]:
260 | for link in complex["links"][k]:
261 | json_s["links"].append({"source": k2e[k], "target":k2e[link],"value":1})
262 |
263 | # Width and height of graph in HTML output
264 | if width_html == 0:
265 | width_css = "100%"
266 | width_js = 'document.getElementById("holder").offsetWidth-20'
267 | else:
268 | width_css = "%spx" % width_html
269 | width_js = "%s" % width_html
270 | if height_html == 0:
271 | height_css = "100%"
272 | height_js = 'document.getElementById("holder").offsetHeight-20'
273 | else:
274 | height_css = "%spx" % height_html
275 | height_js = "%s" % height_html
276 |
277 | # Whether to show certain UI elements or not
278 | if show_tooltips == False:
279 | tooltips_display = "display: none;"
280 | else:
281 | tooltips_display = ""
282 |
283 | if show_meta == False:
284 | meta_display = "display: none;"
285 | else:
286 | meta_display = ""
287 |
288 | if show_title == False:
289 | title_display = "display: none;"
290 | else:
291 | title_display = ""
292 |
293 | with open(path_html,"wb") as outfile:
294 | html = """
295 |
296 |
297 | %s | KeplerMapper
298 |
299 |
313 |
314 |