├── examples ├── gallery │ ├── horsey.png │ ├── hyperspace.png │ ├── topo-galaxy.png │ └── dimensionality-canvas.png ├── cat │ ├── cat-reference-viz.png │ ├── cat-reference.csv.png │ ├── cat.py │ └── cat_keplermapper_output.html ├── horse │ ├── horse-reference.png │ ├── horse-reference.csv.png │ ├── horse.py │ └── horse_keplermapper_output.html ├── lion │ ├── lion-reference-viz.png │ ├── lion-reference.csv.png │ ├── lion.py │ └── lion_keplermapper_output.html ├── makecircles │ ├── make_circles_xaxis.png │ ├── make_circles_distmean.png │ ├── make_circles_raw_dimensions.png │ ├── make_circles_xaxis.py │ ├── make_circles_distmean.py │ ├── keplermapper-makecircles-xaxis.html │ └── keplermapper-makecircles-distmean.html ├── digits │ ├── digits-tsne-custom-tooltip.png │ ├── digits-tsne-custom-tooltip-mnist.png │ ├── digits.py │ └── keplermapper_digits_ylabel_tooltips.html └── readme.md ├── .gitattributes ├── .gitignore ├── disclaimer.txt ├── license.txt ├── todo.txt ├── release.txt ├── readme.md ├── depricated └── km.py └── km.py /examples/gallery/horsey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/horsey.png -------------------------------------------------------------------------------- /examples/gallery/hyperspace.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/hyperspace.png -------------------------------------------------------------------------------- /examples/cat/cat-reference-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/cat/cat-reference-viz.png -------------------------------------------------------------------------------- /examples/cat/cat-reference.csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/cat/cat-reference.csv.png -------------------------------------------------------------------------------- /examples/gallery/topo-galaxy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/topo-galaxy.png -------------------------------------------------------------------------------- /examples/horse/horse-reference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/horse/horse-reference.png -------------------------------------------------------------------------------- /examples/lion/lion-reference-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/lion/lion-reference-viz.png -------------------------------------------------------------------------------- /examples/lion/lion-reference.csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/lion/lion-reference.csv.png -------------------------------------------------------------------------------- /examples/horse/horse-reference.csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/horse/horse-reference.csv.png -------------------------------------------------------------------------------- /examples/gallery/dimensionality-canvas.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/gallery/dimensionality-canvas.png -------------------------------------------------------------------------------- /examples/makecircles/make_circles_xaxis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/makecircles/make_circles_xaxis.png -------------------------------------------------------------------------------- /examples/digits/digits-tsne-custom-tooltip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/digits/digits-tsne-custom-tooltip.png -------------------------------------------------------------------------------- /examples/makecircles/make_circles_distmean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/makecircles/make_circles_distmean.png -------------------------------------------------------------------------------- /examples/digits/digits-tsne-custom-tooltip-mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/digits/digits-tsne-custom-tooltip-mnist.png -------------------------------------------------------------------------------- /examples/makecircles/make_circles_raw_dimensions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmcinnes/kepler-mapper/master/examples/makecircles/make_circles_raw_dimensions.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /examples/cat/cat.py: -------------------------------------------------------------------------------- 1 | import km 2 | 3 | data = km.np.genfromtxt('cat-reference.csv',delimiter=',') 4 | 5 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.1, min_samples=5), nr_cubes=10, overlap_perc=0.8, verbose=1) 6 | 7 | mapper.fit(data) 8 | 9 | complex = mapper.map(data, dimension_index=1, dimension_name="Y-axis") 10 | 11 | mapper.visualize(complex, "cat_keplermapper_output.html", "cat-reference.csv") 12 | 13 | # You may want to visualize the original point cloud data in 3D scatter too 14 | """ 15 | import matplotlib.pyplot as plt 16 | from mpl_toolkits.mplot3d import Axes3D 17 | 18 | fig = plt.figure() 19 | ax = fig.add_subplot(111, projection='3d') 20 | ax.scatter(data[:,0],data[:,1],data[:,2]) 21 | plt.savefig("cat-reference.csv.png") 22 | plt.show() 23 | """ -------------------------------------------------------------------------------- /examples/lion/lion.py: -------------------------------------------------------------------------------- 1 | import km 2 | 3 | data = km.np.genfromtxt('lion-reference.csv',delimiter=',') 4 | 5 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.1, min_samples=5), nr_cubes=10, overlap_perc=0.8, verbose=1) 6 | 7 | mapper.fit(data) 8 | 9 | complex = mapper.map(data, dimension_index=1, dimension_name="Y-axis") 10 | 11 | mapper.visualize(complex, "lion_keplermapper_output.html", "lion-reference.csv") 12 | 13 | # You may want to visualize the original point cloud data in 3D scatter too 14 | """ 15 | import matplotlib.pyplot as plt 16 | from mpl_toolkits.mplot3d import Axes3D 17 | 18 | fig = plt.figure() 19 | ax = fig.add_subplot(111, projection='3d') 20 | ax.scatter(data[:,0],data[:,1],data[:,2]) 21 | plt.savefig("lion-reference.csv.png") 22 | plt.show() 23 | """ -------------------------------------------------------------------------------- /examples/horse/horse.py: -------------------------------------------------------------------------------- 1 | import km 2 | 3 | data = km.np.genfromtxt('horse-reference.csv',delimiter=',') 4 | 5 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.3, min_samples=3), nr_cubes=25, link_local=False, overlap_perc=0.7, verbose=1) 6 | 7 | mapper.fit(data) 8 | 9 | complex = mapper.map(data, dimension_index=1, dimension_name="Y-axis") 10 | 11 | mapper.visualize(complex, "horse_keplermapper_output.html", "horse-reference.csv") 12 | 13 | # You may want to visualize the original point cloud data in 3D scatter too 14 | """ 15 | import matplotlib.pyplot as plt 16 | from mpl_toolkits.mplot3d import Axes3D 17 | 18 | fig = plt.figure() 19 | ax = fig.add_subplot(111, projection='3d') 20 | ax.scatter(data[:,0],data[:,1],data[:,2]) 21 | plt.savefig("horse-reference.csv.png") 22 | plt.show() 23 | """ -------------------------------------------------------------------------------- /examples/makecircles/make_circles_xaxis.py: -------------------------------------------------------------------------------- 1 | import km 2 | 3 | # Make fairly noisy circles 4 | from sklearn import datasets 5 | data, labels = datasets.make_circles(n_samples=5000, noise=0.05, factor=0.3) 6 | 7 | # Initialize 8 | mapper = km.KeplerMapper(verbose=1) 9 | 10 | # Fit to and transform the data 11 | projected_data = mapper.fit_transform(data, projection=[0]) 12 | 13 | # Create dictionary called 'complex' with nodes, edges and meta-information 14 | complex = mapper.map(projected_X=projected_data, inverse_X=data, 15 | clusterer=km.cluster.DBSCAN(eps=0.1, min_samples=10), 16 | nr_cubes=20, overlap_perc=0.1) 17 | 18 | # Visualize it 19 | mapper.visualize(complex, path_html="keplermapper-makecircles-xaxis.html", 20 | title="datasets.make_circles(n_samples=5000, noise=0.05, factor=0.3)", 21 | custom_tooltips=labels, color_function="average_signal_cluster") -------------------------------------------------------------------------------- /examples/makecircles/make_circles_distmean.py: -------------------------------------------------------------------------------- 1 | import km 2 | 3 | # Make very noisy circles 4 | from sklearn import datasets 5 | data, labels = datasets.make_circles(n_samples=5000, noise=0.3, factor=0.3) 6 | 7 | # Initialize 8 | mapper = km.KeplerMapper(verbose=1) 9 | 10 | # Fit to and transform the data 11 | projected_data = mapper.fit_transform(data, projection="dist_mean") 12 | 13 | # Create dictionary called 'complex' with nodes, edges and meta-information 14 | complex = mapper.map(projected_X=projected_data, inverse_X=data, 15 | clusterer=km.cluster.DBSCAN(eps=0.9, min_samples=3), 16 | nr_cubes=30, overlap_perc=0.7) 17 | 18 | # Visualize it 19 | mapper.visualize(complex, path_html="keplermapper-makecircles-distmean.html", 20 | title="datasets.make_circles(n_samples=5000, noise=0.3, factor=0.3)", 21 | custom_tooltips=labels, color_function="average_signal_cluster", 22 | graph_gravity=0.03, graph_link_distance=30, graph_charge=-80) -------------------------------------------------------------------------------- /disclaimer.txt: -------------------------------------------------------------------------------- 1 | DISCLAIMER: 2 | This code was written for self-study and is not production-ready. It is very 3 | much a work in progress. 4 | 5 | This early version is released, because it may be useful to other researchers 6 | and students starting out in topological data science, and to motivate the 7 | author to keep working on it. 8 | 9 | The author makes no guarantee that this code is: 10 | - correct, 11 | - the right way to do things, 12 | - true to the original MAPPER algorithm 13 | - ready for production 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Triskelion - HJ van Veen - info@mlwave.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /todo.txt: -------------------------------------------------------------------------------- 1 | TODO: 2 | Use CSS animations for the div nodes to the remove "jittery" movement. 3 | Add a curvature to the edges 4 | Add member expand function on node click 5 | Style edge width according to the size of set of shared members 6 | Add a bounding box for the force-directed graph 7 | Implement better and more color functions. Also support for custom color functions. 8 | Implement better and more node size functions. 9 | Make better suited to model introspection (visualize out-of-fold predictions performance and feature interactions) 10 | Implement cheap dimensionality reduction for extremely high dimensional data (RandomizedPCA) 11 | Support for online mapping (Column-Oriented datastore https://users.soe.ucsc.edu/~niejiazhong/slides/chandra.pdf) 12 | Add samples to showcase fundamental data shapes: Regression, Clusters, Circular/looping, Y-flares. 13 | Create multi-view to compare graphs 14 | Add an example with a custom (scikit-learn API compatible) clusterer 15 | Write tests 16 | Better support for IDs 17 | Add novel neural gas mapping technique 18 | Turn into an API. Visualizer output to localhost can ask for new json to render graphs with. 19 | Add sliders for on-page graph settings (redraw with a new Gravity or LinkDistance) 20 | Proper accessible HTML output 21 | Better complex["meta"] information 22 | Create distribution 23 | Refactor -------------------------------------------------------------------------------- /examples/readme.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ## Horse 4 | 5 | In the horse directory you'll find an example of creating simplicial complexes from point cloud data. 6 | 7 | `horse-reference.csv` is courtesy of Python Mapper (http://danifold.net/mapper/index.html) 8 | 9 | ![Horse topology](http://i.imgur.com/zNwQBSK.png) 10 | 11 | ## Lion 12 | 13 | In the `lion` directory you'll find an example of creating simplicial complexes from point cloud data. 14 | 15 | `lion-reference.csv` is courtesy of Python Mapper (http://danifold.net/mapper/index.html) 16 | 17 | ![Lion topology](http://i.imgur.com/MlzMsec.png) 18 | 19 | ## Cat 20 | 21 | In the `cat` directory you'll find an example of creating simplicial complexes from point cloud data. 22 | 23 | `cat-reference.csv` is courtesy of Python Mapper (http://danifold.net/mapper/index.html) 24 | 25 | ![Cat topology](http://i.imgur.com/spk7PC4.png) 26 | 27 | ## Linking locally 28 | 29 | With `link_local=True` you can link up the local clusters (clusters in the same hypercube). Below 30 | you can see a low-resolution house cat on the left *without local linking* and *with local linking* on 31 | the right. 32 | 33 | ![Cat topology](http://i.imgur.com/Irm8jp4.png) 34 | 35 | ## Digits 36 | 37 | In the `digits` directory you'll find an example of using the `custom_tooltips`-functionality. It also 38 | shows the usage of `reducer` with t-SNE to reduce the dimensionality to 2. 39 | 40 | ![Unsupervised Digits](http://i.imgur.com/t1fuoHX.png) 41 | 42 | Note how sub-clusters form for digits with a different slant. 43 | 44 | ## Gallery 45 | 46 | Check the `gallery` for a variety of images. 47 | 48 | ## Soon 49 | 50 | Iris, Diabetes 100k, Customer purchase behaviour. -------------------------------------------------------------------------------- /examples/digits/digits.py: -------------------------------------------------------------------------------- 1 | import km 2 | 3 | # Load digits data 4 | from sklearn import datasets 5 | data, labels = datasets.load_digits().data, datasets.load_digits().target 6 | 7 | # Create images for a custom tooltip array 8 | import StringIO 9 | from scipy.misc import imsave, toimage 10 | import base64 11 | tooltip_s = [] 12 | for image_data in data: 13 | output = StringIO.StringIO() 14 | img = toimage(image_data.reshape((8,8))) # Data was a flat row of 64 "pixels". 15 | img.save(output, format="PNG") 16 | contents = output.getvalue() 17 | tooltip_s.append( """ """%base64.b64encode(contents).replace("\n","") ) 18 | output.close() 19 | 20 | tooltip_s = km.np.array(tooltip_s) # need to make sure to feed it as a NumPy array, not a list 21 | 22 | # Initialize to use t-SNE with 2 components (reduces data to 2 dimensions). Also note high overlap_percentage. 23 | mapper = km.KeplerMapper(cluster_algorithm=km.cluster.DBSCAN(eps=0.3, min_samples=15), 24 | reducer = km.manifold.TSNE(), nr_cubes=35, overlap_perc=0.9, 25 | link_local=False, verbose=2) 26 | 27 | # Fit and transform data 28 | data = mapper.fit_transform(data) 29 | 30 | # Create the graph 31 | complex = mapper.map(data, dimension_index=[0,1], dimension_name="t-SNE(2) 2D") 32 | 33 | # Create the visualizations (increased the graph_gravity for a tighter graph-look.) 34 | 35 | # Tooltips with image data for every cluster member 36 | mapper.visualize(complex, "keplermapper_digits_custom_tooltips.html", "Digits", graph_gravity=0.25, custom_tooltips=tooltip_s) 37 | # Tooltips with the target y-labels for every cluster member 38 | mapper.visualize(complex, "keplermapper_digits_ylabel_tooltips.html", "Digits", graph_gravity=0.25, custom_tooltips=labels) -------------------------------------------------------------------------------- /release.txt: -------------------------------------------------------------------------------- 1 | Pre-alpha 2 | 3 | v00001 4 | Wrote class 5 | Wrote documentation 6 | Added license 7 | 8 | v00002 9 | Added a multi-dimensional mode: use all dimensions. 10 | Added case study: 3D point cloud data for animals 11 | Added case study: Make circles 12 | Added advanced parameters for graph layout settings. Should probably be sliders on the .html page itself. 13 | Improved documentation 14 | Added disclaimer 15 | Added todo 16 | Added release log 17 | 18 | v00003 19 | Refactored dimension index to use a list of arbitrary dimensions 20 | Improved verbosity 21 | Added levels of verbosity 22 | Decreased number of code lines by using a single approach 23 | Added sample to explain local linkage True vs. False 24 | Added side-view for animal point-cloud data 25 | Added a gallery in the example directory 26 | 27 | v00004 28 | Added dimensionality reduction 29 | Added "digits" case study 30 | changed fit to fit_transform and return of data 31 | added tooltips 32 | added support for custom tooltips 33 | 34 | Alpha 35 | 36 | v00005 37 | Made Python 3 compatible 38 | Ability to turn off title, meta and tooltips 39 | Ability to set the window height and width of HTML output 40 | Added basic support for another color function: average signal 41 | De-emphasized link_local functionality, since its current implementation is no good. 42 | 43 | v00006 44 | Removed link_local functionality 45 | Halved the number of edges drawn (no two-way edges) 46 | Added support for clustering on the inverse image 47 | Refactored code (see updated documentation) 48 | Added code comments 49 | Added feature to use reducers/manifold learning/dimensions and stat functions 50 | Added 7 projections/lenses from statistics 51 | -------------------------------------------------------------------------------- /examples/cat/cat_keplermapper_output.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 |
18 |

cat-reference.csv

19 |

20 | Lens
Y-axis

21 | Number of cubes
10

22 | Overlap percentage
80.0%

23 | Linking locally
False

24 | Color Function
Distance to min(Y-axis)

25 | Clusterer
DBSCAN(algorithm='auto', eps=0.1, leaf_size=30, metric='euclidean', 26 | min_samples=5, p=None, random_state=None)

27 | Scaler
MinMaxScaler(copy=True, feature_range=(0, 1)) 28 |

29 |
30 | 31 | -------------------------------------------------------------------------------- /examples/lion/lion_keplermapper_output.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 |
18 |

lion-reference.csv

19 |

20 | Lens
Y-axis

21 | Number of cubes
10

22 | Overlap percentage
80.0%

23 | Linking locally
False

24 | Color Function
Distance to min(Y-axis)

25 | Clusterer
DBSCAN(algorithm='auto', eps=0.1, leaf_size=30, metric='euclidean', 26 | min_samples=5, p=None, random_state=None)

27 | Scaler
MinMaxScaler(copy=True, feature_range=(0, 1)) 28 |

29 |
30 | 31 | -------------------------------------------------------------------------------- /examples/horse/horse_keplermapper_output.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 |
18 |

horse-reference.csv

19 |

20 | Lens
Y-axis

21 | Number of cubes
25

22 | Overlap percentage
70.0%

23 | Linking locally
False

24 | Color Function
Distance to min(Y-axis)

25 | Clusterer
DBSCAN(algorithm='auto', eps=0.3, leaf_size=30, metric='euclidean', 26 | min_samples=3, p=None, random_state=None)

27 | Scaler
MinMaxScaler(copy=True, feature_range=(0, 1)) 28 |

29 |
30 | 31 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # KeplerMapper 2 | 3 | > Nature uses as little as possible of anything. - Johannes Kepler 4 | 5 | This is a class containing a mapping algorithm in Python. KeplerMapper can be used for 6 | visualization of high-dimensional data and 3D point cloud data. 7 | 8 | KeplerMapper employs approaches based on the MAPPER algorithm (Singh et al.) as first 9 | described in the paper "Topological Methods for the Analysis of High Dimensional 10 | Data Sets and 3D Object Recognition". 11 | 12 | KeplerMapper can make use of Scikit-Learn API compatible cluster and scaling algorithms. 13 | 14 | ## Usage 15 | 16 | ### Python code 17 | ```python 18 | # Import the class 19 | import km 20 | 21 | # Some sample data 22 | from sklearn import datasets 23 | data, labels = datasets.make_circles(n_samples=5000, noise=0.03, factor=0.3) 24 | 25 | # Initialize 26 | mapper = km.KeplerMapper(verbose=1) 27 | 28 | # Fit to and transform the data 29 | projected_data = mapper.fit_transform(data, projection=[0,1]) # X-Y axis 30 | 31 | # Create dictionary called 'complex' with nodes, edges and meta-information 32 | complex = mapper.map(projected_data, data, nr_cubes=10) 33 | 34 | # Visualize it 35 | mapper.visualize(complex, path_html="make_circles_keplermapper_output.html", 36 | title="make_circles(n_samples=5000, noise=0.03, factor=0.3)") 37 | ``` 38 | 39 | ### Console output 40 | ``` 41 | ..Projecting data using: [0, 1] 42 | 43 | ..Scaling with: MinMaxScaler(copy=True, feature_range=(0, 1)) 44 | 45 | Mapping on data shaped (5000L, 2L) using dimensions 46 | 47 | Creating 1000 hypercubes. 48 | 49 | created 86 edges and 57 nodes in 0:00:03.614000. 50 | 51 | Wrote d3.js graph to 'make_circles_keplermapper_output.html' 52 | ``` 53 | 54 | ### Visualization output 55 | 56 | ![Visualization](http://i.imgur.com/i3cqQVr.png "Click for large") 57 | 58 | Click here for an [interactive version](http://mlwave.github.io/tda/make_circles_keplermapper_output2.html). 59 | Click here for an older [interactive version](http://mlwave.github.io/tda/make_circles_keplermapper_output.html). 60 | 61 | ## Install 62 | 63 | The class is currently just one file. Simply dropping it in any directory which Python is able to import from should work. 64 | 65 | ## Required 66 | 67 | These libraries are required to be installed for KeplerMapper to work: 68 | 69 | * NumPy 70 | * Scikit-Learn 71 | 72 | KeplerMapper works on both Python 2.7 and Python 3+. 73 | 74 | ## External resources 75 | 76 | These resources are loaded by the visualization output. 77 | 78 | * Roboto Webfont (Google) 79 | * D3.js (Mike Bostock) 80 | 81 | ## Parameters 82 | 83 | ### Initialize 84 | 85 | ```python 86 | mapper = km.KeplerMapper(verbose=1) 87 | ``` 88 | 89 | Parameter | Description 90 | --- | --- 91 | verbose | Int. Verbosity of the mapper. *Default = 0* 92 | 93 | ### Fitting and transforming 94 | Input the data set. Specify a projection/lens type. Output the projected data/lens. 95 | 96 | ```python 97 | projected_data = mapper.fit_transform(data, projection="sum", 98 | scaler=km.preprocessing.MinMaxScaler() ) 99 | ``` 100 | 101 | Parameter | Description 102 | --- | --- 103 | data | Numpy Array. The data to fit a projection/lens to. *Required* 104 | projection | Any of: list with dimension indices. Scikit-learn API compatible manifold learner or dimensionality reducer. A string from ["sum","mean","median","max","min","std","dist_mean"]. *Default = "sum"* 105 | scaler | Scikit-Learn API compatible scaler. Scaler of the data applied before mapping. Use `None` for no scaling. *Default = preprocessing.MinMaxScaler()* 106 | 107 | ### Mapping 108 | 109 | ```python 110 | topological_network = mapper.map(projected_X, inverse_X=None, 111 | clusterer=cluster.DBSCAN(eps=0.5,min_samples=3), 112 | nr_cubes=10, overlap_perc=0.1) 113 | 114 | print(topological_network["nodes"]) 115 | print(topological_network["links"]) 116 | print(topological_network["meta"]) 117 | ``` 118 | 119 | Parameter | Description 120 | --- | --- 121 | projected_X | Numpy array. Output from fit_transform. *Required* 122 | inverse_X | Numpy array or `None`. When `None`, cluster on the projection, else cluster on the original data (inverse image). 123 | clusterer | Scikit-Learn API compatible clustering algorithm. The clustering algorithm to use for mapping. *Default = cluster.DBSCAN(eps=0.5,min_samples=3)* 124 | nr_cubes | Int. The number of cubes/intervals to create. *Default = 10* 125 | overlap_perc | Float. How much the cubes/intervals overlap (relevant for creating the edges). *Default = 0.1* 126 | 127 | ### Visualizing 128 | 129 | ```python 130 | mapper.visualize(topological_network, 131 | path_html="mapper_visualization_output.html") 132 | ``` 133 | 134 | Parameter | Description 135 | --- | --- 136 | topological_network | Dict. The `topological_network`-dictionary with nodes, edges and meta-information. *Required* 137 | path_html | File path. Path where to output the .html file *Default = mapper_visualization_output.html* 138 | title | String. Document title for use in the outputted .html. *Default = "My Data"* 139 | graph_link_distance | Int. Global length of links between nodes. Use less for larger graphs. *Default = 30* 140 | graph_charge | Int. The charge between nodes. Use less negative charge for larger graphs. *Default = -120* 141 | graph_gravity | Float. A weak geometric constraint similar to a virtual spring connecting each node to the center of the layout's size. Don't you set to negative or it's turtles all the way up. *Default = 0.1* 142 | custom_tooltips | NumPy Array. Create custom tooltips for all the node members. You could use the target labels `y` for this. Use `None` for standard tooltips. *Default = None*. 143 | show_title | Bool. Whether to show the title. *Default = True* 144 | show_meta | Bool. Whether to show meta information, like the overlap percentage and the clusterer used. *Default = True* 145 | show_tooltips | Bool. Whether to show the tooltips on hover. *Default = True* 146 | width_html | Int. Size in pixels of the graph canvas width. *Default = 0 (full screen width)* 147 | height_html | Int. Size in pixels of the graph canvas height. *Default = 0 (full screen height)* 148 | 149 | ## Examples 150 | 151 | ### 3D-point cloud 152 | 153 | Check the `examples` directory for more. 154 | 155 | ![Visualization](http://i.imgur.com/OQqHt9R.png "Click for large") 156 | 157 | ### Very noisy datasets 158 | 159 | Check the `examples\makecircles` directory for code 160 | 161 | ![Visualization](http://i.imgur.com/OmETfe5.png "Click for large") 162 | 163 | ### Dimensionality reduction 164 | 165 | t-SNE on 4K images of MNIST dataset. 166 | 167 | ![Visualization](http://i.imgur.com/eRa9sMH.png "Click for large") 168 | 169 | ## References 170 | 171 | > Mapper Algorithm
172 | > "Topological Methods for the Analysis of High Dimensional Data Sets and 3D Object Recognition"
173 | > Gurjeet Singh, Facundo Mémoli, and Gunnar Carlsson 174 | 175 | http://www.ayasdi.com/wp-content/uploads/2015/02/Topological_Methods_for_the_Analysis_of_High_Dimensional_Data_Sets_and_3D_Object_Recognition.pdf 176 | 177 | > Topological Data Analysis
178 | > Stanford Seminar. "Topological Data Analysis: How Ayasdi used TDA to Solve Complex Problems"
179 | > SF Data Mining. "Shape and Meaning."
180 | > Anthony Bak 181 | 182 | https://www.youtube.com/watch?v=x3Hl85OBuc0
183 | https://www.youtube.com/watch?v=4RNpuZydlKY 184 | 185 | > Projection vs. Inverse image & Examples
186 | > MLconf ATL. Topological Learning with Ayasdi
187 | > Allison Gilmore 188 | 189 | https://www.youtube.com/watch?v=cJ8W0ASsnp0 190 | 191 | > The shape of data
192 | > "Conference Talk. The shape of data"
193 | > Topology and Data
194 | > Gunnar Carlsson 195 | 196 | https://www.youtube.com/watch?v=kctyag2Xi8o 197 | http://www.ams.org/images/carlsson-notes.pdf 198 | 199 | > Business Value, Problems, Algorithms, Computation and User Experience of TDA
200 | > Data Driven NYC. "Making Data Work"
201 | > Gurjeet Singh 202 | 203 | https://www.youtube.com/watch?v=UZH5xJXJG2I 204 | 205 | > Implementation details and sample data
206 | > Python Mapper
207 | > Daniel Müllner and Aravindakshan Babu 208 | 209 | http://danifold.net/mapper/index.html 210 | 211 | > Applied Topology
212 | > "Elementary Applied Topology"
213 | > R. Ghrist 214 | 215 | https://www.math.upenn.edu/~ghrist/notes.html 216 | 217 | > Applied Topology
218 | > "Qualitative data analysis"
219 | > Community effort 220 | 221 | http://appliedtopology.org/ 222 | 223 | > Single Linkage Clustering
224 | > "Minimum Spanning Trees and Single Linkage Cluster Analysis"
225 | > J. C. Gower, and G. J. S. Ross 226 | 227 | http://www.cs.ucsb.edu/~veronika/MAE/mstSingleLinkage_GowerRoss_1969.pdf 228 | 229 | > Clustering and Manifold Learning
230 | > Scikit-learn: Machine Learning in Python
231 | > Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E. 232 | 233 | http://scikit-learn.org/stable/modules/clustering.html
234 | http://scikit-learn.org/stable/modules/manifold.html 235 | 236 | > Force-directed Graphing/Clustering
237 | > Force-directed Graphs
238 | > Mike Bostock, Tim Dwyer, Thomas Jakobsen 239 | 240 | http://bl.ocks.org/mbostock/4062045 241 | 242 | > Graphing
243 | > Grapher
244 | > Cindy Zhang, Danny Cochran, Diana Suvorova, Curtis Mitchell 245 | 246 | https://github.com/ayasdi/grapher 247 | 248 | > Color scales
249 | > "Creating A Custom Hot to Cold Temperature Color Gradient for use with RRDTool"
250 | > Dale Reagan 251 | 252 | http://web-tech.ga-usa.com/2012/05/creating-a-custom-hot-to-cold-temperature-color-gradient-for-use-with-rrdtool/ 253 | 254 | > Design
255 | > Material Design
256 | > Google 257 | 258 | https://design.google.com/ 259 | 260 | > Design
261 | > Ayasdi Core Product Screenshots
262 | > Ayasdi 263 | 264 | http://www.ayasdi.com/product/core/ 265 | 266 | ## Disclaimer 267 | 268 | See disclaimer.txt for more. Basically this is a work in progress to familiarize myself with topological data analysis. The details of the algorithm implementations may be lacking. I'll gladly accept feedback and pull requests to make it more robust. You can contact me at info@mlwave.com or by opening an issue. -------------------------------------------------------------------------------- /depricated/km.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | from collections import defaultdict 4 | import json 5 | import itertools 6 | from sklearn import cluster, preprocessing, manifold 7 | from datetime import datetime 8 | import sys 9 | 10 | class KeplerMapper(object): 11 | def __init__(self, cluster_algorithm=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10, 12 | overlap_perc=0.1, scaler=preprocessing.MinMaxScaler(), reducer=None, color_function="distance_origin", 13 | link_local=False, verbose=1): 14 | self.clf = cluster_algorithm 15 | self.nr_cubes = nr_cubes 16 | self.overlap_perc = overlap_perc 17 | self.scaler = scaler 18 | self.color_function = color_function 19 | self.verbose = verbose 20 | self.link_local = link_local 21 | self.reducer = reducer 22 | 23 | self.chunk_dist = [] 24 | self.overlap_dist = [] 25 | self.d = [] 26 | 27 | if self.verbose > 0: 28 | print("\nnr_cubes = %s \n\noverlap_perc = %s\n\nlink_local = %s\n\nClusterer = %s\n\nScaler = %s\n\n"%(self.nr_cubes, overlap_perc, self.link_local, str(self.clf),str(self.scaler))) 29 | 30 | def fit_transform(self, X): 31 | # Dimensionality Reduction 32 | if self.reducer != None: 33 | if self.verbose > 0: 34 | try: 35 | self.reducer.set_params(**{"verbose":self.verbose}) 36 | except: 37 | pass 38 | print("\n..Reducing Dimensionality using: \n\t%s\n"%str(self.reducer)) 39 | 40 | reducer = self.reducer 41 | X = reducer.fit_transform(X) 42 | 43 | # Scaling 44 | if self.scaler != None: 45 | if self.verbose > 0: 46 | print("\n..Scaling\n") 47 | scaler = self.scaler 48 | X = scaler.fit_transform(X) 49 | 50 | # We chop up the min-max column ranges into 'nr_cubes' parts 51 | self.chunk_dist = (np.max(X, axis=0) - np.min(X, axis=0))/self.nr_cubes 52 | 53 | # We calculate the overlapping windows distance 54 | self.overlap_dist = self.overlap_perc * self.chunk_dist 55 | 56 | # We find our starting point 57 | self.d = np.min(X, axis=0) 58 | 59 | return X 60 | 61 | def map(self, X, dimension_index=[0], dimension_name=""): 62 | # This maps the data to a simplicial complex. Returns a dictionary with nodes and links. 63 | 64 | start = datetime.now() 65 | 66 | def cube_coordinates_all(nr_cubes, nr_dimensions): 67 | # if there are 4 cubes per dimension and 3 dimensions 68 | # return the bottom left (origin) coordinates of 64 hypercubes, in a sorted list of Numpy arrays 69 | l = [] 70 | for x in range(nr_cubes): 71 | l += [x] * nr_dimensions 72 | return [np.array(list(f)) for f in sorted(set(itertools.permutations(l,nr_dimensions)))] 73 | 74 | nodes = defaultdict(list) 75 | links = defaultdict(list) 76 | complex = {} 77 | 78 | if self.verbose > 0: 79 | print("Mapping on data shaped %s using dimensions %s\n"%(str(X.shape),str(dimension_index))) 80 | 81 | # Scaling 82 | if self.scaler != None: 83 | scaler = self.scaler 84 | X = scaler.fit_transform(X) 85 | 86 | # Initialize Cluster Algorithm 87 | clf = self.clf 88 | 89 | # Prefix'ing the data with ID's 90 | ids = np.array([x for x in range(X.shape[0])]) 91 | X = np.c_[ids,X] 92 | 93 | # Subdivide the data X in intervals/hypercubes with overlap 94 | if self.verbose > 0: 95 | total_cubes = len(cube_coordinates_all(self.nr_cubes,len(dimension_index))) 96 | print("Creating %s hypercubes."%total_cubes) 97 | di = np.array(dimension_index) 98 | for i, coor in enumerate(cube_coordinates_all(self.nr_cubes,di.shape[0])): 99 | # Slice the hypercube 100 | hypercube = X[ np.invert(np.any((X[:,di+1] >= self.d[di] + (coor * self.chunk_dist[di])) & 101 | (X[:,di+1] < self.d[di] + (coor * self.chunk_dist[di]) + self.chunk_dist[di] + self.overlap_dist[di]) == False, axis=1 )) ] 102 | 103 | if self.verbose > 1: 104 | print("There are %s points in cube_%s / %s with starting range %s"% 105 | (hypercube.shape[0],i,total_cubes,self.d[di] + (coor * self.chunk_dist[di]))) 106 | 107 | # If at least one sample inside the hypercube 108 | if hypercube.shape[0] > 0: 109 | # Cluster the data point(s) inside the cube, skipping the id-column 110 | clf.fit(hypercube[:,1:]) 111 | 112 | if self.verbose > 1: 113 | print("Found %s clusters in cube_%s\n"%(np.unique(clf.labels_[clf.labels_ > -1]).shape[0],i)) 114 | 115 | #Now for every (sample id in cube, predicted cluster label) 116 | for a in np.c_[hypercube[:,0],clf.labels_]: 117 | if a[1] != -1: #if not predicted as noise 118 | cluster_id = str(coor[0])+"_"+str(i)+"_"+str(a[1])+"_"+str(coor)+"_"+str(self.d[di] + (coor * self.chunk_dist[di])) # Rudimentary cluster id 119 | nodes[cluster_id].append( int(a[0]) ) # Append the member id's as integers 120 | else: 121 | if self.verbose > 1: 122 | print("Cube_%s is empty.\n"%(i)) 123 | 124 | # Create links when clusters from different hypercubes have members with the same sample id. 125 | for k in nodes: 126 | for kn in nodes: 127 | if k != kn: 128 | if len(nodes[k] + nodes[kn]) != len(set(nodes[kn] + nodes[k])): # there are non-unique id's in the union 129 | links[k].append( kn ) 130 | 131 | # Create links between local hypercube clusters if setting link_local = True 132 | # This is an experimental feature deviating too much from the original mapper algo. 133 | # Creates a lot of spurious edges, and should only be used when mapping one or at most two dimensions. 134 | if self.link_local: 135 | if k.split("_")[0] == kn.split("_")[0]: 136 | links[k].append( kn ) 137 | 138 | # Reporting 139 | if self.verbose > 0: 140 | nr_links = 0 141 | for k in links: 142 | nr_links += len(links[k]) 143 | print("\ncreated %s edges and %s nodes in %s."%(nr_links,len(nodes),str(datetime.now()-start))) 144 | 145 | complex["nodes"] = nodes 146 | complex["links"] = links 147 | complex["meta"] = dimension_name 148 | 149 | return complex 150 | 151 | def visualize(self, complex, path_html="mapper_visualization_output.html", title="My Data", graph_link_distance=30, graph_gravity=0.1, graph_charge=-120, custom_tooltips=None, width_html=0, height_html=0, show_tooltips=True, show_title=True, show_meta=True): 152 | # Turns the dictionary 'complex' in a html file with d3.js 153 | 154 | # Format JSON 155 | json_s = {} 156 | json_s["nodes"] = [] 157 | json_s["links"] = [] 158 | k2e = {} # a key to incremental int dict, used for id's when linking 159 | 160 | for e, k in enumerate(complex["nodes"]): 161 | # Tooltip formatting 162 | if custom_tooltips != None: 163 | tooltip_s = "

Cluster %s

"%k + " ".join([str(f) for f in custom_tooltips[complex["nodes"][k]]]) 164 | if self.color_function == "average_signal_cluster": 165 | tooltip_i = int(((sum([f for f in custom_tooltips[complex["nodes"][k]]]) / len(custom_tooltips[complex["nodes"][k]])) * 30) ) 166 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(tooltip_i)}) 167 | else: 168 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])}) 169 | else: 170 | tooltip_s = "

Cluster %s

Contains %s members."%(k,len(complex["nodes"][k])) 171 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])}) 172 | k2e[k] = e 173 | for k in complex["links"]: 174 | for link in complex["links"][k]: 175 | json_s["links"].append({"source": k2e[k], "target":k2e[link],"value":1}) 176 | 177 | # Width and height of graph in HTML output 178 | if width_html == 0: 179 | width_css = "100%" 180 | width_js = 'document.getElementById("holder").offsetWidth-20' 181 | else: 182 | width_css = "%spx" % width_html 183 | width_js = "%s" % width_html 184 | if height_html == 0: 185 | height_css = "100%" 186 | height_js = 'document.getElementById("holder").offsetHeight-20' 187 | else: 188 | height_css = "%spx" % height_html 189 | height_js = "%s" % height_html 190 | 191 | # Whether to show certain UI elements or not 192 | if show_tooltips == False: 193 | tooltips_display = "display: none;" 194 | else: 195 | tooltips_display = "" 196 | 197 | if show_meta == False: 198 | meta_display = "display: none;" 199 | else: 200 | meta_display = "" 201 | 202 | if show_title == False: 203 | title_display = "display: none;" 204 | else: 205 | title_display = "" 206 | 207 | with open(path_html,"wb") as outfile: 208 | html = """ 209 | 210 | 211 | %s | KeplerMapper 212 | 213 | 227 | 228 |
229 |

%s

230 |

231 | Lens
%s

232 | Cubes per dimension
%s

233 | Overlap percentage
%s%%

234 | 235 | Color Function
%s( %s )

236 | Clusterer
%s

237 | Scaler
%s 238 |

239 |
240 | 241 | """%(title,width_css, height_css, title_display, meta_display, tooltips_display, title,complex["meta"],self.nr_cubes,self.overlap_perc*100,self.link_local,self.color_function,complex["meta"],str(self.clf),str(self.scaler),width_js,height_js,graph_charge,graph_link_distance,graph_gravity,json.dumps(json_s)) 313 | outfile.write(html.encode("utf-8")) 314 | if self.verbose > 0: 315 | print("\nWrote d3.js graph to '%s'"%path_html) -------------------------------------------------------------------------------- /km.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | from collections import defaultdict 4 | import json 5 | import itertools 6 | from sklearn import cluster, preprocessing, manifold 7 | from datetime import datetime 8 | import sys 9 | 10 | class KeplerMapper(object): 11 | # With this class you can build topological networks from (high-dimensional) data. 12 | # 13 | # 1) Fit a projection/lens/function to a dataset and transform it. 14 | # For instance "mean_of_row(x) for x in X" 15 | # 2) Map this projection with overlapping intervals/hypercubes. 16 | # Cluster the points inside the interval 17 | # (Note: we cluster on the inverse image/original data to lessen projection loss). 18 | # If two clusters/nodes have the same members (due to the overlap), then: 19 | # connect these with an edge. 20 | # 3) Visualize the network using HTML and D3.js. 21 | # 22 | # functions 23 | # --------- 24 | # fit_transform: Create a projection (lens) from a dataset 25 | # map: Apply Mapper algorithm on this projection and build a simplicial complex 26 | # visualize: Turns the complex dictionary into a HTML/D3.js visualization 27 | 28 | def __init__(self, verbose=2): 29 | self.verbose = verbose 30 | 31 | self.chunk_dist = [] 32 | self.overlap_dist = [] 33 | self.d = [] 34 | self.nr_cubes = 0 35 | self.overlap_perc = 0 36 | self.clusterer = False 37 | 38 | def fit_transform(self, X, projection="sum", scaler=preprocessing.MinMaxScaler()): 39 | # Creates the projection/lens from X. 40 | # 41 | # Input: X. Input features as a numpy array. 42 | # Output: projected_X. original data transformed to a projection (lens). 43 | # 44 | # parameters 45 | # ---------- 46 | # projection: Projection parameter is either a string, 47 | # a scikit class with fit_transform, like manifold.TSNE(), 48 | # or a list of dimension indices. 49 | # scaler: if None, do no scaling, else apply scaling to the projection 50 | # Default: Min-Max scaling 51 | 52 | self.scaler = scaler 53 | self.projection = str(projection) 54 | 55 | # Detect if projection is a class (for scikit-learn) 56 | if str(type(projection))[1:6] == "class": #TODO: de-ugly-fy 57 | reducer = projection 58 | if self.verbose > 0: 59 | try: 60 | projection.set_params(**{"verbose":self.verbose}) 61 | except: 62 | pass 63 | print("\n..Projecting data using: \n\t%s\n"%str(projection)) 64 | X = reducer.fit_transform(X) 65 | 66 | # Detect if projection is a string (for standard functions) 67 | if isinstance(projection, str): 68 | if self.verbose > 0: 69 | print("\n..Projecting data using: %s"%(projection)) 70 | # Stats lenses 71 | if projection == "sum": # sum of row 72 | X = np.sum(X, axis=1).reshape((X.shape[0],1)) 73 | if projection == "mean": # mean of row 74 | X = np.mean(X, axis=1).reshape((X.shape[0],1)) 75 | if projection == "median": # mean of row 76 | X = np.median(X, axis=1).reshape((X.shape[0],1)) 77 | if projection == "max": # max of row 78 | X = np.max(X, axis=1).reshape((X.shape[0],1)) 79 | if projection == "min": # min of row 80 | X = np.min(X, axis=1).reshape((X.shape[0],1)) 81 | if projection == "std": # std of row 82 | X = np.std(X, axis=1).reshape((X.shape[0],1)) 83 | 84 | if projection == "dist_mean": # Distance of x to mean of X 85 | X_mean = np.mean(X, axis=0) 86 | X = np.sum(np.sqrt((X - X_mean)**2), axis=1).reshape((X.shape[0],1)) 87 | 88 | # Detect if projection is a list (with dimension indices) 89 | if isinstance(projection, list): 90 | if self.verbose > 0: 91 | print("\n..Projecting data using: %s"%(str(projection))) 92 | X = X[:,np.array(projection)] 93 | 94 | # Scaling 95 | if scaler is not None: 96 | if self.verbose > 0: 97 | print("\n..Scaling with: %s\n"%str(scaler)) 98 | X = scaler.fit_transform(X) 99 | 100 | return X 101 | 102 | def map(self, projected_X, inverse_X=None, clusterer=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10, overlap_perc=0.1): 103 | # This maps the data to a simplicial complex. Returns a dictionary with nodes and links. 104 | # 105 | # Input: projected_X. A Numpy array with the projection/lens. 106 | # Output: complex. A dictionary with "nodes", "links" and "meta information" 107 | # 108 | # parameters 109 | # ---------- 110 | # projected_X projected_X. A Numpy array with the projection/lens. Required. 111 | # inverse_X Numpy array or None. If None then the projection itself is used for clustering. 112 | # clusterer Scikit-learn API compatible clustering algorithm. Default: DBSCAN 113 | # nr_cubes Int. The number of intervals/hypercubes to create. 114 | # overlap_perc Float. The percentage of overlap "between" the intervals/hypercubes. 115 | 116 | start = datetime.now() 117 | 118 | # Helper function 119 | def cube_coordinates_all(nr_cubes, nr_dimensions): 120 | # Helper function to get origin coordinates for our intervals/hypercubes 121 | # Useful for looping no matter the number of cubes or dimensions 122 | # Example: if there are 4 cubes per dimension and 3 dimensions 123 | # return the bottom left (origin) coordinates of 64 hypercubes, 124 | # as a sorted list of Numpy arrays 125 | # TODO: elegance-ify... 126 | l = [] 127 | for x in range(nr_cubes): 128 | l += [x] * nr_dimensions 129 | return [np.array(list(f)) for f in sorted(set(itertools.permutations(l,nr_dimensions)))] 130 | 131 | nodes = defaultdict(list) 132 | links = defaultdict(list) 133 | complex = {} 134 | self.nr_cubes = nr_cubes 135 | self.clusterer = clusterer 136 | self.overlap_perc = overlap_perc 137 | 138 | if self.verbose > 0: 139 | print("Mapping on data shaped %s using dimensions\n"%(str(projected_X.shape))) 140 | 141 | # If inverse image is not provided, we use the projection as the inverse image (suffer projection loss) 142 | if inverse_X is None: 143 | inverse_X = projected_X 144 | 145 | # We chop up the min-max column ranges into 'nr_cubes' parts 146 | self.chunk_dist = (np.max(projected_X, axis=0) - np.min(projected_X, axis=0))/nr_cubes 147 | 148 | # We calculate the overlapping windows distance 149 | self.overlap_dist = self.overlap_perc * self.chunk_dist 150 | 151 | # We find our starting point 152 | self.d = np.min(projected_X, axis=0) 153 | 154 | # Use a dimension index array on the projected X 155 | # (For now this uses the entire dimensionality, but we keep for experimentation) 156 | di = np.array([x for x in range(projected_X.shape[1])]) 157 | 158 | # Prefix'ing the data with ID's 159 | ids = np.array([x for x in range(projected_X.shape[0])]) 160 | projected_X = np.c_[ids,projected_X] 161 | inverse_X = np.c_[ids,inverse_X] 162 | 163 | # Subdivide the projected data X in intervals/hypercubes with overlap 164 | if self.verbose > 0: 165 | total_cubes = len(cube_coordinates_all(nr_cubes,projected_X.shape[1])) 166 | print("Creating %s hypercubes."%total_cubes) 167 | 168 | for i, coor in enumerate(cube_coordinates_all(nr_cubes,di.shape[0])): 169 | # Slice the hypercube 170 | hypercube = projected_X[ np.invert(np.any((projected_X[:,di+1] >= self.d[di] + (coor * self.chunk_dist[di])) & 171 | (projected_X[:,di+1] < self.d[di] + (coor * self.chunk_dist[di]) + self.chunk_dist[di] + self.overlap_dist[di]) == False, axis=1 )) ] 172 | 173 | if self.verbose > 1: 174 | print("There are %s points in cube_%s / %s with starting range %s"% 175 | (hypercube.shape[0],i,total_cubes,self.d[di] + (coor * self.chunk_dist[di]))) 176 | 177 | # If at least one sample inside the hypercube 178 | if hypercube.shape[0] > 0: 179 | # Cluster the data point(s) in the cube, skipping the id-column 180 | # Note that we apply clustering on the inverse image (original data samples) that fall inside the cube. 181 | inverse_x = inverse_X[[int(nn) for nn in hypercube[:,0]]] 182 | 183 | clusterer.fit(inverse_x[:,1:]) 184 | 185 | if self.verbose > 1: 186 | print("Found %s clusters in cube_%s\n"%(np.unique(clusterer.labels_[clusterer.labels_ > -1]).shape[0],i)) 187 | 188 | #Now for every (sample id in cube, predicted cluster label) 189 | for a in np.c_[hypercube[:,0],clusterer.labels_]: 190 | if a[1] != -1: #if not predicted as noise 191 | cluster_id = str(coor[0])+"_"+str(i)+"_"+str(a[1])+"_"+str(coor)+"_"+str(self.d[di] + (coor * self.chunk_dist[di])) # TODO: de-rudimentary-ify 192 | nodes[cluster_id].append( int(a[0]) ) # Append the member id's as integers 193 | else: 194 | if self.verbose > 1: 195 | print("Cube_%s is empty.\n"%(i)) 196 | 197 | # Create links when clusters from different hypercubes have members with the same sample id. 198 | candidates = itertools.combinations(nodes.keys(),2) 199 | for candidate in candidates: 200 | # if there are non-unique members in the union 201 | if len(nodes[candidate[0]]+nodes[candidate[1]]) != len(set(nodes[candidate[0]]+nodes[candidate[1]])): 202 | links[candidate[0]].append( candidate[1] ) 203 | 204 | # Reporting 205 | if self.verbose > 0: 206 | nr_links = 0 207 | for k in links: 208 | nr_links += len(links[k]) 209 | print("\ncreated %s edges and %s nodes in %s."%(nr_links,len(nodes),str(datetime.now()-start))) 210 | 211 | complex["nodes"] = nodes 212 | complex["links"] = links 213 | complex["meta"] = self.projection 214 | 215 | return complex 216 | 217 | def visualize(self, complex, color_function="", path_html="mapper_visualization_output.html", title="My Data", 218 | graph_link_distance=30, graph_gravity=0.1, graph_charge=-120, custom_tooltips=None, width_html=0, 219 | height_html=0, show_tooltips=True, show_title=True, show_meta=True): 220 | # Turns the dictionary 'complex' in a html file with d3.js 221 | # 222 | # Input: complex. Dictionary (output from calling .map()) 223 | # Output: a HTML page saved as a file in 'path_html'. 224 | # 225 | # parameters 226 | # ---------- 227 | # color_function string. Not fully implemented. Default: "" (distance to origin) 228 | # path_html file path as string. Where to save the HTML page. 229 | # title string. HTML page document title and first heading. 230 | # graph_link_distance int. Edge length. 231 | # graph_gravity float. "Gravity" to center of layout. 232 | # graph_charge int. charge between nodes. 233 | # custom_tooltips None or Numpy Array. You could use "y"-label array for this. 234 | # width_html int. Width of canvas. Default: 0 (full width) 235 | # height_html int. Height of canvas. Default: 0 (full height) 236 | # show_tooltips bool. default:True 237 | # show_title bool. default:True 238 | # show_meta bool. default:True 239 | 240 | # Format JSON for D3 graph 241 | json_s = {} 242 | json_s["nodes"] = [] 243 | json_s["links"] = [] 244 | k2e = {} # a key to incremental int dict, used for id's when linking 245 | 246 | for e, k in enumerate(complex["nodes"]): 247 | # Tooltip and node color formatting, TODO: de-mess-ify 248 | if custom_tooltips is not None: 249 | tooltip_s = "

Cluster %s

"%k + " ".join([str(f) for f in custom_tooltips[complex["nodes"][k]]]) 250 | if color_function == "average_signal_cluster": 251 | tooltip_i = int(((sum([f for f in custom_tooltips[complex["nodes"][k]]]) / len(custom_tooltips[complex["nodes"][k]])) * 30) ) 252 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(tooltip_i)}) 253 | else: 254 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])}) 255 | else: 256 | tooltip_s = "

Cluster %s

Contains %s members."%(k,len(complex["nodes"][k])) 257 | json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])}) 258 | k2e[k] = e 259 | for k in complex["links"]: 260 | for link in complex["links"][k]: 261 | json_s["links"].append({"source": k2e[k], "target":k2e[link],"value":1}) 262 | 263 | # Width and height of graph in HTML output 264 | if width_html == 0: 265 | width_css = "100%" 266 | width_js = 'document.getElementById("holder").offsetWidth-20' 267 | else: 268 | width_css = "%spx" % width_html 269 | width_js = "%s" % width_html 270 | if height_html == 0: 271 | height_css = "100%" 272 | height_js = 'document.getElementById("holder").offsetHeight-20' 273 | else: 274 | height_css = "%spx" % height_html 275 | height_js = "%s" % height_html 276 | 277 | # Whether to show certain UI elements or not 278 | if show_tooltips == False: 279 | tooltips_display = "display: none;" 280 | else: 281 | tooltips_display = "" 282 | 283 | if show_meta == False: 284 | meta_display = "display: none;" 285 | else: 286 | meta_display = "" 287 | 288 | if show_title == False: 289 | title_display = "display: none;" 290 | else: 291 | title_display = "" 292 | 293 | with open(path_html,"wb") as outfile: 294 | html = """ 295 | 296 | 297 | %s | KeplerMapper 298 | 299 | 313 | 314 |
315 |

%s

316 |

317 | Lens
%s

318 | Cubes per dimension
%s

319 | Overlap percentage
%s%%

320 | Color Function
%s( %s )

321 | Clusterer
%s

322 | Scaler
%s 323 |

324 |
325 | 326 | """%(title,width_css, height_css, title_display, meta_display, tooltips_display, title,complex["meta"],self.nr_cubes,self.overlap_perc*100,color_function,complex["meta"],str(self.clusterer),str(self.scaler),width_js,height_js,graph_charge,graph_link_distance,graph_gravity,json.dumps(json_s)) 390 | outfile.write(html.encode("utf-8")) 391 | if self.verbose > 0: 392 | print("\nWrote d3.js graph to '%s'"%path_html) -------------------------------------------------------------------------------- /examples/makecircles/keplermapper-makecircles-xaxis.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | datasets.make_circles(n_samples=5000, noise=0.05, factor=0.3) | KeplerMapper 5 | 6 | 20 | 21 |
22 |

datasets.make_circles(n_samples=5000, noise=0.05, factor=0.3)

23 |

24 | Lens
[0]

25 | Cubes per dimension
20

26 | Overlap percentage
10.0%

27 | Color Function
average_signal_cluster( [0] )

28 | Clusterer
DBSCAN(algorithm='auto', eps=0.1, leaf_size=30, metric='euclidean', 29 | min_samples=10, p=None, random_state=None)

30 | Scaler
MinMaxScaler(copy=True, feature_range=(0, 1)) 31 |

32 |
33 | 34 | -------------------------------------------------------------------------------- /examples/makecircles/keplermapper-makecircles-distmean.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | datasets.make_circles(n_samples=5000, noise=0.3, factor=0.3) | KeplerMapper 5 | 6 | 20 | 21 |
22 |

datasets.make_circles(n_samples=5000, noise=0.3, factor=0.3)

23 |

24 | Lens
dist_mean

25 | Cubes per dimension
30

26 | Overlap percentage
70.0%

27 | Color Function
average_signal_cluster( dist_mean )

28 | Clusterer
DBSCAN(algorithm='auto', eps=0.9, leaf_size=30, metric='euclidean', 29 | min_samples=3, p=None, random_state=None)

30 | Scaler
MinMaxScaler(copy=True, feature_range=(0, 1)) 31 |

32 |
33 | 34 | -------------------------------------------------------------------------------- /examples/digits/keplermapper_digits_ylabel_tooltips.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Digits | KeplerMapper 5 | 6 | 30 | 31 |
32 |

Digits

33 |

34 | Lens
t-SNE(2) 2D

35 | Number of cubes
35

36 | Overlap percentage
90.0%

37 | Linking locally
False

38 | Color Function
Distance to min(t-SNE(2) 2D)

39 | Clusterer
DBSCAN(algorithm='auto', eps=0.3, leaf_size=30, metric='euclidean', 40 | min_samples=15, p=None, random_state=None)

41 | Scaler
MinMaxScaler(copy=True, feature_range=(0, 1)) 42 |

43 |
44 | 45 | --------------------------------------------------------------------------------