├── sstsne ├── __init__.py ├── _utils.pyx ├── _barnes_hut_tsne.pyx └── ss_t_sne.py ├── README.md ├── LICENSE ├── .gitignore └── setup.py /sstsne/__init__.py: -------------------------------------------------------------------------------- 1 | from .ss_t_sne import SemiSupervisedTSNE 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sstsne 2 | Semi-Supervised t-SNE using a Bayesian prior based on partial labelling 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Leland McInnes 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | try: 4 | from Cython.Distutils import build_ext 5 | from setuptools import setup, Extension 6 | HAVE_CYTHON = True 7 | except ImportError as e: 8 | warnings.warn(e.message) 9 | from setuptools import setup, Extension 10 | from setuptools.command.build_ext import build_ext 11 | HAVE_CYTHON = False 12 | 13 | import numpy 14 | 15 | _utils = Extension('sstsne._utils', 16 | sources=['sstsne/_utils.pyx'], 17 | include_dirs=[numpy.get_include()]) 18 | 19 | _barnes_hut_tsne = Extension('sstsne._barnes_hut_tsne', 20 | sources=['sstsne/_barnes_hut_tsne.pyx'], 21 | include_dirs=[numpy.get_include(), '/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers/']) 22 | 23 | def readme(): 24 | with open('README.md') as readme_file: 25 | return readme_file.read() 26 | 27 | configuration = { 28 | 'name' : 'sstsne', 29 | 'version' : '0.1', 30 | 'description' : 'Semi-Supervised t-SNE using a Bayesian prior based on partial labelling', 31 | 'long_description' : readme(), 32 | 'classifiers' : [ 33 | 'Development Status :: 3 - Alpha', 34 | 'Intended Audience :: Science/Research', 35 | 'Intended Audience :: Developers', 36 | 'License :: OSI Approved', 37 | 'Programming Language :: C', 38 | 'Programming Language :: Python', 39 | 'Topic :: Software Development', 40 | 'Topic :: Scientific/Engineering', 41 | 'Operating System :: Microsoft :: Windows', 42 | 'Operating System :: POSIX', 43 | 'Operating System :: Unix', 44 | 'Operating System :: MacOS', 45 | 'Programming Language :: Python :: 2.7', 46 | 'Programming Language :: Python :: 3.4', 47 | ], 48 | 'keywords' : 'tsne semi-supervised dimension reduction', 49 | 'url' : 'http://github.com/lmcinnes/sstsne', 50 | 'maintainer' : 'Leland McInnes', 51 | 'maintainer_email' : 'leland.mcinnes@gmail.com', 52 | 'license' : 'BSD', 53 | 'packages' : ['sstsne'], 54 | 'install_requires' : ['scikit-learn>=0.17.1', 55 | 'cython >= 0.17'], 56 | 'ext_modules' : [_utils, 57 | _barnes_hut_tsne], 58 | 'cmdclass' : {'build_ext' : build_ext}, 59 | 'test_suite' : 'nose.collector', 60 | 'tests_require' : ['nose'], 61 | } 62 | 63 | if not HAVE_CYTHON: 64 | _utils.sources[0] = '_utils.c' 65 | _barnes_hut_tsne.sources[0] = '_barnes_hut_tsne.c' 66 | configuration['install_requires'] = ['scikit-learn>=0.17.1'] 67 | 68 | setup(**configuration) -------------------------------------------------------------------------------- /sstsne/_utils.pyx: -------------------------------------------------------------------------------- 1 | from libc cimport math 2 | cimport cython 3 | import numpy as np 4 | cimport numpy as np 5 | from libc.stdio cimport printf 6 | cdef extern from "numpy/npy_math.h": 7 | float NPY_INFINITY 8 | 9 | 10 | cdef float EPSILON_DBL = 1e-8 11 | cdef float PERPLEXITY_TOLERANCE = 1e-5 12 | 13 | @cython.boundscheck(False) 14 | cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity( 15 | np.ndarray[np.float32_t, ndim=2] affinities, 16 | np.ndarray[np.int64_t, ndim=2] neighbors, 17 | np.ndarray[np.int64_t, ndim=1] labels, 18 | float label_importance, 19 | int rep_samples, 20 | float desired_perplexity, 21 | int verbose): 22 | """Binary search for sigmas of conditional Gaussians. 23 | 24 | This approximation reduces the computational complexity from O(N^2) to 25 | O(uN). See the exact method '_binary_search_perplexity' for more details. 26 | 27 | Parameters 28 | ---------- 29 | affinities : array-like, shape (n_samples, n_samples) 30 | Distances between training samples. 31 | 32 | neighbors : array-like, shape (n_samples, K) or None 33 | Each row contains the indices to the K nearest neigbors. If this 34 | array is None, then the perplexity is estimated over all data 35 | not just the nearest neighbors. 36 | 37 | labels : array-like, shape (n_samples,) 38 | Integer labels for the samples. Unlabelled samples should have label -1. 39 | 40 | label_importance : float 41 | Relative importance to place on labelling 42 | 43 | rep_sample : int 44 | Whether the partial labels are a representative sample of the full labelling 45 | 46 | desired_perplexity : float 47 | Desired perplexity (2^entropy) of the conditional Gaussians. 48 | 49 | verbose : int 50 | Verbosity level. 51 | 52 | Returns 53 | ------- 54 | P : array, shape (n_samples, n_samples) 55 | Probabilities of conditional Gaussian distributions p_i|j. 56 | """ 57 | # Maximum number of binary search steps 58 | cdef long n_steps = 100 59 | 60 | cdef long n_samples = affinities.shape[0] 61 | # This array is later used as a 32bit array. It has multiple intermediate 62 | # floating point additions that benefit from the extra precision 63 | cdef np.ndarray[np.float64_t, ndim=2] P = np.zeros((n_samples, n_samples), 64 | dtype=np.float64) 65 | # Precisions of conditional Gaussian distrubutions 66 | cdef float beta 67 | cdef float beta_min 68 | cdef float beta_max 69 | cdef float beta_sum = 0.0 70 | # Now we go to log scale 71 | cdef float desired_entropy = math.log(desired_perplexity) 72 | cdef float entropy_diff 73 | 74 | cdef float entropy 75 | cdef float sum_Pi 76 | cdef float sum_disti_Pi 77 | cdef float prior_prob 78 | 79 | cdef long i, j, k, l = 0 80 | cdef long K = n_samples 81 | cdef int using_neighbors = neighbors is not None 82 | 83 | cdef np.ndarray[long, ndim=1] label_sizes = np.bincount(labels + 1) 84 | cdef long n_same_label 85 | cdef long n_other_label 86 | cdef long n_unlabelled = label_sizes[0] 87 | 88 | if using_neighbors: 89 | K = neighbors.shape[1] 90 | 91 | for i in range(n_samples): 92 | beta_min = -NPY_INFINITY 93 | beta_max = NPY_INFINITY 94 | beta = 1.0 95 | 96 | # Binary search of precision for i-th conditional distribution 97 | for l in range(n_steps): 98 | # Compute current entropy and corresponding probabilities 99 | # computed just over the nearest neighbors or over all data 100 | # if we're not using neighbors 101 | if using_neighbors: 102 | for k in range(K): 103 | j = neighbors[i, k] 104 | P[i, j] = math.exp(-affinities[i, j] * beta) 105 | else: 106 | for j in range(K): 107 | P[i, j] = math.exp(-affinities[i, j] * beta) 108 | P[i, i] = 0.0 109 | sum_Pi = 0.0 110 | if using_neighbors: 111 | for k in range(K): 112 | j = neighbors[i, k] 113 | sum_Pi += P[i, j] 114 | else: 115 | for j in range(K): 116 | sum_Pi += P[i, j] 117 | if sum_Pi == 0.0: 118 | sum_Pi = EPSILON_DBL 119 | sum_disti_Pi = 0.0 120 | if using_neighbors: 121 | for k in range(K): 122 | j = neighbors[i, k] 123 | P[i, j] /= sum_Pi 124 | sum_disti_Pi += affinities[i, j] * P[i, j] 125 | else: 126 | for j in range(K): 127 | P[i, j] /= sum_Pi 128 | sum_disti_Pi += affinities[i, j] * P[i, j] 129 | entropy = math.log(sum_Pi) + beta * sum_disti_Pi 130 | entropy_diff = entropy - desired_entropy 131 | 132 | if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE: 133 | break 134 | 135 | if entropy_diff > 0.0: 136 | beta_min = beta 137 | if beta_max == NPY_INFINITY: 138 | beta *= 2.0 139 | else: 140 | beta = (beta + beta_max) / 2.0 141 | else: 142 | beta_max = beta 143 | if beta_min == -NPY_INFINITY: 144 | beta /= 2.0 145 | else: 146 | beta = (beta + beta_min) / 2.0 147 | 148 | beta_sum += beta 149 | 150 | if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples): 151 | print("[t-SNE] Computed conditional probabilities for sample " 152 | "%d / %d" % (i + 1, n_samples)) 153 | 154 | if verbose: 155 | print("[t-SNE] Mean sigma: %f" 156 | % np.mean(math.sqrt(n_samples / beta_sum))) 157 | 158 | for i in range(n_samples): 159 | 160 | sum_Pi = 0 161 | 162 | if using_neighbors: 163 | 164 | for k in range(K): 165 | j = neighbors[i, k] 166 | 167 | n_same_label = label_sizes[labels[i] + 1] 168 | n_other_label = n_samples - n_same_label - n_unlabelled 169 | 170 | if rep_samples: 171 | 172 | denominator = n_same_label ** 2 + n_other_label ** 2 + n_unlabelled ** 2 173 | 174 | if labels[i] == -1 or labels[j] == -1: 175 | prior_prob = n_unlabelled / denominator 176 | elif labels[j] == labels[i]: 177 | prior_prob = min((n_same_label / denominator) + (label_importance / n_same_label), 1.0 - EPSILON_DBL) 178 | else: 179 | prior_prob = max((n_other_label / denominator) - (label_importance / n_other_label), EPSILON_DBL) 180 | 181 | else: 182 | 183 | if labels[i] == -1 or labels[j] == -1: 184 | prior_prob = 1.0 / n_samples 185 | elif labels[j] == labels[i]: 186 | prior_prob = min((1.0 / n_samples) + (label_importance / n_same_label), 1.0 - EPSILON_DBL) 187 | else: 188 | prior_prob = max((1.0 / n_samples) - (label_importance / n_other_label), EPSILON_DBL) 189 | 190 | P[i, j] *= prior_prob 191 | sum_Pi += P[i, j] 192 | 193 | for k in range(K): 194 | j = neighbors[i, k] 195 | P[i, j] /= sum_Pi 196 | 197 | else: 198 | 199 | for j in range(K): 200 | 201 | n_same_label = label_sizes[labels[i] + 1] 202 | n_other_label = n_samples - n_same_label - n_unlabelled 203 | 204 | if rep_samples: 205 | 206 | denominator = n_same_label ** 2 + n_other_label ** 2 + n_unlabelled ** 2 207 | 208 | if labels[i] == -1 or labels[j] == -1: 209 | prior_prob = n_unlabelled / denominator 210 | elif labels[j] == labels[i]: 211 | prior_prob = min((n_same_label / denominator) + (label_importance / n_same_label), 1.0 - EPSILON_DBL) 212 | else: 213 | prior_prob = max((n_other_label / denominator) - (label_importance / n_other_label), EPSILON_DBL) 214 | 215 | else: 216 | 217 | if labels[i] == -1 or labels[j] == -1: 218 | prior_prob = 1.0 / n_samples 219 | elif labels[j] == labels[i]: 220 | prior_prob = min((1.0 / n_samples) + (label_importance / n_same_label), 1.0 - EPSILON_DBL) 221 | else: 222 | prior_prob = max((1.0 / n_samples) - (label_importance / n_other_label), EPSILON_DBL) 223 | 224 | P[i, j] *= prior_prob 225 | sum_Pi += P[i, j] 226 | 227 | for j in range(K): 228 | P[i, j] /= sum_Pi 229 | 230 | return P 231 | -------------------------------------------------------------------------------- /sstsne/_barnes_hut_tsne.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | # cython: wraparound=False 3 | # cython: cdivision=True 4 | # Author: Christopher Moody 5 | # Author: Nick Travers 6 | # Implementation by Chris Moody & Nick Travers 7 | # See http://homepage.tudelft.nl/19j49/t-SNE.html for reference 8 | # implementations and papers describing the technique 9 | 10 | 11 | from libc.stdlib cimport malloc, free 12 | from libc.stdio cimport printf 13 | from libc.math cimport sqrt, log 14 | cimport numpy as np 15 | import numpy as np 16 | 17 | cdef char* EMPTY_STRING = "" 18 | 19 | cdef extern from "math.h": 20 | float fabsf(float x) nogil 21 | 22 | # Round points differing by less than this amount 23 | # effectively ignoring differences near the 32bit 24 | # floating point precision 25 | cdef float EPSILON = 1e-6 26 | 27 | # This is effectively an ifdef statement in Cython 28 | # It allows us to write printf debugging lines 29 | # and remove them at compile time 30 | cdef enum: 31 | DEBUGFLAG = 0 32 | 33 | cdef extern from "time.h": 34 | # Declare only what is necessary from `tm` structure. 35 | ctypedef long clock_t 36 | clock_t clock() nogil 37 | double CLOCKS_PER_SEC 38 | 39 | 40 | cdef extern from "cblas.h": 41 | float snrm2 "cblas_snrm2"(int N, float *X, int incX) nogil 42 | 43 | 44 | cdef struct Node: 45 | # Keep track of the center of mass 46 | float* barycenter 47 | # If this is a leaf, the position of the point within this leaf 48 | float* leaf_point_position 49 | # The number of points including all 50 | # nodes below this one 51 | long cumulative_size 52 | # Number of points at this node 53 | long size 54 | # Index of the point at this node 55 | # Only defined for non-empty leaf nodes 56 | long point_index 57 | # level = 0 is the root node 58 | # And each subdivision adds 1 to the level 59 | long level 60 | # Left edge of this node 61 | float* left_edge 62 | # The center of this node, equal to le + w/2.0 63 | float* center 64 | # The width of this node -- used to calculate the opening 65 | # angle. Equal to width = re - le 66 | float* width 67 | # The value of the maximum width w 68 | float max_width 69 | 70 | # Does this node have children? 71 | # Default to leaf until we add points 72 | int is_leaf 73 | # Array of pointers to pointers of children 74 | Node **children 75 | # Keep a pointer to the parent 76 | Node *parent 77 | # Pointer to the tree this node belongs too 78 | Tree* tree 79 | 80 | cdef struct Tree: 81 | # Holds a pointer to the root node 82 | Node* root_node 83 | # Number of dimensions in the ouput 84 | int n_dimensions 85 | # Total number of cells 86 | long n_cells 87 | # Total number of points 88 | long n_points 89 | # Spit out diagnostic information? 90 | int verbose 91 | # How many cells per node? Should go as 2 ** n_dimensionss 92 | int n_cell_per_node 93 | 94 | cdef Tree* init_tree(float[:] left_edge, float[:] width, int n_dimensions, 95 | int verbose) nogil: 96 | # tree is freed by free_tree 97 | cdef Tree* tree = malloc(sizeof(Tree)) 98 | tree.n_dimensions = n_dimensions 99 | tree.n_cells = 0 100 | tree.n_points = 0 101 | tree.verbose = verbose 102 | tree.root_node = create_root(left_edge, width, n_dimensions) 103 | tree.root_node.tree = tree 104 | tree.n_cells += 1 105 | tree.n_cell_per_node = 2 ** n_dimensions 106 | if DEBUGFLAG: 107 | printf("[t-SNE] Tree initialised. Left_edge = (%1.9e, %1.9e, %1.9e)\n", 108 | left_edge[0], left_edge[1], left_edge[2]) 109 | printf("[t-SNE] Tree initialised. Width = (%1.9e, %1.9e, %1.9e)\n", 110 | width[0], width[1], width[2]) 111 | return tree 112 | 113 | cdef Node* create_root(float[:] left_edge, float[:] width, int n_dimensions) nogil: 114 | # Create a default root node 115 | cdef int ax 116 | cdef int n_cell_per_node = 2 ** n_dimensions 117 | # root is freed by free_tree 118 | root = malloc(sizeof(Node)) 119 | root.is_leaf = 1 120 | root.parent = NULL 121 | root.level = 0 122 | root.cumulative_size = 0 123 | root.size = 0 124 | root.point_index = -1 125 | root.max_width = 0.0 126 | root.width = malloc(sizeof(float) * n_dimensions) 127 | root.left_edge = malloc(sizeof(float) * n_dimensions) 128 | root.center = malloc(sizeof(float) * n_dimensions) 129 | root.barycenter = malloc(sizeof(float) * n_dimensions) 130 | root.leaf_point_position= malloc(sizeof(float) * n_dimensions) 131 | root.children = NULL 132 | for ax in range(n_dimensions): 133 | root.width[ax] = width[ax] 134 | root.left_edge[ax] = left_edge[ax] 135 | root.center[ax] = 0.0 136 | root.barycenter[ax] = 0. 137 | root.leaf_point_position[ax] = -1 138 | for ax in range(n_dimensions): 139 | root.max_width = max(root.max_width, root.width[ax]) 140 | if DEBUGFLAG: 141 | printf("[t-SNE] Created root node %p\n", root) 142 | return root 143 | 144 | cdef Node* create_child(Node *parent, int[3] offset) nogil: 145 | # Create a new child node with default parameters 146 | cdef int ax 147 | # these children are freed by free_recursive 148 | child = malloc(sizeof(Node)) 149 | child.is_leaf = 1 150 | child.parent = parent 151 | child.level = parent.level + 1 152 | child.size = 0 153 | child.cumulative_size = 0 154 | child.point_index = -1 155 | child.tree = parent.tree 156 | child.max_width = 0.0 157 | child.width = malloc(sizeof(float) * parent.tree.n_dimensions) 158 | child.left_edge = malloc(sizeof(float) * parent.tree.n_dimensions) 159 | child.center = malloc(sizeof(float) * parent.tree.n_dimensions) 160 | child.barycenter = malloc(sizeof(float) * parent.tree.n_dimensions) 161 | child.leaf_point_position = malloc(sizeof(float) * parent.tree.n_dimensions) 162 | child.children = NULL 163 | for ax in range(parent.tree.n_dimensions): 164 | child.width[ax] = parent.width[ax] / 2.0 165 | child.left_edge[ax] = parent.left_edge[ax] + offset[ax] * parent.width[ax] / 2.0 166 | child.center[ax] = child.left_edge[ax] + child.width[ax] / 2.0 167 | child.barycenter[ax] = 0. 168 | child.leaf_point_position[ax] = -1. 169 | for ax in range(parent.tree.n_dimensions): 170 | child.max_width = max(child.max_width, child.width[ax]) 171 | child.tree.n_cells += 1 172 | return child 173 | 174 | cdef Node* select_child(Node *node, float[3] pos, long index) nogil: 175 | # Find which sub-node a position should go into 176 | # And return the appropriate node 177 | cdef int* offset = malloc(sizeof(int) * node.tree.n_dimensions) 178 | cdef int ax, idx 179 | cdef Node* child 180 | cdef int error 181 | for ax in range(node.tree.n_dimensions): 182 | offset[ax] = (pos[ax] - (node.left_edge[ax] + node.width[ax] / 2.0)) > 0. 183 | idx = offset2index(offset, node.tree.n_dimensions) 184 | child = node.children[idx] 185 | if DEBUGFLAG: 186 | printf("[t-SNE] Offset [%i, %i] with LE [%f, %f]\n", 187 | offset[0], offset[1], child.left_edge[0], child.left_edge[1]) 188 | free(offset) 189 | return child 190 | 191 | 192 | cdef inline void index2offset(int* offset, int index, int n_dimensions) nogil: 193 | # Convert a 1D index into N-D index; useful for indexing 194 | # children of a quadtree, octree, N-tree 195 | # Quite likely there's a fancy bitshift way of doing this 196 | # since the offset is equivalent to the binary representation 197 | # of the integer index 198 | # We read the offset array left-to-right 199 | # such that the least significat bit is on the right 200 | cdef int rem, k, shift 201 | for k in range(n_dimensions): 202 | shift = n_dimensions -k -1 203 | rem = ((index >> shift) << shift) 204 | offset[k] = rem > 0 205 | if DEBUGFLAG: 206 | printf("i2o index %i k %i rem %i offset", index, k, rem) 207 | for j in range(n_dimensions): 208 | printf(" %i", offset[j]) 209 | printf(" n_dimensions %i\n", n_dimensions) 210 | index -= rem 211 | 212 | 213 | cdef inline int offset2index(int* offset, int n_dimensions) nogil: 214 | # Calculate the 1:1 index for a given offset array 215 | # We read the offset array right-to-left 216 | # such that the least significat bit is on the right 217 | cdef int dim 218 | cdef int index = 0 219 | for dim in range(n_dimensions): 220 | index += (2 ** dim) * offset[n_dimensions - dim - 1] 221 | if DEBUGFLAG: 222 | printf("o2i index %i dim %i offset", index, dim) 223 | for j in range(n_dimensions): 224 | printf(" %i", offset[j]) 225 | printf(" n_dimensions %i\n", n_dimensions) 226 | return index 227 | 228 | 229 | cdef void subdivide(Node* node) nogil: 230 | # This instantiates 2**n_dimensions = n_cell_per_node nodes for the current node 231 | cdef int idx = 0 232 | cdef int* offset = malloc(sizeof(int) * node.tree.n_dimensions) 233 | node.is_leaf = False 234 | node.children = malloc(sizeof(Node*) * node.tree.n_cell_per_node) 235 | for idx in range(node.tree.n_cell_per_node): 236 | index2offset(offset, idx, node.tree.n_dimensions) 237 | node.children[idx] = create_child(node, offset) 238 | free(offset) 239 | 240 | 241 | cdef int insert(Node *root, float pos[3], long point_index, long depth, long 242 | duplicate_count) nogil: 243 | # Introduce a new point into the tree 244 | # by recursively inserting it and subdividng as necessary 245 | # Carefully treat the case of identical points at the same node 246 | # by increasing the root.size and tracking duplicate_count 247 | cdef Node *child 248 | cdef long i 249 | cdef int ax 250 | cdef int not_identical = 1 251 | cdef int n_dimensions = root.tree.n_dimensions 252 | if DEBUGFLAG: 253 | printf("[t-SNE] [d=%i] Inserting pos %i [%f, %f] duplicate_count=%i " 254 | "into child %p\n", depth, point_index, pos[0], pos[1], 255 | duplicate_count, root) 256 | # Increment the total number points including this 257 | # node and below it 258 | root.cumulative_size += duplicate_count 259 | # Evaluate the new center of mass, weighting the previous 260 | # center of mass against the new point data 261 | cdef double frac_seen = (root.cumulative_size - 1) / ( 262 | root.cumulative_size) 263 | cdef double frac_new = 1.0 / root.cumulative_size 264 | # Assert that duplicate_count > 0 265 | if duplicate_count < 1: 266 | return -1 267 | # Assert that the point is inside the left & right edges 268 | for ax in range(n_dimensions): 269 | root.barycenter[ax] *= frac_seen 270 | if (pos[ax] > (root.left_edge[ax] + root.width[ax] + EPSILON)): 271 | printf("[t-SNE] Error: point (%1.9e) is above right edge of node " 272 | "(%1.9e)\n", pos[ax], root.left_edge[ax] + root.width[ax]) 273 | return -1 274 | if (pos[ax] < root.left_edge[ax] - EPSILON): 275 | printf("[t-SNE] Error: point (%1.9e) is below left edge of node " 276 | "(%1.9e)\n", pos[ax], root.left_edge[ax]) 277 | return -1 278 | for ax in range(n_dimensions): 279 | root.barycenter[ax] += pos[ax] * frac_new 280 | 281 | # If this node is unoccupied, fill it. 282 | # Otherwise, we need to insert recursively. 283 | # Two insertion scenarios: 284 | # 1) Insert into this node if it is a leaf and empty 285 | # 2) Subdivide this node if it is currently occupied 286 | if (root.size == 0) & root.is_leaf: 287 | # Root node is empty and a leaf 288 | if DEBUGFLAG: 289 | printf("[t-SNE] [d=%i] Inserting [%f, %f] into blank cell\n", depth, 290 | pos[0], pos[1]) 291 | for ax in range(n_dimensions): 292 | root.leaf_point_position[ax] = pos[ax] 293 | root.point_index = point_index 294 | root.size = duplicate_count 295 | return 0 296 | else: 297 | # Root node is occupied or not a leaf 298 | if DEBUGFLAG: 299 | printf("[t-SNE] [d=%i] Node %p is occupied or is a leaf.\n", depth, 300 | root) 301 | printf("[t-SNE] [d=%i] Node %p leaf = %i. Size %i\n", depth, root, 302 | root.is_leaf, root.size) 303 | if root.is_leaf & (root.size > 0): 304 | # is a leaf node and is occupied 305 | for ax in range(n_dimensions): 306 | not_identical &= (fabsf(pos[ax] - root.leaf_point_position[ax]) < EPSILON) 307 | not_identical &= (root.point_index != point_index) 308 | if not_identical == 1: 309 | root.size += duplicate_count 310 | if DEBUGFLAG: 311 | printf("[t-SNE] Warning: [d=%i] Detected identical " 312 | "points. Returning. Leaf now has size %i\n", 313 | depth, root.size) 314 | return 0 315 | # If necessary, subdivide this node before 316 | # descending 317 | if root.is_leaf: 318 | if DEBUGFLAG: 319 | printf("[t-SNE] [d=%i] Subdividing this leaf node %p\n", depth, 320 | root) 321 | subdivide(root) 322 | # We have two points to relocate: the one previously 323 | # at this node, and the new one we're attempting 324 | # to insert 325 | if root.size > 0: 326 | child = select_child(root, root.leaf_point_position, root.point_index) 327 | if DEBUGFLAG: 328 | printf("[t-SNE] [d=%i] Relocating old point to node %p\n", 329 | depth, child) 330 | insert(child, root.leaf_point_position, root.point_index, depth + 1, root.size) 331 | # Insert the new point 332 | if DEBUGFLAG: 333 | printf("[t-SNE] [d=%i] Selecting node for new point\n", depth) 334 | child = select_child(root, pos, point_index) 335 | if root.size > 0: 336 | # Remove the point from this node 337 | for ax in range(n_dimensions): 338 | root.leaf_point_position[ax] = -1 339 | root.size = 0 340 | root.point_index = -1 341 | return insert(child, pos, point_index, depth + 1, 1) 342 | 343 | cdef int insert_many(Tree* tree, float[:,:] pos_array) nogil: 344 | # Insert each data point into the tree one at a time 345 | cdef long nrows = pos_array.shape[0] 346 | cdef long i 347 | cdef int ax 348 | cdef float row[3] 349 | cdef long err = 0 350 | for i in range(nrows): 351 | for ax in range(tree.n_dimensions): 352 | row[ax] = pos_array[i, ax] 353 | if DEBUGFLAG: 354 | printf("[t-SNE] inserting point %i: [%f, %f]\n", i, row[0], row[1]) 355 | err = insert(tree.root_node, row, i, 0, 1) 356 | if err != 0: 357 | printf("[t-SNE] ERROR\n%s", EMPTY_STRING) 358 | return err 359 | tree.n_points += 1 360 | return err 361 | 362 | cdef int free_tree(Tree* tree) nogil: 363 | cdef int check 364 | cdef long* cnt = malloc(sizeof(long) * 3) 365 | for i in range(3): 366 | cnt[i] = 0 367 | free_recursive(tree, tree.root_node, cnt) 368 | check = cnt[0] == tree.n_cells 369 | check &= cnt[2] == tree.n_points 370 | free(tree) 371 | free(cnt) 372 | return check 373 | 374 | cdef void free_post_children(Node *node) nogil: 375 | free(node.width) 376 | free(node.left_edge) 377 | free(node.center) 378 | free(node.barycenter) 379 | free(node.leaf_point_position) 380 | free(node) 381 | 382 | cdef void free_recursive(Tree* tree, Node *root, long* counts) nogil: 383 | # Free up all of the tree nodes recursively 384 | # while counting the number of nodes visited 385 | # and total number of data points removed 386 | cdef int idx 387 | cdef Node* child 388 | if not root.is_leaf: 389 | for idx in range(tree.n_cell_per_node): 390 | child = root.children[idx] 391 | free_recursive(tree, child, counts) 392 | counts[0] += 1 393 | if child.is_leaf: 394 | counts[1] += 1 395 | if child.size > 0: 396 | counts[2] +=1 397 | else: 398 | free(child.children) 399 | 400 | free_post_children(child) 401 | 402 | if root == tree.root_node: 403 | if not root.is_leaf: 404 | free(root.children) 405 | 406 | free_post_children(root) 407 | 408 | cdef long count_points(Node* root, long count) nogil: 409 | # Walk through the whole tree and count the number 410 | # of points at the leaf nodes 411 | if DEBUGFLAG: 412 | printf("[t-SNE] Counting nodes at root node %p\n", root) 413 | cdef Node* child 414 | cdef int idx 415 | if root.is_leaf: 416 | count += root.size 417 | if DEBUGFLAG : 418 | printf("[t-SNE] %p is a leaf node, no children\n", root) 419 | printf("[t-SNE] %i points in node %p\n", count, root) 420 | return count 421 | # Otherwise, get the children 422 | for idx in range(root.tree.n_cell_per_node): 423 | child = root.children[idx] 424 | if DEBUGFLAG: 425 | printf("[t-SNE] Counting points for child %p\n", child) 426 | if child.is_leaf and child.size > 0: 427 | if DEBUGFLAG: 428 | printf("[t-SNE] Child has size %d\n", child.size) 429 | count += child.size 430 | elif not child.is_leaf: 431 | if DEBUGFLAG: 432 | printf("[t-SNE] Child is not a leaf. Descending\n%s", EMPTY_STRING) 433 | count = count_points(child, count) 434 | # else case is we have an empty leaf node 435 | # which happens when we create a quadtree for 436 | # one point, and then the other neighboring cells 437 | # don't get filled in 438 | if DEBUGFLAG: 439 | printf("[t-SNE] %i points in this node\n", count) 440 | return count 441 | 442 | 443 | cdef float compute_gradient(float[:,:] val_P, 444 | float[:,:] pos_reference, 445 | np.int64_t[:,:] neighbors, 446 | float[:,:] tot_force, 447 | Node* root_node, 448 | float theta, 449 | float dof, 450 | long start, 451 | long stop) nogil: 452 | # Having created the tree, calculate the gradient 453 | # in two components, the positive and negative forces 454 | cdef long i, coord 455 | cdef int ax 456 | cdef long n = pos_reference.shape[0] 457 | cdef int n_dimensions = root_node.tree.n_dimensions 458 | if root_node.tree.verbose > 11: 459 | printf("[t-SNE] Allocating %i elements in force arrays\n", 460 | n * n_dimensions * 2) 461 | cdef float* sum_Q = malloc(sizeof(float)) 462 | cdef float* neg_f = malloc(sizeof(float) * n * n_dimensions) 463 | cdef float* neg_f_fast = malloc(sizeof(float) * n * n_dimensions) 464 | cdef float* pos_f = malloc(sizeof(float) * n * n_dimensions) 465 | cdef clock_t t1, t2 466 | cdef float sQ, error 467 | 468 | sum_Q[0] = 0.0 469 | t1 = clock() 470 | compute_gradient_negative(val_P, pos_reference, neg_f, root_node, sum_Q, 471 | dof, theta, start, stop) 472 | t2 = clock() 473 | if root_node.tree.verbose > 15: 474 | printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1))) 475 | sQ = sum_Q[0] 476 | t1 = clock() 477 | error = compute_gradient_positive(val_P, pos_reference, neighbors, pos_f, 478 | n_dimensions, dof, sQ, start, root_node.tree.verbose) 479 | t2 = clock() 480 | if root_node.tree.verbose > 15: 481 | printf("[t-SNE] Computing positive gradient: %e ticks\n", ((float) (t2 - t1))) 482 | for i in range(start, n): 483 | for ax in range(n_dimensions): 484 | coord = i * n_dimensions + ax 485 | tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sum_Q[0]) 486 | free(sum_Q) 487 | free(neg_f) 488 | free(neg_f_fast) 489 | free(pos_f) 490 | return sQ 491 | 492 | 493 | cdef float compute_gradient_positive(float[:,:] val_P, 494 | float[:,:] pos_reference, 495 | np.int64_t[:,:] neighbors, 496 | float* pos_f, 497 | int n_dimensions, 498 | float dof, 499 | float sum_Q, 500 | np.int64_t start, 501 | int verbose) nogil: 502 | # Sum over the following expression for i not equal to j 503 | # grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j) 504 | # This is equivalent to compute_edge_forces in the authors' code 505 | # It just goes over the nearest neighbors instead of all the data points 506 | # (unlike the non-nearest neighbors version of `compute_gradient_positive') 507 | cdef: 508 | int ax 509 | long i, j, k 510 | long K = neighbors.shape[1] 511 | long n = val_P.shape[0] 512 | float[3] buff 513 | float D, Q, pij 514 | float C = 0.0 515 | float exponent = (dof + 1.0) / -2.0 516 | cdef clock_t t1, t2 517 | t1 = clock() 518 | for i in range(start, n): 519 | for ax in range(n_dimensions): 520 | pos_f[i * n_dimensions + ax] = 0.0 521 | for k in range(K): 522 | j = neighbors[i, k] 523 | # we don't need to exclude the i==j case since we've 524 | # already thrown it out from the list of neighbors 525 | D = 0.0 526 | Q = 0.0 527 | pij = val_P[i, j] 528 | for ax in range(n_dimensions): 529 | buff[ax] = pos_reference[i, ax] - pos_reference[j, ax] 530 | D += buff[ax] ** 2.0 531 | Q = (((1.0 + D) / dof) ** exponent) 532 | D = pij * Q 533 | Q /= sum_Q 534 | C += pij * log((pij + EPSILON) / (Q + EPSILON)) 535 | for ax in range(n_dimensions): 536 | pos_f[i * n_dimensions + ax] += D * buff[ax] 537 | t2 = clock() 538 | dt = ((float) (t2 - t1)) 539 | if verbose > 10: 540 | printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt) 541 | return C 542 | 543 | 544 | 545 | cdef void compute_gradient_negative(float[:,:] val_P, 546 | float[:,:] pos_reference, 547 | float* neg_f, 548 | Node *root_node, 549 | float* sum_Q, 550 | float dof, 551 | float theta, 552 | long start, 553 | long stop) nogil: 554 | if stop == -1: 555 | stop = pos_reference.shape[0] 556 | cdef: 557 | int ax 558 | long i, j 559 | long n = stop - start 560 | float* force 561 | float* iQ 562 | float* pos 563 | float* dist2s 564 | long* sizes 565 | float* deltas 566 | long* l 567 | int n_dimensions = root_node.tree.n_dimensions 568 | float qijZ, mult 569 | long idx, 570 | long dta = 0 571 | long dtb = 0 572 | clock_t t1, t2, t3 573 | float* neg_force 574 | 575 | iQ = malloc(sizeof(float)) 576 | force = malloc(sizeof(float) * n_dimensions) 577 | pos = malloc(sizeof(float) * n_dimensions) 578 | dist2s = malloc(sizeof(float) * n) 579 | sizes = malloc(sizeof(long) * n) 580 | deltas = malloc(sizeof(float) * n * n_dimensions) 581 | l = malloc(sizeof(long)) 582 | neg_force= malloc(sizeof(float) * n_dimensions) 583 | 584 | for i in range(start, stop): 585 | # Clear the arrays 586 | for ax in range(n_dimensions): 587 | force[ax] = 0.0 588 | neg_force[ax] = 0.0 589 | pos[ax] = pos_reference[i, ax] 590 | iQ[0] = 0.0 591 | l[0] = 0 592 | # Find which nodes are summarizing and collect their centers of mass 593 | # deltas, and sizes, into vectorized arrays 594 | t1 = clock() 595 | compute_non_edge_forces(root_node, theta, i, pos, force, dist2s, 596 | sizes, deltas, l) 597 | t2 = clock() 598 | # Compute the t-SNE negative force 599 | # for the digits dataset, walking the tree 600 | # is about 10-15x more expensive than the 601 | # following for loop 602 | exponent = (dof + 1.0) / -2.0 603 | for j in range(l[0]): 604 | qijZ = ((1.0 + dist2s[j]) / dof) ** exponent 605 | sum_Q[0] += sizes[j] * qijZ 606 | mult = sizes[j] * qijZ * qijZ 607 | for ax in range(n_dimensions): 608 | idx = j * n_dimensions + ax 609 | neg_force[ax] += mult * deltas[idx] 610 | t3 = clock() 611 | for ax in range(n_dimensions): 612 | neg_f[i * n_dimensions + ax] = neg_force[ax] 613 | dta += t2 - t1 614 | dtb += t3 - t2 615 | if root_node.tree.verbose > 20: 616 | printf("[t-SNE] Tree: %i clock ticks | ", dta) 617 | printf("Force computation: %i clock ticks\n", dtb) 618 | free(iQ) 619 | free(force) 620 | free(pos) 621 | free(dist2s) 622 | free(sizes) 623 | free(deltas) 624 | free(l) 625 | free(neg_force) 626 | 627 | 628 | cdef void compute_non_edge_forces(Node* node, 629 | float theta, 630 | long point_index, 631 | float* pos, 632 | float* force, 633 | float* dist2s, 634 | long* sizes, 635 | float* deltas, 636 | long* l) nogil: 637 | # Compute the t-SNE force on the point in pos given by point_index 638 | cdef: 639 | Node* child 640 | int i, j 641 | int n_dimensions = node.tree.n_dimensions 642 | long idx, idx1 643 | float dist_check 644 | 645 | # There are no points below this node if cumulative_size == 0 646 | # so do not bother to calculate any force contributions 647 | # Also do not compute self-interactions 648 | if node.cumulative_size > 0 and not (node.is_leaf and (node.point_index == 649 | point_index)): 650 | # Compute distance between node center of mass and the reference point 651 | # I've tried rewriting this in terms of BLAS functions, but it's about 652 | # 1.5x worse when we do so, probbaly because the vectors are small 653 | idx1 = l[0] * n_dimensions 654 | deltas[idx1] = pos[0] - node.barycenter[0] 655 | idx = idx1 656 | for i in range(1, n_dimensions): 657 | idx += 1 658 | deltas[idx] = pos[i] - node.barycenter[i] 659 | # do np.sqrt(np.sum(deltas**2.0)) 660 | dist2s[l[0]] = snrm2(n_dimensions, &deltas[idx1], 1) 661 | # Check whether we can use this node as a summary 662 | # It's a summary node if the angular size as measured from the point 663 | # is relatively small (w.r.t. to theta) or if it is a leaf node. 664 | # If it can be summarized, we use the cell center of mass 665 | # Otherwise, we go a higher level of resolution and into the leaves. 666 | if node.is_leaf or ((node.max_width / dist2s[l[0]]) < theta): 667 | # Compute the t-SNE force between the reference point and the 668 | # current node 669 | sizes[l[0]] = node.cumulative_size 670 | dist2s[l[0]] = dist2s[l[0]] * dist2s[l[0]] 671 | l[0] += 1 672 | else: 673 | # Recursively apply Barnes-Hut to child nodes 674 | for idx in range(node.tree.n_cell_per_node): 675 | child = node.children[idx] 676 | if child.cumulative_size == 0: 677 | continue 678 | compute_non_edge_forces(child, theta, 679 | point_index, pos, force, dist2s, sizes, deltas, 680 | l) 681 | 682 | 683 | cdef float compute_error(float[:, :] val_P, 684 | float[:, :] pos_reference, 685 | np.int64_t[:,:] neighbors, 686 | float sum_Q, 687 | int n_dimensions, 688 | int verbose) nogil: 689 | cdef int i, j, ax 690 | cdef int I = neighbors.shape[0] 691 | cdef int K = neighbors.shape[1] 692 | cdef float pij, Q 693 | cdef float C = 0.0 694 | cdef clock_t t1, t2 695 | cdef float dt, delta 696 | t1 = clock() 697 | for i in range(I): 698 | for k in range(K): 699 | j = neighbors[i, k] 700 | pij = val_P[i, j] 701 | Q = 0.0 702 | for ax in range(n_dimensions): 703 | delta = (pos_reference[i, ax] - pos_reference[j, ax]) 704 | Q += delta * delta 705 | Q = (1.0 / (sum_Q + Q * sum_Q)) 706 | C += pij * log((pij + EPSILON) / (Q + EPSILON)) 707 | t2 = clock() 708 | dt = ((float) (t2 - t1)) 709 | if verbose > 10: 710 | printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt) 711 | return C 712 | 713 | 714 | def calculate_edge(pos_output): 715 | # Make the boundaries slightly outside of the data 716 | # to avoid floating point error near the edge 717 | left_edge = np.min(pos_output, axis=0) 718 | right_edge = np.max(pos_output, axis=0) 719 | center = (right_edge + left_edge) * 0.5 720 | width = np.maximum(np.subtract(right_edge, left_edge), EPSILON) 721 | # Exagerate width to avoid boundary edge 722 | width = width.astype(np.float32) * 1.001 723 | left_edge = center - width / 2.0 724 | right_edge = center + width / 2.0 725 | return left_edge, right_edge, width 726 | 727 | def gradient(float[:,:] pij_input, 728 | float[:,:] pos_output, 729 | np.int64_t[:,:] neighbors, 730 | float[:,:] forces, 731 | float theta, 732 | int n_dimensions, 733 | int verbose, 734 | float dof = 1.0, 735 | long skip_num_points=0): 736 | # This function is designed to be called from external Python 737 | # it passes the 'forces' array by reference and fills thats array 738 | # up in-place 739 | cdef float C 740 | n = pos_output.shape[0] 741 | left_edge, right_edge, width = calculate_edge(pos_output) 742 | assert width.itemsize == 4 743 | assert pij_input.itemsize == 4 744 | assert pos_output.itemsize == 4 745 | assert forces.itemsize == 4 746 | m = "Number of neighbors must be < # of points - 1" 747 | assert n - 1 >= neighbors.shape[1], m 748 | m = "neighbors array and pos_output shapes are incompatible" 749 | assert n == neighbors.shape[0], m 750 | m = "Forces array and pos_output shapes are incompatible" 751 | assert n == forces.shape[0], m 752 | m = "Pij and pos_output shapes are incompatible" 753 | assert n == pij_input.shape[0], m 754 | m = "Pij and pos_output shapes are incompatible" 755 | assert n == pij_input.shape[1], m 756 | if verbose > 10: 757 | printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions) 758 | cdef Tree* qt = init_tree(left_edge, width, n_dimensions, verbose) 759 | if verbose > 10: 760 | printf("[t-SNE] Inserting %i points\n", pos_output.shape[0]) 761 | err = insert_many(qt, pos_output) 762 | assert err == 0, "[t-SNE] Insertion failed" 763 | if verbose > 10: 764 | # XXX: format hack to workaround lack of `const char *` type 765 | # in the generated C code that triggers error with gcc 4.9 766 | # and -Werror=format-security 767 | printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING) 768 | sum_Q = compute_gradient(pij_input, pos_output, neighbors, forces, 769 | qt.root_node, theta, dof, skip_num_points, -1) 770 | C = compute_error(pij_input, pos_output, neighbors, sum_Q, n_dimensions, 771 | verbose) 772 | if verbose > 10: 773 | # XXX: format hack to workaround lack of `const char *` type 774 | # in the generated C code 775 | # and -Werror=format-security 776 | printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING) 777 | cdef long count = count_points(qt.root_node, 0) 778 | m = ("Tree consistency failed: unexpected number of points=%i " 779 | "at root node=%i" % (count, qt.root_node.cumulative_size)) 780 | assert count == qt.root_node.cumulative_size, m 781 | m = "Tree consistency failed: unexpected number of points on the tree" 782 | assert count == qt.n_points, m 783 | free_tree(qt) 784 | return C 785 | 786 | 787 | # Helper functions 788 | def check_quadtree(X, np.int64_t[:] counts): 789 | """ 790 | Helper function to access quadtree functions for testing 791 | """ 792 | 793 | X = X.astype(np.float32) 794 | left_edge, right_edge, width = calculate_edge(X) 795 | # Initialise a tree 796 | qt = init_tree(left_edge, width, 2, 2) 797 | # Insert data into the tree 798 | insert_many(qt, X) 799 | 800 | cdef long count = count_points(qt.root_node, 0) 801 | counts[0] = count 802 | counts[1] = qt.root_node.cumulative_size 803 | counts[2] = qt.n_points 804 | free_tree(qt) 805 | return counts 806 | 807 | 808 | cdef int helper_test_index2offset(int* check, int index, int n_dimensions): 809 | cdef int* offset = malloc(sizeof(int) * n_dimensions) 810 | cdef int error_check = 1 811 | for i in range(n_dimensions): 812 | offset[i] = 0 813 | index2offset(offset, index, n_dimensions) 814 | for i in range(n_dimensions): 815 | error_check &= offset[i] == check[i] 816 | free(offset) 817 | return error_check 818 | 819 | 820 | def test_index2offset(): 821 | ret = 1 822 | ret &= helper_test_index2offset([1, 0, 1], 5, 3) == 1 823 | ret &= helper_test_index2offset([0, 0, 0], 0, 3) == 1 824 | ret &= helper_test_index2offset([0, 0, 1], 1, 3) == 1 825 | ret &= helper_test_index2offset([0, 1, 0], 2, 3) == 1 826 | ret &= helper_test_index2offset([0, 1, 1], 3, 3) == 1 827 | ret &= helper_test_index2offset([1, 0, 0], 4, 3) == 1 828 | return ret 829 | 830 | 831 | def test_index_offset(): 832 | cdef int n_dimensions, idx, tidx, k 833 | cdef int error_check = 1 834 | cdef int* offset 835 | for n_dimensions in range(2, 10): 836 | offset = malloc(sizeof(int) * n_dimensions) 837 | for k in range(n_dimensions): 838 | offset[k] = 0 839 | for idx in range(2 ** n_dimensions): 840 | index2offset(offset, idx, n_dimensions) 841 | tidx = offset2index(offset, n_dimensions) 842 | error_check &= tidx == idx 843 | assert error_check == 1 844 | free(offset) 845 | return error_check 846 | -------------------------------------------------------------------------------- /sstsne/ss_t_sne.py: -------------------------------------------------------------------------------- 1 | # Author: Alexander Fabisch -- 2 | # Author: Christopher Moody 3 | # Author: Nick Travers 4 | # Author: Leland McInnes 5 | # License: BSD 3 clause (C) 2014 6 | 7 | # This is the exact and Barnes-Hut t-SNE implementation. There are other 8 | # modifications of the algorithm: 9 | # * Fast Optimization for t-SNE: 10 | # http://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf 11 | # Includes a further addition of SemiSupervision via partial labelling of the data 12 | 13 | import numpy as np 14 | from scipy import linalg 15 | import scipy.sparse as sp 16 | from scipy.spatial.distance import pdist 17 | from scipy.spatial.distance import squareform 18 | from sklearn.neighbors import BallTree 19 | from sklearn.base import BaseEstimator 20 | from sklearn.utils import check_array 21 | from sklearn.utils import check_random_state 22 | from sklearn.utils.extmath import _ravel 23 | from sklearn.decomposition import PCA 24 | from sklearn.metrics.pairwise import pairwise_distances 25 | from . import _utils 26 | from . import _barnes_hut_tsne 27 | from sklearn.utils.fixes import astype 28 | 29 | 30 | MACHINE_EPSILON = np.finfo(np.double).eps 31 | 32 | 33 | def _joint_probabilities(distances, labels, label_importance, rep_sample, 34 | desired_perplexity, verbose): 35 | """Compute joint probabilities p_ij from distances. 36 | 37 | Parameters 38 | ---------- 39 | distances : array, shape (n_samples * (n_samples-1) / 2,) 40 | Distances of samples are stored as condensed matrices, i.e. 41 | we omit the diagonal and duplicate entries and store everything 42 | in a one-dimensional array. 43 | 44 | labels : array, shape (n_samples,) 45 | An integer labelling of each sample, with unknown samples given 46 | the label -1. 47 | 48 | label_importance: float 49 | How much to deviate from a uniform prior via the label classes. 50 | 51 | rep_samples: boolean 52 | Whether the partial labelling is a representative sample of 53 | the full (and unknown) labelling. 54 | 55 | desired_perplexity : float 56 | Desired perplexity of the joint probability distributions. 57 | 58 | verbose : int 59 | Verbosity level. 60 | 61 | Returns 62 | ------- 63 | P : array, shape (n_samples * (n_samples-1) / 2,) 64 | Condensed joint probability matrix. 65 | """ 66 | # Compute conditional probabilities such that they approximately match 67 | # the desired perplexity 68 | distances = astype(distances, np.float32, copy=False) 69 | labels = astype(labels, np.int64, copy=False) 70 | conditional_P = _utils._binary_search_perplexity( 71 | distances, None, labels, label_importance, 72 | rep_sample, desired_perplexity, verbose) 73 | P = conditional_P + conditional_P.T 74 | sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) 75 | P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) 76 | return P 77 | 78 | 79 | def _joint_probabilities_nn(distances, neighbors, labels, label_importance, 80 | rep_sample, desired_perplexity, verbose): 81 | """Compute joint probabilities p_ij from distances using just nearest 82 | neighbors. 83 | 84 | This method is approximately equal to _joint_probabilities. The latter 85 | is O(N), but limiting the joint probability to nearest neighbors improves 86 | this substantially to O(uN). 87 | 88 | Parameters 89 | ---------- 90 | distances : array, shape (n_samples * (n_samples-1) / 2,) 91 | Distances of samples are stored as condensed matrices, i.e. 92 | we omit the diagonal and duplicate entries and store everything 93 | in a one-dimensional array. 94 | 95 | labels : array, shape (n_samples,) 96 | An integer labelling of each sample, with unknown samples given 97 | the label -1. 98 | 99 | label_importance: float 100 | How much to deviate from a uniform prior via the label classes. 101 | 102 | rep_samples: boolean 103 | Whether the partial labelling is a representative sample of 104 | the full (and unknown) labelling. 105 | 106 | desired_perplexity : float 107 | Desired perplexity of the joint probability distributions. 108 | 109 | verbose : int 110 | Verbosity level. 111 | 112 | Returns 113 | ------- 114 | P : array, shape (n_samples * (n_samples-1) / 2,) 115 | Condensed joint probability matrix. 116 | """ 117 | # Compute conditional probabilities such that they approximately match 118 | # the desired perplexity 119 | distances = astype(distances, np.float32, copy=False) 120 | labels = astype(labels, np.int64, copy=False) 121 | neighbors = astype(neighbors, np.int64, copy=False) 122 | conditional_P = _utils._binary_search_perplexity( 123 | distances, neighbors, labels, label_importance, 124 | rep_sample, desired_perplexity, verbose) 125 | m = "All probabilities should be finite" 126 | assert np.all(np.isfinite(conditional_P)), m 127 | P = conditional_P + conditional_P.T 128 | sum_P = np.maximum(np.sum(P), MACHINE_EPSILON) 129 | P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON) 130 | assert np.all(np.abs(P) <= 1.0) 131 | return P 132 | 133 | 134 | def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components, 135 | skip_num_points=0): 136 | """t-SNE objective function: gradient of the KL divergence 137 | of p_ijs and q_ijs and the absolute error. 138 | 139 | Parameters 140 | ---------- 141 | params : array, shape (n_params,) 142 | Unraveled embedding. 143 | 144 | P : array, shape (n_samples * (n_samples-1) / 2,) 145 | Condensed joint probability matrix. 146 | 147 | degrees_of_freedom : float 148 | Degrees of freedom of the Student's-t distribution. 149 | 150 | n_samples : int 151 | Number of samples. 152 | 153 | n_components : int 154 | Dimension of the embedded space. 155 | 156 | skip_num_points : int (optional, default:0) 157 | This does not compute the gradient for points with indices below 158 | `skip_num_points`. This is useful when computing transforms of new 159 | data where you'd like to keep the old data fixed. 160 | 161 | Returns 162 | ------- 163 | kl_divergence : float 164 | Kullback-Leibler divergence of p_ij and q_ij. 165 | 166 | grad : array, shape (n_params,) 167 | Unraveled gradient of the Kullback-Leibler divergence with respect to 168 | the embedding. 169 | """ 170 | X_embedded = params.reshape(n_samples, n_components) 171 | 172 | # Q is a heavy-tailed distribution: Student's t-distribution 173 | n = pdist(X_embedded, "sqeuclidean") 174 | n += 1. 175 | n /= degrees_of_freedom 176 | n **= (degrees_of_freedom + 1.0) / -2.0 177 | Q = np.maximum(n / (2.0 * np.sum(n)), MACHINE_EPSILON) 178 | 179 | # Optimization trick below: np.dot(x, y) is faster than 180 | # np.sum(x * y) because it calls BLAS 181 | 182 | # Objective: C (Kullback-Leibler divergence of P and Q) 183 | kl_divergence = 2.0 * np.dot(P, np.log(P / Q)) 184 | 185 | # Gradient: dC/dY 186 | grad = np.ndarray((n_samples, n_components)) 187 | PQd = squareform((P - Q) * n) 188 | for i in range(skip_num_points, n_samples): 189 | np.dot(_ravel(PQd[i]), X_embedded[i] - X_embedded, out=grad[i]) 190 | grad = grad.ravel() 191 | c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom 192 | grad *= c 193 | 194 | return kl_divergence, grad 195 | 196 | 197 | def _kl_divergence_error(params, P, neighbors, degrees_of_freedom, n_samples, 198 | n_components): 199 | """t-SNE objective function: the absolute error of the 200 | KL divergence of p_ijs and q_ijs. 201 | 202 | Parameters 203 | ---------- 204 | params : array, shape (n_params,) 205 | Unraveled embedding. 206 | 207 | P : array, shape (n_samples * (n_samples-1) / 2,) 208 | Condensed joint probability matrix. 209 | 210 | neighbors : array (n_samples, K) 211 | The neighbors is not actually required to calculate the 212 | divergence, but is here to match the signature of the 213 | gradient function 214 | 215 | degrees_of_freedom : float 216 | Degrees of freedom of the Student's-t distribution. 217 | 218 | n_samples : int 219 | Number of samples. 220 | 221 | n_components : int 222 | Dimension of the embedded space. 223 | 224 | Returns 225 | ------- 226 | kl_divergence : float 227 | Kullback-Leibler divergence of p_ij and q_ij. 228 | 229 | grad : array, shape (n_params,) 230 | Unraveled gradient of the Kullback-Leibler divergence with respect to 231 | the embedding. 232 | """ 233 | X_embedded = params.reshape(n_samples, n_components) 234 | 235 | # Q is a heavy-tailed distribution: Student's t-distribution 236 | n = pdist(X_embedded, "sqeuclidean") 237 | n += 1. 238 | n /= degrees_of_freedom 239 | n **= (degrees_of_freedom + 1.0) / -2.0 240 | Q = np.maximum(n / (2.0 * np.sum(n)), MACHINE_EPSILON) 241 | 242 | # Optimization trick below: np.dot(x, y) is faster than 243 | # np.sum(x * y) because it calls BLAS 244 | 245 | # Objective: C (Kullback-Leibler divergence of P and Q) 246 | if len(P.shape) == 2: 247 | P = squareform(P) 248 | kl_divergence = 2.0 * np.dot(P, np.log(P / Q)) 249 | 250 | return kl_divergence 251 | 252 | 253 | def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples, 254 | n_components, angle=0.5, skip_num_points=0, 255 | verbose=False): 256 | """t-SNE objective function: KL divergence of p_ijs and q_ijs. 257 | 258 | Uses Barnes-Hut tree methods to calculate the gradient that 259 | runs in O(NlogN) instead of O(N^2) 260 | 261 | Parameters 262 | ---------- 263 | params : array, shape (n_params,) 264 | Unraveled embedding. 265 | 266 | P : array, shape (n_samples * (n_samples-1) / 2,) 267 | Condensed joint probability matrix. 268 | 269 | neighbors: int64 array, shape (n_samples, K) 270 | Array with element [i, j] giving the index for the jth 271 | closest neighbor to point i. 272 | 273 | degrees_of_freedom : float 274 | Degrees of freedom of the Student's-t distribution. 275 | 276 | n_samples : int 277 | Number of samples. 278 | 279 | n_components : int 280 | Dimension of the embedded space. 281 | 282 | angle : float (default: 0.5) 283 | This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 284 | 'angle' is the angular size (referred to as theta in [3]) of a distant 285 | node as measured from a point. If this size is below 'angle' then it is 286 | used as a summary node of all points contained within it. 287 | This method is not very sensitive to changes in this parameter 288 | in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing 289 | computation time and angle greater 0.8 has quickly increasing error. 290 | 291 | skip_num_points : int (optional, default:0) 292 | This does not compute the gradient for points with indices below 293 | `skip_num_points`. This is useful when computing transforms of new 294 | data where you'd like to keep the old data fixed. 295 | 296 | verbose : int 297 | Verbosity level. 298 | 299 | Returns 300 | ------- 301 | kl_divergence : float 302 | Kullback-Leibler divergence of p_ij and q_ij. 303 | 304 | grad : array, shape (n_params,) 305 | Unraveled gradient of the Kullback-Leibler divergence with respect to 306 | the embedding. 307 | """ 308 | params = astype(params, np.float32, copy=False) 309 | X_embedded = params.reshape(n_samples, n_components) 310 | neighbors = astype(neighbors, np.int64, copy=False) 311 | if len(P.shape) == 1: 312 | sP = squareform(P).astype(np.float32) 313 | else: 314 | sP = P.astype(np.float32) 315 | 316 | grad = np.zeros(X_embedded.shape, dtype=np.float32) 317 | error = _barnes_hut_tsne.gradient(sP, X_embedded, neighbors, 318 | grad, angle, n_components, verbose, 319 | dof=degrees_of_freedom) 320 | c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom 321 | grad = grad.ravel() 322 | grad *= c 323 | 324 | return error, grad 325 | 326 | 327 | def _gradient_descent(objective, p0, it, n_iter, objective_error=None, 328 | n_iter_check=1, n_iter_without_progress=50, 329 | momentum=0.5, learning_rate=1000.0, min_gain=0.01, 330 | min_grad_norm=1e-7, min_error_diff=1e-7, verbose=0, 331 | args=None, kwargs=None): 332 | """Batch gradient descent with momentum and individual gains. 333 | 334 | Parameters 335 | ---------- 336 | objective : function or callable 337 | Should return a tuple of cost and gradient for a given parameter 338 | vector. When expensive to compute, the cost can optionally 339 | be None and can be computed every n_iter_check steps using 340 | the objective_error function. 341 | 342 | p0 : array-like, shape (n_params,) 343 | Initial parameter vector. 344 | 345 | it : int 346 | Current number of iterations (this function will be called more than 347 | once during the optimization). 348 | 349 | n_iter : int 350 | Maximum number of gradient descent iterations. 351 | 352 | n_iter_check : int 353 | Number of iterations before evaluating the global error. If the error 354 | is sufficiently low, we abort the optimization. 355 | 356 | objective_error : function or callable 357 | Should return a tuple of cost and gradient for a given parameter 358 | vector. 359 | 360 | n_iter_without_progress : int, optional (default: 30) 361 | Maximum number of iterations without progress before we abort the 362 | optimization. 363 | 364 | momentum : float, within (0.0, 1.0), optional (default: 0.5) 365 | The momentum generates a weight for previous gradients that decays 366 | exponentially. 367 | 368 | learning_rate : float, optional (default: 1000.0) 369 | The learning rate should be extremely high for t-SNE! Values in the 370 | range [100.0, 1000.0] are common. 371 | 372 | min_gain : float, optional (default: 0.01) 373 | Minimum individual gain for each parameter. 374 | 375 | min_grad_norm : float, optional (default: 1e-7) 376 | If the gradient norm is below this threshold, the optimization will 377 | be aborted. 378 | 379 | min_error_diff : float, optional (default: 1e-7) 380 | If the absolute difference of two successive cost function values 381 | is below this threshold, the optimization will be aborted. 382 | 383 | verbose : int, optional (default: 0) 384 | Verbosity level. 385 | 386 | args : sequence 387 | Arguments to pass to objective function. 388 | 389 | kwargs : dict 390 | Keyword arguments to pass to objective function. 391 | 392 | Returns 393 | ------- 394 | p : array, shape (n_params,) 395 | Optimum parameters. 396 | 397 | error : float 398 | Optimum. 399 | 400 | i : int 401 | Last iteration. 402 | """ 403 | if args is None: 404 | args = [] 405 | if kwargs is None: 406 | kwargs = {} 407 | 408 | p = p0.copy().ravel() 409 | update = np.zeros_like(p) 410 | gains = np.ones_like(p) 411 | error = np.finfo(np.float).max 412 | best_error = np.finfo(np.float).max 413 | best_iter = 0 414 | 415 | for i in range(it, n_iter): 416 | new_error, grad = objective(p, *args, **kwargs) 417 | grad_norm = linalg.norm(grad) 418 | 419 | inc = update * grad >= 0.0 420 | dec = np.invert(inc) 421 | gains[inc] += 0.05 422 | gains[dec] *= 0.95 423 | np.clip(gains, min_gain, np.inf) 424 | grad *= gains 425 | update = momentum * update - learning_rate * grad 426 | p += update 427 | 428 | if (i + 1) % n_iter_check == 0: 429 | if new_error is None: 430 | new_error = objective_error(p, *args) 431 | error_diff = np.abs(new_error - error) 432 | error = new_error 433 | 434 | if verbose >= 2: 435 | m = "[t-SNE] Iteration %d: error = %.7f, gradient norm = %.7f" 436 | print(m % (i + 1, error, grad_norm)) 437 | 438 | if error < best_error: 439 | best_error = error 440 | best_iter = i 441 | elif i - best_iter > n_iter_without_progress: 442 | if verbose >= 2: 443 | print("[t-SNE] Iteration %d: did not make any progress " 444 | "during the last %d episodes. Finished." 445 | % (i + 1, n_iter_without_progress)) 446 | break 447 | if grad_norm <= min_grad_norm: 448 | if verbose >= 2: 449 | print("[t-SNE] Iteration %d: gradient norm %f. Finished." 450 | % (i + 1, grad_norm)) 451 | break 452 | if error_diff <= min_error_diff: 453 | if verbose >= 2: 454 | m = "[t-SNE] Iteration %d: error difference %f. Finished." 455 | print(m % (i + 1, error_diff)) 456 | break 457 | 458 | if new_error is not None: 459 | error = new_error 460 | 461 | return p, error, i 462 | 463 | 464 | def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False): 465 | """Expresses to what extent the local structure is retained. 466 | 467 | The trustworthiness is within [0, 1]. It is defined as 468 | 469 | .. math:: 470 | 471 | T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1} 472 | \sum_{j \in U^{(k)}_i (r(i, j) - k)} 473 | 474 | where :math:`r(i, j)` is the rank of the embedded datapoint j 475 | according to the pairwise distances between the embedded datapoints, 476 | :math:`U^{(k)}_i` is the set of points that are in the k nearest 477 | neighbors in the embedded space but not in the original space. 478 | 479 | * "Neighborhood Preservation in Nonlinear Projection Methods: An 480 | Experimental Study" 481 | J. Venna, S. Kaski 482 | * "Learning a Parametric Embedding by Preserving Local Structure" 483 | L.J.P. van der Maaten 484 | 485 | Parameters 486 | ---------- 487 | X : array, shape (n_samples, n_features) or (n_samples, n_samples) 488 | If the metric is 'precomputed' X must be a square distance 489 | matrix. Otherwise it contains a sample per row. 490 | 491 | X_embedded : array, shape (n_samples, n_components) 492 | Embedding of the training data in low-dimensional space. 493 | 494 | n_neighbors : int, optional (default: 5) 495 | Number of neighbors k that will be considered. 496 | 497 | precomputed : bool, optional (default: False) 498 | Set this flag if X is a precomputed square distance matrix. 499 | 500 | Returns 501 | ------- 502 | trustworthiness : float 503 | Trustworthiness of the low-dimensional embedding. 504 | """ 505 | if precomputed: 506 | dist_X = X 507 | else: 508 | dist_X = pairwise_distances(X, squared=True) 509 | dist_X_embedded = pairwise_distances(X_embedded, squared=True) 510 | ind_X = np.argsort(dist_X, axis=1) 511 | ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1] 512 | 513 | n_samples = X.shape[0] 514 | t = 0.0 515 | ranks = np.zeros(n_neighbors) 516 | for i in range(n_samples): 517 | for j in range(n_neighbors): 518 | ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0] 519 | ranks -= n_neighbors 520 | t += np.sum(ranks[ranks > 0]) 521 | t = 1.0 - t * (2.0 / (n_samples * n_neighbors * 522 | (2.0 * n_samples - 3.0 * n_neighbors - 1.0))) 523 | return t 524 | 525 | 526 | class SemiSupervisedTSNE(BaseEstimator): 527 | """Semi Supervised t-distributed Stochastic Neighbor Embedding. 528 | 529 | t-SNE [1] is a tool to visualize high-dimensional data. It converts 530 | similarities between data points to joint probabilities and tries 531 | to minimize the Kullback-Leibler divergence between the joint 532 | probabilities of the low-dimensional embedding and the 533 | high-dimensional data. t-SNE has a cost function that is not convex, 534 | i.e. with different initializations we can get different results. 535 | 536 | It is highly recommended to use another dimensionality reduction 537 | method (e.g. PCA for dense data or TruncatedSVD for sparse data) 538 | to reduce the number of dimensions to a reasonable amount (e.g. 50) 539 | if the number of features is very high. This will suppress some 540 | noise and speed up the computation of pairwise distances between 541 | samples. For more tips see Laurens van der Maaten's FAQ [2]. 542 | 543 | This semi-supervised version of t-SNE supports an incomplete labelling 544 | being supplied. This labelling is then used to inform the dimension 545 | reduction such that samples with the same label are more likely to 546 | be close, while samples with different labels are more likely to be 547 | separated. 548 | 549 | Read more in the sklearn :ref:`User Guide `. 550 | 551 | Parameters 552 | ---------- 553 | n_components : int, optional (default: 2) 554 | Dimension of the embedded space. 555 | 556 | label_importance : float, optional (default: 1.0) 557 | How much to weight the importance of the labels when determining 558 | the transformation. In practice this determines how far from 559 | a uniform distribution to make the label based prior. 560 | 561 | class_sizes_are_representative : boolean, optional (default: False) 562 | If label class sizes are representative of the full/true labelling 563 | then we can weight the prior using class sizes, which can account 564 | for significant variance in class sizes well. Unless youn know that 565 | you have a representative sample labelled it is best to leave this 566 | False. 567 | 568 | perplexity : float, optional (default: 30) 569 | The perplexity is related to the number of nearest neighbors that 570 | is used in other manifold learning algorithms. Larger datasets 571 | usually require a larger perplexity. Consider selecting a value 572 | between 5 and 50. The choice is not extremely critical since t-SNE 573 | is quite insensitive to this parameter. 574 | 575 | early_exaggeration : float, optional (default: 4.0) 576 | Controls how tight natural clusters in the original space are in 577 | the embedded space and how much space will be between them. For 578 | larger values, the space between natural clusters will be larger 579 | in the embedded space. Again, the choice of this parameter is not 580 | very critical. If the cost function increases during initial 581 | optimization, the early exaggeration factor or the learning rate 582 | might be too high. 583 | 584 | learning_rate : float, optional (default: 1000) 585 | The learning rate can be a critical parameter. It should be 586 | between 100 and 1000. If the cost function increases during initial 587 | optimization, the early exaggeration factor or the learning rate 588 | might be too high. If the cost function gets stuck in a bad local 589 | minimum increasing the learning rate helps sometimes. 590 | 591 | n_iter : int, optional (default: 1000) 592 | Maximum number of iterations for the optimization. Should be at 593 | least 200. 594 | 595 | n_iter_without_progress : int, optional (default: 30) 596 | Maximum number of iterations without progress before we abort the 597 | optimization. 598 | 599 | .. versionadded:: 0.17 600 | parameter *n_iter_without_progress* to control stopping criteria. 601 | 602 | min_grad_norm : float, optional (default: 1E-7) 603 | If the gradient norm is below this threshold, the optimization will 604 | be aborted. 605 | 606 | metric : string or callable, optional 607 | The metric to use when calculating distance between instances in a 608 | feature array. If metric is a string, it must be one of the options 609 | allowed by scipy.spatial.distance.pdist for its metric parameter, or 610 | a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. 611 | If metric is "precomputed", X is assumed to be a distance matrix. 612 | Alternatively, if metric is a callable function, it is called on each 613 | pair of instances (rows) and the resulting value recorded. The callable 614 | should take two arrays from X as input and return a value indicating 615 | the distance between them. The default is "euclidean" which is 616 | interpreted as squared euclidean distance. 617 | 618 | init : string, optional (default: "random") 619 | Initialization of embedding. Possible options are 'random' and 'pca'. 620 | PCA initialization cannot be used with precomputed distances and is 621 | usually more globally stable than random initialization. 622 | 623 | verbose : int, optional (default: 0) 624 | Verbosity level. 625 | 626 | random_state : int or RandomState instance or None (default) 627 | Pseudo Random Number generator seed control. If None, use the 628 | numpy.random singleton. Note that different initializations 629 | might result in different local minima of the cost function. 630 | 631 | method : string (default: 'barnes_hut') 632 | By default the gradient calculation algorithm uses Barnes-Hut 633 | approximation running in O(NlogN) time. method='exact' 634 | will run on the slower, but exact, algorithm in O(N^2) time. The 635 | exact algorithm should be used when nearest-neighbor errors need 636 | to be better than 3%. However, the exact method cannot scale to 637 | millions of examples. 638 | 639 | .. versionadded:: 0.17 640 | Approximate optimization *method* via the Barnes-Hut. 641 | 642 | angle : float (default: 0.5) 643 | Only used if method='barnes_hut' 644 | This is the trade-off between speed and accuracy for Barnes-Hut T-SNE. 645 | 'angle' is the angular size (referred to as theta in [3]) of a distant 646 | node as measured from a point. If this size is below 'angle' then it is 647 | used as a summary node of all points contained within it. 648 | This method is not very sensitive to changes in this parameter 649 | in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing 650 | computation time and angle greater 0.8 has quickly increasing error. 651 | 652 | 653 | Attributes 654 | ---------- 655 | embedding_ : array-like, shape (n_samples, n_components) 656 | Stores the embedding vectors. 657 | 658 | kl_divergence_ : float 659 | Kullback-Leibler divergence after optimization. 660 | 661 | Examples 662 | -------- 663 | 664 | >>> import numpy as np 665 | >>> from sklearn.manifold import TSNE 666 | >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) 667 | >>> model = TSNE(n_components=2, random_state=0) 668 | >>> np.set_printoptions(suppress=True) 669 | >>> model.fit_transform(X) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE 670 | array([[ 0.00017599, 0.00003993], 671 | [ 0.00009891, 0.00021913], 672 | [ 0.00018554, -0.00009357], 673 | [ 0.00009528, -0.00001407]]) 674 | 675 | References 676 | ---------- 677 | 678 | [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data 679 | Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008. 680 | 681 | [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding 682 | http://homepage.tudelft.nl/19j49/t-SNE.html 683 | 684 | [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms. 685 | Journal of Machine Learning Research 15(Oct):3221-3245, 2014. 686 | http://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf 687 | """ 688 | 689 | def __init__(self, n_components=2, label_importance=1.0, 690 | class_sizes_are_representative=False, perplexity=30.0, 691 | early_exaggeration=4.0, learning_rate=1000.0, n_iter=1000, 692 | n_iter_without_progress=30, min_grad_norm=1e-7, 693 | metric="euclidean", init="random", verbose=0, 694 | random_state=None, method='barnes_hut', angle=0.5): 695 | if init not in ["pca", "random"] or isinstance(init, np.ndarray): 696 | msg = "'init' must be 'pca', 'random' or a NumPy array" 697 | raise ValueError(msg) 698 | self.n_components = n_components 699 | self.label_importance = label_importance 700 | self.class_sizes_are_representative = class_sizes_are_representative 701 | self.perplexity = perplexity 702 | self.early_exaggeration = early_exaggeration 703 | self.learning_rate = learning_rate 704 | self.n_iter = n_iter 705 | self.n_iter_without_progress = n_iter_without_progress 706 | self.min_grad_norm = min_grad_norm 707 | self.metric = metric 708 | self.init = init 709 | self.verbose = verbose 710 | self.random_state = random_state 711 | self.method = method 712 | self.angle = angle 713 | self.embedding_ = None 714 | 715 | def _fit(self, X, y, skip_num_points=0): 716 | """Fit the model using X as training data, and y 717 | as the (partial) labelling. 718 | 719 | Note that sparse arrays can only be handled by method='exact'. 720 | It is recommended that you convert your sparse array to dense 721 | (e.g. `X.toarray()`) if it fits in memory, or otherwise using a 722 | dimensionality reduction technique (e.g. TruncatedSVD). 723 | 724 | Parameters 725 | ---------- 726 | X : array, shape (n_samples, n_features) or (n_samples, n_samples) 727 | If the metric is 'precomputed' X must be a square distance 728 | matrix. Otherwise it contains a sample per row. Note that this 729 | when method='barnes_hut', X cannot be a sparse array and if need be 730 | will be converted to a 32 bit float array. Method='exact' allows 731 | sparse arrays and 64bit floating point inputs. 732 | 733 | y : array, shape (n_samples,) 734 | Labels must be integers, with unlabelled points given the label -1. 735 | 736 | skip_num_points : int (optional, default:0) 737 | This does not compute the gradient for points with indices below 738 | `skip_num_points`. This is useful when computing transforms of new 739 | data where you'd like to keep the old data fixed. 740 | """ 741 | if self.method not in ['barnes_hut', 'exact']: 742 | raise ValueError("'method' must be 'barnes_hut' or 'exact'") 743 | if self.angle < 0.0 or self.angle > 1.0: 744 | raise ValueError("'angle' must be between 0.0 - 1.0") 745 | if self.method == 'barnes_hut' and sp.issparse(X): 746 | raise TypeError('A sparse matrix was passed, but dense ' 747 | 'data is required for method="barnes_hut". Use ' 748 | 'X.toarray() to convert to a dense numpy array if ' 749 | 'the array is small enough for it to fit in ' 750 | 'memory. Otherwise consider dimensionality ' 751 | 'reduction techniques (e.g. TruncatedSVD)') 752 | else: 753 | X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], 754 | dtype=np.float64) 755 | random_state = check_random_state(self.random_state) 756 | 757 | if self.early_exaggeration < 1.0: 758 | raise ValueError("early_exaggeration must be at least 1, but is " 759 | "%f" % self.early_exaggeration) 760 | 761 | if self.n_iter < 200: 762 | raise ValueError("n_iter should be at least 200") 763 | 764 | if self.metric == "precomputed": 765 | if self.init == 'pca': 766 | raise ValueError("The parameter init=\"pca\" cannot be used " 767 | "with metric=\"precomputed\".") 768 | if X.shape[0] != X.shape[1]: 769 | raise ValueError("X should be a square distance matrix") 770 | distances = X 771 | else: 772 | if self.verbose: 773 | print("[t-SNE] Computing pairwise distances...") 774 | 775 | if self.metric == "euclidean": 776 | distances = pairwise_distances(X, metric=self.metric, 777 | squared=True) 778 | else: 779 | distances = pairwise_distances(X, metric=self.metric) 780 | 781 | if not np.all(distances >= 0): 782 | raise ValueError("All distances should be positive, either " 783 | "the metric or precomputed distances given " 784 | "as X are not correct") 785 | 786 | # Degrees of freedom of the Student's t-distribution. The suggestion 787 | # degrees_of_freedom = n_components - 1 comes from 788 | # "Learning a Parametric Embedding by Preserving Local Structure" 789 | # Laurens van der Maaten, 2009. 790 | degrees_of_freedom = max(self.n_components - 1.0, 1) 791 | n_samples = X.shape[0] 792 | # the number of nearest neighbors to find 793 | k = min(n_samples - 1, int(3. * self.perplexity + 1)) 794 | 795 | neighbors_nn = None 796 | if self.method == 'barnes_hut': 797 | if self.verbose: 798 | print("[t-SNE] Computing %i nearest neighbors..." % k) 799 | if self.metric == 'precomputed': 800 | # Use the precomputed distances to find 801 | # the k nearest neighbors and their distances 802 | neighbors_nn = np.argsort(distances, axis=1)[:, :k] 803 | else: 804 | # Find the nearest neighbors for every point 805 | bt = BallTree(X) 806 | # LvdM uses 3 * perplexity as the number of neighbors 807 | # And we add one to not count the data point itself 808 | # In the event that we have very small # of points 809 | # set the neighbors to n - 1 810 | distances_nn, neighbors_nn = bt.query(X, k=k + 1) 811 | neighbors_nn = neighbors_nn[:, 1:] 812 | P = _joint_probabilities_nn(distances, neighbors_nn, y, 813 | self.label_importance, 814 | self.class_sizes_are_representative, 815 | self.perplexity, self.verbose) 816 | else: 817 | P = _joint_probabilities(distances, y, 818 | self.label_importance, 819 | self.class_sizes_are_representative, 820 | self.perplexity, self.verbose) 821 | assert np.all(np.isfinite(P)), "All probabilities should be finite" 822 | assert np.all(P >= 0), "All probabilities should be zero or positive" 823 | assert np.all(P <= 1), ("All probabilities should be less " 824 | "or then equal to one") 825 | 826 | if self.init == 'pca': 827 | pca = PCA(n_components=self.n_components, svd_solver='randomized', 828 | random_state=random_state) 829 | X_embedded = pca.fit_transform(X) 830 | elif isinstance(self.init, np.ndarray): 831 | X_embedded = self.init 832 | elif self.init == 'random': 833 | X_embedded = None 834 | else: 835 | raise ValueError("Unsupported initialization scheme: %s" 836 | % self.init) 837 | 838 | return self._tsne(P, degrees_of_freedom, n_samples, random_state, 839 | X_embedded=X_embedded, 840 | neighbors=neighbors_nn, 841 | skip_num_points=skip_num_points) 842 | 843 | def _tsne(self, P, degrees_of_freedom, n_samples, random_state, 844 | X_embedded=None, neighbors=None, skip_num_points=0): 845 | """Runs t-SNE.""" 846 | # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P 847 | # and the Student's t-distributions Q. The optimization algorithm that 848 | # we use is batch gradient descent with three stages: 849 | # * early exaggeration with momentum 0.5 850 | # * early exaggeration with momentum 0.8 851 | # * final optimization with momentum 0.8 852 | # The embedding is initialized with iid samples from Gaussians with 853 | # standard deviation 1e-4. 854 | 855 | if X_embedded is None: 856 | # Initialize embedding randomly 857 | X_embedded = 1e-4 * random_state.randn(n_samples, 858 | self.n_components) 859 | params = X_embedded.ravel() 860 | 861 | opt_args = {} 862 | opt_args = {"n_iter": 50, "momentum": 0.5, "it": 0, 863 | "learning_rate": self.learning_rate, 864 | "verbose": self.verbose, "n_iter_check": 25, 865 | "kwargs": dict(skip_num_points=skip_num_points)} 866 | if self.method == 'barnes_hut': 867 | m = "Must provide an array of neighbors to use Barnes-Hut" 868 | assert neighbors is not None, m 869 | obj_func = _kl_divergence_bh 870 | objective_error = _kl_divergence_error 871 | sP = squareform(P).astype(np.float32) 872 | neighbors = neighbors.astype(np.int64) 873 | args = [sP, neighbors, degrees_of_freedom, n_samples, 874 | self.n_components] 875 | opt_args['args'] = args 876 | opt_args['min_grad_norm'] = 1e-3 877 | opt_args['n_iter_without_progress'] = 30 878 | # Don't always calculate the cost since that calculation 879 | # can be nearly as expensive as the gradient 880 | opt_args['objective_error'] = objective_error 881 | opt_args['kwargs']['angle'] = self.angle 882 | opt_args['kwargs']['verbose'] = self.verbose 883 | else: 884 | obj_func = _kl_divergence 885 | opt_args['args'] = [P, degrees_of_freedom, n_samples, 886 | self.n_components] 887 | opt_args['min_error_diff'] = 0.0 888 | opt_args['min_grad_norm'] = 0.0 889 | 890 | # Early exaggeration 891 | P *= self.early_exaggeration 892 | 893 | params, kl_divergence, it = _gradient_descent(obj_func, params, 894 | **opt_args) 895 | opt_args['n_iter'] = 100 896 | opt_args['momentum'] = 0.8 897 | opt_args['it'] = it + 1 898 | params, kl_divergence, it = _gradient_descent(obj_func, params, 899 | **opt_args) 900 | if self.verbose: 901 | print("[t-SNE] KL divergence after %d iterations with early " 902 | "exaggeration: %f" % (it + 1, kl_divergence)) 903 | # Save the final number of iterations 904 | self.n_iter_final = it 905 | 906 | # Final optimization 907 | P /= self.early_exaggeration 908 | opt_args['n_iter'] = self.n_iter 909 | opt_args['it'] = it + 1 910 | params, error, it = _gradient_descent(obj_func, params, **opt_args) 911 | 912 | if self.verbose: 913 | print("[t-SNE] Error after %d iterations: %f" 914 | % (it + 1, kl_divergence)) 915 | 916 | X_embedded = params.reshape(n_samples, self.n_components) 917 | self.kl_divergence_ = kl_divergence 918 | 919 | return X_embedded 920 | 921 | def fit_transform(self, X, y): 922 | """Fit X into an embedded space and return that transformed 923 | output. 924 | 925 | Parameters 926 | ---------- 927 | X : array, shape (n_samples, n_features) or (n_samples, n_samples) 928 | If the metric is 'precomputed' X must be a square distance 929 | matrix. Otherwise it contains a sample per row. 930 | 931 | y : array, shape (n_samples,) 932 | A (partial) labelling of the samples. The array should provide 933 | a label value for each sample. Labels must be integers, with 934 | unlabelled points given the label -1. 935 | 936 | Returns 937 | ------- 938 | X_new : array, shape (n_samples, n_components) 939 | Embedding of the training data in low-dimensional space. 940 | """ 941 | embedding = self._fit(X, y) 942 | self.embedding_ = embedding 943 | return self.embedding_ 944 | 945 | def fit(self, X, y): 946 | """Fit X into an embedded space. 947 | 948 | Parameters 949 | ---------- 950 | X : array, shape (n_samples, n_features) or (n_samples, n_samples) 951 | If the metric is 'precomputed' X must be a square distance 952 | matrix. Otherwise it contains a sample per row. If the method 953 | is 'exact', X may be a sparse matrix of type 'csr', 'csc' 954 | or 'coo'. 955 | 956 | y : array, shape (n_samples,) 957 | A (partial) labelling of the samples. The array should provide 958 | a label value for each sample. Labels must be integers, with 959 | unlabelled points given the label -1. 960 | """ 961 | self.fit_transform(X, y) 962 | return self 963 | --------------------------------------------------------------------------------