├── sstsne
    ├── __init__.py
    ├── _utils.pyx
    ├── _barnes_hut_tsne.pyx
    └── ss_t_sne.py
├── README.md
├── LICENSE
├── .gitignore
└── setup.py


/sstsne/__init__.py:
--------------------------------------------------------------------------------
1 | from .ss_t_sne import SemiSupervisedTSNE
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # sstsne
2 | Semi-Supervised t-SNE using a Bayesian prior based on partial labelling
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Leland McInnes
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | try:
 4 |     from Cython.Distutils import build_ext
 5 |     from setuptools import setup, Extension
 6 |     HAVE_CYTHON = True
 7 | except ImportError as e:
 8 |     warnings.warn(e.message)
 9 |     from setuptools import setup, Extension
10 |     from setuptools.command.build_ext import build_ext
11 |     HAVE_CYTHON = False
12 | 
13 | import numpy
14 | 
15 | _utils = Extension('sstsne._utils',
16 |                    sources=['sstsne/_utils.pyx'],
17 |                    include_dirs=[numpy.get_include()])
18 | 
19 | _barnes_hut_tsne = Extension('sstsne._barnes_hut_tsne',
20 |                              sources=['sstsne/_barnes_hut_tsne.pyx'],
21 |                              include_dirs=[numpy.get_include(), '/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/Headers/'])
22 | 
23 | def readme():
24 |     with open('README.md') as readme_file:
25 |         return readme_file.read()
26 | 
27 | configuration = {
28 |     'name' : 'sstsne',
29 |     'version' : '0.1',
30 |     'description' : 'Semi-Supervised t-SNE using a Bayesian prior based on partial labelling',
31 |     'long_description' : readme(),
32 |     'classifiers' : [
33 |         'Development Status :: 3 - Alpha',
34 |         'Intended Audience :: Science/Research',
35 |         'Intended Audience :: Developers',
36 |         'License :: OSI Approved',
37 |         'Programming Language :: C',
38 |         'Programming Language :: Python',
39 |         'Topic :: Software Development',
40 |         'Topic :: Scientific/Engineering',
41 |         'Operating System :: Microsoft :: Windows',
42 |         'Operating System :: POSIX',
43 |         'Operating System :: Unix',
44 |         'Operating System :: MacOS',
45 |         'Programming Language :: Python :: 2.7',
46 |         'Programming Language :: Python :: 3.4',
47 |     ],
48 |     'keywords' : 'tsne semi-supervised dimension reduction',
49 |     'url' : 'http://github.com/lmcinnes/sstsne',
50 |     'maintainer' : 'Leland McInnes',
51 |     'maintainer_email' : 'leland.mcinnes@gmail.com',
52 |     'license' : 'BSD',
53 |     'packages' : ['sstsne'],
54 |     'install_requires' : ['scikit-learn>=0.17.1',
55 |                           'cython >= 0.17'],
56 |     'ext_modules' : [_utils,
57 |                      _barnes_hut_tsne],
58 |     'cmdclass' : {'build_ext' : build_ext},
59 |     'test_suite' : 'nose.collector',
60 |     'tests_require' : ['nose'],
61 |     }
62 | 
63 | if not HAVE_CYTHON:
64 |     _utils.sources[0] = '_utils.c'
65 |     _barnes_hut_tsne.sources[0] = '_barnes_hut_tsne.c'
66 |     configuration['install_requires'] = ['scikit-learn>=0.17.1']
67 | 
68 | setup(**configuration)


--------------------------------------------------------------------------------
/sstsne/_utils.pyx:
--------------------------------------------------------------------------------
  1 | from libc cimport math
  2 | cimport cython
  3 | import numpy as np
  4 | cimport numpy as np
  5 | from libc.stdio cimport printf
  6 | cdef extern from "numpy/npy_math.h":
  7 |     float NPY_INFINITY
  8 | 
  9 | 
 10 | cdef float EPSILON_DBL = 1e-8
 11 | cdef float PERPLEXITY_TOLERANCE = 1e-5
 12 | 
 13 | @cython.boundscheck(False)
 14 | cpdef np.ndarray[np.float32_t, ndim=2] _binary_search_perplexity(
 15 |         np.ndarray[np.float32_t, ndim=2] affinities,
 16 |         np.ndarray[np.int64_t, ndim=2] neighbors,
 17 |         np.ndarray[np.int64_t, ndim=1] labels,
 18 |         float label_importance,
 19 |         int rep_samples,
 20 |         float desired_perplexity,
 21 |         int verbose):
 22 |     """Binary search for sigmas of conditional Gaussians.
 23 | 
 24 |     This approximation reduces the computational complexity from O(N^2) to
 25 |     O(uN). See the exact method '_binary_search_perplexity' for more details.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     affinities : array-like, shape (n_samples, n_samples)
 30 |         Distances between training samples.
 31 | 
 32 |     neighbors : array-like, shape (n_samples, K) or None
 33 |         Each row contains the indices to the K nearest neigbors. If this
 34 |         array is None, then the perplexity is estimated over all data
 35 |         not just the nearest neighbors.
 36 | 
 37 |     labels : array-like, shape (n_samples,)
 38 |         Integer labels for the samples. Unlabelled samples should have label -1.
 39 |         
 40 |     label_importance : float
 41 |         Relative importance to place on labelling
 42 |         
 43 |     rep_sample : int
 44 |         Whether the partial labels are a representative sample of the full labelling
 45 | 
 46 |     desired_perplexity : float
 47 |         Desired perplexity (2^entropy) of the conditional Gaussians.
 48 | 
 49 |     verbose : int
 50 |         Verbosity level.
 51 | 
 52 |     Returns
 53 |     -------
 54 |     P : array, shape (n_samples, n_samples)
 55 |         Probabilities of conditional Gaussian distributions p_i|j.
 56 |     """
 57 |     # Maximum number of binary search steps
 58 |     cdef long n_steps = 100
 59 | 
 60 |     cdef long n_samples = affinities.shape[0]
 61 |     # This array is later used as a 32bit array. It has multiple intermediate
 62 |     # floating point additions that benefit from the extra precision
 63 |     cdef np.ndarray[np.float64_t, ndim=2] P = np.zeros((n_samples, n_samples),
 64 |                                                        dtype=np.float64)
 65 |     # Precisions of conditional Gaussian distrubutions
 66 |     cdef float beta
 67 |     cdef float beta_min
 68 |     cdef float beta_max
 69 |     cdef float beta_sum = 0.0
 70 |     # Now we go to log scale
 71 |     cdef float desired_entropy = math.log(desired_perplexity)
 72 |     cdef float entropy_diff
 73 | 
 74 |     cdef float entropy
 75 |     cdef float sum_Pi
 76 |     cdef float sum_disti_Pi
 77 |     cdef float prior_prob
 78 |     
 79 |     cdef long i, j, k, l = 0
 80 |     cdef long K = n_samples
 81 |     cdef int using_neighbors = neighbors is not None
 82 | 
 83 |     cdef np.ndarray[long, ndim=1] label_sizes = np.bincount(labels + 1)
 84 |     cdef long n_same_label
 85 |     cdef long n_other_label
 86 |     cdef long n_unlabelled = label_sizes[0]
 87 | 
 88 |     if using_neighbors:
 89 |         K = neighbors.shape[1]
 90 | 
 91 |     for i in range(n_samples):
 92 |         beta_min = -NPY_INFINITY
 93 |         beta_max = NPY_INFINITY
 94 |         beta = 1.0
 95 | 
 96 |         # Binary search of precision for i-th conditional distribution
 97 |         for l in range(n_steps):
 98 |             # Compute current entropy and corresponding probabilities
 99 |             # computed just over the nearest neighbors or over all data
100 |             # if we're not using neighbors
101 |             if using_neighbors:
102 |                 for k in range(K):
103 |                     j = neighbors[i, k]
104 |                     P[i, j] = math.exp(-affinities[i, j] * beta)
105 |             else:
106 |                 for j in range(K):
107 |                     P[i, j] = math.exp(-affinities[i, j] * beta)
108 |             P[i, i] = 0.0
109 |             sum_Pi = 0.0
110 |             if using_neighbors:
111 |                 for k in range(K):
112 |                     j = neighbors[i, k]
113 |                     sum_Pi += P[i, j]
114 |             else:
115 |                 for j in range(K):
116 |                     sum_Pi += P[i, j]
117 |             if sum_Pi == 0.0:
118 |                 sum_Pi = EPSILON_DBL
119 |             sum_disti_Pi = 0.0
120 |             if using_neighbors:
121 |                 for k in range(K):
122 |                     j = neighbors[i, k]
123 |                     P[i, j] /= sum_Pi
124 |                     sum_disti_Pi += affinities[i, j] * P[i, j]
125 |             else:
126 |                 for j in range(K):
127 |                     P[i, j] /= sum_Pi
128 |                     sum_disti_Pi += affinities[i, j] * P[i, j]
129 |             entropy = math.log(sum_Pi) + beta * sum_disti_Pi
130 |             entropy_diff = entropy - desired_entropy
131 | 
132 |             if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
133 |                 break
134 | 
135 |             if entropy_diff > 0.0:
136 |                 beta_min = beta
137 |                 if beta_max == NPY_INFINITY:
138 |                     beta *= 2.0
139 |                 else:
140 |                     beta = (beta + beta_max) / 2.0
141 |             else:
142 |                 beta_max = beta
143 |                 if beta_min == -NPY_INFINITY:
144 |                     beta /= 2.0
145 |                 else:
146 |                     beta = (beta + beta_min) / 2.0
147 | 
148 |         beta_sum += beta
149 | 
150 |         if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
151 |             print("[t-SNE] Computed conditional probabilities for sample "
152 |                   "%d / %d" % (i + 1, n_samples))
153 | 
154 |     if verbose:
155 |         print("[t-SNE] Mean sigma: %f"
156 |               % np.mean(math.sqrt(n_samples / beta_sum)))
157 |               
158 |     for i in range(n_samples):
159 |     
160 |         sum_Pi = 0
161 |         
162 |         if using_neighbors:
163 |         
164 |             for k in range(K):
165 |                 j = neighbors[i, k]
166 | 
167 |                 n_same_label = label_sizes[labels[i] + 1]
168 |                 n_other_label = n_samples - n_same_label - n_unlabelled
169 |                     
170 |                 if rep_samples:
171 |                     
172 |                     denominator = n_same_label ** 2 + n_other_label ** 2 + n_unlabelled ** 2
173 | 
174 |                     if labels[i] == -1 or labels[j] == -1:
175 |                         prior_prob = n_unlabelled / denominator
176 |                     elif labels[j] == labels[i]:
177 |                         prior_prob = min((n_same_label / denominator) + (label_importance / n_same_label), 1.0 - EPSILON_DBL)
178 |                     else:
179 |                         prior_prob = max((n_other_label / denominator) - (label_importance / n_other_label), EPSILON_DBL)
180 | 
181 |                 else:
182 |                     
183 |                     if labels[i] == -1 or labels[j] == -1:
184 |                         prior_prob = 1.0 / n_samples
185 |                     elif labels[j] == labels[i]:
186 |                         prior_prob = min((1.0 / n_samples) + (label_importance / n_same_label), 1.0 - EPSILON_DBL)
187 |                     else:
188 |                         prior_prob = max((1.0 / n_samples) - (label_importance / n_other_label), EPSILON_DBL)
189 | 
190 |                 P[i, j] *= prior_prob
191 |                 sum_Pi += P[i, j]
192 |             
193 |             for k in range(K):
194 |                 j = neighbors[i, k]
195 |                 P[i, j] /= sum_Pi
196 |                 
197 |         else:
198 |         
199 |             for j in range(K):
200 |                 
201 |                 n_same_label = label_sizes[labels[i] + 1]
202 |                 n_other_label = n_samples - n_same_label - n_unlabelled
203 |                     
204 |                 if rep_samples:
205 |                     
206 |                     denominator = n_same_label ** 2 + n_other_label ** 2 + n_unlabelled ** 2
207 | 
208 |                     if labels[i] == -1 or labels[j] == -1:
209 |                         prior_prob = n_unlabelled / denominator
210 |                     elif labels[j] == labels[i]:
211 |                         prior_prob = min((n_same_label / denominator) + (label_importance / n_same_label), 1.0 - EPSILON_DBL)
212 |                     else:
213 |                         prior_prob = max((n_other_label / denominator) - (label_importance / n_other_label), EPSILON_DBL)
214 | 
215 |                 else:
216 |                     
217 |                     if labels[i] == -1 or labels[j] == -1:
218 |                         prior_prob = 1.0 / n_samples
219 |                     elif labels[j] == labels[i]:
220 |                         prior_prob = min((1.0 / n_samples) + (label_importance / n_same_label), 1.0 - EPSILON_DBL)
221 |                     else:
222 |                         prior_prob = max((1.0 / n_samples) - (label_importance / n_other_label), EPSILON_DBL)
223 | 
224 |                 P[i, j] *= prior_prob
225 |                 sum_Pi += P[i, j]
226 |             
227 |             for j in range(K):
228 |                 P[i, j] /= sum_Pi
229 | 
230 |     return P
231 | 


--------------------------------------------------------------------------------
/sstsne/_barnes_hut_tsne.pyx:
--------------------------------------------------------------------------------
  1 | # cython: boundscheck=False
  2 | # cython: wraparound=False
  3 | # cython: cdivision=True
  4 | # Author: Christopher Moody <chrisemoody@gmail.com>
  5 | # Author: Nick Travers <nickt@squareup.com>
  6 | # Implementation by Chris Moody & Nick Travers
  7 | # See http://homepage.tudelft.nl/19j49/t-SNE.html for reference
  8 | # implementations and papers describing the technique
  9 | 
 10 | 
 11 | from libc.stdlib cimport malloc, free
 12 | from libc.stdio cimport printf
 13 | from libc.math cimport sqrt, log
 14 | cimport numpy as np
 15 | import numpy as np
 16 | 
 17 | cdef char* EMPTY_STRING = ""
 18 | 
 19 | cdef extern from "math.h":
 20 |     float fabsf(float x) nogil
 21 | 
 22 | # Round points differing by less than this amount
 23 | # effectively ignoring differences near the 32bit
 24 | # floating point precision
 25 | cdef float EPSILON = 1e-6
 26 | 
 27 | # This is effectively an ifdef statement in Cython
 28 | # It allows us to write printf debugging lines
 29 | # and remove them at compile time
 30 | cdef enum:
 31 |     DEBUGFLAG = 0
 32 | 
 33 | cdef extern from "time.h":
 34 |     # Declare only what is necessary from `tm` structure.
 35 |     ctypedef long clock_t
 36 |     clock_t clock() nogil
 37 |     double CLOCKS_PER_SEC
 38 | 
 39 | 
 40 | cdef extern from "cblas.h":
 41 |     float snrm2 "cblas_snrm2"(int N, float *X, int incX) nogil
 42 | 
 43 | 
 44 | cdef struct Node:
 45 |     # Keep track of the center of mass
 46 |     float* barycenter
 47 |     # If this is a leaf, the position of the point within this leaf
 48 |     float* leaf_point_position
 49 |     # The number of points including all
 50 |     # nodes below this one
 51 |     long cumulative_size
 52 |     # Number of points at this node
 53 |     long size
 54 |     # Index of the point at this node
 55 |     # Only defined for non-empty leaf nodes
 56 |     long point_index
 57 |     # level = 0 is the root node
 58 |     # And each subdivision adds 1 to the level
 59 |     long level
 60 |     # Left edge of this node
 61 |     float* left_edge
 62 |     # The center of this node, equal to le + w/2.0
 63 |     float* center
 64 |     # The width of this node -- used to calculate the opening
 65 |     # angle. Equal to width = re - le
 66 |     float* width
 67 |     # The value of the maximum width w
 68 |     float max_width
 69 | 
 70 |     # Does this node have children?
 71 |     # Default to leaf until we add points
 72 |     int is_leaf
 73 |     # Array of pointers to pointers of children
 74 |     Node **children
 75 |     # Keep a pointer to the parent
 76 |     Node *parent
 77 |     # Pointer to the tree this node belongs too
 78 |     Tree* tree
 79 | 
 80 | cdef struct Tree:
 81 |     # Holds a pointer to the root node
 82 |     Node* root_node
 83 |     # Number of dimensions in the ouput
 84 |     int n_dimensions
 85 |     # Total number of cells
 86 |     long n_cells
 87 |     # Total number of points
 88 |     long n_points
 89 |     # Spit out diagnostic information?
 90 |     int verbose
 91 |     # How many cells per node? Should go as 2 ** n_dimensionss
 92 |     int n_cell_per_node
 93 | 
 94 | cdef Tree* init_tree(float[:] left_edge, float[:] width, int n_dimensions,
 95 |                      int verbose) nogil:
 96 |     # tree is freed by free_tree
 97 |     cdef Tree* tree = <Tree*> malloc(sizeof(Tree))
 98 |     tree.n_dimensions = n_dimensions
 99 |     tree.n_cells = 0
100 |     tree.n_points = 0
101 |     tree.verbose = verbose
102 |     tree.root_node = create_root(left_edge, width, n_dimensions)
103 |     tree.root_node.tree = tree
104 |     tree.n_cells += 1
105 |     tree.n_cell_per_node = 2 ** n_dimensions
106 |     if DEBUGFLAG:
107 |         printf("[t-SNE] Tree initialised. Left_edge = (%1.9e, %1.9e, %1.9e)\n",
108 |                left_edge[0], left_edge[1], left_edge[2])
109 |         printf("[t-SNE] Tree initialised. Width = (%1.9e, %1.9e, %1.9e)\n",
110 |                 width[0], width[1], width[2])
111 |     return tree
112 | 
113 | cdef Node* create_root(float[:] left_edge, float[:] width, int n_dimensions) nogil:
114 |     # Create a default root node
115 |     cdef int ax
116 |     cdef int n_cell_per_node = 2 ** n_dimensions
117 |     # root is freed by free_tree
118 |     root = <Node*> malloc(sizeof(Node))
119 |     root.is_leaf = 1
120 |     root.parent = NULL
121 |     root.level = 0
122 |     root.cumulative_size = 0
123 |     root.size = 0
124 |     root.point_index = -1
125 |     root.max_width = 0.0
126 |     root.width = <float*> malloc(sizeof(float) * n_dimensions)
127 |     root.left_edge = <float*> malloc(sizeof(float) * n_dimensions)
128 |     root.center = <float*> malloc(sizeof(float) * n_dimensions)
129 |     root.barycenter = <float*> malloc(sizeof(float) * n_dimensions)
130 |     root.leaf_point_position= <float*> malloc(sizeof(float) * n_dimensions)
131 |     root.children = NULL
132 |     for ax in range(n_dimensions):
133 |         root.width[ax] = width[ax]
134 |         root.left_edge[ax] = left_edge[ax]
135 |         root.center[ax] = 0.0
136 |         root.barycenter[ax] = 0.
137 |         root.leaf_point_position[ax] = -1
138 |     for ax in range(n_dimensions):
139 |         root.max_width = max(root.max_width, root.width[ax])
140 |     if DEBUGFLAG:
141 |         printf("[t-SNE] Created root node %p\n", root)
142 |     return root
143 | 
144 | cdef Node* create_child(Node *parent, int[3] offset) nogil:
145 |     # Create a new child node with default parameters
146 |     cdef int ax
147 |     # these children are freed by free_recursive
148 |     child = <Node *> malloc(sizeof(Node))
149 |     child.is_leaf = 1
150 |     child.parent = parent
151 |     child.level = parent.level + 1
152 |     child.size = 0
153 |     child.cumulative_size = 0
154 |     child.point_index = -1
155 |     child.tree = parent.tree
156 |     child.max_width = 0.0
157 |     child.width = <float*> malloc(sizeof(float) * parent.tree.n_dimensions)
158 |     child.left_edge = <float*> malloc(sizeof(float) * parent.tree.n_dimensions)
159 |     child.center = <float*> malloc(sizeof(float) * parent.tree.n_dimensions)
160 |     child.barycenter = <float*> malloc(sizeof(float) * parent.tree.n_dimensions)
161 |     child.leaf_point_position = <float*> malloc(sizeof(float) * parent.tree.n_dimensions)
162 |     child.children = NULL
163 |     for ax in range(parent.tree.n_dimensions):
164 |         child.width[ax] = parent.width[ax] / 2.0
165 |         child.left_edge[ax] = parent.left_edge[ax] + offset[ax] * parent.width[ax] / 2.0
166 |         child.center[ax] = child.left_edge[ax] + child.width[ax] / 2.0
167 |         child.barycenter[ax] = 0.
168 |         child.leaf_point_position[ax] = -1.
169 |     for ax in range(parent.tree.n_dimensions):
170 |         child.max_width = max(child.max_width, child.width[ax])
171 |     child.tree.n_cells += 1
172 |     return child
173 | 
174 | cdef Node* select_child(Node *node, float[3] pos, long index) nogil:
175 |     # Find which sub-node a position should go into
176 |     # And return the appropriate node
177 |     cdef int* offset = <int*> malloc(sizeof(int) * node.tree.n_dimensions)
178 |     cdef int ax, idx
179 |     cdef Node* child
180 |     cdef int error
181 |     for ax in range(node.tree.n_dimensions):
182 |         offset[ax] = (pos[ax] - (node.left_edge[ax] + node.width[ax] / 2.0)) > 0.
183 |     idx = offset2index(offset, node.tree.n_dimensions)
184 |     child = node.children[idx]
185 |     if DEBUGFLAG:
186 |         printf("[t-SNE] Offset [%i, %i] with LE [%f, %f]\n",
187 |                offset[0], offset[1], child.left_edge[0], child.left_edge[1])
188 |     free(offset)
189 |     return child
190 | 
191 | 
192 | cdef inline void index2offset(int* offset, int index, int n_dimensions) nogil:
193 |     # Convert a 1D index into N-D index; useful for indexing
194 |     # children of a quadtree, octree, N-tree
195 |     # Quite likely there's a fancy bitshift way of doing this
196 |     # since the offset is equivalent to the binary representation
197 |     # of the integer index
198 |     # We read the offset array left-to-right
199 |     # such that the least significat bit is on the right
200 |     cdef int rem, k, shift
201 |     for k in range(n_dimensions):
202 |         shift = n_dimensions -k -1
203 |         rem = ((index >> shift) << shift)
204 |         offset[k] = rem > 0
205 |         if DEBUGFLAG:
206 |             printf("i2o index %i k %i rem %i offset", index, k, rem)
207 |             for j in range(n_dimensions):
208 |                 printf(" %i", offset[j])
209 |             printf(" n_dimensions %i\n", n_dimensions)
210 |         index -= rem
211 | 
212 | 
213 | cdef inline int offset2index(int* offset, int n_dimensions) nogil:
214 |     # Calculate the 1:1 index for a given offset array
215 |     # We read the offset array right-to-left
216 |     # such that the least significat bit is on the right
217 |     cdef int dim
218 |     cdef int index = 0
219 |     for dim in range(n_dimensions):
220 |         index += (2 ** dim) * offset[n_dimensions - dim - 1]
221 |         if DEBUGFLAG:
222 |             printf("o2i index %i dim %i            offset", index, dim)
223 |             for j in range(n_dimensions):
224 |                 printf(" %i", offset[j])
225 |             printf(" n_dimensions %i\n", n_dimensions)
226 |     return index
227 | 
228 | 
229 | cdef void subdivide(Node* node) nogil:
230 |     # This instantiates 2**n_dimensions = n_cell_per_node nodes for the current node
231 |     cdef int idx = 0
232 |     cdef int* offset = <int*> malloc(sizeof(int) * node.tree.n_dimensions)
233 |     node.is_leaf = False
234 |     node.children = <Node**> malloc(sizeof(Node*) * node.tree.n_cell_per_node)
235 |     for idx in range(node.tree.n_cell_per_node):
236 |         index2offset(offset, idx, node.tree.n_dimensions)
237 |         node.children[idx] = create_child(node, offset)
238 |     free(offset)
239 | 
240 | 
241 | cdef int insert(Node *root, float pos[3], long point_index, long depth, long
242 |         duplicate_count) nogil:
243 |     # Introduce a new point into the tree
244 |     # by recursively inserting it and subdividng as necessary
245 |     # Carefully treat the case of identical points at the same node
246 |     # by increasing the root.size and tracking duplicate_count
247 |     cdef Node *child
248 |     cdef long i
249 |     cdef int ax
250 |     cdef int not_identical = 1
251 |     cdef int n_dimensions = root.tree.n_dimensions
252 |     if DEBUGFLAG:
253 |         printf("[t-SNE] [d=%i] Inserting pos %i [%f, %f] duplicate_count=%i "
254 |                 "into child %p\n", depth, point_index, pos[0], pos[1],
255 |                 duplicate_count, root)
256 |     # Increment the total number points including this
257 |     # node and below it
258 |     root.cumulative_size += duplicate_count
259 |     # Evaluate the new center of mass, weighting the previous
260 |     # center of mass against the new point data
261 |     cdef double frac_seen = <double>(root.cumulative_size - 1) / (<double>
262 |             root.cumulative_size)
263 |     cdef double frac_new  = 1.0 / <double> root.cumulative_size
264 |     # Assert that duplicate_count > 0
265 |     if duplicate_count < 1:
266 |         return -1
267 |     # Assert that the point is inside the left & right edges
268 |     for ax in range(n_dimensions):
269 |         root.barycenter[ax] *= frac_seen
270 |         if (pos[ax] > (root.left_edge[ax] + root.width[ax] + EPSILON)):
271 |             printf("[t-SNE] Error: point (%1.9e) is above right edge of node "
272 |                     "(%1.9e)\n", pos[ax], root.left_edge[ax] + root.width[ax])
273 |             return -1
274 |         if (pos[ax] < root.left_edge[ax] - EPSILON):
275 |             printf("[t-SNE] Error: point (%1.9e) is below left edge of node "
276 |                    "(%1.9e)\n", pos[ax], root.left_edge[ax])
277 |             return -1
278 |     for ax in range(n_dimensions):
279 |         root.barycenter[ax] += pos[ax] * frac_new
280 | 
281 |     # If this node is unoccupied, fill it.
282 |     # Otherwise, we need to insert recursively.
283 |     # Two insertion scenarios:
284 |     # 1) Insert into this node if it is a leaf and empty
285 |     # 2) Subdivide this node if it is currently occupied
286 |     if (root.size == 0) & root.is_leaf:
287 |         # Root node is empty and a leaf
288 |         if DEBUGFLAG:
289 |             printf("[t-SNE] [d=%i] Inserting [%f, %f] into blank cell\n", depth,
290 |                    pos[0], pos[1])
291 |         for ax in range(n_dimensions):
292 |             root.leaf_point_position[ax] = pos[ax]
293 |         root.point_index = point_index
294 |         root.size = duplicate_count
295 |         return 0
296 |     else:
297 |         # Root node is occupied or not a leaf
298 |         if DEBUGFLAG:
299 |             printf("[t-SNE] [d=%i] Node %p is occupied or is a leaf.\n", depth,
300 |                     root)
301 |             printf("[t-SNE] [d=%i] Node %p leaf = %i. Size %i\n", depth, root,
302 |                     root.is_leaf, root.size)
303 |         if root.is_leaf & (root.size > 0):
304 |             # is a leaf node and is occupied
305 |             for ax in range(n_dimensions):
306 |                 not_identical &= (fabsf(pos[ax] - root.leaf_point_position[ax]) < EPSILON)
307 |                 not_identical &= (root.point_index != point_index)
308 |             if not_identical == 1:
309 |                 root.size += duplicate_count
310 |                 if DEBUGFLAG:
311 |                     printf("[t-SNE] Warning: [d=%i] Detected identical "
312 |                             "points. Returning. Leaf now has size %i\n",
313 |                             depth, root.size)
314 |                 return 0
315 |         # If necessary, subdivide this node before
316 |         # descending
317 |         if root.is_leaf:
318 |             if DEBUGFLAG:
319 |                 printf("[t-SNE] [d=%i] Subdividing this leaf node %p\n", depth,
320 |                         root)
321 |             subdivide(root)
322 |         # We have two points to relocate: the one previously
323 |         # at this node, and the new one we're attempting
324 |         # to insert
325 |         if root.size > 0:
326 |             child = select_child(root, root.leaf_point_position, root.point_index)
327 |             if DEBUGFLAG:
328 |                 printf("[t-SNE] [d=%i] Relocating old point to node %p\n",
329 |                         depth, child)
330 |             insert(child, root.leaf_point_position, root.point_index, depth + 1, root.size)
331 |         # Insert the new point
332 |         if DEBUGFLAG:
333 |             printf("[t-SNE] [d=%i] Selecting node for new point\n", depth)
334 |         child = select_child(root, pos, point_index)
335 |         if root.size > 0:
336 |             # Remove the point from this node
337 |             for ax in range(n_dimensions):
338 |                 root.leaf_point_position[ax] = -1
339 |             root.size = 0
340 |             root.point_index = -1
341 |         return insert(child, pos, point_index, depth + 1, 1)
342 | 
343 | cdef int insert_many(Tree* tree, float[:,:] pos_array) nogil:
344 |     # Insert each data point into the tree one at a time
345 |     cdef long nrows = pos_array.shape[0]
346 |     cdef long i
347 |     cdef int ax
348 |     cdef float row[3]
349 |     cdef long err = 0
350 |     for i in range(nrows):
351 |         for ax in range(tree.n_dimensions):
352 |             row[ax] = pos_array[i, ax]
353 |         if DEBUGFLAG:
354 |             printf("[t-SNE] inserting point %i: [%f, %f]\n", i, row[0], row[1])
355 |         err = insert(tree.root_node, row, i, 0, 1)
356 |         if err != 0:
357 |             printf("[t-SNE] ERROR\n%s", EMPTY_STRING)
358 |             return err
359 |         tree.n_points += 1
360 |     return err
361 | 
362 | cdef int free_tree(Tree* tree) nogil:
363 |     cdef int check
364 |     cdef long* cnt = <long*> malloc(sizeof(long) * 3)
365 |     for i in range(3):
366 |         cnt[i] = 0
367 |     free_recursive(tree, tree.root_node, cnt)
368 |     check = cnt[0] == tree.n_cells
369 |     check &= cnt[2] == tree.n_points
370 |     free(tree)
371 |     free(cnt)
372 |     return check
373 | 
374 | cdef void free_post_children(Node *node) nogil:
375 |     free(node.width)
376 |     free(node.left_edge)
377 |     free(node.center)
378 |     free(node.barycenter)
379 |     free(node.leaf_point_position)
380 |     free(node)
381 | 
382 | cdef void free_recursive(Tree* tree, Node *root, long* counts) nogil:
383 |     # Free up all of the tree nodes recursively
384 |     # while counting the number of nodes visited
385 |     # and total number of data points removed
386 |     cdef int idx
387 |     cdef Node* child
388 |     if not root.is_leaf:
389 |         for idx in range(tree.n_cell_per_node):
390 |             child = root.children[idx]
391 |             free_recursive(tree, child, counts)
392 |             counts[0] += 1
393 |             if child.is_leaf:
394 |                 counts[1] += 1
395 |                 if child.size > 0:
396 |                     counts[2] +=1
397 |             else:
398 |                 free(child.children)
399 | 
400 |             free_post_children(child)
401 | 
402 |     if root == tree.root_node:
403 |         if not root.is_leaf:
404 |             free(root.children)
405 | 
406 |         free_post_children(root)
407 | 
408 | cdef long count_points(Node* root, long count) nogil:
409 |     # Walk through the whole tree and count the number
410 |     # of points at the leaf nodes
411 |     if DEBUGFLAG:
412 |         printf("[t-SNE] Counting nodes at root node %p\n", root)
413 |     cdef Node* child
414 |     cdef int idx
415 |     if root.is_leaf:
416 |         count += root.size
417 |         if DEBUGFLAG :
418 |             printf("[t-SNE] %p is a leaf node, no children\n", root)
419 |             printf("[t-SNE] %i points in node %p\n", count, root)
420 |         return count
421 |     # Otherwise, get the children
422 |     for idx in range(root.tree.n_cell_per_node):
423 |         child = root.children[idx]
424 |         if DEBUGFLAG:
425 |             printf("[t-SNE] Counting points for child %p\n", child)
426 |         if child.is_leaf and child.size > 0:
427 |             if DEBUGFLAG:
428 |                 printf("[t-SNE] Child has size %d\n", child.size)
429 |             count += child.size
430 |         elif not child.is_leaf:
431 |             if DEBUGFLAG:
432 |                 printf("[t-SNE] Child is not a leaf. Descending\n%s", EMPTY_STRING)
433 |             count = count_points(child, count)
434 |         # else case is we have an empty leaf node
435 |         # which happens when we create a quadtree for
436 |         # one point, and then the other neighboring cells
437 |         # don't get filled in
438 |     if DEBUGFLAG:
439 |         printf("[t-SNE] %i points in this node\n", count)
440 |     return count
441 | 
442 | 
443 | cdef float compute_gradient(float[:,:] val_P,
444 |                             float[:,:] pos_reference,
445 |                             np.int64_t[:,:] neighbors,
446 |                             float[:,:] tot_force,
447 |                             Node* root_node,
448 |                             float theta,
449 |                             float dof,
450 |                             long start,
451 |                             long stop) nogil:
452 |     # Having created the tree, calculate the gradient
453 |     # in two components, the positive and negative forces
454 |     cdef long i, coord
455 |     cdef int ax
456 |     cdef long n = pos_reference.shape[0]
457 |     cdef int n_dimensions = root_node.tree.n_dimensions
458 |     if root_node.tree.verbose > 11:
459 |         printf("[t-SNE] Allocating %i elements in force arrays\n",
460 |                 n * n_dimensions * 2)
461 |     cdef float* sum_Q = <float*> malloc(sizeof(float))
462 |     cdef float* neg_f = <float*> malloc(sizeof(float) * n * n_dimensions)
463 |     cdef float* neg_f_fast = <float*> malloc(sizeof(float) * n * n_dimensions)
464 |     cdef float* pos_f = <float*> malloc(sizeof(float) * n * n_dimensions)
465 |     cdef clock_t t1, t2
466 |     cdef float sQ, error
467 | 
468 |     sum_Q[0] = 0.0
469 |     t1 = clock()
470 |     compute_gradient_negative(val_P, pos_reference, neg_f, root_node, sum_Q,
471 |                               dof, theta, start, stop)
472 |     t2 = clock()
473 |     if root_node.tree.verbose > 15:
474 |         printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1)))
475 |     sQ = sum_Q[0]
476 |     t1 = clock()
477 |     error = compute_gradient_positive(val_P, pos_reference, neighbors, pos_f,
478 |                               n_dimensions, dof, sQ, start, root_node.tree.verbose)
479 |     t2 = clock()
480 |     if root_node.tree.verbose > 15:
481 |         printf("[t-SNE] Computing positive gradient: %e ticks\n", ((float) (t2 - t1)))
482 |     for i in range(start, n):
483 |         for ax in range(n_dimensions):
484 |             coord = i * n_dimensions + ax
485 |             tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sum_Q[0])
486 |     free(sum_Q)
487 |     free(neg_f)
488 |     free(neg_f_fast)
489 |     free(pos_f)
490 |     return sQ
491 | 
492 | 
493 | cdef float compute_gradient_positive(float[:,:] val_P,
494 |                                      float[:,:] pos_reference,
495 |                                      np.int64_t[:,:] neighbors,
496 |                                      float* pos_f,
497 |                                      int n_dimensions,
498 |                                      float dof,
499 |                                      float sum_Q,
500 |                                      np.int64_t start,
501 |                                      int verbose) nogil:
502 |     # Sum over the following expression for i not equal to j
503 |     # grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j)
504 |     # This is equivalent to compute_edge_forces in the authors' code
505 |     # It just goes over the nearest neighbors instead of all the data points
506 |     # (unlike the non-nearest neighbors version of `compute_gradient_positive')
507 |     cdef:
508 |         int ax
509 |         long i, j, k
510 |         long K = neighbors.shape[1]
511 |         long n = val_P.shape[0]
512 |         float[3] buff
513 |         float D, Q, pij
514 |         float C = 0.0
515 |         float exponent = (dof + 1.0) / -2.0
516 |     cdef clock_t t1, t2
517 |     t1 = clock()
518 |     for i in range(start, n):
519 |         for ax in range(n_dimensions):
520 |             pos_f[i * n_dimensions + ax] = 0.0
521 |         for k in range(K):
522 |             j = neighbors[i, k]
523 |             # we don't need to exclude the i==j case since we've
524 |             # already thrown it out from the list of neighbors
525 |             D = 0.0
526 |             Q = 0.0
527 |             pij = val_P[i, j]
528 |             for ax in range(n_dimensions):
529 |                 buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
530 |                 D += buff[ax] ** 2.0
531 |             Q = (((1.0 + D) / dof) ** exponent)
532 |             D = pij * Q
533 |             Q /= sum_Q
534 |             C += pij * log((pij + EPSILON) / (Q + EPSILON))
535 |             for ax in range(n_dimensions):
536 |                 pos_f[i * n_dimensions + ax] += D * buff[ax]
537 |     t2 = clock()
538 |     dt = ((float) (t2 - t1))
539 |     if verbose > 10:
540 |         printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
541 |     return C
542 | 
543 | 
544 | 
545 | cdef void compute_gradient_negative(float[:,:] val_P,
546 |                                     float[:,:] pos_reference,
547 |                                     float* neg_f,
548 |                                     Node *root_node,
549 |                                     float* sum_Q,
550 |                                     float dof,
551 |                                     float theta,
552 |                                     long start,
553 |                                     long stop) nogil:
554 |     if stop == -1:
555 |         stop = pos_reference.shape[0]
556 |     cdef:
557 |         int ax
558 |         long i, j
559 |         long n = stop - start
560 |         float* force
561 |         float* iQ
562 |         float* pos
563 |         float* dist2s
564 |         long* sizes
565 |         float* deltas
566 |         long* l
567 |         int n_dimensions = root_node.tree.n_dimensions
568 |         float qijZ, mult
569 |         long idx,
570 |         long dta = 0
571 |         long dtb = 0
572 |         clock_t t1, t2, t3
573 |         float* neg_force
574 | 
575 |     iQ = <float*> malloc(sizeof(float))
576 |     force = <float*> malloc(sizeof(float) * n_dimensions)
577 |     pos = <float*> malloc(sizeof(float) * n_dimensions)
578 |     dist2s = <float*> malloc(sizeof(float) * n)
579 |     sizes = <long*> malloc(sizeof(long) * n)
580 |     deltas = <float*> malloc(sizeof(float) * n * n_dimensions)
581 |     l = <long*> malloc(sizeof(long))
582 |     neg_force= <float*> malloc(sizeof(float) * n_dimensions)
583 | 
584 |     for i in range(start, stop):
585 |         # Clear the arrays
586 |         for ax in range(n_dimensions):
587 |             force[ax] = 0.0
588 |             neg_force[ax] = 0.0
589 |             pos[ax] = pos_reference[i, ax]
590 |         iQ[0] = 0.0
591 |         l[0] = 0
592 |         # Find which nodes are summarizing and collect their centers of mass
593 |         # deltas, and sizes, into vectorized arrays
594 |         t1 = clock()
595 |         compute_non_edge_forces(root_node, theta, i, pos, force, dist2s,
596 |                                      sizes, deltas, l)
597 |         t2 = clock()
598 |         # Compute the t-SNE negative force
599 |         # for the digits dataset, walking the tree
600 |         # is about 10-15x more expensive than the
601 |         # following for loop
602 |         exponent = (dof + 1.0) / -2.0
603 |         for j in range(l[0]):
604 |             qijZ = ((1.0 + dist2s[j]) / dof) ** exponent
605 |             sum_Q[0] += sizes[j] * qijZ
606 |             mult = sizes[j] * qijZ * qijZ
607 |             for ax in range(n_dimensions):
608 |                 idx = j * n_dimensions + ax
609 |                 neg_force[ax] += mult * deltas[idx]
610 |         t3 = clock()
611 |         for ax in range(n_dimensions):
612 |             neg_f[i * n_dimensions + ax] = neg_force[ax]
613 |         dta += t2 - t1
614 |         dtb += t3 - t2
615 |     if root_node.tree.verbose > 20:
616 |         printf("[t-SNE] Tree: %i clock ticks | ", dta)
617 |         printf("Force computation: %i clock ticks\n", dtb)
618 |     free(iQ)
619 |     free(force)
620 |     free(pos)
621 |     free(dist2s)
622 |     free(sizes)
623 |     free(deltas)
624 |     free(l)
625 |     free(neg_force)
626 | 
627 | 
628 | cdef void compute_non_edge_forces(Node* node,
629 |                                   float theta,
630 |                                   long point_index,
631 |                                   float* pos,
632 |                                   float* force,
633 |                                   float* dist2s,
634 |                                   long* sizes,
635 |                                   float* deltas,
636 |                                   long* l) nogil:
637 |     # Compute the t-SNE force on the point in pos given by point_index
638 |     cdef:
639 |         Node* child
640 |         int i, j
641 |         int n_dimensions = node.tree.n_dimensions
642 |         long idx, idx1
643 |         float dist_check
644 | 
645 |     # There are no points below this node if cumulative_size == 0
646 |     # so do not bother to calculate any force contributions
647 |     # Also do not compute self-interactions
648 |     if node.cumulative_size > 0 and not (node.is_leaf and (node.point_index ==
649 |         point_index)):
650 |         # Compute distance between node center of mass and the reference point
651 |         # I've tried rewriting this in terms of BLAS functions, but it's about
652 |         # 1.5x worse when we do so, probbaly because the vectors are small
653 |         idx1 = l[0] * n_dimensions
654 |         deltas[idx1] = pos[0] - node.barycenter[0]
655 |         idx = idx1
656 |         for i in range(1, n_dimensions):
657 |             idx += 1
658 |             deltas[idx] = pos[i] - node.barycenter[i]
659 |         # do np.sqrt(np.sum(deltas**2.0))
660 |         dist2s[l[0]] = snrm2(n_dimensions, &deltas[idx1], 1)
661 |         # Check whether we can use this node as a summary
662 |         # It's a summary node if the angular size as measured from the point
663 |         # is relatively small (w.r.t. to theta) or if it is a leaf node.
664 |         # If it can be summarized, we use the cell center of mass
665 |         # Otherwise, we go a higher level of resolution and into the leaves.
666 |         if node.is_leaf or ((node.max_width / dist2s[l[0]]) < theta):
667 |             # Compute the t-SNE force between the reference point and the
668 |             # current node
669 |             sizes[l[0]] = node.cumulative_size
670 |             dist2s[l[0]] = dist2s[l[0]] * dist2s[l[0]]
671 |             l[0] += 1
672 |         else:
673 |             # Recursively apply Barnes-Hut to child nodes
674 |             for idx in range(node.tree.n_cell_per_node):
675 |                 child = node.children[idx]
676 |                 if child.cumulative_size == 0:
677 |                     continue
678 |                 compute_non_edge_forces(child, theta,
679 |                         point_index, pos, force, dist2s, sizes, deltas,
680 |                         l)
681 | 
682 | 
683 | cdef float compute_error(float[:, :] val_P,
684 |                         float[:, :] pos_reference,
685 |                         np.int64_t[:,:] neighbors,
686 |                         float sum_Q,
687 |                         int n_dimensions,
688 |                         int verbose) nogil:
689 |     cdef int i, j, ax
690 |     cdef int I = neighbors.shape[0]
691 |     cdef int K = neighbors.shape[1]
692 |     cdef float pij, Q
693 |     cdef float C = 0.0
694 |     cdef clock_t t1, t2
695 |     cdef float dt, delta
696 |     t1 = clock()
697 |     for i in range(I):
698 |         for k in range(K):
699 |             j = neighbors[i, k]
700 |             pij = val_P[i, j]
701 |             Q = 0.0
702 |             for ax in range(n_dimensions):
703 |                 delta = (pos_reference[i, ax] - pos_reference[j, ax])
704 |                 Q += delta * delta
705 |             Q = (1.0 / (sum_Q + Q * sum_Q))
706 |             C += pij * log((pij + EPSILON) / (Q + EPSILON))
707 |     t2 = clock()
708 |     dt = ((float) (t2 - t1))
709 |     if verbose > 10:
710 |         printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
711 |     return C
712 | 
713 | 
714 | def calculate_edge(pos_output):
715 |     # Make the boundaries slightly outside of the data
716 |     # to avoid floating point error near the edge
717 |     left_edge = np.min(pos_output, axis=0)
718 |     right_edge = np.max(pos_output, axis=0)
719 |     center = (right_edge + left_edge) * 0.5
720 |     width = np.maximum(np.subtract(right_edge, left_edge), EPSILON)
721 |     # Exagerate width to avoid boundary edge
722 |     width = width.astype(np.float32) * 1.001
723 |     left_edge = center - width / 2.0
724 |     right_edge = center + width / 2.0
725 |     return left_edge, right_edge, width
726 | 
727 | def gradient(float[:,:] pij_input,
728 |              float[:,:] pos_output,
729 |              np.int64_t[:,:] neighbors,
730 |              float[:,:] forces,
731 |              float theta,
732 |              int n_dimensions,
733 |              int verbose,
734 |              float dof = 1.0,
735 |              long skip_num_points=0):
736 |     # This function is designed to be called from external Python
737 |     # it passes the 'forces' array by reference and fills thats array
738 |     # up in-place
739 |     cdef float C
740 |     n = pos_output.shape[0]
741 |     left_edge, right_edge, width = calculate_edge(pos_output)
742 |     assert width.itemsize == 4
743 |     assert pij_input.itemsize == 4
744 |     assert pos_output.itemsize == 4
745 |     assert forces.itemsize == 4
746 |     m = "Number of neighbors must be < # of points - 1"
747 |     assert n - 1 >= neighbors.shape[1], m
748 |     m = "neighbors array and pos_output shapes are incompatible"
749 |     assert n == neighbors.shape[0], m
750 |     m = "Forces array and pos_output shapes are incompatible"
751 |     assert n == forces.shape[0], m
752 |     m = "Pij and pos_output shapes are incompatible"
753 |     assert n == pij_input.shape[0], m
754 |     m = "Pij and pos_output shapes are incompatible"
755 |     assert n == pij_input.shape[1], m
756 |     if verbose > 10:
757 |         printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions)
758 |     cdef Tree* qt = init_tree(left_edge, width, n_dimensions, verbose)
759 |     if verbose > 10:
760 |         printf("[t-SNE] Inserting %i points\n", pos_output.shape[0])
761 |     err = insert_many(qt, pos_output)
762 |     assert err == 0, "[t-SNE] Insertion failed"
763 |     if verbose > 10:
764 |         # XXX: format hack to workaround lack of `const char *` type
765 |         # in the generated C code that triggers error with gcc 4.9
766 |         # and -Werror=format-security
767 |         printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING)
768 |     sum_Q = compute_gradient(pij_input, pos_output, neighbors, forces,
769 |                              qt.root_node, theta, dof, skip_num_points, -1)
770 |     C = compute_error(pij_input, pos_output, neighbors, sum_Q, n_dimensions,
771 |                       verbose)
772 |     if verbose > 10:
773 |         # XXX: format hack to workaround lack of `const char *` type
774 |         # in the generated C code
775 |         # and -Werror=format-security
776 |         printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING)
777 |     cdef long count = count_points(qt.root_node, 0)
778 |     m = ("Tree consistency failed: unexpected number of points=%i "
779 |          "at root node=%i" % (count, qt.root_node.cumulative_size))
780 |     assert count == qt.root_node.cumulative_size, m
781 |     m = "Tree consistency failed: unexpected number of points on the tree"
782 |     assert count == qt.n_points, m
783 |     free_tree(qt)
784 |     return C
785 | 
786 | 
787 | # Helper functions
788 | def check_quadtree(X, np.int64_t[:] counts):
789 |     """
790 |     Helper function to access quadtree functions for testing
791 |     """
792 | 
793 |     X = X.astype(np.float32)
794 |     left_edge, right_edge, width = calculate_edge(X)
795 |     # Initialise a tree
796 |     qt = init_tree(left_edge, width, 2, 2)
797 |     # Insert data into the tree
798 |     insert_many(qt, X)
799 | 
800 |     cdef long count = count_points(qt.root_node, 0)
801 |     counts[0] = count
802 |     counts[1] = qt.root_node.cumulative_size
803 |     counts[2] = qt.n_points
804 |     free_tree(qt)
805 |     return counts
806 | 
807 | 
808 | cdef int helper_test_index2offset(int* check, int index, int n_dimensions):
809 |     cdef int* offset = <int*> malloc(sizeof(int) * n_dimensions)
810 |     cdef int error_check = 1
811 |     for i in range(n_dimensions):
812 |         offset[i] = 0
813 |     index2offset(offset, index, n_dimensions)
814 |     for i in range(n_dimensions):
815 |         error_check &= offset[i] == check[i]
816 |     free(offset)
817 |     return error_check
818 | 
819 | 
820 | def test_index2offset():
821 |     ret = 1
822 |     ret &= helper_test_index2offset([1, 0, 1], 5, 3) == 1
823 |     ret &= helper_test_index2offset([0, 0, 0], 0, 3) == 1
824 |     ret &= helper_test_index2offset([0, 0, 1], 1, 3) == 1
825 |     ret &= helper_test_index2offset([0, 1, 0], 2, 3) == 1
826 |     ret &= helper_test_index2offset([0, 1, 1], 3, 3) == 1
827 |     ret &= helper_test_index2offset([1, 0, 0], 4, 3) == 1
828 |     return ret
829 | 
830 | 
831 | def test_index_offset():
832 |     cdef int n_dimensions, idx, tidx, k
833 |     cdef int error_check = 1
834 |     cdef int* offset
835 |     for n_dimensions in range(2, 10):
836 |         offset = <int*> malloc(sizeof(int) * n_dimensions)
837 |         for k in range(n_dimensions):
838 |             offset[k] = 0
839 |         for idx in range(2 ** n_dimensions):
840 |             index2offset(offset, idx, n_dimensions)
841 |             tidx = offset2index(offset, n_dimensions)
842 |             error_check &= tidx == idx
843 |             assert error_check == 1
844 |         free(offset)
845 |     return error_check
846 | 


--------------------------------------------------------------------------------
/sstsne/ss_t_sne.py:
--------------------------------------------------------------------------------
  1 | # Author: Alexander Fabisch  -- <afabisch@informatik.uni-bremen.de>
  2 | # Author: Christopher Moody <chrisemoody@gmail.com>
  3 | # Author: Nick Travers <nickt@squareup.com>
  4 | # Author: Leland McInnes <leland.mcinnes@gmail.com>
  5 | # License: BSD 3 clause (C) 2014
  6 | 
  7 | # This is the exact and Barnes-Hut t-SNE implementation. There are other
  8 | # modifications of the algorithm:
  9 | # * Fast Optimization for t-SNE:
 10 | #   http://cseweb.ucsd.edu/~lvdmaaten/workshops/nips2010/papers/vandermaaten.pdf
 11 | # Includes a further addition of SemiSupervision via partial labelling of the data
 12 | 
 13 | import numpy as np
 14 | from scipy import linalg
 15 | import scipy.sparse as sp
 16 | from scipy.spatial.distance import pdist
 17 | from scipy.spatial.distance import squareform
 18 | from sklearn.neighbors import BallTree
 19 | from sklearn.base import BaseEstimator
 20 | from sklearn.utils import check_array
 21 | from sklearn.utils import check_random_state
 22 | from sklearn.utils.extmath import _ravel
 23 | from sklearn.decomposition import PCA
 24 | from sklearn.metrics.pairwise import pairwise_distances
 25 | from . import _utils
 26 | from . import _barnes_hut_tsne
 27 | from sklearn.utils.fixes import astype
 28 | 
 29 | 
 30 | MACHINE_EPSILON = np.finfo(np.double).eps
 31 | 
 32 | 
 33 | def _joint_probabilities(distances, labels, label_importance, rep_sample,
 34 |                          desired_perplexity, verbose):
 35 |     """Compute joint probabilities p_ij from distances.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     distances : array, shape (n_samples * (n_samples-1) / 2,)
 40 |         Distances of samples are stored as condensed matrices, i.e.
 41 |         we omit the diagonal and duplicate entries and store everything
 42 |         in a one-dimensional array.
 43 | 
 44 |     labels : array, shape (n_samples,)
 45 |         An integer labelling of each sample, with unknown samples given
 46 |         the label -1.
 47 | 
 48 |     label_importance: float
 49 |         How much to deviate from a uniform prior via the label classes.
 50 | 
 51 |     rep_samples: boolean
 52 |         Whether the partial labelling is a representative sample of
 53 |         the full (and unknown) labelling.
 54 | 
 55 |     desired_perplexity : float
 56 |         Desired perplexity of the joint probability distributions.
 57 | 
 58 |     verbose : int
 59 |         Verbosity level.
 60 | 
 61 |     Returns
 62 |     -------
 63 |     P : array, shape (n_samples * (n_samples-1) / 2,)
 64 |         Condensed joint probability matrix.
 65 |     """
 66 |     # Compute conditional probabilities such that they approximately match
 67 |     # the desired perplexity
 68 |     distances = astype(distances, np.float32, copy=False)
 69 |     labels = astype(labels, np.int64, copy=False)
 70 |     conditional_P = _utils._binary_search_perplexity(
 71 |         distances, None, labels, label_importance,
 72 |         rep_sample, desired_perplexity, verbose)
 73 |     P = conditional_P + conditional_P.T
 74 |     sum_P = np.maximum(np.sum(P), MACHINE_EPSILON)
 75 |     P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON)
 76 |     return P
 77 | 
 78 | 
 79 | def _joint_probabilities_nn(distances, neighbors, labels, label_importance,
 80 |                             rep_sample, desired_perplexity, verbose):
 81 |     """Compute joint probabilities p_ij from distances using just nearest
 82 |     neighbors.
 83 | 
 84 |     This method is approximately equal to _joint_probabilities. The latter
 85 |     is O(N), but limiting the joint probability to nearest neighbors improves
 86 |     this substantially to O(uN).
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     distances : array, shape (n_samples * (n_samples-1) / 2,)
 91 |         Distances of samples are stored as condensed matrices, i.e.
 92 |         we omit the diagonal and duplicate entries and store everything
 93 |         in a one-dimensional array.
 94 | 
 95 |     labels : array, shape (n_samples,)
 96 |         An integer labelling of each sample, with unknown samples given
 97 |         the label -1.
 98 | 
 99 |     label_importance: float
100 |         How much to deviate from a uniform prior via the label classes.
101 | 
102 |     rep_samples: boolean
103 |         Whether the partial labelling is a representative sample of
104 |         the full (and unknown) labelling.
105 | 
106 |     desired_perplexity : float
107 |         Desired perplexity of the joint probability distributions.
108 | 
109 |     verbose : int
110 |         Verbosity level.
111 | 
112 |     Returns
113 |     -------
114 |     P : array, shape (n_samples * (n_samples-1) / 2,)
115 |         Condensed joint probability matrix.
116 |     """
117 |     # Compute conditional probabilities such that they approximately match
118 |     # the desired perplexity
119 |     distances = astype(distances, np.float32, copy=False)
120 |     labels = astype(labels, np.int64, copy=False)
121 |     neighbors = astype(neighbors, np.int64, copy=False)
122 |     conditional_P = _utils._binary_search_perplexity(
123 |         distances, neighbors, labels, label_importance,
124 |         rep_sample, desired_perplexity, verbose)
125 |     m = "All probabilities should be finite"
126 |     assert np.all(np.isfinite(conditional_P)), m
127 |     P = conditional_P + conditional_P.T
128 |     sum_P = np.maximum(np.sum(P), MACHINE_EPSILON)
129 |     P = np.maximum(squareform(P) / sum_P, MACHINE_EPSILON)
130 |     assert np.all(np.abs(P) <= 1.0)
131 |     return P
132 | 
133 | 
134 | def _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components,
135 |                    skip_num_points=0):
136 |     """t-SNE objective function: gradient of the KL divergence
137 |     of p_ijs and q_ijs and the absolute error.
138 | 
139 |     Parameters
140 |     ----------
141 |     params : array, shape (n_params,)
142 |         Unraveled embedding.
143 | 
144 |     P : array, shape (n_samples * (n_samples-1) / 2,)
145 |         Condensed joint probability matrix.
146 | 
147 |     degrees_of_freedom : float
148 |         Degrees of freedom of the Student's-t distribution.
149 | 
150 |     n_samples : int
151 |         Number of samples.
152 | 
153 |     n_components : int
154 |         Dimension of the embedded space.
155 | 
156 |     skip_num_points : int (optional, default:0)
157 |         This does not compute the gradient for points with indices below
158 |         `skip_num_points`. This is useful when computing transforms of new
159 |         data where you'd like to keep the old data fixed.
160 | 
161 |     Returns
162 |     -------
163 |     kl_divergence : float
164 |         Kullback-Leibler divergence of p_ij and q_ij.
165 | 
166 |     grad : array, shape (n_params,)
167 |         Unraveled gradient of the Kullback-Leibler divergence with respect to
168 |         the embedding.
169 |     """
170 |     X_embedded = params.reshape(n_samples, n_components)
171 | 
172 |     # Q is a heavy-tailed distribution: Student's t-distribution
173 |     n = pdist(X_embedded, "sqeuclidean")
174 |     n += 1.
175 |     n /= degrees_of_freedom
176 |     n **= (degrees_of_freedom + 1.0) / -2.0
177 |     Q = np.maximum(n / (2.0 * np.sum(n)), MACHINE_EPSILON)
178 | 
179 |     # Optimization trick below: np.dot(x, y) is faster than
180 |     # np.sum(x * y) because it calls BLAS
181 | 
182 |     # Objective: C (Kullback-Leibler divergence of P and Q)
183 |     kl_divergence = 2.0 * np.dot(P, np.log(P / Q))
184 | 
185 |     # Gradient: dC/dY
186 |     grad = np.ndarray((n_samples, n_components))
187 |     PQd = squareform((P - Q) * n)
188 |     for i in range(skip_num_points, n_samples):
189 |         np.dot(_ravel(PQd[i]), X_embedded[i] - X_embedded, out=grad[i])
190 |     grad = grad.ravel()
191 |     c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
192 |     grad *= c
193 | 
194 |     return kl_divergence, grad
195 | 
196 | 
197 | def _kl_divergence_error(params, P, neighbors, degrees_of_freedom, n_samples,
198 |                          n_components):
199 |     """t-SNE objective function: the absolute error of the
200 |     KL divergence of p_ijs and q_ijs.
201 | 
202 |     Parameters
203 |     ----------
204 |     params : array, shape (n_params,)
205 |         Unraveled embedding.
206 | 
207 |     P : array, shape (n_samples * (n_samples-1) / 2,)
208 |         Condensed joint probability matrix.
209 | 
210 |     neighbors : array (n_samples, K)
211 |         The neighbors is not actually required to calculate the
212 |         divergence, but is here to match the signature of the
213 |         gradient function
214 | 
215 |     degrees_of_freedom : float
216 |         Degrees of freedom of the Student's-t distribution.
217 | 
218 |     n_samples : int
219 |         Number of samples.
220 | 
221 |     n_components : int
222 |         Dimension of the embedded space.
223 | 
224 |     Returns
225 |     -------
226 |     kl_divergence : float
227 |         Kullback-Leibler divergence of p_ij and q_ij.
228 | 
229 |     grad : array, shape (n_params,)
230 |         Unraveled gradient of the Kullback-Leibler divergence with respect to
231 |         the embedding.
232 |     """
233 |     X_embedded = params.reshape(n_samples, n_components)
234 | 
235 |     # Q is a heavy-tailed distribution: Student's t-distribution
236 |     n = pdist(X_embedded, "sqeuclidean")
237 |     n += 1.
238 |     n /= degrees_of_freedom
239 |     n **= (degrees_of_freedom + 1.0) / -2.0
240 |     Q = np.maximum(n / (2.0 * np.sum(n)), MACHINE_EPSILON)
241 | 
242 |     # Optimization trick below: np.dot(x, y) is faster than
243 |     # np.sum(x * y) because it calls BLAS
244 | 
245 |     # Objective: C (Kullback-Leibler divergence of P and Q)
246 |     if len(P.shape) == 2:
247 |         P = squareform(P)
248 |     kl_divergence = 2.0 * np.dot(P, np.log(P / Q))
249 | 
250 |     return kl_divergence
251 | 
252 | 
253 | def _kl_divergence_bh(params, P, neighbors, degrees_of_freedom, n_samples,
254 |                       n_components, angle=0.5, skip_num_points=0,
255 |                       verbose=False):
256 |     """t-SNE objective function: KL divergence of p_ijs and q_ijs.
257 | 
258 |     Uses Barnes-Hut tree methods to calculate the gradient that
259 |     runs in O(NlogN) instead of O(N^2)
260 | 
261 |     Parameters
262 |     ----------
263 |     params : array, shape (n_params,)
264 |         Unraveled embedding.
265 | 
266 |     P : array, shape (n_samples * (n_samples-1) / 2,)
267 |         Condensed joint probability matrix.
268 | 
269 |     neighbors: int64 array, shape (n_samples, K)
270 |         Array with element [i, j] giving the index for the jth
271 |         closest neighbor to point i.
272 | 
273 |     degrees_of_freedom : float
274 |         Degrees of freedom of the Student's-t distribution.
275 | 
276 |     n_samples : int
277 |         Number of samples.
278 | 
279 |     n_components : int
280 |         Dimension of the embedded space.
281 | 
282 |     angle : float (default: 0.5)
283 |         This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
284 |         'angle' is the angular size (referred to as theta in [3]) of a distant
285 |         node as measured from a point. If this size is below 'angle' then it is
286 |         used as a summary node of all points contained within it.
287 |         This method is not very sensitive to changes in this parameter
288 |         in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
289 |         computation time and angle greater 0.8 has quickly increasing error.
290 | 
291 |     skip_num_points : int (optional, default:0)
292 |         This does not compute the gradient for points with indices below
293 |         `skip_num_points`. This is useful when computing transforms of new
294 |         data where you'd like to keep the old data fixed.
295 | 
296 |     verbose : int
297 |         Verbosity level.
298 | 
299 |     Returns
300 |     -------
301 |     kl_divergence : float
302 |         Kullback-Leibler divergence of p_ij and q_ij.
303 | 
304 |     grad : array, shape (n_params,)
305 |         Unraveled gradient of the Kullback-Leibler divergence with respect to
306 |         the embedding.
307 |     """
308 |     params = astype(params, np.float32, copy=False)
309 |     X_embedded = params.reshape(n_samples, n_components)
310 |     neighbors = astype(neighbors, np.int64, copy=False)
311 |     if len(P.shape) == 1:
312 |         sP = squareform(P).astype(np.float32)
313 |     else:
314 |         sP = P.astype(np.float32)
315 | 
316 |     grad = np.zeros(X_embedded.shape, dtype=np.float32)
317 |     error = _barnes_hut_tsne.gradient(sP, X_embedded, neighbors,
318 |                                       grad, angle, n_components, verbose,
319 |                                       dof=degrees_of_freedom)
320 |     c = 2.0 * (degrees_of_freedom + 1.0) / degrees_of_freedom
321 |     grad = grad.ravel()
322 |     grad *= c
323 | 
324 |     return error, grad
325 | 
326 | 
327 | def _gradient_descent(objective, p0, it, n_iter, objective_error=None,
328 |                       n_iter_check=1, n_iter_without_progress=50,
329 |                       momentum=0.5, learning_rate=1000.0, min_gain=0.01,
330 |                       min_grad_norm=1e-7, min_error_diff=1e-7, verbose=0,
331 |                       args=None, kwargs=None):
332 |     """Batch gradient descent with momentum and individual gains.
333 | 
334 |     Parameters
335 |     ----------
336 |     objective : function or callable
337 |         Should return a tuple of cost and gradient for a given parameter
338 |         vector. When expensive to compute, the cost can optionally
339 |         be None and can be computed every n_iter_check steps using
340 |         the objective_error function.
341 | 
342 |     p0 : array-like, shape (n_params,)
343 |         Initial parameter vector.
344 | 
345 |     it : int
346 |         Current number of iterations (this function will be called more than
347 |         once during the optimization).
348 | 
349 |     n_iter : int
350 |         Maximum number of gradient descent iterations.
351 | 
352 |     n_iter_check : int
353 |         Number of iterations before evaluating the global error. If the error
354 |         is sufficiently low, we abort the optimization.
355 | 
356 |     objective_error : function or callable
357 |         Should return a tuple of cost and gradient for a given parameter
358 |         vector.
359 | 
360 |     n_iter_without_progress : int, optional (default: 30)
361 |         Maximum number of iterations without progress before we abort the
362 |         optimization.
363 | 
364 |     momentum : float, within (0.0, 1.0), optional (default: 0.5)
365 |         The momentum generates a weight for previous gradients that decays
366 |         exponentially.
367 | 
368 |     learning_rate : float, optional (default: 1000.0)
369 |         The learning rate should be extremely high for t-SNE! Values in the
370 |         range [100.0, 1000.0] are common.
371 | 
372 |     min_gain : float, optional (default: 0.01)
373 |         Minimum individual gain for each parameter.
374 | 
375 |     min_grad_norm : float, optional (default: 1e-7)
376 |         If the gradient norm is below this threshold, the optimization will
377 |         be aborted.
378 | 
379 |     min_error_diff : float, optional (default: 1e-7)
380 |         If the absolute difference of two successive cost function values
381 |         is below this threshold, the optimization will be aborted.
382 | 
383 |     verbose : int, optional (default: 0)
384 |         Verbosity level.
385 | 
386 |     args : sequence
387 |         Arguments to pass to objective function.
388 | 
389 |     kwargs : dict
390 |         Keyword arguments to pass to objective function.
391 | 
392 |     Returns
393 |     -------
394 |     p : array, shape (n_params,)
395 |         Optimum parameters.
396 | 
397 |     error : float
398 |         Optimum.
399 | 
400 |     i : int
401 |         Last iteration.
402 |     """
403 |     if args is None:
404 |         args = []
405 |     if kwargs is None:
406 |         kwargs = {}
407 | 
408 |     p = p0.copy().ravel()
409 |     update = np.zeros_like(p)
410 |     gains = np.ones_like(p)
411 |     error = np.finfo(np.float).max
412 |     best_error = np.finfo(np.float).max
413 |     best_iter = 0
414 | 
415 |     for i in range(it, n_iter):
416 |         new_error, grad = objective(p, *args, **kwargs)
417 |         grad_norm = linalg.norm(grad)
418 | 
419 |         inc = update * grad >= 0.0
420 |         dec = np.invert(inc)
421 |         gains[inc] += 0.05
422 |         gains[dec] *= 0.95
423 |         np.clip(gains, min_gain, np.inf)
424 |         grad *= gains
425 |         update = momentum * update - learning_rate * grad
426 |         p += update
427 | 
428 |         if (i + 1) % n_iter_check == 0:
429 |             if new_error is None:
430 |                 new_error = objective_error(p, *args)
431 |             error_diff = np.abs(new_error - error)
432 |             error = new_error
433 | 
434 |             if verbose >= 2:
435 |                 m = "[t-SNE] Iteration %d: error = %.7f, gradient norm = %.7f"
436 |                 print(m % (i + 1, error, grad_norm))
437 | 
438 |             if error < best_error:
439 |                 best_error = error
440 |                 best_iter = i
441 |             elif i - best_iter > n_iter_without_progress:
442 |                 if verbose >= 2:
443 |                     print("[t-SNE] Iteration %d: did not make any progress "
444 |                           "during the last %d episodes. Finished."
445 |                           % (i + 1, n_iter_without_progress))
446 |                 break
447 |             if grad_norm <= min_grad_norm:
448 |                 if verbose >= 2:
449 |                     print("[t-SNE] Iteration %d: gradient norm %f. Finished."
450 |                           % (i + 1, grad_norm))
451 |                 break
452 |             if error_diff <= min_error_diff:
453 |                 if verbose >= 2:
454 |                     m = "[t-SNE] Iteration %d: error difference %f. Finished."
455 |                     print(m % (i + 1, error_diff))
456 |                 break
457 | 
458 |         if new_error is not None:
459 |             error = new_error
460 | 
461 |     return p, error, i
462 | 
463 | 
464 | def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False):
465 |     """Expresses to what extent the local structure is retained.
466 | 
467 |     The trustworthiness is within [0, 1]. It is defined as
468 | 
469 |     .. math::
470 | 
471 |         T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
472 |             \sum_{j \in U^{(k)}_i (r(i, j) - k)}
473 | 
474 |     where :math:`r(i, j)` is the rank of the embedded datapoint j
475 |     according to the pairwise distances between the embedded datapoints,
476 |     :math:`U^{(k)}_i` is the set of points that are in the k nearest
477 |     neighbors in the embedded space but not in the original space.
478 | 
479 |     * "Neighborhood Preservation in Nonlinear Projection Methods: An
480 |       Experimental Study"
481 |       J. Venna, S. Kaski
482 |     * "Learning a Parametric Embedding by Preserving Local Structure"
483 |       L.J.P. van der Maaten
484 | 
485 |     Parameters
486 |     ----------
487 |     X : array, shape (n_samples, n_features) or (n_samples, n_samples)
488 |         If the metric is 'precomputed' X must be a square distance
489 |         matrix. Otherwise it contains a sample per row.
490 | 
491 |     X_embedded : array, shape (n_samples, n_components)
492 |         Embedding of the training data in low-dimensional space.
493 | 
494 |     n_neighbors : int, optional (default: 5)
495 |         Number of neighbors k that will be considered.
496 | 
497 |     precomputed : bool, optional (default: False)
498 |         Set this flag if X is a precomputed square distance matrix.
499 | 
500 |     Returns
501 |     -------
502 |     trustworthiness : float
503 |         Trustworthiness of the low-dimensional embedding.
504 |     """
505 |     if precomputed:
506 |         dist_X = X
507 |     else:
508 |         dist_X = pairwise_distances(X, squared=True)
509 |     dist_X_embedded = pairwise_distances(X_embedded, squared=True)
510 |     ind_X = np.argsort(dist_X, axis=1)
511 |     ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1]
512 | 
513 |     n_samples = X.shape[0]
514 |     t = 0.0
515 |     ranks = np.zeros(n_neighbors)
516 |     for i in range(n_samples):
517 |         for j in range(n_neighbors):
518 |             ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0]
519 |         ranks -= n_neighbors
520 |         t += np.sum(ranks[ranks > 0])
521 |     t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
522 |                           (2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
523 |     return t
524 | 
525 | 
526 | class SemiSupervisedTSNE(BaseEstimator):
527 |     """Semi Supervised t-distributed Stochastic Neighbor Embedding.
528 | 
529 |     t-SNE [1] is a tool to visualize high-dimensional data. It converts
530 |     similarities between data points to joint probabilities and tries
531 |     to minimize the Kullback-Leibler divergence between the joint
532 |     probabilities of the low-dimensional embedding and the
533 |     high-dimensional data. t-SNE has a cost function that is not convex,
534 |     i.e. with different initializations we can get different results.
535 | 
536 |     It is highly recommended to use another dimensionality reduction
537 |     method (e.g. PCA for dense data or TruncatedSVD for sparse data)
538 |     to reduce the number of dimensions to a reasonable amount (e.g. 50)
539 |     if the number of features is very high. This will suppress some
540 |     noise and speed up the computation of pairwise distances between
541 |     samples. For more tips see Laurens van der Maaten's FAQ [2].
542 | 
543 |     This semi-supervised version of t-SNE supports an incomplete labelling
544 |     being supplied. This labelling is then used to inform the dimension
545 |     reduction such that samples with the same label are more likely to
546 |     be close, while samples with different labels are more likely to be
547 |     separated.
548 | 
549 |     Read more in the sklearn :ref:`User Guide <t_sne>`.
550 | 
551 |     Parameters
552 |     ----------
553 |     n_components : int, optional (default: 2)
554 |         Dimension of the embedded space.
555 | 
556 |     label_importance : float, optional (default: 1.0)
557 |         How much to weight the importance of the labels when determining
558 |         the transformation. In practice this determines how far from
559 |         a uniform distribution to make the label based prior.
560 | 
561 |     class_sizes_are_representative : boolean, optional (default: False)
562 |         If label class sizes are representative of the full/true labelling
563 |         then we can weight the prior using class sizes, which can account
564 |         for significant variance in class sizes well. Unless youn know that
565 |         you have a representative sample labelled it is best to leave this
566 |         False.
567 | 
568 |     perplexity : float, optional (default: 30)
569 |         The perplexity is related to the number of nearest neighbors that
570 |         is used in other manifold learning algorithms. Larger datasets
571 |         usually require a larger perplexity. Consider selecting a value
572 |         between 5 and 50. The choice is not extremely critical since t-SNE
573 |         is quite insensitive to this parameter.
574 | 
575 |     early_exaggeration : float, optional (default: 4.0)
576 |         Controls how tight natural clusters in the original space are in
577 |         the embedded space and how much space will be between them. For
578 |         larger values, the space between natural clusters will be larger
579 |         in the embedded space. Again, the choice of this parameter is not
580 |         very critical. If the cost function increases during initial
581 |         optimization, the early exaggeration factor or the learning rate
582 |         might be too high.
583 | 
584 |     learning_rate : float, optional (default: 1000)
585 |         The learning rate can be a critical parameter. It should be
586 |         between 100 and 1000. If the cost function increases during initial
587 |         optimization, the early exaggeration factor or the learning rate
588 |         might be too high. If the cost function gets stuck in a bad local
589 |         minimum increasing the learning rate helps sometimes.
590 | 
591 |     n_iter : int, optional (default: 1000)
592 |         Maximum number of iterations for the optimization. Should be at
593 |         least 200.
594 | 
595 |     n_iter_without_progress : int, optional (default: 30)
596 |         Maximum number of iterations without progress before we abort the
597 |         optimization.
598 | 
599 |         .. versionadded:: 0.17
600 |            parameter *n_iter_without_progress* to control stopping criteria.
601 | 
602 |     min_grad_norm : float, optional (default: 1E-7)
603 |         If the gradient norm is below this threshold, the optimization will
604 |         be aborted.
605 | 
606 |     metric : string or callable, optional
607 |         The metric to use when calculating distance between instances in a
608 |         feature array. If metric is a string, it must be one of the options
609 |         allowed by scipy.spatial.distance.pdist for its metric parameter, or
610 |         a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
611 |         If metric is "precomputed", X is assumed to be a distance matrix.
612 |         Alternatively, if metric is a callable function, it is called on each
613 |         pair of instances (rows) and the resulting value recorded. The callable
614 |         should take two arrays from X as input and return a value indicating
615 |         the distance between them. The default is "euclidean" which is
616 |         interpreted as squared euclidean distance.
617 | 
618 |     init : string, optional (default: "random")
619 |         Initialization of embedding. Possible options are 'random' and 'pca'.
620 |         PCA initialization cannot be used with precomputed distances and is
621 |         usually more globally stable than random initialization.
622 | 
623 |     verbose : int, optional (default: 0)
624 |         Verbosity level.
625 | 
626 |     random_state : int or RandomState instance or None (default)
627 |         Pseudo Random Number generator seed control. If None, use the
628 |         numpy.random singleton. Note that different initializations
629 |         might result in different local minima of the cost function.
630 | 
631 |     method : string (default: 'barnes_hut')
632 |         By default the gradient calculation algorithm uses Barnes-Hut
633 |         approximation running in O(NlogN) time. method='exact'
634 |         will run on the slower, but exact, algorithm in O(N^2) time. The
635 |         exact algorithm should be used when nearest-neighbor errors need
636 |         to be better than 3%. However, the exact method cannot scale to
637 |         millions of examples.
638 | 
639 |         .. versionadded:: 0.17
640 |            Approximate optimization *method* via the Barnes-Hut.
641 | 
642 |     angle : float (default: 0.5)
643 |         Only used if method='barnes_hut'
644 |         This is the trade-off between speed and accuracy for Barnes-Hut T-SNE.
645 |         'angle' is the angular size (referred to as theta in [3]) of a distant
646 |         node as measured from a point. If this size is below 'angle' then it is
647 |         used as a summary node of all points contained within it.
648 |         This method is not very sensitive to changes in this parameter
649 |         in the range of 0.2 - 0.8. Angle less than 0.2 has quickly increasing
650 |         computation time and angle greater 0.8 has quickly increasing error.
651 | 
652 | 
653 |     Attributes
654 |     ----------
655 |     embedding_ : array-like, shape (n_samples, n_components)
656 |         Stores the embedding vectors.
657 | 
658 |     kl_divergence_ : float
659 |         Kullback-Leibler divergence after optimization.
660 | 
661 |     Examples
662 |     --------
663 | 
664 |     >>> import numpy as np
665 |     >>> from sklearn.manifold import TSNE
666 |     >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
667 |     >>> model = TSNE(n_components=2, random_state=0)
668 |     >>> np.set_printoptions(suppress=True)
669 |     >>> model.fit_transform(X) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
670 |     array([[ 0.00017599,  0.00003993],
671 |            [ 0.00009891,  0.00021913],
672 |            [ 0.00018554, -0.00009357],
673 |            [ 0.00009528, -0.00001407]])
674 | 
675 |     References
676 |     ----------
677 | 
678 |     [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data
679 |         Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008.
680 | 
681 |     [2] van der Maaten, L.J.P. t-Distributed Stochastic Neighbor Embedding
682 |         http://homepage.tudelft.nl/19j49/t-SNE.html
683 | 
684 |     [3] L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.
685 |         Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
686 |         http://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf
687 |     """
688 | 
689 |     def __init__(self, n_components=2, label_importance=1.0,
690 |                  class_sizes_are_representative=False, perplexity=30.0,
691 |                  early_exaggeration=4.0, learning_rate=1000.0, n_iter=1000,
692 |                  n_iter_without_progress=30, min_grad_norm=1e-7,
693 |                  metric="euclidean", init="random", verbose=0,
694 |                  random_state=None, method='barnes_hut', angle=0.5):
695 |         if init not in ["pca", "random"] or isinstance(init, np.ndarray):
696 |             msg = "'init' must be 'pca', 'random' or a NumPy array"
697 |             raise ValueError(msg)
698 |         self.n_components = n_components
699 |         self.label_importance = label_importance
700 |         self.class_sizes_are_representative = class_sizes_are_representative
701 |         self.perplexity = perplexity
702 |         self.early_exaggeration = early_exaggeration
703 |         self.learning_rate = learning_rate
704 |         self.n_iter = n_iter
705 |         self.n_iter_without_progress = n_iter_without_progress
706 |         self.min_grad_norm = min_grad_norm
707 |         self.metric = metric
708 |         self.init = init
709 |         self.verbose = verbose
710 |         self.random_state = random_state
711 |         self.method = method
712 |         self.angle = angle
713 |         self.embedding_ = None
714 | 
715 |     def _fit(self, X, y, skip_num_points=0):
716 |         """Fit the model using X as training data, and y
717 |         as the (partial) labelling.
718 | 
719 |         Note that sparse arrays can only be handled by method='exact'.
720 |         It is recommended that you convert your sparse array to dense
721 |         (e.g. `X.toarray()`) if it fits in memory, or otherwise using a
722 |         dimensionality reduction technique (e.g. TruncatedSVD).
723 | 
724 |         Parameters
725 |         ----------
726 |         X : array, shape (n_samples, n_features) or (n_samples, n_samples)
727 |             If the metric is 'precomputed' X must be a square distance
728 |             matrix. Otherwise it contains a sample per row. Note that this
729 |             when method='barnes_hut', X cannot be a sparse array and if need be
730 |             will be converted to a 32 bit float array. Method='exact' allows
731 |             sparse arrays and 64bit floating point inputs.
732 | 
733 |         y : array, shape (n_samples,)
734 |             Labels must be integers, with unlabelled points given the label -1.
735 | 
736 |         skip_num_points : int (optional, default:0)
737 |             This does not compute the gradient for points with indices below
738 |             `skip_num_points`. This is useful when computing transforms of new
739 |             data where you'd like to keep the old data fixed.
740 |         """
741 |         if self.method not in ['barnes_hut', 'exact']:
742 |             raise ValueError("'method' must be 'barnes_hut' or 'exact'")
743 |         if self.angle < 0.0 or self.angle > 1.0:
744 |             raise ValueError("'angle' must be between 0.0 - 1.0")
745 |         if self.method == 'barnes_hut' and sp.issparse(X):
746 |             raise TypeError('A sparse matrix was passed, but dense '
747 |                             'data is required for method="barnes_hut". Use '
748 |                             'X.toarray() to convert to a dense numpy array if '
749 |                             'the array is small enough for it to fit in '
750 |                             'memory. Otherwise consider dimensionality '
751 |                             'reduction techniques (e.g. TruncatedSVD)')
752 |         else:
753 |             X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
754 |                             dtype=np.float64)
755 |         random_state = check_random_state(self.random_state)
756 | 
757 |         if self.early_exaggeration < 1.0:
758 |             raise ValueError("early_exaggeration must be at least 1, but is "
759 |                              "%f" % self.early_exaggeration)
760 | 
761 |         if self.n_iter < 200:
762 |             raise ValueError("n_iter should be at least 200")
763 | 
764 |         if self.metric == "precomputed":
765 |             if self.init == 'pca':
766 |                 raise ValueError("The parameter init=\"pca\" cannot be used "
767 |                                  "with metric=\"precomputed\".")
768 |             if X.shape[0] != X.shape[1]:
769 |                 raise ValueError("X should be a square distance matrix")
770 |             distances = X
771 |         else:
772 |             if self.verbose:
773 |                 print("[t-SNE] Computing pairwise distances...")
774 | 
775 |             if self.metric == "euclidean":
776 |                 distances = pairwise_distances(X, metric=self.metric,
777 |                                                squared=True)
778 |             else:
779 |                 distances = pairwise_distances(X, metric=self.metric)
780 | 
781 |         if not np.all(distances >= 0):
782 |             raise ValueError("All distances should be positive, either "
783 |                              "the metric or precomputed distances given "
784 |                              "as X are not correct")
785 | 
786 |         # Degrees of freedom of the Student's t-distribution. The suggestion
787 |         # degrees_of_freedom = n_components - 1 comes from
788 |         # "Learning a Parametric Embedding by Preserving Local Structure"
789 |         # Laurens van der Maaten, 2009.
790 |         degrees_of_freedom = max(self.n_components - 1.0, 1)
791 |         n_samples = X.shape[0]
792 |         # the number of nearest neighbors to find
793 |         k = min(n_samples - 1, int(3. * self.perplexity + 1))
794 | 
795 |         neighbors_nn = None
796 |         if self.method == 'barnes_hut':
797 |             if self.verbose:
798 |                 print("[t-SNE] Computing %i nearest neighbors..." % k)
799 |             if self.metric == 'precomputed':
800 |                 # Use the precomputed distances to find
801 |                 # the k nearest neighbors and their distances
802 |                 neighbors_nn = np.argsort(distances, axis=1)[:, :k]
803 |             else:
804 |                 # Find the nearest neighbors for every point
805 |                 bt = BallTree(X)
806 |                 # LvdM uses 3 * perplexity as the number of neighbors
807 |                 # And we add one to not count the data point itself
808 |                 # In the event that we have very small # of points
809 |                 # set the neighbors to n - 1
810 |                 distances_nn, neighbors_nn = bt.query(X, k=k + 1)
811 |                 neighbors_nn = neighbors_nn[:, 1:]
812 |             P = _joint_probabilities_nn(distances, neighbors_nn, y,
813 |                                         self.label_importance,
814 |                                         self.class_sizes_are_representative,
815 |                                         self.perplexity, self.verbose)
816 |         else:
817 |             P = _joint_probabilities(distances, y,
818 |                                      self.label_importance,
819 |                                      self.class_sizes_are_representative,
820 |                                      self.perplexity, self.verbose)
821 |         assert np.all(np.isfinite(P)), "All probabilities should be finite"
822 |         assert np.all(P >= 0), "All probabilities should be zero or positive"
823 |         assert np.all(P <= 1), ("All probabilities should be less "
824 |                                 "or then equal to one")
825 | 
826 |         if self.init == 'pca':
827 |             pca = PCA(n_components=self.n_components, svd_solver='randomized',
828 |                       random_state=random_state)
829 |             X_embedded = pca.fit_transform(X)
830 |         elif isinstance(self.init, np.ndarray):
831 |             X_embedded = self.init
832 |         elif self.init == 'random':
833 |             X_embedded = None
834 |         else:
835 |             raise ValueError("Unsupported initialization scheme: %s"
836 |                              % self.init)
837 | 
838 |         return self._tsne(P, degrees_of_freedom, n_samples, random_state,
839 |                           X_embedded=X_embedded,
840 |                           neighbors=neighbors_nn,
841 |                           skip_num_points=skip_num_points)
842 | 
843 |     def _tsne(self, P, degrees_of_freedom, n_samples, random_state,
844 |               X_embedded=None, neighbors=None, skip_num_points=0):
845 |         """Runs t-SNE."""
846 |         # t-SNE minimizes the Kullback-Leiber divergence of the Gaussians P
847 |         # and the Student's t-distributions Q. The optimization algorithm that
848 |         # we use is batch gradient descent with three stages:
849 |         # * early exaggeration with momentum 0.5
850 |         # * early exaggeration with momentum 0.8
851 |         # * final optimization with momentum 0.8
852 |         # The embedding is initialized with iid samples from Gaussians with
853 |         # standard deviation 1e-4.
854 | 
855 |         if X_embedded is None:
856 |             # Initialize embedding randomly
857 |             X_embedded = 1e-4 * random_state.randn(n_samples,
858 |                                                    self.n_components)
859 |         params = X_embedded.ravel()
860 | 
861 |         opt_args = {}
862 |         opt_args = {"n_iter": 50, "momentum": 0.5, "it": 0,
863 |                     "learning_rate": self.learning_rate,
864 |                     "verbose": self.verbose, "n_iter_check": 25,
865 |                     "kwargs": dict(skip_num_points=skip_num_points)}
866 |         if self.method == 'barnes_hut':
867 |             m = "Must provide an array of neighbors to use Barnes-Hut"
868 |             assert neighbors is not None, m
869 |             obj_func = _kl_divergence_bh
870 |             objective_error = _kl_divergence_error
871 |             sP = squareform(P).astype(np.float32)
872 |             neighbors = neighbors.astype(np.int64)
873 |             args = [sP, neighbors, degrees_of_freedom, n_samples,
874 |                     self.n_components]
875 |             opt_args['args'] = args
876 |             opt_args['min_grad_norm'] = 1e-3
877 |             opt_args['n_iter_without_progress'] = 30
878 |             # Don't always calculate the cost since that calculation
879 |             # can be nearly as expensive as the gradient
880 |             opt_args['objective_error'] = objective_error
881 |             opt_args['kwargs']['angle'] = self.angle
882 |             opt_args['kwargs']['verbose'] = self.verbose
883 |         else:
884 |             obj_func = _kl_divergence
885 |             opt_args['args'] = [P, degrees_of_freedom, n_samples,
886 |                                 self.n_components]
887 |             opt_args['min_error_diff'] = 0.0
888 |             opt_args['min_grad_norm'] = 0.0
889 | 
890 |         # Early exaggeration
891 |         P *= self.early_exaggeration
892 | 
893 |         params, kl_divergence, it = _gradient_descent(obj_func, params,
894 |                                                       **opt_args)
895 |         opt_args['n_iter'] = 100
896 |         opt_args['momentum'] = 0.8
897 |         opt_args['it'] = it + 1
898 |         params, kl_divergence, it = _gradient_descent(obj_func, params,
899 |                                                       **opt_args)
900 |         if self.verbose:
901 |             print("[t-SNE] KL divergence after %d iterations with early "
902 |                   "exaggeration: %f" % (it + 1, kl_divergence))
903 |         # Save the final number of iterations
904 |         self.n_iter_final = it
905 | 
906 |         # Final optimization
907 |         P /= self.early_exaggeration
908 |         opt_args['n_iter'] = self.n_iter
909 |         opt_args['it'] = it + 1
910 |         params, error, it = _gradient_descent(obj_func, params, **opt_args)
911 | 
912 |         if self.verbose:
913 |             print("[t-SNE] Error after %d iterations: %f"
914 |                   % (it + 1, kl_divergence))
915 | 
916 |         X_embedded = params.reshape(n_samples, self.n_components)
917 |         self.kl_divergence_ = kl_divergence
918 | 
919 |         return X_embedded
920 | 
921 |     def fit_transform(self, X, y):
922 |         """Fit X into an embedded space and return that transformed
923 |         output.
924 | 
925 |         Parameters
926 |         ----------
927 |         X : array, shape (n_samples, n_features) or (n_samples, n_samples)
928 |             If the metric is 'precomputed' X must be a square distance
929 |             matrix. Otherwise it contains a sample per row.
930 | 
931 |         y : array, shape (n_samples,)
932 |             A (partial) labelling of the samples. The array should provide
933 |             a label value for each sample. Labels must be integers, with
934 |             unlabelled points given the label -1.
935 | 
936 |         Returns
937 |         -------
938 |         X_new : array, shape (n_samples, n_components)
939 |             Embedding of the training data in low-dimensional space.
940 |         """
941 |         embedding = self._fit(X, y)
942 |         self.embedding_ = embedding
943 |         return self.embedding_
944 | 
945 |     def fit(self, X, y):
946 |         """Fit X into an embedded space.
947 | 
948 |         Parameters
949 |         ----------
950 |         X : array, shape (n_samples, n_features) or (n_samples, n_samples)
951 |             If the metric is 'precomputed' X must be a square distance
952 |             matrix. Otherwise it contains a sample per row. If the method
953 |             is 'exact', X may be a sparse matrix of type 'csr', 'csc'
954 |             or 'coo'.
955 | 
956 |         y : array, shape (n_samples,)
957 |             A (partial) labelling of the samples. The array should provide
958 |             a label value for each sample. Labels must be integers, with
959 |             unlabelled points given the label -1.
960 |          """
961 |         self.fit_transform(X, y)
962 |         return self
963 | 


--------------------------------------------------------------------------------