├── .gitignore ├── .travis.yml ├── README.md ├── US_County_Level_Presidential_Results_08-16.csv ├── example.ipynb ├── gendata.py ├── license.txt ├── pytest.ini ├── setup.py └── spenc ├── __init__.py ├── abstracts.py ├── scores.py ├── tests ├── __init__.py ├── data │ ├── nat_10k_nodata.ary │ ├── nat_30k_discovered.ary │ ├── nat_30k_randoms.ary │ ├── nat_infk_discovered.ary │ └── nat_infk_randoms.ary └── test_spenc.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .ipynb_checkpoints 3 | *.pyc 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | branches: 4 | only: 5 | - master 6 | python: 7 | - "3.5" 8 | - "3.6" 9 | 10 | before_install: 11 | - wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh 12 | - chmod +x miniconda.sh 13 | - ./miniconda.sh -b -p ./miniconda 14 | - export PATH=`pwd`/miniconda/bin:$PATH 15 | - conda update --yes conda 16 | - conda create -y -q -n test-env python=$TRAVIS_PYTHON_VERSION 17 | - source activate test-env 18 | 19 | install: 20 | - conda install --yes pip 21 | - conda install --yes scikit-learn scipy nose 22 | - pip install geopandas pysal pytest 23 | 24 | script: 25 | - python setup.py sdist >/dev/null 26 | - python -c "import numpy; print(numpy.show_config())" 27 | - pytest spenc; 28 | notifications: 29 | email: 30 | recipients: 31 | - levi.john.wolf@gmail.com 32 | on_success: change 33 | on_failure: always 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spatially-Encouraged Spectral Clustering 2 | [![Build Status](https://travis-ci.org/ljwolf/spenc.svg?branch=master)](https://travis-ci.org/ljwolf/spenc) 3 | [![DOI](https://zenodo.org/badge/129973633.svg)](https://zenodo.org/badge/latestdoi/129973633) 4 | 5 | 6 | This repository provides the code & walks through how to use spatially-encouraged spectral clustering. Refer to the [example notebook](https://github.com/ljwolf/spenc/blob/master/example.ipynb) for more information on usage. 7 | 8 | Usage reqiures `scikit-learn` and `scipy`. The package is released on pypi as `spenc`, so installation is available using: 9 | 10 | `pip install spenc` 11 | 12 | # Citation 13 | 14 | If you would like to reference this software, please cite its zenodo listing: 15 | 16 | Wolf, Levi John. 2018. “Ljwolf/spenc: GISRUK”. Zenodo. doi:10.5281/zenodo.1219904. 17 | 18 | And, for the paper defining the algorithm: 19 | 20 | Wolf, Levi John. *(In Review)* "Spatially-Encouraged Spectral Clustering." *International Journal of Geographic Information Science*. 21 | 22 | with a full preprint available at the [Open Science Framework](https://osf.io/yzt2p). 23 | -------------------------------------------------------------------------------- /gendata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import spenc as spenc 3 | import pysal as ps 4 | import geopandas as gpd 5 | import os 6 | SEED = 1901 7 | dir_self = os.path.dirname(__file__) 8 | datadir = os.path.join(dir_self, 'spenc/tests/data') 9 | 10 | if __name__ == "__main__": 11 | nat = gpd.read_file(ps.examples.get_path("NAT.shp")) 12 | natR = ps.weights.Rook.from_dataframe(nat) 13 | names = nat.filter(like='90').columns.tolist() + nat.filter(like='89').columns.tolist() 14 | X = nat[names].values 15 | X = (X - X.mean(axis=0))/X.var(axis=0) 16 | 17 | print('(1 of 5) doing 10k nodata') 18 | np.random.seed(SEED) 19 | labels = spenc.SPENC(n_clusters=10, random_state=SEED).fit(None, natR.sparse).labels_ 20 | labels.dump(os.path.join(datadir, 'nat_10k_nodata.ary')) 21 | 22 | print('(2 of 5) doing 30k sampling') 23 | np.random.seed(SEED) 24 | labels = spenc.SPENC(n_clusters=30, random_state=SEED).sample(natR.sparse, n_samples=3) 25 | labels.dump(os.path.join(datadir, 'nat_30k_randoms.ary')) 26 | 27 | print('(3 of 5) doing 30k withdata') 28 | np.random.seed(SEED) 29 | labels = spenc.SPENC(n_clusters=30, gamma=.001, random_state=SEED).fit(X, natR.sparse).labels_ 30 | labels.dump(os.path.join(datadir, 'nat_30k_discovered.ary')) 31 | 32 | print('(4 of 5) doing infk sampling') 33 | np.random.seed(SEED) 34 | labels = spenc.SPENC(n_clusters=np.inf, random_state=SEED).sample(natR.sparse, floor=20) 35 | labels.dump(os.path.join(datadir, 'nat_infk_randoms.ary')) 36 | 37 | print('(5 of 5) doing infk withdata') 38 | np.random.seed(SEED) 39 | labels = spenc.SPENC(n_clusters=np.inf, gamma=.001, random_state=SEED).fit(X, natR.sparse, floor=20).labels_ 40 | labels.dump(os.path.join(datadir, 'nat_infk_discovered.ary')) 41 | 42 | 43 | print('done!') 44 | 45 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Copyright 2018, Levi John Wolf 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are 4 | permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list 7 | of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 10 | of conditions and the following disclaimer in the documentation and/or other materials 11 | provided with the distribution. 12 | 13 | 3. Neither the name of the copyright holder nor the names of its contributors may be used 14 | to endorse or promote products derived from this software without specific prior 15 | written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 22 | OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 24 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | once 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | 4 | basepath = os.path.dirname(__file__) 5 | init = os.path.join(basepath, 'spenc/__init__.py') 6 | 7 | with open(init, 'r') as initfile: 8 | firstline = initfile.readline() 9 | init_version = firstline.split('=')[-1].strip() 10 | 11 | setup(name='spenc', 12 | version=init_version, 13 | description='Spatially-Encouraged Spectral Clustering, a method of discovering clusters/deriving labels for spatially-referenced data with attribute/labels attached', 14 | url='https://github.com/ljwolf/spenc', 15 | author='Levi John Wolf', 16 | author_email='levi.john.wolf@gmail.com', 17 | license='3-Clause BSD', 18 | python_requires='>=3.5', 19 | packages=['spenc'], 20 | install_requires=['scikit-learn>=0.20.0', 'scipy']) 21 | -------------------------------------------------------------------------------- /spenc/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.3.0dev' 2 | 3 | from .abstracts import SPENC 4 | -------------------------------------------------------------------------------- /spenc/abstracts.py: -------------------------------------------------------------------------------- 1 | from sklearn import cluster as clust 2 | import sklearn.metrics as skm 3 | import sklearn.metrics.pairwise as pw 4 | from sklearn.utils.validation import check_array 5 | from .utils import check_weights 6 | from sklearn.neighbors import kneighbors_graph 7 | from sklearn.utils.extmath import _deterministic_vector_sign_flip 8 | from sklearn.utils import check_random_state 9 | from sklearn.cluster.spectral import discretize as _discretize 10 | from sklearn.preprocessing import LabelEncoder 11 | from sklearn.base import clone 12 | import numpy as np 13 | from .scores import boundary_fraction 14 | import scipy.sparse as spar 15 | from scipy.sparse import csgraph as cg, linalg as la 16 | from warnings import warn as Warn 17 | 18 | class SPENC(clust.SpectralClustering): 19 | def __init__(self, n_clusters=8, eigen_solver=None, random_state=None, 20 | n_init=10, gamma=1., affinity='rbf', n_neighbors=10, 21 | eigen_tol=1e-9, assign_labels='kmeans', degree=3, coef0=1, 22 | kernel_params=None, n_jobs=1): 23 | """ 24 | Apply clustering to a projection of the normalized laplacian, using 25 | spatial information to constrain the clustering. 26 | 27 | In practice Spectral Clustering is very useful when the structure of 28 | the individual clusters is highly non-convex or more generally when 29 | a measure of the center and spread of the cluster is not a suitable 30 | description of the complete cluster. For instance when clusters are 31 | nested circles on the 2D plan. 32 | 33 | Spatially-Encouraged Spectral Clustering (SPENC) is useful for when 34 | there may be highly non-convex clusters or clusters with irregular 35 | topology in a geographic context. 36 | 37 | If a binary weights matrix is provided during fit, this method can be 38 | used to find weighted normalized graph cuts. 39 | 40 | When calling ``fit``, an affinity matrix is constructed using either 41 | kernel function such the Gaussian (aka RBF) kernel of the euclidean 42 | distanced ``d(X, X)``:: 43 | 44 | np.exp(-gamma * d(X,X) ** 2) 45 | 46 | or a k-nearest neighbors connectivity matrix. 47 | 48 | Alternatively, using ``precomputed``, a user-provided affinity 49 | matrix can be used. 50 | 51 | Read more in the scikit-learn user guide on spectral clustering 52 | 53 | Parameters 54 | ----------- 55 | n_clusters : integer, optional 56 | The number of clusters to search for. 57 | 58 | eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} 59 | NOTE: ignored unless fitting using the `breakme` flag. So, do not use. 60 | The eigenvalue decomposition strategy to use. AMG requires pyamg 61 | to be installed. It can be faster on very large, sparse problems, 62 | but may also lead to instabilities 63 | 64 | random_state : int, RandomState instance or None, optional, default: None 65 | A pseudo random number generator used for the initialization of the 66 | lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by 67 | the K-Means initialization. If int, random_state is the seed used by 68 | the random number generator; If RandomState instance, random_state is 69 | the random number generator; If None, the random number generator is 70 | the RandomState instance used by `np.random`. 71 | 72 | n_init : int, optional, default: 10 73 | Number of time the k-means algorithm will be run with different 74 | centroid seeds. The final results will be the best output of 75 | n_init consecutive runs in terms of inertia. 76 | 77 | gamma : float, default=1.0 78 | Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels. 79 | Ignored for ``affinity='nearest_neighbors'``. 80 | 81 | affinity : string, array-like or callable, default 'rbf' 82 | If a string, this may be one of 'nearest_neighbors', 'precomputed', 83 | 'rbf' or one of the kernels supported by 84 | `sklearn.metrics.pairwise_kernels`. 85 | 86 | Only kernels that produce similarity scores (non-negative values that 87 | increase with similarity) should be used. This property is not checked 88 | by the clustering algorithm. 89 | 90 | n_neighbors : integer 91 | Number of neighbors to use when constructing the affinity matrix using 92 | the nearest neighbors method. Ignored for ``affinity='rbf'``. 93 | 94 | eigen_tol : float, optional, default: 1e-7 95 | Stopping criterion for eigendecomposition of the Laplacian matrix 96 | when using arpack eigen_solver. 97 | 98 | assign_labels : {'kmeans', 'discretize', 'hierarchical'}, default: 'discretize' 99 | The strategy to use to assign labels in the embedding 100 | space. There are three ways to assign labels after the laplacian 101 | embedding. 102 | 1. k-means can be applied and is a popular choice. But it can 103 | also be sensitive to initialization. 104 | 2. Discretization is another approach which is less sensitive to 105 | random initialization, and which usually finds better clusters. 106 | 3. Hierarchical decomposition repeatedly bi-partitions the graph, 107 | instead of finding the decomposition all at once, as suggested in 108 | Shi & Malik (2000). 109 | 110 | degree : float, default=3 111 | Degree of the polynomial affinity kernel. Ignored by other kernels. 112 | 113 | coef0 : float, default=1 114 | Zero coefficient for polynomial and sigmoid affinity kernels. 115 | Ignored by other kernels. 116 | 117 | kernel_params : dictionary of string to any, optional 118 | Parameters (keyword arguments) and values for affinity kernel passed as 119 | callable object. Ignored by other affinity kernels. 120 | 121 | n_jobs : int, optional (default = 1) 122 | The number of parallel jobs to run for the nearest-neighbors 123 | affinity kernel, if used. 124 | If ``-1``, then the number of jobs is set to the number of CPU cores. 125 | 126 | Attributes 127 | ---------- 128 | affinity_matrix_ : array-like, shape (n_samples, n_samples) 129 | Affinity matrix used for clustering. Available only if after calling 130 | ``fit``. 131 | 132 | labels_ : 133 | Labels of each point 134 | 135 | Notes 136 | ----- 137 | If you have an affinity matrix, such as a distance matrix, 138 | for which 0 means identical elements, and high values means 139 | very dissimilar elements, it can be transformed in a 140 | similarity matrix that is well suited for the algorithm by 141 | applying the Gaussian (RBF, heat) kernel:: 142 | 143 | np.exp(- dist_matrix ** 2 / (2. * delta ** 2)) 144 | 145 | Where ``delta`` is a free parameter representing the width of the Gaussian 146 | kernel. 147 | 148 | Another alternative is to take a symmetric version of the k 149 | nearest neighbors connectivity matrix of the points. 150 | 151 | References 152 | ---------- 153 | 154 | - Normalized cuts and image segmentation, 2000 155 | Jianbo Shi, Jitendra Malik 156 | http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 157 | 158 | - A Tutorial on Spectral Clustering, 2007 159 | Ulrike von Luxburg 160 | http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323 161 | 162 | - Multiclass spectral clustering, 2003 163 | Stella X. Yu, Jianbo Shi 164 | http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf 165 | """ 166 | self.n_clusters = n_clusters 167 | self.eigen_solver = eigen_solver 168 | self.random_state = random_state 169 | self.n_init = n_init 170 | self.gamma = gamma 171 | self.affinity = affinity 172 | self.n_neighbors = n_neighbors 173 | self.eigen_tol = eigen_tol 174 | self.assign_labels = assign_labels 175 | self.degree = degree 176 | self.coef0 = coef0 177 | self.kernel_params = kernel_params 178 | self.n_jobs = n_jobs 179 | 180 | def fit(self, X, W=None, y=None, shift_invert=True, check_W=True, 181 | grid_resolution = 100, floor=0, floor_weights=None, cut_method='gridsearch'): 182 | """Creates an affinity matrix for X using the selected affinity, 183 | applies W to the affinity elementwise, and then applies spectral clustering 184 | to the affinity matrix. 185 | 186 | Arguments 187 | --------- 188 | X : sparse or dense array 189 | matrix containing P features for N observations. 190 | W : sparse or dense array, default None 191 | matrix expressing the pairwise spatial relationships 192 | between N observations. 193 | y : sparse or dense array, default None 194 | ignored, for scikit-learn class inheritance/regularity purposes. 195 | shift_invert : bool, default True 196 | boolean governing whether or not to use shift-invert 197 | trick to finding sparse eigenvectors 198 | 199 | breakme : bool, default False 200 | Whether or not to simply pipe down to the sklearn spectral 201 | clustering class. Will likely break the formal guarantees 202 | about contiguity/connectedness of solutions, due to the 203 | standardizations/short cuts taken in sklearn.cluster.SpectralClustering 204 | check_W : bool, default True 205 | Whether or not to check that the spatial weights matrix 206 | is correctly formatted and aligns with the X matrix. 207 | grid_resolution : int, default 100 208 | how many subdivisions to use when doing gridsearch 209 | for cutpoint on second eigenvector of subgraphs. 210 | floor : float/int, default 0 211 | value which governs the lower limit on the size of partitions 212 | if 0, there is no limit. 213 | if floor_weights are provided, floor should be a limit on 214 | the sum of floor weights for each region. 215 | floor_weights : np.ndarray of shape (n,), default np.ones((n,)) 216 | array containing weights for each observation used to determine 217 | the region floor. 218 | cut_method : str, default 'gridsearch' 219 | option governing what method to use to partition regions 220 | 1. "gridsearch" (default): the hierarchical grid search 221 | suggested by Shi & Malik (2000); search the second 222 | eigenvector for the "best" partition in terms of cut weight. 223 | 2. "zero": cut the eigenvector at zero. Usually a passable solution, 224 | since the second eigenvector is usually centered around zero. 225 | 3. "median": cut the eigenvector through its median. This means the 226 | regions will always be divided into two halves with equal numbers 227 | of elemental units. 228 | "gridsearch" may be slow when grid_resolution is large. 229 | "zero" is the best method for large data. 230 | 231 | NOTE: 232 | 233 | breakme sends the affinity matrix down to scikit's spectral clustering class. 234 | I call this breakme because of bug8129. 235 | I don't see a significant difference here when switching between the two, 236 | most assignments in the problems I've examined are the same. 237 | I think, since the bug is in the scaling of the eigenvectors, it's not super important. 238 | 239 | But, in the future, it may make sense to investigate whether the bug in sklearn 240 | is fully fixed, which would mean that any spectral clustering for 241 | a weights matrix in sklearn would always be contiguous. 242 | 243 | """ 244 | if np.isinf(self.n_clusters): 245 | self.assign_labels='hierarchical' 246 | 247 | if W is None: 248 | W = sparse.csc_matrix(np.ones_like(X.shape[0],X.shape[0])) 249 | 250 | if X is not None: 251 | X = check_array(X, accept_sparse = ['csr','coo', 'csc'], 252 | dtype=np.float64, ensure_min_samples=2) 253 | if check_W: 254 | W = check_weights(W, X) 255 | 256 | if self.affinity == 'nearest_neighbors': 257 | connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors, 258 | include_self=True, n_jobs=self.n_jobs) 259 | attribute_affinity_ = .5 * (connectivity + connectivity.T) 260 | elif self.affinity == 'precomputed': 261 | attribute_affinity_ = W.multiply(X) 262 | else: 263 | params = self.kernel_params 264 | if params is None: 265 | params = {} 266 | if not callable(self.affinity): 267 | params['gamma'] = self.gamma 268 | params['degree'] = self.degree 269 | params['coef0'] = self.coef0 270 | attribute_affinity_ = pw.pairwise_kernels(X, metric=self.affinity, 271 | filter_params=True, 272 | **params) 273 | spatial_affinity_ = W 274 | affinity_matrix_ = W.multiply(attribute_affinity_) 275 | else: 276 | affinity_matrix_ = W 277 | 278 | affinity_old = self.affinity 279 | self.affinity = 'precomputed' 280 | super().fit(affinity_matrix_) 281 | return self 282 | 283 | def _embed(self, affinity, shift_invert=True): 284 | """ 285 | Compute the eigenspace embedding of a given affinity matrix. 286 | 287 | Arguments 288 | --------- 289 | affinity : sparse or dense matrix 290 | affinity matrix to compute the spectral embedding of 291 | shift_invert: bool 292 | whether or not to use the shift-invert eigenvector search 293 | trick useful for finding sparse eigenvectors. 294 | """ 295 | laplacian, orig_d = cg.laplacian(affinity, 296 | normed=True, return_diag=True) 297 | laplacian *=-1 298 | random_state = check_random_state(self.random_state) 299 | v0 = random_state.uniform(-1,1,laplacian.shape[0]) 300 | 301 | if not shift_invert: 302 | ev, spectrum = la.eigsh(laplacian, which='LA', k=self.n_clusters, v0=v0, 303 | tol=self.eigen_tol) 304 | else: 305 | ev, spectrum = la.eigsh(laplacian, which='LM', sigma=1, k=self.n_clusters, v0=v0, 306 | tol=self.eigen_tol) 307 | 308 | embedding = spectrum.T[self.n_clusters::-1] #sklearn/issues/8129 309 | embedding = embedding / orig_d 310 | embedding = _deterministic_vector_sign_flip(embedding) 311 | return embedding 312 | 313 | def _spectral_bipartition(self, affinity_matrix_, 314 | grid_resolution=100, 315 | shift_invert=True, floor=0, 316 | floor_weights = None, 317 | cut_method='gridsearch'): 318 | """ 319 | Implements the recursive spectral bipartitioning of shi and malik (2000) 320 | If n_clusters = np.inf and floor > 0, then will find 321 | all possible cuts with more than X units. 322 | 323 | Arguments 324 | --------- 325 | grid_resolution : int 326 | how many subdivisions to use when doing gridsearch 327 | for cutpoint on second eigenvector of subgraphs. 328 | (Default: 100) 329 | shift_invert : bool 330 | boolean governing whether or not to use shift-invert 331 | trick to finding sparse eigenvectors 332 | (Default: True) 333 | floor : float/int 334 | value which governs the lower limit on the size of partitions 335 | if 0, there is no limit. 336 | if floor_weights are provided, floor should be a limit on 337 | the sum of floor weights for each region. 338 | (Default: 0) 339 | floor_weights : np.ndarray of shape (n,) 340 | array containing weights for each observation used to determine 341 | the region floor. 342 | (Default: np.ones((n,))) 343 | cut_method : str 344 | option governing what method to use to partition regions 345 | 1. "gridsearch" (default): the hierarchical grid search 346 | suggested by Shi & Malik (2000); search the second 347 | eigenvector for the "best" partition in terms of cut weight. 348 | 2. "zero": cut the eigenvector at zero. Usually a passable solution, 349 | since the second eigenvector is usually centered around zero. 350 | 3. "median": cut the eigenvector through its median. This means the 351 | regions will always be divided into two halves with equal numbers 352 | of elemental units. 353 | "gridsearch" may be slow when grid_resolution is large. 354 | "zero" is the best method for large data. 355 | """ 356 | if floor_weights is None: 357 | floor_weights = np.ones((affinity_matrix_.shape[0],)) 358 | if spar.issparse(affinity_matrix_): 359 | affinity_matrix_ = affinity_matrix_.tocsr() 360 | threshold = self.n_clusters 361 | self.n_clusters = 2 362 | discovered=1 363 | this_cut = np.ones((affinity_matrix_.shape[0],)).astype(bool) 364 | cuts = [] 365 | accepted_cuts = [] 366 | while discovered < threshold: 367 | if this_cut.sum() > 2: #can't cut a singleton 368 | current_affinity = affinity_matrix_[this_cut,:][:,this_cut] 369 | embedding = self._embed(current_affinity, shift_invert = shift_invert) 370 | second_eigenvector = embedding[1] 371 | new_cut, score_of_cut = self._make_hierarchical_cut(second_eigenvector, 372 | current_affinity, 373 | grid_resolution, 374 | cut_method=cut_method, 375 | floor=floor) 376 | left_cut = this_cut.copy() 377 | left_cut[left_cut] *= new_cut 378 | right_cut = this_cut.copy() 379 | right_cut[right_cut] *= ~new_cut 380 | assert len(this_cut) == len(left_cut) == len(right_cut), "Indexing Error in cutting!" 381 | if (((left_cut*floor_weights).sum() > floor) 382 | & ((right_cut*floor_weights).sum() > floor)): 383 | if ((tuple(left_cut) not in accepted_cuts) 384 | & (tuple(right_cut) not in accepted_cuts)): 385 | cuts.append(left_cut) 386 | accepted_cuts.append(tuple(left_cut)) 387 | cuts.append(right_cut) 388 | accepted_cuts.append(tuple(right_cut)) 389 | discovered += 1 390 | try: 391 | this_cut = cuts.pop(0) 392 | except IndexError: 393 | break 394 | accepted_cuts = np.vstack(accepted_cuts) 395 | labels = np.ones((accepted_cuts[0].shape[0],))*-1.0 396 | for i,k in enumerate(np.flipud(accepted_cuts)): 397 | unassigned = labels == -1 398 | should_assign = (unassigned & k) 399 | labels[should_assign] = i 400 | return LabelEncoder().fit_transform(labels) 401 | 402 | def _make_hierarchical_cut(self, second_eigenvector, 403 | affinity_matrix, 404 | grid_resolution, 405 | cut_method='median', 406 | floor=0): 407 | """Compute a single hierarchical cut using one of the methods described in 408 | Shi and Malik (2000). 409 | """ 410 | def mkobjective(second_eigenvector): 411 | """This makes a closure around the objective function given an eigenvector""" 412 | def objective(cutpoint): 413 | cut = second_eigenvector <= cutpoint 414 | assocA = affinity_matrix[cut].sum(axis=1).sum() 415 | assocB = affinity_matrix[~cut].sum(axis=1).sum() 416 | cutAB = affinity_matrix[cut,:][:,~cut].sum(axis=1).sum() * 2 417 | score = cutAB/assocA + cutAB/assocB 418 | if np.isnan(score): 419 | score = np.inf 420 | return score 421 | return objective 422 | 423 | objective = mkobjective(second_eigenvector) 424 | 425 | if cut_method == 'gridsearch': 426 | support = np.linspace(*np.percentile(second_eigenvector, q=(2,98)), 427 | num=grid_resolution) 428 | 429 | 430 | objective_surface = [objective(cutpoint) for cutpoint in support] 431 | cutpoint = support[np.argmin(objective_surface)] 432 | cut = second_eigenvector <= cutpoint 433 | return cut,np.min(objective_surface) 434 | elif cut_method == 'median': 435 | median = np.median(second_eigenvector) 436 | score = objective(median) 437 | return second_eigenvector < median, score 438 | else: 439 | score = objective(0) 440 | return second_eigenvector < 0, score 441 | 442 | 443 | def score(self, X, W, labels=None, delta=.5, 444 | attribute_score=skm.calinski_harabasz_score, 445 | spatial_score=boundary_fraction, 446 | attribute_kw = dict(), 447 | spatial_kw = dict()): 448 | """ 449 | Computes the score of the given label vector on data in X using convex 450 | combination weight in delta. 451 | 452 | Arguments 453 | --------- 454 | X : numpy array (N,P) 455 | array of data classified into `labels` to score. 456 | W : sparse array or numpy array (N,N) 457 | array representation of spatial relationships 458 | labels : numpy array (N,) 459 | vector of labels aligned with X and W 460 | delta : float 461 | weight to apply to the attribute score. 462 | Spatial score is given weight 1 - delta, 463 | and attributes weight delta. 464 | Default: .5 465 | attribute_score : callable 466 | function to use to evaluate attribute homogeneity 467 | Must have signature attribute_score(X,labels,**params) 468 | Default: sklearn.metrics.calinski_harabaz_score 469 | (within/between deviation ratio) 470 | spatial_score : callable 471 | function to use to evaluate spatial regularity/contiguity. 472 | Must have signature spatial_score(X,labels,**params) 473 | Default: boundary_ratio(W,X,labels,**spatial_kw) 474 | """ 475 | if labels is None: 476 | if not hasattr(self, 'labels_'): 477 | raise Exception('Object must be fit in order to avoid passing labels.') 478 | labels = self.labels_ 479 | labels = np.asarray(labels).flatten() 480 | attribute_score = attribute_score(X,labels, **attribute_kw) 481 | spatial_score = spatial_score(W,labels, X=X,**spatial_kw) 482 | return delta * attribute_score + (1 - delta)*spatial_score 483 | 484 | def _sample_gen(self, W, n_samples=1, 485 | affinity='rbf', 486 | distribution=None, **fit_kw): 487 | """ 488 | NOTE: this is the lazy generator version of sample 489 | Compute random clusters using random eigenvector decomposition. 490 | This uses random weights in spectral decomposition to generate approximately-evenly populated 491 | random subgraphs from W. 492 | 493 | Arguments 494 | --------- 495 | W : np.ndarray or scipy.sparse matrix 496 | matrix encoding the spatial relationships between observations in the frame. 497 | Must be strictly binary & connected to result in connected graphs correct behavior. 498 | Mathematical properties of randomregions are undefined if not. 499 | n_samples : int, default 1 500 | integer describing how many samples to construct 501 | affinity : string or callable, default is 'rbf' 502 | passed down to the underlying SPENC class when spectral spatial clusters are found. 503 | distribution : callable default is numpy.random.normal(0,1, size=(N,1)) 504 | function when called with no arguments that draws the random weights used to 505 | generate the random regions. Must align with W. 506 | spenc_parameters : keyword arguments 507 | extra arguments passed down to the SPENC class for further customization. 508 | """ 509 | if distribution is None: 510 | distribution = lambda : np.random.normal(0,1,size=(W.shape[0], 1)) 511 | else: 512 | assert callable(distribution), 'distribution is not callable!' 513 | for _ in range(n_samples): 514 | randomweights = distribution() 515 | fitted = clone(self).fit(randomweights, W, **fit_kw) 516 | yield fitted.labels_ 517 | 518 | def sample(self, W, n_samples=1, 519 | distribution=None, **fit_kw): 520 | """ 521 | Compute random clusters using random eigenvector decomposition. 522 | This uses random weights in spectral decomposition to generate approximately-evenly populated 523 | random subgraphs from W. 524 | 525 | Arguments 526 | --------- 527 | W : np.ndarray or scipy.sparse matrix 528 | matrix encoding the spatial relationships between observations in the frame. 529 | Must be strictly binary & connected to result in connected graphs correct behavior. 530 | Mathematical properties of randomregions are undefined if not. 531 | n_samples : int, default 1 532 | integer describing how many samples to construct 533 | affinity : string or callable, default is 'rbf' 534 | passed down to the underlying SPENC class when spectral spatial clusters are found. 535 | distribution : callable default is numpy.random.normal(0,1, size=(N,1)) 536 | function when called with no arguments that draws the random weights used to 537 | generate the random regions. Must align with W. 538 | fit_kw : keyword arguments 539 | extra arguments passed down to the SPENC class for further customization. 540 | Returns 541 | ------- 542 | labels corresponding to the input W that are generated at random. 543 | """ 544 | result = np.vstack([labels for labels in 545 | self._sample_gen(W, n_samples=n_samples, 546 | distribution=distribution, **fit_kw)]) 547 | if n_samples == 1: 548 | result = result.flatten() 549 | return result 550 | 551 | class AgglomerativeClustering(clust.AgglomerativeClustering): 552 | 553 | def _sample_gen(self, n_samples=25, distribution=None): 554 | """ 555 | sample random clusters with agglomerative clustering using random weights. 556 | """ 557 | if distribution is None: 558 | distribution = lambda : np.random.normal(0,1,size=(self.connectivity.shape[0],1)) 559 | else: 560 | assert callable(distribution), 'distribution is not callable!' 561 | for _ in range(n_samples): 562 | randomweights = distribution() 563 | fitted = clone(self).fit(randomweights) 564 | yield fitted.labels_ 565 | 566 | def sample(self, n_samples=1, 567 | distribution=None): 568 | """ 569 | Compute random clusters using randomly-weighted agglomerative clustering. 570 | This uses random weights in agglomerative clustering decomposition to generate 571 | random subgraphs from W. 572 | 573 | Arguments 574 | --------- 575 | W : np.ndarray or scipy.sparse matrix 576 | matrix encoding the spatial relationships between observations in the frame. 577 | Must be strictly binary & connected to result in connected graphs correct behavior. 578 | Mathematical properties of randomregions are undefined if not. 579 | n_samples : int 580 | integer describing how many samples to construct 581 | distribution : callable (default: np.random.normal(0,1)) 582 | a function that, when called with no arguments, returns the weights 583 | used as fake data to randomize the graph. 584 | 585 | Returns 586 | ------- 587 | labels corresponding to the input W that are generated at random. 588 | """ 589 | return np.vstack([labels for labels in 590 | self._sample_gen(n_samples=n_samples, 591 | distribution=distribution)]) 592 | -------------------------------------------------------------------------------- /spenc/scores.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def boundary_fraction(W, labels, X = None): 4 | """ 5 | """ 6 | boundary = 0 7 | for row, own_label in zip(W,labels): 8 | neighbor_labels = labels[row.nonzero()[-1]] 9 | boundary += (neighbor_labels != own_label).any().astype(int) 10 | return boundary / W.shape[0] 11 | 12 | def boundary_score(W, labels, X = None): 13 | """ 14 | Returns a version of boundary_fraction unbounded on the negative end using 15 | the log of the fraction: 16 | 17 | np.log(boundary_fraction(W, labels)) 18 | 19 | This is solely for testing purposes. 20 | """ 21 | return np.log(boundary_fraction(W, labels, X = None)) 22 | -------------------------------------------------------------------------------- /spenc/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/__init__.py -------------------------------------------------------------------------------- /spenc/tests/data/nat_10k_nodata.ary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_10k_nodata.ary -------------------------------------------------------------------------------- /spenc/tests/data/nat_30k_discovered.ary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_30k_discovered.ary -------------------------------------------------------------------------------- /spenc/tests/data/nat_30k_randoms.ary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_30k_randoms.ary -------------------------------------------------------------------------------- /spenc/tests/data/nat_infk_discovered.ary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_infk_discovered.ary -------------------------------------------------------------------------------- /spenc/tests/data/nat_infk_randoms.ary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_infk_randoms.ary -------------------------------------------------------------------------------- /spenc/tests/test_spenc.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import pysal as ps 3 | import numpy as np 4 | import os 5 | import geopandas as gpd 6 | import scipy.sparse.csgraph as csg 7 | from sklearn.metrics import accuracy_score 8 | 9 | import spenc 10 | 11 | filepath = os.path.dirname(__file__) 12 | SEED = 1901 13 | 14 | 15 | class SPENCTest(TestCase): 16 | def setUp(self): 17 | self.nat = gpd.read_file(ps.lib.examples.get_path('NAT.shp')) 18 | self.natR = ps.lib.weights.Rook.from_dataframe(self.nat) 19 | self.nat_10k_nodata = np.load(os.path.join(filepath, 'data/nat_10k_nodata.ary')) 20 | self.nat_30k_randoms = np.load(os.path.join(filepath, 'data/nat_30k_randoms.ary')) 21 | self.nat_30k_discovered = np.load(os.path.join(filepath, 'data/nat_30k_discovered.ary')) 22 | self.nat_infk_randoms = np.load(os.path.join(filepath, 'data/nat_infk_randoms.ary')) 23 | self.nat_infk_discovered = np.load(os.path.join(filepath, 'data/nat_infk_discovered.ary')) 24 | self.nat_names = self.nat.filter(like='90').columns.tolist() \ 25 | + self.nat.filter(like='89').columns.tolist() 26 | self.natX = self.nat[self.nat_names].values 27 | self.natX = (self.natX - self.natX.mean(axis=0)) / self.natX.var(axis=0) 28 | 29 | def test_NAT_nodata(self): 30 | np.random.seed(1901) #shouldn't matter substantively, only for label# 31 | t1 = spenc.SPENC(n_clusters=10, random_state=SEED).fit(None, self.natR.sparse).labels_ 32 | for label in range(t1.max()): 33 | mask = t1 == label 34 | subgraph = self.natR.sparse[mask,:][:,mask] 35 | subgraph.eliminate_zeros() 36 | n_components, labels = csg.connected_components(subgraph) 37 | self.assertEqual(n_components, 1, 38 | 'Disconnected component ({}) in NAT clusters!'.format(label)) 39 | np.testing.assert_allclose(accuracy_score(t1, self.nat_10k_nodata), 1, atol=.05) 40 | 41 | def test_NAT_randoms(self): 42 | np.random.seed(1901) 43 | randoms = spenc.SPENC(n_clusters=30, random_state=SEED).sample(self.natR.sparse, n_samples=3) 44 | self.assertEqual(randoms.shape, (3, len(self.nat)), 'sample shapes are incorrect!') 45 | for i,random in enumerate(randoms): 46 | for label in range(random.max()): 47 | mask = random == label 48 | subgraph = self.natR.sparse[mask,:][:,mask] 49 | subgraph.eliminate_zeros() 50 | n_components, labels = csg.connected_components(subgraph) 51 | self.assertEqual(n_components, 1, 52 | 'Disconnected component ({}) in NAT ' 53 | 'random cluster set {}!'.format(label,i) ) 54 | np.testing.assert_allclose(accuracy_score(random, self.nat_30k_randoms[i]), 1.0, atol=.05) 55 | np.random.seed(1901) 56 | randoms = spenc.SPENC(n_clusters=np.inf, random_state=SEED).sample(self.natR.sparse, floor=20) 57 | self.assertEqual(randoms.shape, (len(self.nat),), 'sample shapes are incorrect!') 58 | for label in range(randoms.max()): 59 | mask = randoms == label 60 | subgraph = self.natR.sparse[mask,:][:,mask] 61 | subgraph.eliminate_zeros() 62 | n_components, labels = csg.connected_components(subgraph) 63 | self.assertEqual(n_components, 1, 64 | 'Disconnected component ({}) in NAT ' 65 | 'random cluster set {}!'.format(label,i)) 66 | #remember, this is only one draw 67 | np.testing.assert_allclose(accuracy_score(randoms, self.nat_infk_randoms), 1, atol=.05) 68 | 69 | def test_NAT_data(self): 70 | np.random.seed(1901) 71 | k30 = spenc.SPENC(n_clusters=30, gamma=.001, random_state=SEED).fit(self.natX, self.natR.sparse) 72 | for label in range(k30.labels_.max()): 73 | mask = k30.labels_ == label 74 | subgraph = self.natR.sparse[mask,:][:,mask] 75 | subgraph.eliminate_zeros() 76 | n_components, labels = csg.connected_components(subgraph) 77 | self.assertEqual(n_components, 1, 78 | 'Disconnected component ({}) in NAT clusters!'.format(label)) 79 | # self.assertEqual(accuracy_score(k30.labels_, self.nat_30k_discovered), 1.0) breaks on travis for some reason 80 | np.random.seed(1901) 81 | kinf = spenc.SPENC(n_clusters=np.inf, gamma=.001, random_state=SEED)\ 82 | .fit(self.natX, self.natR.sparse, floor=20) 83 | for label in range(kinf.labels_.max()): 84 | mask = kinf.labels_ == label 85 | subgraph = self.natR.sparse[mask,:][:,mask] 86 | subgraph.eliminate_zeros() 87 | n_components, labels = csg.connected_components(subgraph) 88 | self.assertEqual(n_components, 1, 89 | 'Disconnected component ({}) in NAT clusters!'.format(label)) 90 | np.testing.assert_allclose(accuracy_score(kinf.labels_, self.nat_infk_discovered), 1, atol=.05) 91 | -------------------------------------------------------------------------------- /spenc/utils.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse.csgraph as csg 2 | import scipy.sparse as sp 3 | from warnings import warn as Warn 4 | import numpy as np 5 | 6 | def check_weights(W, X=None, transform = None): 7 | """ 8 | Check that the provided weights matrix and the X matrix are conformal. 9 | Further, check that the spatial weights are fully connected. 10 | """ 11 | if X is not None: 12 | assert W.shape[0] == X.shape[0], "W does not have the same number of samples as X" 13 | graph = sp.csc_matrix(W) 14 | graph.eliminate_zeros() 15 | components, labels = csg.connected_components(graph) 16 | if components > 1: 17 | Warn('Spatial affinity matrix is disconnected, and has {} subcomponents.' 18 | 'This will certainly affect the solution output.') 19 | return W 20 | 21 | def lattice(x,y): 22 | """ 23 | Construct a lattice of unit squares of dimension (x,y) 24 | """ 25 | from shapely.geometry import Polygon 26 | import geopandas as gpd 27 | x = np.arange(x)*1.0 28 | y = np.arange(y)*1.0 29 | pgons = [] 30 | for i in x: 31 | for j in y: 32 | ll,lr,ur,ul = (i,j), (i+1,j),\ 33 | (i+1,j+1), (i,j+1) 34 | #print([ll,lr,ur,ul]) 35 | pgons.append(Polygon([ll,lr,ur,ul])) 36 | return gpd.GeoDataFrame({'geometry':pgons}) 37 | 38 | def p_connected(replications): 39 | """ 40 | Compute the probability that any two observations are clustered 41 | together through a set of labellings. 42 | 43 | Uses outer product broadcasting in numpy, so only iterates over n_iterations, 44 | rather than n_iterations X n_iterations. 45 | """ 46 | n_replications, n_observations = replications.shape 47 | # dumbest way to do this 48 | out = np.zeros((n_observations, n_observations)) 49 | for replication in replications: 50 | out += replication[:,None] == replication[None,:] 51 | return out/len(replications) --------------------------------------------------------------------------------