├── .gitignore
├── .travis.yml
├── README.md
├── US_County_Level_Presidential_Results_08-16.csv
├── example.ipynb
├── gendata.py
├── license.txt
├── pytest.ini
├── setup.py
└── spenc
    ├── __init__.py
    ├── abstracts.py
    ├── scores.py
    ├── tests
        ├── __init__.py
        ├── data
        │   ├── nat_10k_nodata.ary
        │   ├── nat_30k_discovered.ary
        │   ├── nat_30k_randoms.ary
        │   ├── nat_infk_discovered.ary
        │   └── nat_infk_randoms.ary
        └── test_spenc.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .ipynb_checkpoints
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: false
 3 | branches:
 4 | only:
 5 |   - master
 6 | python:
 7 |   - "3.5"
 8 |   - "3.6"
 9 | 
10 | before_install:
11 |   - wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
12 |   - chmod +x miniconda.sh
13 |   - ./miniconda.sh -b -p ./miniconda
14 |   - export PATH=`pwd`/miniconda/bin:$PATH
15 |   - conda update --yes conda
16 |   - conda create -y -q -n test-env python=$TRAVIS_PYTHON_VERSION
17 |   - source activate test-env
18 | 
19 | install:
20 |   - conda install --yes pip
21 |   - conda install --yes scikit-learn scipy nose 
22 |   - pip install geopandas pysal pytest
23 | 
24 | script:
25 |   - python setup.py sdist >/dev/null
26 |   - python -c "import numpy; print(numpy.show_config())"
27 |   - pytest spenc;
28 | notifications:
29 |     email:
30 |         recipients:
31 |             - levi.john.wolf@gmail.com
32 |         on_success: change
33 |         on_failure: always
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spatially-Encouraged Spectral Clustering
 2 | [![Build Status](https://travis-ci.org/ljwolf/spenc.svg?branch=master)](https://travis-ci.org/ljwolf/spenc)
 3 | [![DOI](https://zenodo.org/badge/129973633.svg)](https://zenodo.org/badge/latestdoi/129973633)
 4 | 
 5 | 
 6 | This repository provides the code & walks through how to use spatially-encouraged spectral clustering. Refer to the [example notebook](https://github.com/ljwolf/spenc/blob/master/example.ipynb) for more information on usage. 
 7 | 
 8 | Usage reqiures `scikit-learn` and `scipy`. The package is released on pypi as `spenc`, so installation is available using:
 9 | 
10 | `pip install spenc`
11 | 
12 | # Citation
13 | 
14 | If you would like to reference this software, please cite its zenodo listing: 
15 | 
16 | Wolf, Levi John. 2018. “Ljwolf/spenc: GISRUK”. Zenodo. doi:10.5281/zenodo.1219904.
17 | 
18 | And, for the paper defining the algorithm:
19 | 
20 | Wolf, Levi John. *(In Review)* "Spatially-Encouraged Spectral Clustering." *International Journal of Geographic Information Science*. 
21 | 
22 | with a full preprint available at the [Open Science Framework](https://osf.io/yzt2p).
23 | 


--------------------------------------------------------------------------------
/gendata.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import spenc as spenc
 3 | import pysal as ps
 4 | import geopandas as gpd
 5 | import os
 6 | SEED = 1901
 7 | dir_self = os.path.dirname(__file__)
 8 | datadir = os.path.join(dir_self, 'spenc/tests/data')
 9 | 
10 | if __name__ == "__main__":
11 |     nat = gpd.read_file(ps.examples.get_path("NAT.shp"))
12 |     natR = ps.weights.Rook.from_dataframe(nat)
13 |     names = nat.filter(like='90').columns.tolist() + nat.filter(like='89').columns.tolist()
14 |     X = nat[names].values
15 |     X = (X - X.mean(axis=0))/X.var(axis=0)
16 | 
17 |     print('(1 of 5) doing 10k nodata')
18 |     np.random.seed(SEED)
19 |     labels = spenc.SPENC(n_clusters=10, random_state=SEED).fit(None, natR.sparse).labels_
20 |     labels.dump(os.path.join(datadir, 'nat_10k_nodata.ary'))
21 | 
22 |     print('(2 of 5) doing 30k sampling')
23 |     np.random.seed(SEED)
24 |     labels = spenc.SPENC(n_clusters=30, random_state=SEED).sample(natR.sparse, n_samples=3)
25 |     labels.dump(os.path.join(datadir, 'nat_30k_randoms.ary'))
26 | 
27 |     print('(3 of 5) doing 30k withdata')
28 |     np.random.seed(SEED)
29 |     labels = spenc.SPENC(n_clusters=30, gamma=.001, random_state=SEED).fit(X, natR.sparse).labels_
30 |     labels.dump(os.path.join(datadir, 'nat_30k_discovered.ary'))
31 | 
32 |     print('(4 of 5) doing infk sampling')
33 |     np.random.seed(SEED)
34 |     labels = spenc.SPENC(n_clusters=np.inf, random_state=SEED).sample(natR.sparse, floor=20)
35 |     labels.dump(os.path.join(datadir, 'nat_infk_randoms.ary'))
36 | 
37 |     print('(5 of 5) doing infk withdata')
38 |     np.random.seed(SEED)
39 |     labels = spenc.SPENC(n_clusters=np.inf, gamma=.001, random_state=SEED).fit(X, natR.sparse, floor=20).labels_
40 |     labels.dump(os.path.join(datadir, 'nat_infk_discovered.ary'))
41 | 
42 | 
43 |     print('done!')
44 | 
45 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2018, Levi John Wolf
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are 
 4 | permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice, this list 
 7 |    of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list
10 |    of conditions and the following disclaimer in the documentation and/or other materials
11 |    provided with the distribution.
12 | 
13 | 3. Neither the name of the copyright holder nor the names of its contributors may be used
14 |    to endorse or promote products derived from this software without specific prior 
15 |    written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 | OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
20 | SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
22 | OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
23 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
24 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings = 
3 |     once
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import os
 3 | 
 4 | basepath = os.path.dirname(__file__)
 5 | init = os.path.join(basepath, 'spenc/__init__.py')
 6 | 
 7 | with open(init, 'r') as initfile:
 8 |     firstline = initfile.readline()
 9 | init_version = firstline.split('=')[-1].strip()
10 | 
11 | setup(name='spenc',
12 |       version=init_version,
13 |       description='Spatially-Encouraged Spectral Clustering, a method of discovering clusters/deriving labels for spatially-referenced data with attribute/labels attached',
14 |       url='https://github.com/ljwolf/spenc',
15 |       author='Levi John Wolf',
16 |       author_email='levi.john.wolf@gmail.com',
17 |       license='3-Clause BSD',
18 |       python_requires='>=3.5',
19 |       packages=['spenc'],
20 |       install_requires=['scikit-learn>=0.20.0', 'scipy'])
21 | 


--------------------------------------------------------------------------------
/spenc/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.3.0dev'
2 | 
3 | from .abstracts import SPENC
4 | 


--------------------------------------------------------------------------------
/spenc/abstracts.py:
--------------------------------------------------------------------------------
  1 | from sklearn import cluster as clust
  2 | import sklearn.metrics as skm
  3 | import sklearn.metrics.pairwise as pw
  4 | from sklearn.utils.validation import check_array
  5 | from .utils import check_weights
  6 | from sklearn.neighbors import kneighbors_graph
  7 | from sklearn.utils.extmath import _deterministic_vector_sign_flip
  8 | from sklearn.utils import check_random_state
  9 | from sklearn.cluster.spectral import discretize as _discretize
 10 | from sklearn.preprocessing import LabelEncoder
 11 | from sklearn.base import clone
 12 | import numpy as np
 13 | from .scores import boundary_fraction
 14 | import scipy.sparse as spar
 15 | from scipy.sparse import csgraph as cg, linalg as la
 16 | from warnings import warn as Warn
 17 | 
 18 | class SPENC(clust.SpectralClustering):
 19 |     def __init__(self, n_clusters=8, eigen_solver=None, random_state=None,
 20 |                  n_init=10, gamma=1., affinity='rbf', n_neighbors=10,
 21 |                  eigen_tol=1e-9, assign_labels='kmeans', degree=3, coef0=1,
 22 |                  kernel_params=None, n_jobs=1):
 23 |         """
 24 |         Apply clustering to a projection of the normalized laplacian, using
 25 |         spatial information to constrain the clustering.
 26 | 
 27 |         In practice Spectral Clustering is very useful when the structure of
 28 |         the individual clusters is highly non-convex or more generally when
 29 |         a measure of the center and spread of the cluster is not a suitable
 30 |         description of the complete cluster. For instance when clusters are
 31 |         nested circles on the 2D plan.
 32 | 
 33 |         Spatially-Encouraged Spectral Clustering (SPENC) is useful for when
 34 |         there may be highly non-convex clusters or clusters with irregular
 35 |         topology in a geographic context.
 36 | 
 37 |         If a binary weights matrix is provided during fit, this method can be
 38 |         used to find weighted normalized graph cuts.
 39 | 
 40 |         When calling ``fit``, an affinity matrix is constructed using either
 41 |         kernel function such the Gaussian (aka RBF) kernel of the euclidean
 42 |         distanced ``d(X, X)``::
 43 | 
 44 |                 np.exp(-gamma * d(X,X) ** 2)
 45 | 
 46 |         or a k-nearest neighbors connectivity matrix.
 47 | 
 48 |         Alternatively, using ``precomputed``, a user-provided affinity
 49 |         matrix can be used.
 50 | 
 51 |         Read more in the scikit-learn user guide on spectral clustering
 52 | 
 53 |         Parameters
 54 |         -----------
 55 |         n_clusters : integer, optional
 56 |             The number of clusters to search for.
 57 | 
 58 |         eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
 59 |             NOTE: ignored unless fitting using the `breakme` flag. So, do not use.
 60 |             The eigenvalue decomposition strategy to use. AMG requires pyamg
 61 |             to be installed. It can be faster on very large, sparse problems,
 62 |             but may also lead to instabilities
 63 | 
 64 |         random_state : int, RandomState instance or None, optional, default: None
 65 |             A pseudo random number generator used for the initialization of the
 66 |             lobpcg eigen vectors decomposition when eigen_solver == 'amg' and by
 67 |             the K-Means initialization.  If int, random_state is the seed used by
 68 |             the random number generator; If RandomState instance, random_state is
 69 |             the random number generator; If None, the random number generator is
 70 |             the RandomState instance used by `np.random`.
 71 | 
 72 |         n_init : int, optional, default: 10
 73 |             Number of time the k-means algorithm will be run with different
 74 |             centroid seeds. The final results will be the best output of
 75 |             n_init consecutive runs in terms of inertia.
 76 | 
 77 |         gamma : float, default=1.0
 78 |             Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
 79 |             Ignored for ``affinity='nearest_neighbors'``.
 80 | 
 81 |         affinity : string, array-like or callable, default 'rbf'
 82 |             If a string, this may be one of 'nearest_neighbors', 'precomputed',
 83 |             'rbf' or one of the kernels supported by
 84 |             `sklearn.metrics.pairwise_kernels`.
 85 | 
 86 |             Only kernels that produce similarity scores (non-negative values that
 87 |             increase with similarity) should be used. This property is not checked
 88 |             by the clustering algorithm.
 89 | 
 90 |         n_neighbors : integer
 91 |             Number of neighbors to use when constructing the affinity matrix using
 92 |             the nearest neighbors method. Ignored for ``affinity='rbf'``.
 93 | 
 94 |         eigen_tol : float, optional, default: 1e-7
 95 |             Stopping criterion for eigendecomposition of the Laplacian matrix
 96 |             when using arpack eigen_solver.
 97 | 
 98 |         assign_labels : {'kmeans', 'discretize', 'hierarchical'}, default: 'discretize'
 99 |             The strategy to use to assign labels in the embedding
100 |             space. There are three ways to assign labels after the laplacian
101 |             embedding.
102 |             1. k-means can be applied and is a popular choice. But it can
103 |                 also be sensitive to initialization.
104 |             2. Discretization is another approach which is less sensitive to
105 |                 random initialization, and which usually finds better clusters.
106 |             3. Hierarchical decomposition repeatedly bi-partitions the graph,
107 |                 instead of finding the decomposition all at once, as suggested in
108 |                 Shi & Malik (2000).
109 | 
110 |         degree : float, default=3
111 |             Degree of the polynomial affinity kernel. Ignored by other kernels.
112 | 
113 |         coef0 : float, default=1
114 |             Zero coefficient for polynomial and sigmoid affinity kernels.
115 |             Ignored by other kernels.
116 | 
117 |         kernel_params : dictionary of string to any, optional
118 |             Parameters (keyword arguments) and values for affinity kernel passed as
119 |             callable object. Ignored by other affinity kernels.
120 | 
121 |         n_jobs : int, optional (default = 1)
122 |             The number of parallel jobs to run for the nearest-neighbors
123 |             affinity kernel, if used.
124 |             If ``-1``, then the number of jobs is set to the number of CPU cores.
125 | 
126 |         Attributes
127 |         ----------
128 |         affinity_matrix_ : array-like, shape (n_samples, n_samples)
129 |             Affinity matrix used for clustering. Available only if after calling
130 |             ``fit``.
131 | 
132 |         labels_ :
133 |             Labels of each point
134 | 
135 |         Notes
136 |         -----
137 |         If you have an affinity matrix, such as a distance matrix,
138 |         for which 0 means identical elements, and high values means
139 |         very dissimilar elements, it can be transformed in a
140 |         similarity matrix that is well suited for the algorithm by
141 |         applying the Gaussian (RBF, heat) kernel::
142 | 
143 |             np.exp(- dist_matrix ** 2 / (2. * delta ** 2))
144 | 
145 |         Where ``delta`` is a free parameter representing the width of the Gaussian
146 |         kernel.
147 | 
148 |         Another alternative is to take a symmetric version of the k
149 |         nearest neighbors connectivity matrix of the points.
150 | 
151 |         References
152 |         ----------
153 | 
154 |         - Normalized cuts and image segmentation, 2000
155 |           Jianbo Shi, Jitendra Malik
156 |           http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
157 | 
158 |         - A Tutorial on Spectral Clustering, 2007
159 |           Ulrike von Luxburg
160 |           http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323
161 | 
162 |         - Multiclass spectral clustering, 2003
163 |           Stella X. Yu, Jianbo Shi
164 |           http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
165 |         """
166 |         self.n_clusters = n_clusters
167 |         self.eigen_solver = eigen_solver
168 |         self.random_state = random_state
169 |         self.n_init = n_init
170 |         self.gamma = gamma
171 |         self.affinity = affinity
172 |         self.n_neighbors = n_neighbors
173 |         self.eigen_tol = eigen_tol
174 |         self.assign_labels = assign_labels
175 |         self.degree = degree
176 |         self.coef0 = coef0
177 |         self.kernel_params = kernel_params
178 |         self.n_jobs = n_jobs
179 | 
180 |     def fit(self, X, W=None, y=None, shift_invert=True, check_W=True,
181 |             grid_resolution = 100, floor=0, floor_weights=None, cut_method='gridsearch'):
182 |         """Creates an affinity matrix for X using the selected affinity,
183 |         applies W to the affinity elementwise, and then applies spectral clustering
184 |         to the affinity matrix.
185 | 
186 |         Arguments
187 |         ---------
188 |         X               : sparse or dense array
189 |                           matrix containing P features for N observations.
190 |         W               : sparse or dense array, default None
191 |                           matrix expressing the pairwise spatial relationships
192 |                           between N observations.
193 |         y               : sparse or dense array, default None
194 |                           ignored, for scikit-learn class inheritance/regularity purposes.
195 |         shift_invert    : bool, default True
196 |                           boolean governing whether or not to use shift-invert
197 |                           trick to finding sparse eigenvectors
198 | 
199 |         breakme         : bool, default False
200 |                           Whether or not to simply pipe down to the sklearn spectral
201 |                           clustering class. Will likely break the formal guarantees
202 |                           about contiguity/connectedness of solutions, due to the
203 |                           standardizations/short cuts taken in sklearn.cluster.SpectralClustering
204 |         check_W         : bool, default True
205 |                           Whether or not to check that the spatial weights matrix
206 |                           is correctly formatted and aligns with the X matrix.
207 |         grid_resolution : int, default 100
208 |                           how many subdivisions to use when doing gridsearch
209 |                           for cutpoint on second eigenvector of subgraphs.
210 |         floor           : float/int, default 0
211 |                           value which governs the lower limit on the size of partitions
212 |                           if 0, there is no limit.
213 |                           if floor_weights are provided, floor should be a limit on
214 |                           the sum of floor weights for each region.
215 |         floor_weights   : np.ndarray of shape (n,), default np.ones((n,))
216 |                           array containing weights for each observation used to determine
217 |                           the region floor.
218 |         cut_method      : str, default 'gridsearch'
219 |                           option governing what method to use to partition regions
220 |                           1. "gridsearch" (default): the hierarchical grid search
221 |                             suggested by Shi & Malik (2000); search the second
222 |                             eigenvector for the "best" partition in terms of cut weight.
223 |                           2. "zero": cut the eigenvector at zero. Usually a passable solution,
224 |                             since the second eigenvector is usually centered around zero.
225 |                           3. "median": cut the eigenvector through its median. This means the
226 |                             regions will always be divided into two halves with equal numbers
227 |                             of elemental units.
228 |                           "gridsearch" may be slow when grid_resolution is large.
229 |                           "zero" is the best method for large data.
230 | 
231 |         NOTE:
232 | 
233 |         breakme sends the affinity matrix down to scikit's spectral clustering class.
234 |         I call this breakme because of bug8129.
235 |         I don't see a significant difference here when switching between the two,
236 |         most assignments in the problems I've examined are the same.
237 |         I think, since the bug is in the scaling of the eigenvectors, it's not super important.
238 | 
239 |         But, in the future, it may make sense to investigate whether the bug in sklearn
240 |         is fully fixed, which would mean that any spectral clustering for
241 |         a weights matrix in sklearn would always be contiguous.
242 | 
243 |         """
244 |         if np.isinf(self.n_clusters):
245 |             self.assign_labels='hierarchical'
246 | 
247 |         if W is None:
248 |             W = sparse.csc_matrix(np.ones_like(X.shape[0],X.shape[0]))
249 | 
250 |         if X is not None:
251 |             X = check_array(X, accept_sparse = ['csr','coo', 'csc'],
252 |                         dtype=np.float64, ensure_min_samples=2)
253 |             if check_W:
254 |                 W = check_weights(W, X)
255 | 
256 |             if self.affinity == 'nearest_neighbors':
257 |                 connectivity = kneighbors_graph(X, n_neighbors=self.n_neighbors,
258 |                                                 include_self=True, n_jobs=self.n_jobs)
259 |                 attribute_affinity_ = .5 * (connectivity + connectivity.T)
260 |             elif self.affinity == 'precomputed':
261 |                 attribute_affinity_ = W.multiply(X)
262 |             else:
263 |                 params = self.kernel_params
264 |                 if params is None:
265 |                     params = {}
266 |                 if not callable(self.affinity):
267 |                     params['gamma'] = self.gamma
268 |                     params['degree'] = self.degree
269 |                     params['coef0'] = self.coef0
270 |                 attribute_affinity_ = pw.pairwise_kernels(X, metric=self.affinity,
271 |                                                                filter_params=True,
272 |                                                                **params)
273 |             spatial_affinity_ = W
274 |             affinity_matrix_ = W.multiply(attribute_affinity_)
275 |         else:
276 |             affinity_matrix_ = W
277 | 
278 |         affinity_old = self.affinity
279 |         self.affinity = 'precomputed'
280 |         super().fit(affinity_matrix_)
281 |         return self
282 | 
283 |     def _embed(self, affinity, shift_invert=True):
284 |         """
285 |         Compute the eigenspace embedding of a given affinity matrix.
286 | 
287 |         Arguments
288 |         ---------
289 |         affinity    :   sparse or dense matrix
290 |                         affinity matrix to compute the spectral embedding of
291 |         shift_invert:   bool
292 |                         whether or not to use the shift-invert eigenvector search
293 |                         trick useful for finding sparse eigenvectors.
294 |         """
295 |         laplacian, orig_d = cg.laplacian(affinity,
296 |                                          normed=True, return_diag=True)
297 |         laplacian *=-1
298 |         random_state = check_random_state(self.random_state)
299 |         v0 = random_state.uniform(-1,1,laplacian.shape[0])
300 | 
301 |         if not shift_invert:
302 |             ev, spectrum = la.eigsh(laplacian, which='LA', k=self.n_clusters, v0=v0,
303 |                                     tol=self.eigen_tol)
304 |         else:
305 |             ev, spectrum = la.eigsh(laplacian, which='LM', sigma=1, k=self.n_clusters, v0=v0,
306 |                                     tol=self.eigen_tol)
307 | 
308 |         embedding = spectrum.T[self.n_clusters::-1] #sklearn/issues/8129
309 |         embedding = embedding / orig_d
310 |         embedding = _deterministic_vector_sign_flip(embedding)
311 |         return embedding
312 | 
313 |     def _spectral_bipartition(self, affinity_matrix_,
314 |                               grid_resolution=100,
315 |                               shift_invert=True, floor=0,
316 |                               floor_weights = None,
317 |                               cut_method='gridsearch'):
318 |         """
319 |         Implements the recursive spectral bipartitioning of shi and malik (2000)
320 |         If n_clusters = np.inf and floor > 0, then will find
321 |         all possible cuts with more than X units.
322 | 
323 |         Arguments
324 |         ---------
325 |         grid_resolution : int
326 |                           how many subdivisions to use when doing gridsearch
327 |                           for cutpoint on second eigenvector of subgraphs.
328 |                           (Default: 100)
329 |         shift_invert    : bool
330 |                           boolean governing whether or not to use shift-invert
331 |                           trick to finding sparse eigenvectors
332 |                           (Default: True)
333 |         floor           : float/int
334 |                           value which governs the lower limit on the size of partitions
335 |                           if 0, there is no limit.
336 |                           if floor_weights are provided, floor should be a limit on
337 |                           the sum of floor weights for each region.
338 |                           (Default: 0)
339 |         floor_weights   : np.ndarray of shape (n,)
340 |                           array containing weights for each observation used to determine
341 |                           the region floor.
342 |                           (Default: np.ones((n,)))
343 |         cut_method      : str
344 |                           option governing what method to use to partition regions
345 |                           1. "gridsearch" (default): the hierarchical grid search
346 |                             suggested by Shi & Malik (2000); search the second
347 |                             eigenvector for the "best" partition in terms of cut weight.
348 |                           2. "zero": cut the eigenvector at zero. Usually a passable solution,
349 |                             since the second eigenvector is usually centered around zero.
350 |                           3. "median": cut the eigenvector through its median. This means the
351 |                             regions will always be divided into two halves with equal numbers
352 |                             of elemental units.
353 |                           "gridsearch" may be slow when grid_resolution is large.
354 |                           "zero" is the best method for large data.
355 |         """
356 |         if floor_weights is None:
357 |             floor_weights = np.ones((affinity_matrix_.shape[0],))
358 |         if spar.issparse(affinity_matrix_):
359 |             affinity_matrix_ = affinity_matrix_.tocsr()
360 |         threshold = self.n_clusters
361 |         self.n_clusters = 2
362 |         discovered=1
363 |         this_cut = np.ones((affinity_matrix_.shape[0],)).astype(bool)
364 |         cuts = []
365 |         accepted_cuts = []
366 |         while discovered < threshold:
367 |             if this_cut.sum() > 2: #can't cut a singleton
368 |                 current_affinity = affinity_matrix_[this_cut,:][:,this_cut]
369 |                 embedding = self._embed(current_affinity, shift_invert = shift_invert)
370 |                 second_eigenvector = embedding[1]
371 |                 new_cut, score_of_cut = self._make_hierarchical_cut(second_eigenvector,
372 |                                                                     current_affinity,
373 |                                                                     grid_resolution,
374 |                                                                     cut_method=cut_method,
375 |                                                                     floor=floor)
376 |                 left_cut = this_cut.copy()
377 |                 left_cut[left_cut] *= new_cut
378 |                 right_cut = this_cut.copy()
379 |                 right_cut[right_cut] *= ~new_cut
380 |                 assert len(this_cut) == len(left_cut) == len(right_cut), "Indexing Error in cutting!"
381 |             if (((left_cut*floor_weights).sum() > floor)
382 |              & ((right_cut*floor_weights).sum() > floor)):
383 |                 if ((tuple(left_cut) not in accepted_cuts)
384 |                  & (tuple(right_cut) not in accepted_cuts)):
385 |                     cuts.append(left_cut)
386 |                     accepted_cuts.append(tuple(left_cut))
387 |                     cuts.append(right_cut)
388 |                     accepted_cuts.append(tuple(right_cut))
389 |             discovered += 1
390 |             try:
391 |                 this_cut = cuts.pop(0)
392 |             except IndexError:
393 |                 break
394 |         accepted_cuts = np.vstack(accepted_cuts)
395 |         labels = np.ones((accepted_cuts[0].shape[0],))*-1.0
396 |         for i,k in enumerate(np.flipud(accepted_cuts)):
397 |             unassigned = labels == -1
398 |             should_assign = (unassigned & k)
399 |             labels[should_assign] = i
400 |         return LabelEncoder().fit_transform(labels)
401 | 
402 |     def _make_hierarchical_cut(self, second_eigenvector,
403 |                                affinity_matrix,
404 |                                grid_resolution,
405 |                                cut_method='median',
406 |                                floor=0):
407 |         """Compute a single hierarchical cut using one of the methods described in
408 |         Shi and Malik (2000).
409 |         """
410 |         def mkobjective(second_eigenvector):
411 |             """This makes a closure around the objective function given an eigenvector"""
412 |             def objective(cutpoint):
413 |                 cut = second_eigenvector <= cutpoint
414 |                 assocA = affinity_matrix[cut].sum(axis=1).sum()
415 |                 assocB = affinity_matrix[~cut].sum(axis=1).sum()
416 |                 cutAB = affinity_matrix[cut,:][:,~cut].sum(axis=1).sum() * 2
417 |                 score = cutAB/assocA + cutAB/assocB
418 |                 if np.isnan(score):
419 |                     score = np.inf
420 |                 return score
421 |             return objective
422 | 
423 |         objective = mkobjective(second_eigenvector)
424 | 
425 |         if cut_method == 'gridsearch':
426 |             support = np.linspace(*np.percentile(second_eigenvector, q=(2,98)),
427 |                                   num=grid_resolution)
428 | 
429 | 
430 |             objective_surface = [objective(cutpoint) for cutpoint in support]
431 |             cutpoint = support[np.argmin(objective_surface)]
432 |             cut = second_eigenvector <= cutpoint
433 |             return cut,np.min(objective_surface)
434 |         elif cut_method == 'median':
435 |             median = np.median(second_eigenvector)
436 |             score = objective(median)
437 |             return second_eigenvector < median, score
438 |         else:
439 |             score = objective(0)
440 |             return second_eigenvector < 0, score
441 | 
442 | 
443 |     def score(self, X, W, labels=None, delta=.5,
444 |               attribute_score=skm.calinski_harabasz_score,
445 |               spatial_score=boundary_fraction,
446 |               attribute_kw = dict(),
447 |               spatial_kw = dict()):
448 |         """
449 |         Computes the score of the given label vector on data in X using convex
450 |         combination weight in delta.
451 | 
452 |         Arguments
453 |         ---------
454 |         X               : numpy array (N,P)
455 |                           array of data classified into `labels` to score.
456 |         W               : sparse array or numpy array (N,N)
457 |                           array representation of spatial relationships
458 |         labels          : numpy array (N,)
459 |                           vector of labels aligned with X and W
460 |         delta           : float
461 |                           weight to apply to the attribute score.
462 |                           Spatial score is given weight 1 - delta,
463 |                           and attributes weight delta.
464 |                           Default: .5
465 |         attribute_score : callable
466 |                           function to use to evaluate attribute homogeneity
467 |                           Must have signature attribute_score(X,labels,**params)
468 |                           Default: sklearn.metrics.calinski_harabaz_score
469 |                                    (within/between deviation ratio)
470 |         spatial_score   : callable
471 |                           function to use to evaluate spatial regularity/contiguity.
472 |                           Must have signature spatial_score(X,labels,**params)
473 |                           Default: boundary_ratio(W,X,labels,**spatial_kw)
474 |         """
475 |         if labels is None:
476 |             if not hasattr(self, 'labels_'):
477 |                 raise Exception('Object must be fit in order to avoid passing labels.')
478 |             labels = self.labels_
479 |         labels = np.asarray(labels).flatten()
480 |         attribute_score = attribute_score(X,labels, **attribute_kw)
481 |         spatial_score = spatial_score(W,labels, X=X,**spatial_kw)
482 |         return delta * attribute_score + (1 - delta)*spatial_score
483 | 
484 |     def _sample_gen(self, W, n_samples=1,
485 |                             affinity='rbf',
486 |                             distribution=None, **fit_kw):
487 |         """
488 |         NOTE: this is the lazy generator version of sample
489 |         Compute random clusters using random eigenvector decomposition.
490 |         This uses random weights in spectral decomposition to generate approximately-evenly populated
491 |         random subgraphs from W.
492 | 
493 |         Arguments
494 |         ---------
495 |         W                : np.ndarray or scipy.sparse matrix
496 |                            matrix encoding the spatial relationships between observations in the frame.
497 |                            Must be strictly binary & connected to result in connected graphs correct behavior.
498 |                            Mathematical properties of randomregions are undefined if not.
499 |         n_samples        : int, default 1
500 |                            integer describing how many samples to construct
501 |         affinity         : string or callable, default is 'rbf'
502 |                            passed down to the underlying SPENC class when spectral spatial clusters are found.
503 |         distribution     : callable default is numpy.random.normal(0,1, size=(N,1))
504 |                            function when called with no arguments that draws the random weights used to
505 |                            generate the random regions. Must align with W.
506 |         spenc_parameters : keyword arguments
507 |                            extra arguments passed down to the SPENC class for further customization.
508 |         """
509 |         if distribution is None:
510 |             distribution = lambda : np.random.normal(0,1,size=(W.shape[0], 1))
511 |         else:
512 |             assert callable(distribution), 'distribution is not callable!'
513 |         for _ in range(n_samples):
514 |             randomweights = distribution()
515 |             fitted = clone(self).fit(randomweights, W, **fit_kw)
516 |             yield fitted.labels_
517 | 
518 |     def sample(self, W, n_samples=1,
519 |                distribution=None, **fit_kw):
520 |         """
521 |         Compute random clusters using random eigenvector decomposition.
522 |         This uses random weights in spectral decomposition to generate approximately-evenly populated
523 |         random subgraphs from W.
524 | 
525 |         Arguments
526 |         ---------
527 |         W             : np.ndarray or scipy.sparse matrix
528 |                         matrix encoding the spatial relationships between observations in the frame.
529 |                         Must be strictly binary & connected to result in connected graphs correct behavior.
530 |                         Mathematical properties of randomregions are undefined if not.
531 |         n_samples     : int, default 1
532 |                         integer describing how many samples to construct
533 |         affinity      : string or callable, default is 'rbf'
534 |                         passed down to the underlying SPENC class when spectral spatial clusters are found.
535 |         distribution  : callable default is numpy.random.normal(0,1, size=(N,1))
536 |                         function when called with no arguments that draws the random weights used to
537 |                         generate the random regions. Must align with W.
538 |         fit_kw        : keyword arguments
539 |                         extra arguments passed down to the SPENC class for further customization.
540 |         Returns
541 |         -------
542 |         labels corresponding to the input W that are generated at random.
543 |         """
544 |         result = np.vstack([labels for labels in
545 |                             self._sample_gen(W, n_samples=n_samples,
546 |                             distribution=distribution, **fit_kw)])
547 |         if n_samples == 1:
548 |             result = result.flatten()
549 |         return result
550 | 
551 | class AgglomerativeClustering(clust.AgglomerativeClustering):
552 | 
553 |     def _sample_gen(self, n_samples=25, distribution=None):
554 |         """
555 |         sample random clusters with agglomerative clustering using random weights.
556 |         """
557 |         if distribution is None:
558 |             distribution = lambda : np.random.normal(0,1,size=(self.connectivity.shape[0],1))
559 |         else:
560 |             assert callable(distribution), 'distribution is not callable!'
561 |         for _ in range(n_samples):
562 |             randomweights = distribution()
563 |             fitted = clone(self).fit(randomweights)
564 |             yield fitted.labels_
565 | 
566 |     def sample(self, n_samples=1,
567 |                distribution=None):
568 |       """
569 |       Compute random clusters using randomly-weighted agglomerative clustering.
570 |       This uses random weights in agglomerative clustering decomposition to generate
571 |       random subgraphs from W.
572 | 
573 |       Arguments
574 |       ---------
575 |       W                : np.ndarray or scipy.sparse matrix
576 |                          matrix encoding the spatial relationships between observations in the frame.
577 |                          Must be strictly binary & connected to result in connected graphs correct behavior.
578 |                          Mathematical properties of randomregions are undefined if not.
579 |       n_samples        : int
580 |                          integer describing how many samples to construct
581 |       distribution     : callable (default: np.random.normal(0,1))
582 |                          a function that, when called with no arguments, returns the weights
583 |                          used as fake data to randomize the graph.
584 | 
585 |       Returns
586 |       -------
587 |       labels corresponding to the input W that are generated at random.
588 |       """
589 |       return np.vstack([labels for labels in
590 |                         self._sample_gen(n_samples=n_samples,
591 |                                          distribution=distribution)])
592 | 


--------------------------------------------------------------------------------
/spenc/scores.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def boundary_fraction(W, labels, X = None):
 4 |     """
 5 |     """
 6 |     boundary = 0
 7 |     for row, own_label in zip(W,labels):
 8 |         neighbor_labels = labels[row.nonzero()[-1]]
 9 |         boundary += (neighbor_labels != own_label).any().astype(int)
10 |     return boundary / W.shape[0]
11 | 
12 | def boundary_score(W, labels, X = None):
13 |     """
14 |     Returns a version of boundary_fraction unbounded on the negative end using
15 |     the log of the fraction:
16 | 
17 |     np.log(boundary_fraction(W, labels))
18 | 
19 |     This is solely for testing purposes. 
20 |     """
21 |     return np.log(boundary_fraction(W, labels, X = None))
22 | 


--------------------------------------------------------------------------------
/spenc/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/__init__.py


--------------------------------------------------------------------------------
/spenc/tests/data/nat_10k_nodata.ary:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_10k_nodata.ary


--------------------------------------------------------------------------------
/spenc/tests/data/nat_30k_discovered.ary:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_30k_discovered.ary


--------------------------------------------------------------------------------
/spenc/tests/data/nat_30k_randoms.ary:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_30k_randoms.ary


--------------------------------------------------------------------------------
/spenc/tests/data/nat_infk_discovered.ary:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_infk_discovered.ary


--------------------------------------------------------------------------------
/spenc/tests/data/nat_infk_randoms.ary:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ljwolf/spenc/d58f235606d6b567a59a405525d98792fc3b3dd7/spenc/tests/data/nat_infk_randoms.ary


--------------------------------------------------------------------------------
/spenc/tests/test_spenc.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | import pysal as ps
 3 | import numpy as np
 4 | import os
 5 | import geopandas as gpd
 6 | import scipy.sparse.csgraph as csg
 7 | from sklearn.metrics import accuracy_score
 8 | 
 9 | import spenc
10 | 
11 | filepath = os.path.dirname(__file__)
12 | SEED = 1901
13 | 
14 | 
15 | class SPENCTest(TestCase):
16 |     def setUp(self):
17 |         self.nat = gpd.read_file(ps.lib.examples.get_path('NAT.shp'))
18 |         self.natR = ps.lib.weights.Rook.from_dataframe(self.nat)
19 |         self.nat_10k_nodata = np.load(os.path.join(filepath, 'data/nat_10k_nodata.ary'))
20 |         self.nat_30k_randoms = np.load(os.path.join(filepath, 'data/nat_30k_randoms.ary'))
21 |         self.nat_30k_discovered = np.load(os.path.join(filepath, 'data/nat_30k_discovered.ary'))
22 |         self.nat_infk_randoms = np.load(os.path.join(filepath, 'data/nat_infk_randoms.ary'))
23 |         self.nat_infk_discovered = np.load(os.path.join(filepath, 'data/nat_infk_discovered.ary'))
24 |         self.nat_names = self.nat.filter(like='90').columns.tolist() \
25 |                   + self.nat.filter(like='89').columns.tolist()
26 |         self.natX = self.nat[self.nat_names].values
27 |         self.natX = (self.natX - self.natX.mean(axis=0)) / self.natX.var(axis=0)
28 | 
29 |     def test_NAT_nodata(self):
30 |         np.random.seed(1901) #shouldn't matter substantively, only for label#
31 |         t1 = spenc.SPENC(n_clusters=10, random_state=SEED).fit(None, self.natR.sparse).labels_
32 |         for label in range(t1.max()):
33 |             mask = t1 == label
34 |             subgraph = self.natR.sparse[mask,:][:,mask]
35 |             subgraph.eliminate_zeros()
36 |             n_components, labels = csg.connected_components(subgraph)
37 |             self.assertEqual(n_components, 1,
38 |                              'Disconnected component ({}) in NAT clusters!'.format(label))
39 |         np.testing.assert_allclose(accuracy_score(t1, self.nat_10k_nodata), 1, atol=.05)
40 | 
41 |     def test_NAT_randoms(self):
42 |         np.random.seed(1901)
43 |         randoms = spenc.SPENC(n_clusters=30, random_state=SEED).sample(self.natR.sparse, n_samples=3)
44 |         self.assertEqual(randoms.shape, (3, len(self.nat)), 'sample shapes are incorrect!')
45 |         for i,random in enumerate(randoms):
46 |             for label in range(random.max()):
47 |                 mask = random == label
48 |                 subgraph = self.natR.sparse[mask,:][:,mask]
49 |                 subgraph.eliminate_zeros()
50 |                 n_components, labels = csg.connected_components(subgraph)
51 |                 self.assertEqual(n_components, 1,
52 |                                  'Disconnected component ({}) in NAT '
53 |                                  'random cluster set {}!'.format(label,i) )
54 |             np.testing.assert_allclose(accuracy_score(random, self.nat_30k_randoms[i]), 1.0, atol=.05)
55 |         np.random.seed(1901)
56 |         randoms = spenc.SPENC(n_clusters=np.inf, random_state=SEED).sample(self.natR.sparse, floor=20)
57 |         self.assertEqual(randoms.shape, (len(self.nat),), 'sample shapes are incorrect!')
58 |         for label in range(randoms.max()):
59 |             mask = randoms == label
60 |             subgraph = self.natR.sparse[mask,:][:,mask]
61 |             subgraph.eliminate_zeros()
62 |             n_components, labels = csg.connected_components(subgraph)
63 |             self.assertEqual(n_components, 1,
64 |                              'Disconnected component ({}) in NAT '
65 |                              'random cluster set {}!'.format(label,i))
66 |         #remember, this is only one draw
67 |         np.testing.assert_allclose(accuracy_score(randoms, self.nat_infk_randoms), 1, atol=.05)
68 | 
69 |     def test_NAT_data(self):
70 |         np.random.seed(1901)
71 |         k30 = spenc.SPENC(n_clusters=30, gamma=.001, random_state=SEED).fit(self.natX, self.natR.sparse)
72 |         for label in range(k30.labels_.max()):
73 |             mask = k30.labels_ == label
74 |             subgraph = self.natR.sparse[mask,:][:,mask]
75 |             subgraph.eliminate_zeros()
76 |             n_components, labels = csg.connected_components(subgraph)
77 |             self.assertEqual(n_components, 1,
78 |                              'Disconnected component ({}) in NAT clusters!'.format(label))
79 |         # self.assertEqual(accuracy_score(k30.labels_, self.nat_30k_discovered), 1.0) breaks on travis for some reason
80 |         np.random.seed(1901)
81 |         kinf = spenc.SPENC(n_clusters=np.inf, gamma=.001, random_state=SEED)\
82 |                     .fit(self.natX, self.natR.sparse, floor=20)
83 |         for label in range(kinf.labels_.max()):
84 |             mask = kinf.labels_ == label
85 |             subgraph = self.natR.sparse[mask,:][:,mask]
86 |             subgraph.eliminate_zeros()
87 |             n_components, labels = csg.connected_components(subgraph)
88 |             self.assertEqual(n_components, 1,
89 |                              'Disconnected component ({}) in NAT clusters!'.format(label))
90 |         np.testing.assert_allclose(accuracy_score(kinf.labels_, self.nat_infk_discovered), 1, atol=.05)
91 | 


--------------------------------------------------------------------------------
/spenc/utils.py:
--------------------------------------------------------------------------------
 1 | import scipy.sparse.csgraph as csg
 2 | import scipy.sparse as sp
 3 | from warnings import warn as Warn
 4 | import numpy as np
 5 | 
 6 | def check_weights(W, X=None, transform = None):
 7 |     """
 8 |     Check that the provided weights matrix and the X matrix are conformal.
 9 |     Further, check that the spatial weights are fully connected. 
10 |     """
11 |     if X is not None:
12 |         assert W.shape[0] == X.shape[0], "W does not have the same number of samples as X"
13 |     graph = sp.csc_matrix(W)
14 |     graph.eliminate_zeros()
15 |     components, labels = csg.connected_components(graph)
16 |     if components > 1:
17 |     	Warn('Spatial affinity matrix is disconnected, and has {} subcomponents.'
18 |     		 'This will certainly affect the solution output.')
19 |     return W
20 | 
21 | def lattice(x,y):
22 |     """
23 |     Construct a lattice of unit squares of dimension (x,y)
24 |     """
25 |     from shapely.geometry import Polygon
26 |     import geopandas as gpd
27 |     x = np.arange(x)*1.0
28 |     y = np.arange(y)*1.0
29 |     pgons = []
30 |     for i in x:
31 |         for j in y:
32 |             ll,lr,ur,ul = (i,j), (i+1,j),\
33 |                           (i+1,j+1), (i,j+1)
34 |             #print([ll,lr,ur,ul])
35 |             pgons.append(Polygon([ll,lr,ur,ul]))
36 |     return gpd.GeoDataFrame({'geometry':pgons})
37 | 
38 | def p_connected(replications):
39 |     """
40 |     Compute the probability that any two observations are clustered
41 |     together through a set of labellings.
42 | 
43 |     Uses outer product broadcasting in numpy, so only iterates over n_iterations,
44 |     rather than n_iterations X n_iterations.
45 |     """
46 |     n_replications, n_observations = replications.shape
47 |     # dumbest way to do this
48 |     out = np.zeros((n_observations, n_observations))
49 |     for replication in replications:
50 |         out += replication[:,None] == replication[None,:]
51 |     return out/len(replications)


--------------------------------------------------------------------------------