├── LICENSE
├── NYC_checkin.ipynb
├── NYC_checkin3.ipynb
├── README.md
├── STICC_main.py
├── STICC_solver.py
├── data
├── nyc_checkin.cpg
├── nyc_checkin.dbf
├── nyc_checkin.shp
├── nyc_checkin.shx
├── nyc_checkin.zip
├── nyc_checkin_sticc.cpg
├── nyc_checkin_sticc.dbf
├── nyc_checkin_sticc.shp
├── nyc_checkin_sticc.shx
├── nyc_checkin_sticc3.cpg
├── nyc_checkin_sticc3.dbf
├── nyc_checkin_sticc3.shp
├── nyc_checkin_sticc3.shx
├── sticc_points.dbf
├── sticc_points.shp
├── sticc_points.shx
├── sticc_points_spatial_multivariate.cpg
├── sticc_points_spatial_multivariate.dbf
├── sticc_points_spatial_multivariate.prj
├── sticc_points_spatial_multivariate.sbn
├── sticc_points_spatial_multivariate.shp
├── sticc_points_spatial_multivariate.shp.xml
└── sticc_points_spatial_multivariate.shx
├── images
├── GeoDSLogo.jpg
├── STICC.jpeg
└── clustering.jpg
├── requirements.txt
├── src
├── STICC_helper.py
├── __init__.py
└── admm_solver.py
└── synthetic.ipynb
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 2-Clause License
2 |
3 | Copyright (c) 2017-2018, David Hallac, Sagar Vare, Saachi Jain, and Others
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |
--------------------------------------------------------------------------------
/NYC_checkin.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import random\n",
10 | "import pandas as pd\n",
11 | "import geopandas as gpd\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import esda\n",
14 | "import libpysal.weights as weights\n",
15 | "from esda.moran import Moran\n",
16 | "from shapely.geometry import Point, MultiPoint, LineString, Polygon, shape\n",
17 | "import json\n",
18 | "import pylab\n",
19 | "import libpysal\n",
20 | "import numpy as np\n",
21 | "from sklearn.metrics.cluster import adjusted_rand_score\n",
22 | "from sklearn.metrics import f1_score\n",
23 | "from pyclustering.cluster.cure import cure\n",
24 | "from pyclustering.cluster.kmeans import kmeans\n",
25 | "from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer\n",
26 | "from sklearn import preprocessing"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "def permutation(lst):\n",
36 | " if len(lst) == 0:\n",
37 | " return []\n",
38 | "\n",
39 | " if len(lst) == 1:\n",
40 | " return [lst]\n",
41 | "\n",
42 | " l = []\n",
43 | " for i in range(len(lst)):\n",
44 | " m = lst[i]\n",
45 | " remLst = lst[:i] + lst[i+1:]\n",
46 | " for p in permutation(remLst):\n",
47 | " l.append([m] + p) \n",
48 | " return l"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "def get_f1_score(df, permut):\n",
58 | " def match_clus(x, permut):\n",
59 | " if x == 0:\n",
60 | " return int(permut[0])\n",
61 | " elif x == 1:\n",
62 | " return int(permut[1])\n",
63 | " else:\n",
64 | " return x\n",
65 | "\n",
66 | " df[\"group_match\"] = df[\"group\"].apply(lambda x: match_clus(x, permut))\n",
67 | " return df, f1_score(df.group_match.values, df.clus_group_gt.values, average='macro')"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 4,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "def get_max_f1_score(df):\n",
77 | " max_f1 = 0\n",
78 | " max_p = []\n",
79 | " for p in permutation([3,4]):\n",
80 | " df, f1 = get_f1_score(df, p)\n",
81 | " if max_f1 < f1:\n",
82 | " max_f1 = f1\n",
83 | " max_p = p\n",
84 | " print(\"f1_score \", max_f1, max_p)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 5,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "def cal_joint_statistic(nyc_data, w_voronoi):\n",
94 | " matched_connects = 0\n",
95 | " all_neighbors_connects = 0\n",
96 | " for obj_id, neighbors in w_voronoi.neighbors.items():\n",
97 | " obj_clus = nyc_data.iat[obj_id, -1]\n",
98 | " for nei in neighbors:\n",
99 | " nei_clus = nyc_data.iat[nei, -1]\n",
100 | " all_neighbors_connects += 1\n",
101 | " if obj_clus == nei_clus:\n",
102 | " matched_connects += 1\n",
103 | " return matched_connects / all_neighbors_connects"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "# Processing NYC Check-in Data"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 6,
116 | "metadata": {},
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/html": [
121 | "
\n",
122 | "\n",
135 | "
\n",
136 | " \n",
137 | " \n",
138 | " | \n",
139 | " venueId | \n",
140 | " userId | \n",
141 | " gender | \n",
142 | " friend_num | \n",
143 | " follow_num | \n",
144 | " latitude | \n",
145 | " longitude | \n",
146 | " venueCateg | \n",
147 | " week | \n",
148 | " hour | \n",
149 | " geometry | \n",
150 | "
\n",
151 | " \n",
152 | " \n",
153 | " \n",
154 | " 0 | \n",
155 | " 3fd66200f964a52000e71ee3 | \n",
156 | " 445 | \n",
157 | " male | \n",
158 | " 4.0 | \n",
159 | " 13.0 | \n",
160 | " 40.73385 | \n",
161 | " -74.002998 | \n",
162 | " Jazz Club | \n",
163 | " Sat | \n",
164 | " 8 | \n",
165 | " POINT (-74.00300 40.73385) | \n",
166 | "
\n",
167 | " \n",
168 | "
\n",
169 | "
"
170 | ],
171 | "text/plain": [
172 | " venueId userId gender friend_num follow_num latitude \\\n",
173 | "0 3fd66200f964a52000e71ee3 445 male 4.0 13.0 40.73385 \n",
174 | "\n",
175 | " longitude venueCateg week hour geometry \n",
176 | "0 -74.002998 Jazz Club Sat 8 POINT (-74.00300 40.73385) "
177 | ]
178 | },
179 | "execution_count": 6,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "nyc_check_in = gpd.read_file('data/nyc_checkin.shp')\n",
186 | "nyc_check_in.head(1)"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 7,
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/html": [
197 | "\n",
198 | "\n",
211 | "
\n",
212 | " \n",
213 | " \n",
214 | " | \n",
215 | " venueId | \n",
216 | " userId | \n",
217 | " gender | \n",
218 | " friend_num | \n",
219 | " follow_num | \n",
220 | " latitude | \n",
221 | " longitude | \n",
222 | " week | \n",
223 | " hour | \n",
224 | " geometry | \n",
225 | "
\n",
226 | " \n",
227 | " venueCateg | \n",
228 | " | \n",
229 | " | \n",
230 | " | \n",
231 | " | \n",
232 | " | \n",
233 | " | \n",
234 | " | \n",
235 | " | \n",
236 | " | \n",
237 | " | \n",
238 | "
\n",
239 | " \n",
240 | " \n",
241 | " \n",
242 | " Subway | \n",
243 | " 10042 | \n",
244 | " 10042 | \n",
245 | " 10042 | \n",
246 | " 10042 | \n",
247 | " 10042 | \n",
248 | " 10042 | \n",
249 | " 10042 | \n",
250 | " 10042 | \n",
251 | " 10042 | \n",
252 | " 10042 | \n",
253 | "
\n",
254 | " \n",
255 | "
\n",
256 | "
"
257 | ],
258 | "text/plain": [
259 | " venueId userId gender friend_num follow_num latitude \\\n",
260 | "venueCateg \n",
261 | "Subway 10042 10042 10042 10042 10042 10042 \n",
262 | "\n",
263 | " longitude week hour geometry \n",
264 | "venueCateg \n",
265 | "Subway 10042 10042 10042 10042 "
266 | ]
267 | },
268 | "execution_count": 7,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "nyc_check_in.groupby(\"venueCateg\").count().sort_values(\"venueId\").tail(1)"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 8,
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "(7228, 11)\n"
287 | ]
288 | },
289 | {
290 | "data": {
291 | "text/html": [
292 | "\n",
293 | "\n",
306 | "
\n",
307 | " \n",
308 | " \n",
309 | " | \n",
310 | " venueId | \n",
311 | " userId | \n",
312 | " gender | \n",
313 | " friend_num | \n",
314 | " follow_num | \n",
315 | " latitude | \n",
316 | " longitude | \n",
317 | " venueCateg | \n",
318 | " week | \n",
319 | " hour | \n",
320 | " geometry | \n",
321 | "
\n",
322 | " \n",
323 | " \n",
324 | " \n",
325 | " 7421 | \n",
326 | " 42829c80f964a5202f221fe3 | \n",
327 | " 1409 | \n",
328 | " female | \n",
329 | " 487.0 | \n",
330 | " 98.0 | \n",
331 | " 40.754239 | \n",
332 | " -73.985473 | \n",
333 | " Office | \n",
334 | " Tue | \n",
335 | " 8 | \n",
336 | " POINT (-73.98547 40.75424) | \n",
337 | "
\n",
338 | " \n",
339 | "
\n",
340 | "
"
341 | ],
342 | "text/plain": [
343 | " venueId userId gender friend_num follow_num \\\n",
344 | "7421 42829c80f964a5202f221fe3 1409 female 487.0 98.0 \n",
345 | "\n",
346 | " latitude longitude venueCateg week hour geometry \n",
347 | "7421 40.754239 -73.985473 Office Tue 8 POINT (-73.98547 40.75424) "
348 | ]
349 | },
350 | "execution_count": 8,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "venueCateg_list = [\"Office\", \"Home (private)\"]\n",
357 | "venueId_list = pd.DataFrame(nyc_check_in.venueId.unique()).sample(frac=0.5).values.squeeze()\n",
358 | "nyc_check_sticc = nyc_check_in[(nyc_check_in.venueCateg.isin(venueCateg_list))&(nyc_check_in.venueId.isin(venueId_list))]\n",
359 | "print(nyc_check_sticc.shape)\n",
360 | "nyc_check_sticc.head(1)"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 9,
366 | "metadata": {},
367 | "outputs": [
368 | {
369 | "name": "stderr",
370 | "output_type": "stream",
371 | "text": [
372 | "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
373 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
374 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
375 | "\n",
376 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
377 | " super(GeoDataFrame, self).__setitem__(key, value)\n",
378 | "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
379 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
380 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
381 | "\n",
382 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
383 | " super(GeoDataFrame, self).__setitem__(key, value)\n"
384 | ]
385 | }
386 | ],
387 | "source": [
388 | "def return_week(x):\n",
389 | " if x == \"Mon\":\n",
390 | " return 1\n",
391 | " elif x == \"Tue\":\n",
392 | " return 2\n",
393 | " elif x == \"Wed\":\n",
394 | " return 3\n",
395 | " elif x == \"Thu\":\n",
396 | " return 4\n",
397 | " elif x == \"Fri\":\n",
398 | " return 5\n",
399 | " elif x == \"Sat\":\n",
400 | " return 6\n",
401 | " elif x == \"Sun\":\n",
402 | " return 7\n",
403 | " \n",
404 | "def return_category(x):\n",
405 | " if x == \"Gym\":\n",
406 | " return 1\n",
407 | " elif x == \"Coffee Shop\":\n",
408 | " return 2\n",
409 | " elif x == \"Office\":\n",
410 | " return 3\n",
411 | " elif x == \"Home (private)\":\n",
412 | " return 4\n",
413 | " elif x == \"Subway\":\n",
414 | " return 5\n",
415 | "\n",
416 | "nyc_check_sticc[\"week_attr\"] = nyc_check_sticc[\"week\"].apply(lambda x: return_week(x))\n",
417 | "nyc_check_sticc[\"category\"] = nyc_check_sticc[\"venueCateg\"].apply(lambda x: return_category(x))\n",
418 | "nyc_check_sticc = nyc_check_sticc.reset_index().drop(\"index\", axis=1)"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 10,
424 | "metadata": {},
425 | "outputs": [
426 | {
427 | "data": {
428 | "text/html": [
429 | "\n",
430 | "\n",
443 | "
\n",
444 | " \n",
445 | " \n",
446 | " | \n",
447 | " venueId | \n",
448 | " userId | \n",
449 | " gender | \n",
450 | " friend_num | \n",
451 | " follow_num | \n",
452 | " latitude | \n",
453 | " longitude | \n",
454 | " venueCateg | \n",
455 | " week | \n",
456 | " hour | \n",
457 | " geometry | \n",
458 | " week_attr | \n",
459 | " category | \n",
460 | "
\n",
461 | " \n",
462 | " \n",
463 | " \n",
464 | " 0 | \n",
465 | " 42829c80f964a5202f221fe3 | \n",
466 | " 1409 | \n",
467 | " female | \n",
468 | " 487.0 | \n",
469 | " 98.0 | \n",
470 | " 40.754239 | \n",
471 | " -73.985473 | \n",
472 | " Office | \n",
473 | " Tue | \n",
474 | " 8 | \n",
475 | " POINT (-73.98547 40.75424) | \n",
476 | " 2 | \n",
477 | " 3 | \n",
478 | "
\n",
479 | " \n",
480 | "
\n",
481 | "
"
482 | ],
483 | "text/plain": [
484 | " venueId userId gender friend_num follow_num \\\n",
485 | "0 42829c80f964a5202f221fe3 1409 female 487.0 98.0 \n",
486 | "\n",
487 | " latitude longitude venueCateg week hour geometry \\\n",
488 | "0 40.754239 -73.985473 Office Tue 8 POINT (-73.98547 40.75424) \n",
489 | "\n",
490 | " week_attr category \n",
491 | "0 2 3 "
492 | ]
493 | },
494 | "execution_count": 10,
495 | "metadata": {},
496 | "output_type": "execute_result"
497 | }
498 | ],
499 | "source": [
500 | "nyc_check_sticc.head(1)"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": 11,
506 | "metadata": {},
507 | "outputs": [
508 | {
509 | "name": "stderr",
510 | "output_type": "stream",
511 | "text": [
512 | "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n",
513 | " There are 156 disconnected components.\n",
514 | " warnings.warn(message)\n"
515 | ]
516 | }
517 | ],
518 | "source": [
519 | "kd = libpysal.cg.KDTree(np.array(nyc_check_sticc[[\"latitude\", \"longitude\"]].values))\n",
520 | "wnn = libpysal.weights.KNN(kd, 3)"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 12,
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "data": {
530 | "text/html": [
531 | "\n",
532 | "\n",
545 | "
\n",
546 | " \n",
547 | " \n",
548 | " | \n",
549 | " n_pt_0 | \n",
550 | " n_pt_1 | \n",
551 | " n_pt_2 | \n",
552 | "
\n",
553 | " \n",
554 | " \n",
555 | " \n",
556 | " 0 | \n",
557 | " 6322 | \n",
558 | " 5330 | \n",
559 | " 6317 | \n",
560 | "
\n",
561 | " \n",
562 | "
\n",
563 | "
"
564 | ],
565 | "text/plain": [
566 | " n_pt_0 n_pt_1 n_pt_2\n",
567 | "0 6322 5330 6317"
568 | ]
569 | },
570 | "execution_count": 12,
571 | "metadata": {},
572 | "output_type": "execute_result"
573 | }
574 | ],
575 | "source": [
576 | "nearest_pt = pd.DataFrame().from_dict(wnn.neighbors, orient=\"index\")\n",
577 | "for i in range(nearest_pt.shape[1]):\n",
578 | " nearest_pt = nearest_pt.rename({i:f\"n_pt_{i}\"}, axis=1)\n",
579 | "nearest_pt.head(1)"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": 13,
585 | "metadata": {},
586 | "outputs": [
587 | {
588 | "data": {
589 | "text/html": [
590 | "\n",
591 | "\n",
604 | "
\n",
605 | " \n",
606 | " \n",
607 | " | \n",
608 | " venueId | \n",
609 | " userId | \n",
610 | " gender | \n",
611 | " friend_num | \n",
612 | " follow_num | \n",
613 | " latitude | \n",
614 | " longitude | \n",
615 | " venueCateg | \n",
616 | " week | \n",
617 | " hour | \n",
618 | " geometry | \n",
619 | " week_attr | \n",
620 | " category | \n",
621 | " n_pt_0 | \n",
622 | " n_pt_1 | \n",
623 | " n_pt_2 | \n",
624 | "
\n",
625 | " \n",
626 | " \n",
627 | " \n",
628 | " 0 | \n",
629 | " 42829c80f964a5202f221fe3 | \n",
630 | " 1409 | \n",
631 | " female | \n",
632 | " 487.0 | \n",
633 | " 98.0 | \n",
634 | " 40.754239 | \n",
635 | " -73.985473 | \n",
636 | " Office | \n",
637 | " Tue | \n",
638 | " 8 | \n",
639 | " POINT (-73.98547 40.75424) | \n",
640 | " 2 | \n",
641 | " 3 | \n",
642 | " 6322 | \n",
643 | " 5330 | \n",
644 | " 6317 | \n",
645 | "
\n",
646 | " \n",
647 | "
\n",
648 | "
"
649 | ],
650 | "text/plain": [
651 | " venueId userId gender friend_num follow_num \\\n",
652 | "0 42829c80f964a5202f221fe3 1409 female 487.0 98.0 \n",
653 | "\n",
654 | " latitude longitude venueCateg week hour geometry \\\n",
655 | "0 40.754239 -73.985473 Office Tue 8 POINT (-73.98547 40.75424) \n",
656 | "\n",
657 | " week_attr category n_pt_0 n_pt_1 n_pt_2 \n",
658 | "0 2 3 6322 5330 6317 "
659 | ]
660 | },
661 | "execution_count": 13,
662 | "metadata": {},
663 | "output_type": "execute_result"
664 | }
665 | ],
666 | "source": [
667 | "nyc_check_sticc = nyc_check_sticc.join(nearest_pt)\n",
668 | "nyc_check_sticc.head(1)"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 14,
674 | "metadata": {},
675 | "outputs": [],
676 | "source": [
677 | "nyc_check_sticc[[\"week_attr\", \"hour\", \"n_pt_0\", \n",
678 | " \"n_pt_1\", \"n_pt_2\"]].to_csv(r'nyc_checkin.txt', header=None, index=True, sep=',')"
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": 15,
684 | "metadata": {},
685 | "outputs": [],
686 | "source": [
687 | "w_voronoi = weights.Voronoi.from_dataframe(nyc_check_sticc)"
688 | ]
689 | },
690 | {
691 | "cell_type": "markdown",
692 | "metadata": {},
693 | "source": [
694 | "# STICC"
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": 16,
700 | "metadata": {
701 | "collapsed": true,
702 | "jupyter": {
703 | "outputs_hidden": true
704 | }
705 | },
706 | "outputs": [
707 | {
708 | "name": "stdout",
709 | "output_type": "stream",
710 | "text": [
711 | "lam_sparse 0.1\n",
712 | "switch_penalty 5.0\n",
713 | "num_cluster 2\n",
714 | "num stacked 4\n",
715 | "completed getting the data\n",
716 | "2 (7228, 2) (7228, 3)\n",
717 | "\n",
718 | "\n",
719 | "\n",
720 | "ITERATION ### 0\n",
721 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
722 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
723 | "length of the cluster 0 ------> 2425\n",
724 | "length of the cluster 1 ------> 4803\n",
725 | "UPDATED THE OLD COVARIANCE\n",
726 | "beginning the smoothening ALGORITHM\n",
727 | "length of cluster # 0 --------> 2595\n",
728 | "length of cluster # 1 --------> 4633\n",
729 | "Done writing the figure\n",
730 | "\n",
731 | "\n",
732 | "\n",
733 | "\n",
734 | "\n",
735 | "\n",
736 | "\n",
737 | "ITERATION ### 1\n",
738 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
739 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
740 | "length of the cluster 0 ------> 2595\n",
741 | "length of the cluster 1 ------> 4633\n",
742 | "UPDATED THE OLD COVARIANCE\n",
743 | "beginning the smoothening ALGORITHM\n",
744 | "length of cluster # 0 --------> 3342\n",
745 | "length of cluster # 1 --------> 3886\n",
746 | "Done writing the figure\n",
747 | "\n",
748 | "\n",
749 | "\n",
750 | "\n",
751 | "\n",
752 | "\n",
753 | "\n",
754 | "ITERATION ### 2\n",
755 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
756 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
757 | "length of the cluster 0 ------> 3342\n",
758 | "length of the cluster 1 ------> 3886\n",
759 | "UPDATED THE OLD COVARIANCE\n",
760 | "beginning the smoothening ALGORITHM\n",
761 | "length of cluster # 0 --------> 3701\n",
762 | "length of cluster # 1 --------> 3527\n",
763 | "Done writing the figure\n",
764 | "\n",
765 | "\n",
766 | "\n",
767 | "\n",
768 | "\n",
769 | "\n",
770 | "\n",
771 | "ITERATION ### 3\n",
772 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
773 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
774 | "length of the cluster 0 ------> 3701\n",
775 | "length of the cluster 1 ------> 3527\n",
776 | "UPDATED THE OLD COVARIANCE\n",
777 | "beginning the smoothening ALGORITHM\n",
778 | "length of cluster # 0 --------> 3865\n",
779 | "length of cluster # 1 --------> 3363\n",
780 | "Done writing the figure\n",
781 | "\n",
782 | "\n",
783 | "\n",
784 | "\n",
785 | "\n",
786 | "\n",
787 | "\n",
788 | "ITERATION ### 4\n",
789 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
790 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
791 | "length of the cluster 0 ------> 3865\n",
792 | "length of the cluster 1 ------> 3363\n",
793 | "UPDATED THE OLD COVARIANCE\n",
794 | "beginning the smoothening ALGORITHM\n",
795 | "length of cluster # 0 --------> 3928\n",
796 | "length of cluster # 1 --------> 3300\n",
797 | "Done writing the figure\n",
798 | "\n",
799 | "\n",
800 | "\n",
801 | "\n",
802 | "\n",
803 | "\n",
804 | "\n",
805 | "ITERATION ### 5\n",
806 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
807 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
808 | "length of the cluster 0 ------> 3928\n",
809 | "length of the cluster 1 ------> 3300\n",
810 | "UPDATED THE OLD COVARIANCE\n",
811 | "beginning the smoothening ALGORITHM\n",
812 | "length of cluster # 0 --------> 3955\n",
813 | "length of cluster # 1 --------> 3273\n",
814 | "Done writing the figure\n",
815 | "\n",
816 | "\n",
817 | "\n",
818 | "\n",
819 | "\n",
820 | "\n",
821 | "\n",
822 | "ITERATION ### 6\n",
823 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
824 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
825 | "length of the cluster 0 ------> 3955\n",
826 | "length of the cluster 1 ------> 3273\n",
827 | "UPDATED THE OLD COVARIANCE\n",
828 | "beginning the smoothening ALGORITHM\n",
829 | "length of cluster # 0 --------> 3972\n",
830 | "length of cluster # 1 --------> 3256\n",
831 | "Done writing the figure\n",
832 | "\n",
833 | "\n",
834 | "\n",
835 | "\n",
836 | "\n",
837 | "\n",
838 | "\n",
839 | "ITERATION ### 7\n",
840 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
841 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
842 | "length of the cluster 0 ------> 3972\n",
843 | "length of the cluster 1 ------> 3256\n",
844 | "UPDATED THE OLD COVARIANCE\n",
845 | "beginning the smoothening ALGORITHM\n",
846 | "length of cluster # 0 --------> 3974\n",
847 | "length of cluster # 1 --------> 3254\n",
848 | "Done writing the figure\n",
849 | "\n",
850 | "\n",
851 | "\n",
852 | "\n",
853 | "\n",
854 | "\n",
855 | "\n",
856 | "ITERATION ### 8\n",
857 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
858 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
859 | "length of the cluster 0 ------> 3974\n",
860 | "length of the cluster 1 ------> 3254\n",
861 | "UPDATED THE OLD COVARIANCE\n",
862 | "beginning the smoothening ALGORITHM\n",
863 | "length of cluster # 0 --------> 3976\n",
864 | "length of cluster # 1 --------> 3252\n",
865 | "Done writing the figure\n",
866 | "\n",
867 | "\n",
868 | "\n",
869 | "\n",
870 | "\n",
871 | "\n",
872 | "\n",
873 | "ITERATION ### 9\n",
874 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
875 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
876 | "length of the cluster 0 ------> 3976\n",
877 | "length of the cluster 1 ------> 3252\n",
878 | "UPDATED THE OLD COVARIANCE\n",
879 | "beginning the smoothening ALGORITHM\n",
880 | "length of cluster # 0 --------> 3976\n",
881 | "length of cluster # 1 --------> 3252\n",
882 | "Done writing the figure\n",
883 | "\n",
884 | "\n",
885 | "\n",
886 | "\n",
887 | "\n",
888 | "\n",
889 | "\n",
890 | "\n",
891 | "CONVERGED!!! BREAKING EARLY!!!\n",
892 | "\n",
893 | "\n",
894 | "\n",
895 | "TRAINING F1 score: -1 -1 -1\n",
896 | "[1.0000 1.0000 1.0000 ... 0.0000 0.0000 1.0000]\n"
897 | ]
898 | }
899 | ],
900 | "source": [
901 | "!python STICC_main.py --fname=nyc_checkin.txt --oname=result_nyc_checkin.txt --attr_idx_start=1 \\\n",
902 | "--attr_idx_end=2 --spatial_idx_start=3 --spatial_idx_end=5 \\\n",
903 | "--spatial_radius 4 --number_of_clusters 2 --lambda_parameter 10e-2 --beta 5 --maxIters 20"
904 | ]
905 | },
906 | {
907 | "cell_type": "code",
908 | "execution_count": 17,
909 | "metadata": {},
910 | "outputs": [
911 | {
912 | "name": "stdout",
913 | "output_type": "stream",
914 | "text": [
915 | "Adjusted rand score 0.5125812312665118\n",
916 | "Spatial contiguity: 0.8781538173477508\n",
917 | "f1_score 0.8552023059764304 [4, 3]\n"
918 | ]
919 | }
920 | ],
921 | "source": [
922 | "group = pd.read_table('result_nyc_checkin.txt', names=[\"group\"])\n",
923 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
924 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
925 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
926 | " result_nyc_check_sticc.clus_group_gt.values))\n",
927 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
928 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
929 | "get_max_f1_score(result_nyc_check_sticc)"
930 | ]
931 | },
932 | {
933 | "cell_type": "markdown",
934 | "metadata": {},
935 | "source": [
936 | "# Other methods"
937 | ]
938 | },
939 | {
940 | "cell_type": "code",
941 | "execution_count": 18,
942 | "metadata": {},
943 | "outputs": [],
944 | "source": [
945 | "def get_pycluster_result(ground_truth, cluster_method):\n",
946 | " data = ground_truth[[\"week_attr\", \"hour\"]].values # For K-Means\n",
947 | " data = ground_truth[[\"week_attr\", \"hour\", \"latitude\", \"longitude\"]].values # For Sp K-Means\n",
948 | " \n",
949 | " if cluster_method == kmeans:\n",
950 | " initial_centers = kmeans_plusplus_initializer(data.tolist(), 2).initialize()\n",
951 | " instance = cluster_method(data.tolist(), initial_centers)\n",
952 | " elif cluster_method == cure:\n",
953 | " print(\"cure\")\n",
954 | " instance = cure(data, 2)\n",
955 | " else:\n",
956 | " instance = cluster_method(data.tolist(), 2)\n",
957 | "\n",
958 | " instance.process()\n",
959 | " clusters = instance.get_clusters()\n",
960 | " \n",
961 | " clusters_result = []\n",
962 | " for i, clus in enumerate(clusters):\n",
963 | " for data in clus:\n",
964 | " clusters_result.append([data, i])\n",
965 | " clusters_result_df = pd.DataFrame(clusters_result, columns=[\"pt\", \"group\"]).sort_values(\"pt\").set_index(\"pt\")\n",
966 | " return clusters_result_df"
967 | ]
968 | },
969 | {
970 | "cell_type": "markdown",
971 | "metadata": {},
972 | "source": [
973 | "# K-Means"
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": 19,
979 | "metadata": {},
980 | "outputs": [
981 | {
982 | "name": "stdout",
983 | "output_type": "stream",
984 | "text": [
985 | "Adjusted rand score 0.04405713957270548\n",
986 | "Spatial contiguity: 0.6662453775218836\n",
987 | "f1_score 0.6069746428638629 [4, 3]\n"
988 | ]
989 | }
990 | ],
991 | "source": [
992 | "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
993 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
994 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
995 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
996 | " result_nyc_check_sticc.clus_group_gt.values))\n",
997 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
998 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
999 | "get_max_f1_score(result_nyc_check_sticc)"
1000 | ]
1001 | },
1002 | {
1003 | "cell_type": "markdown",
1004 | "metadata": {},
1005 | "source": [
1006 | "# Sp K-Means"
1007 | ]
1008 | },
1009 | {
1010 | "cell_type": "code",
1011 | "execution_count": 20,
1012 | "metadata": {},
1013 | "outputs": [
1014 | {
1015 | "name": "stdout",
1016 | "output_type": "stream",
1017 | "text": [
1018 | "Adjusted rand score 0.014733457093020055\n",
1019 | "Spatial contiguity: 0.6291100394563788\n",
1020 | "f1_score 0.5679695938194762 [3, 4]\n"
1021 | ]
1022 | }
1023 | ],
1024 | "source": [
1025 | "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
1026 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1027 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1028 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1029 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1030 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1031 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1032 | "get_max_f1_score(result_nyc_check_sticc)"
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "execution_count": 21,
1038 | "metadata": {},
1039 | "outputs": [
1040 | {
1041 | "data": {
1042 | "text/html": [
1043 | "\n",
1044 | "\n",
1057 | "
\n",
1058 | " \n",
1059 | " \n",
1060 | " | \n",
1061 | " venueId | \n",
1062 | " userId | \n",
1063 | " gender | \n",
1064 | " friend_num | \n",
1065 | " follow_num | \n",
1066 | " latitude | \n",
1067 | " longitude | \n",
1068 | " venueCateg | \n",
1069 | " week | \n",
1070 | " hour | \n",
1071 | " geometry | \n",
1072 | " week_attr | \n",
1073 | " category | \n",
1074 | " n_pt_0 | \n",
1075 | " n_pt_1 | \n",
1076 | " n_pt_2 | \n",
1077 | "
\n",
1078 | " \n",
1079 | " \n",
1080 | " \n",
1081 | " 0 | \n",
1082 | " 46ce971cf964a520414a1fe3 | \n",
1083 | " 2636 | \n",
1084 | " male | \n",
1085 | " 84.0 | \n",
1086 | " 84.0 | \n",
1087 | " 40.760867 | \n",
1088 | " -73.980347 | \n",
1089 | " Office | \n",
1090 | " Wed | \n",
1091 | " 23 | \n",
1092 | " POINT (-73.98035 40.76087) | \n",
1093 | " 3 | \n",
1094 | " 3 | \n",
1095 | " 322 | \n",
1096 | " 315 | \n",
1097 | " 288 | \n",
1098 | "
\n",
1099 | " \n",
1100 | "
\n",
1101 | "
"
1102 | ],
1103 | "text/plain": [
1104 | " venueId userId gender friend_num follow_num latitude \\\n",
1105 | "0 46ce971cf964a520414a1fe3 2636 male 84.0 84.0 40.760867 \n",
1106 | "\n",
1107 | " longitude venueCateg week hour geometry week_attr \\\n",
1108 | "0 -73.980347 Office Wed 23 POINT (-73.98035 40.76087) 3 \n",
1109 | "\n",
1110 | " category n_pt_0 n_pt_1 n_pt_2 \n",
1111 | "0 3 322 315 288 "
1112 | ]
1113 | },
1114 | "execution_count": 21,
1115 | "metadata": {},
1116 | "output_type": "execute_result"
1117 | }
1118 | ],
1119 | "source": [
1120 | "nyc_check_sticc.head(1)"
1121 | ]
1122 | },
1123 | {
1124 | "cell_type": "markdown",
1125 | "metadata": {},
1126 | "source": [
1127 | "# CURE"
1128 | ]
1129 | },
1130 | {
1131 | "cell_type": "code",
1132 | "execution_count": 22,
1133 | "metadata": {},
1134 | "outputs": [
1135 | {
1136 | "name": "stdout",
1137 | "output_type": "stream",
1138 | "text": [
1139 | "cure\n",
1140 | "Adjusted rand score 0.0011630086117161073\n",
1141 | "Spatial contiguity: 0.6589215256466462\n",
1142 | "f1_score 0.5566708449149055 [3, 4]\n"
1143 | ]
1144 | }
1145 | ],
1146 | "source": [
1147 | "group = get_pycluster_result(nyc_check_sticc, cure)\n",
1148 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1149 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1150 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1151 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1152 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1153 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1154 | "get_max_f1_score(result_nyc_check_sticc)"
1155 | ]
1156 | },
1157 | {
1158 | "cell_type": "markdown",
1159 | "metadata": {},
1160 | "source": [
1161 | "# GMM"
1162 | ]
1163 | },
1164 | {
1165 | "cell_type": "code",
1166 | "execution_count": 23,
1167 | "metadata": {},
1168 | "outputs": [],
1169 | "source": [
1170 | "from sklearn.mixture import GaussianMixture"
1171 | ]
1172 | },
1173 | {
1174 | "cell_type": "code",
1175 | "execution_count": 24,
1176 | "metadata": {},
1177 | "outputs": [
1178 | {
1179 | "data": {
1180 | "text/html": [
1181 | "\n",
1182 | "\n",
1195 | "
\n",
1196 | " \n",
1197 | " \n",
1198 | " | \n",
1199 | " venueId | \n",
1200 | " userId | \n",
1201 | " gender | \n",
1202 | " friend_num | \n",
1203 | " follow_num | \n",
1204 | " latitude | \n",
1205 | " longitude | \n",
1206 | " venueCateg | \n",
1207 | " week | \n",
1208 | " hour | \n",
1209 | " geometry | \n",
1210 | " week_attr | \n",
1211 | " category | \n",
1212 | " n_pt_0 | \n",
1213 | " n_pt_1 | \n",
1214 | " n_pt_2 | \n",
1215 | "
\n",
1216 | " \n",
1217 | " \n",
1218 | " \n",
1219 | " 0 | \n",
1220 | " 46ce971cf964a520414a1fe3 | \n",
1221 | " 2636 | \n",
1222 | " male | \n",
1223 | " 84.0 | \n",
1224 | " 84.0 | \n",
1225 | " 40.760867 | \n",
1226 | " -73.980347 | \n",
1227 | " Office | \n",
1228 | " Wed | \n",
1229 | " 23 | \n",
1230 | " POINT (-73.98035 40.76087) | \n",
1231 | " 3 | \n",
1232 | " 3 | \n",
1233 | " 322 | \n",
1234 | " 315 | \n",
1235 | " 288 | \n",
1236 | "
\n",
1237 | " \n",
1238 | "
\n",
1239 | "
"
1240 | ],
1241 | "text/plain": [
1242 | " venueId userId gender friend_num follow_num latitude \\\n",
1243 | "0 46ce971cf964a520414a1fe3 2636 male 84.0 84.0 40.760867 \n",
1244 | "\n",
1245 | " longitude venueCateg week hour geometry week_attr \\\n",
1246 | "0 -73.980347 Office Wed 23 POINT (-73.98035 40.76087) 3 \n",
1247 | "\n",
1248 | " category n_pt_0 n_pt_1 n_pt_2 \n",
1249 | "0 3 322 315 288 "
1250 | ]
1251 | },
1252 | "execution_count": 24,
1253 | "metadata": {},
1254 | "output_type": "execute_result"
1255 | }
1256 | ],
1257 | "source": [
1258 | "gmm_data = nyc_check_sticc.copy()\n",
1259 | "gmm_data.head(1)"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "code",
1264 | "execution_count": 25,
1265 | "metadata": {},
1266 | "outputs": [],
1267 | "source": [
1268 | "X = gmm_data[['hour', 'week_attr']].values"
1269 | ]
1270 | },
1271 | {
1272 | "cell_type": "code",
1273 | "execution_count": 26,
1274 | "metadata": {},
1275 | "outputs": [
1276 | {
1277 | "data": {
1278 | "text/html": [
1279 | "\n",
1280 | "\n",
1293 | "
\n",
1294 | " \n",
1295 | " \n",
1296 | " | \n",
1297 | " group | \n",
1298 | "
\n",
1299 | " \n",
1300 | " \n",
1301 | " \n",
1302 | " 0 | \n",
1303 | " 1 | \n",
1304 | "
\n",
1305 | " \n",
1306 | "
\n",
1307 | "
"
1308 | ],
1309 | "text/plain": [
1310 | " group\n",
1311 | "0 1"
1312 | ]
1313 | },
1314 | "execution_count": 26,
1315 | "metadata": {},
1316 | "output_type": "execute_result"
1317 | }
1318 | ],
1319 | "source": [
1320 | "gm = GaussianMixture(n_components=2).fit(X)\n",
1321 | "gmm = pd.DataFrame(gm.predict(X), columns=[\"group\"])\n",
1322 | "gmm.head(1)"
1323 | ]
1324 | },
1325 | {
1326 | "cell_type": "code",
1327 | "execution_count": 27,
1328 | "metadata": {},
1329 | "outputs": [
1330 | {
1331 | "name": "stdout",
1332 | "output_type": "stream",
1333 | "text": [
1334 | "Adjusted rand score 0.008571217314584517\n",
1335 | "Spatial contiguity: 0.6590530469092504\n",
1336 | "f1_score 0.568958207722862 [3, 4]\n"
1337 | ]
1338 | }
1339 | ],
1340 | "source": [
1341 | "result_nyc_check_sticc = nyc_check_sticc.join(gmm)\n",
1342 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1343 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1344 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1345 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1346 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1347 | "get_max_f1_score(result_nyc_check_sticc)"
1348 | ]
1349 | }
1350 | ],
1351 | "metadata": {
1352 | "kernelspec": {
1353 | "display_name": "Python 3",
1354 | "language": "python",
1355 | "name": "python3"
1356 | },
1357 | "language_info": {
1358 | "codemirror_mode": {
1359 | "name": "ipython",
1360 | "version": 3
1361 | },
1362 | "file_extension": ".py",
1363 | "mimetype": "text/x-python",
1364 | "name": "python",
1365 | "nbconvert_exporter": "python",
1366 | "pygments_lexer": "ipython3",
1367 | "version": "3.8.3"
1368 | }
1369 | },
1370 | "nbformat": 4,
1371 | "nbformat_minor": 4
1372 | }
1373 |
--------------------------------------------------------------------------------
/NYC_checkin3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import random\n",
10 | "import pandas as pd\n",
11 | "import geopandas as gpd\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import esda\n",
14 | "import libpysal.weights as weights\n",
15 | "from esda.moran import Moran\n",
16 | "from shapely.geometry import Point, MultiPoint, LineString, Polygon, shape\n",
17 | "import json\n",
18 | "import pylab\n",
19 | "import libpysal\n",
20 | "import numpy as np\n",
21 | "from sklearn.metrics.cluster import adjusted_rand_score\n",
22 | "from sklearn.metrics import f1_score\n",
23 | "from pyclustering.cluster.cure import cure\n",
24 | "from pyclustering.cluster.kmeans import kmeans\n",
25 | "from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer\n",
26 | "from sklearn import preprocessing"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "def permutation(lst):\n",
36 | " if len(lst) == 0:\n",
37 | " return []\n",
38 | "\n",
39 | " if len(lst) == 1:\n",
40 | " return [lst]\n",
41 | "\n",
42 | " l = []\n",
43 | " for i in range(len(lst)):\n",
44 | " m = lst[i]\n",
45 | " remLst = lst[:i] + lst[i+1:]\n",
46 | " for p in permutation(remLst):\n",
47 | " l.append([m] + p) \n",
48 | " return l"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "def get_f1_score(df, permut):\n",
58 | " def match_clus(x, permut):\n",
59 | " if x == 0:\n",
60 | " return int(permut[0])\n",
61 | " elif x == 1:\n",
62 | " return int(permut[1])\n",
63 | " elif x == 2:\n",
64 | " return int(permut[1])\n",
65 | " else:\n",
66 | " return x\n",
67 | "\n",
68 | " df[\"group_match\"] = df[\"group\"].apply(lambda x: match_clus(x, permut))\n",
69 | " return df, f1_score(df.group_match.values, df.clus_group_gt.values, average='macro')"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 4,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "def get_max_f1_score(df):\n",
79 | " max_f1 = 0\n",
80 | " max_p = []\n",
81 | " for p in permutation([1,3,4]):\n",
82 | " df, f1 = get_f1_score(df, p)\n",
83 | " if max_f1 < f1:\n",
84 | " max_f1 = f1\n",
85 | " max_p = p\n",
86 | " print(\"f1_score \", max_f1, max_p)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 5,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "def cal_joint_statistic(nyc_data, w_voronoi):\n",
96 | " matched_connects = 0\n",
97 | " all_neighbors_connects = 0\n",
98 | " for obj_id, neighbors in w_voronoi.neighbors.items():\n",
99 | " obj_clus = nyc_data.iat[obj_id, -1]\n",
100 | " for nei in neighbors:\n",
101 | " nei_clus = nyc_data.iat[nei, -1]\n",
102 | " all_neighbors_connects += 1\n",
103 | " if obj_clus == nei_clus:\n",
104 | " matched_connects += 1\n",
105 | " return matched_connects / all_neighbors_connects"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 6,
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "data": {
115 | "text/html": [
116 | "\n",
117 | "\n",
130 | "
\n",
131 | " \n",
132 | " \n",
133 | " | \n",
134 | " venueId | \n",
135 | " userId | \n",
136 | " gender | \n",
137 | " friend_num | \n",
138 | " follow_num | \n",
139 | " latitude | \n",
140 | " longitude | \n",
141 | " venueCateg | \n",
142 | " week | \n",
143 | " hour | \n",
144 | " geometry | \n",
145 | "
\n",
146 | " \n",
147 | " \n",
148 | " \n",
149 | " 0 | \n",
150 | " 3fd66200f964a52000e71ee3 | \n",
151 | " 445 | \n",
152 | " male | \n",
153 | " 4.0 | \n",
154 | " 13.0 | \n",
155 | " 40.73385 | \n",
156 | " -74.002998 | \n",
157 | " Jazz Club | \n",
158 | " Sat | \n",
159 | " 8 | \n",
160 | " POINT (-74.00300 40.73385) | \n",
161 | "
\n",
162 | " \n",
163 | "
\n",
164 | "
"
165 | ],
166 | "text/plain": [
167 | " venueId userId gender friend_num follow_num latitude \\\n",
168 | "0 3fd66200f964a52000e71ee3 445 male 4.0 13.0 40.73385 \n",
169 | "\n",
170 | " longitude venueCateg week hour geometry \n",
171 | "0 -74.002998 Jazz Club Sat 8 POINT (-74.00300 40.73385) "
172 | ]
173 | },
174 | "execution_count": 6,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "nyc_check_in = gpd.read_file('data/nyc_checkin.shp')\n",
181 | "nyc_check_in.head(1)"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 7,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/html": [
192 | "\n",
193 | "\n",
206 | "
\n",
207 | " \n",
208 | " \n",
209 | " | \n",
210 | " venueId | \n",
211 | " userId | \n",
212 | " gender | \n",
213 | " friend_num | \n",
214 | " follow_num | \n",
215 | " latitude | \n",
216 | " longitude | \n",
217 | " week | \n",
218 | " hour | \n",
219 | " geometry | \n",
220 | "
\n",
221 | " \n",
222 | " venueCateg | \n",
223 | " | \n",
224 | " | \n",
225 | " | \n",
226 | " | \n",
227 | " | \n",
228 | " | \n",
229 | " | \n",
230 | " | \n",
231 | " | \n",
232 | " | \n",
233 | "
\n",
234 | " \n",
235 | " \n",
236 | " \n",
237 | " Subway | \n",
238 | " 10042 | \n",
239 | " 10042 | \n",
240 | " 10042 | \n",
241 | " 10042 | \n",
242 | " 10042 | \n",
243 | " 10042 | \n",
244 | " 10042 | \n",
245 | " 10042 | \n",
246 | " 10042 | \n",
247 | " 10042 | \n",
248 | "
\n",
249 | " \n",
250 | "
\n",
251 | "
"
252 | ],
253 | "text/plain": [
254 | " venueId userId gender friend_num follow_num latitude \\\n",
255 | "venueCateg \n",
256 | "Subway 10042 10042 10042 10042 10042 10042 \n",
257 | "\n",
258 | " longitude week hour geometry \n",
259 | "venueCateg \n",
260 | "Subway 10042 10042 10042 10042 "
261 | ]
262 | },
263 | "execution_count": 7,
264 | "metadata": {},
265 | "output_type": "execute_result"
266 | }
267 | ],
268 | "source": [
269 | "nyc_check_in.groupby(\"venueCateg\").count().sort_values(\"venueId\").tail(1)"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 8,
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "name": "stdout",
279 | "output_type": "stream",
280 | "text": [
281 | "(5909, 11)\n"
282 | ]
283 | },
284 | {
285 | "data": {
286 | "text/html": [
287 | "\n",
288 | "\n",
301 | "
\n",
302 | " \n",
303 | " \n",
304 | " | \n",
305 | " venueId | \n",
306 | " userId | \n",
307 | " gender | \n",
308 | " friend_num | \n",
309 | " follow_num | \n",
310 | " latitude | \n",
311 | " longitude | \n",
312 | " venueCateg | \n",
313 | " week | \n",
314 | " hour | \n",
315 | " geometry | \n",
316 | "
\n",
317 | " \n",
318 | " \n",
319 | " \n",
320 | " 1828 | \n",
321 | " 3fd66200f964a5206fe71ee3 | \n",
322 | " 654 | \n",
323 | " male | \n",
324 | " 103.0 | \n",
325 | " 46.0 | \n",
326 | " 40.752901 | \n",
327 | " -73.974176 | \n",
328 | " Gym | \n",
329 | " Mon | \n",
330 | " 17 | \n",
331 | " POINT (-73.97418 40.75290) | \n",
332 | "
\n",
333 | " \n",
334 | "
\n",
335 | "
"
336 | ],
337 | "text/plain": [
338 | " venueId userId gender friend_num follow_num \\\n",
339 | "1828 3fd66200f964a5206fe71ee3 654 male 103.0 46.0 \n",
340 | "\n",
341 | " latitude longitude venueCateg week hour geometry \n",
342 | "1828 40.752901 -73.974176 Gym Mon 17 POINT (-73.97418 40.75290) "
343 | ]
344 | },
345 | "execution_count": 8,
346 | "metadata": {},
347 | "output_type": "execute_result"
348 | }
349 | ],
350 | "source": [
351 | "venueCateg_list = [\"Gym\", \"Office\", \"Home (private)\"]\n",
352 | "venueId_list = pd.DataFrame(nyc_check_in.venueId.unique()).sample(frac=0.3).values.squeeze()\n",
353 | "nyc_check_sticc = nyc_check_in[(nyc_check_in.venueCateg.isin(venueCateg_list))&(nyc_check_in.venueId.isin(venueId_list))]\n",
354 | "print(nyc_check_sticc.shape)\n",
355 | "nyc_check_sticc.head(1)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 9,
361 | "metadata": {},
362 | "outputs": [
363 | {
364 | "name": "stderr",
365 | "output_type": "stream",
366 | "text": [
367 | "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
368 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
369 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
370 | "\n",
371 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
372 | " super(GeoDataFrame, self).__setitem__(key, value)\n",
373 | "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
374 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
375 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
376 | "\n",
377 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
378 | " super(GeoDataFrame, self).__setitem__(key, value)\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "def return_week(x):\n",
384 | " if x == \"Mon\":\n",
385 | " return 1\n",
386 | " elif x == \"Tue\":\n",
387 | " return 2\n",
388 | " elif x == \"Wed\":\n",
389 | " return 3\n",
390 | " elif x == \"Thu\":\n",
391 | " return 4\n",
392 | " elif x == \"Fri\":\n",
393 | " return 5\n",
394 | " elif x == \"Sat\":\n",
395 | " return 6\n",
396 | " elif x == \"Sun\":\n",
397 | " return 7\n",
398 | " \n",
399 | "def return_category(x):\n",
400 | " if x == \"Gym\":\n",
401 | " return 1\n",
402 | " elif x == \"Coffee Shop\":\n",
403 | " return 2\n",
404 | " elif x == \"Office\":\n",
405 | " return 3\n",
406 | " elif x == \"Home (private)\":\n",
407 | " return 4\n",
408 | " elif x == \"Subway\":\n",
409 | " return 5\n",
410 | "\n",
411 | "nyc_check_sticc[\"week_attr\"] = nyc_check_sticc[\"week\"].apply(lambda x: return_week(x))\n",
412 | "nyc_check_sticc[\"category\"] = nyc_check_sticc[\"venueCateg\"].apply(lambda x: return_category(x))\n",
413 | "nyc_check_sticc = nyc_check_sticc.reset_index().drop(\"index\", axis=1)"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 10,
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "data": {
423 | "text/html": [
424 | "\n",
425 | "\n",
438 | "
\n",
439 | " \n",
440 | " \n",
441 | " | \n",
442 | " venueId | \n",
443 | " userId | \n",
444 | " gender | \n",
445 | " friend_num | \n",
446 | " follow_num | \n",
447 | " latitude | \n",
448 | " longitude | \n",
449 | " venueCateg | \n",
450 | " week | \n",
451 | " hour | \n",
452 | " geometry | \n",
453 | " week_attr | \n",
454 | " category | \n",
455 | "
\n",
456 | " \n",
457 | " \n",
458 | " \n",
459 | " 0 | \n",
460 | " 3fd66200f964a5206fe71ee3 | \n",
461 | " 654 | \n",
462 | " male | \n",
463 | " 103.0 | \n",
464 | " 46.0 | \n",
465 | " 40.752901 | \n",
466 | " -73.974176 | \n",
467 | " Gym | \n",
468 | " Mon | \n",
469 | " 17 | \n",
470 | " POINT (-73.97418 40.75290) | \n",
471 | " 1 | \n",
472 | " 1 | \n",
473 | "
\n",
474 | " \n",
475 | "
\n",
476 | "
"
477 | ],
478 | "text/plain": [
479 | " venueId userId gender friend_num follow_num latitude \\\n",
480 | "0 3fd66200f964a5206fe71ee3 654 male 103.0 46.0 40.752901 \n",
481 | "\n",
482 | " longitude venueCateg week hour geometry week_attr \\\n",
483 | "0 -73.974176 Gym Mon 17 POINT (-73.97418 40.75290) 1 \n",
484 | "\n",
485 | " category \n",
486 | "0 1 "
487 | ]
488 | },
489 | "execution_count": 10,
490 | "metadata": {},
491 | "output_type": "execute_result"
492 | }
493 | ],
494 | "source": [
495 | "nyc_check_sticc.head(1)"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": 11,
501 | "metadata": {},
502 | "outputs": [
503 | {
504 | "name": "stderr",
505 | "output_type": "stream",
506 | "text": [
507 | "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n",
508 | " There are 140 disconnected components.\n",
509 | " warnings.warn(message)\n"
510 | ]
511 | }
512 | ],
513 | "source": [
514 | "kd = libpysal.cg.KDTree(np.array(nyc_check_sticc[[\"latitude\", \"longitude\"]].values))\n",
515 | "wnn = libpysal.weights.KNN(kd, 3)"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 12,
521 | "metadata": {},
522 | "outputs": [
523 | {
524 | "data": {
525 | "text/html": [
526 | "\n",
527 | "\n",
540 | "
\n",
541 | " \n",
542 | " \n",
543 | " | \n",
544 | " n_pt_0 | \n",
545 | " n_pt_1 | \n",
546 | " n_pt_2 | \n",
547 | "
\n",
548 | " \n",
549 | " \n",
550 | " \n",
551 | " 0 | \n",
552 | " 3556 | \n",
553 | " 9 | \n",
554 | " 22 | \n",
555 | "
\n",
556 | " \n",
557 | "
\n",
558 | "
"
559 | ],
560 | "text/plain": [
561 | " n_pt_0 n_pt_1 n_pt_2\n",
562 | "0 3556 9 22"
563 | ]
564 | },
565 | "execution_count": 12,
566 | "metadata": {},
567 | "output_type": "execute_result"
568 | }
569 | ],
570 | "source": [
571 | "nearest_pt = pd.DataFrame().from_dict(wnn.neighbors, orient=\"index\")\n",
572 | "for i in range(nearest_pt.shape[1]):\n",
573 | " nearest_pt = nearest_pt.rename({i:f\"n_pt_{i}\"}, axis=1)\n",
574 | "nearest_pt.head(1)"
575 | ]
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 13,
580 | "metadata": {},
581 | "outputs": [
582 | {
583 | "data": {
584 | "text/html": [
585 | "\n",
586 | "\n",
599 | "
\n",
600 | " \n",
601 | " \n",
602 | " | \n",
603 | " venueId | \n",
604 | " userId | \n",
605 | " gender | \n",
606 | " friend_num | \n",
607 | " follow_num | \n",
608 | " latitude | \n",
609 | " longitude | \n",
610 | " venueCateg | \n",
611 | " week | \n",
612 | " hour | \n",
613 | " geometry | \n",
614 | " week_attr | \n",
615 | " category | \n",
616 | " n_pt_0 | \n",
617 | " n_pt_1 | \n",
618 | " n_pt_2 | \n",
619 | "
\n",
620 | " \n",
621 | " \n",
622 | " \n",
623 | " 0 | \n",
624 | " 3fd66200f964a5206fe71ee3 | \n",
625 | " 654 | \n",
626 | " male | \n",
627 | " 103.0 | \n",
628 | " 46.0 | \n",
629 | " 40.752901 | \n",
630 | " -73.974176 | \n",
631 | " Gym | \n",
632 | " Mon | \n",
633 | " 17 | \n",
634 | " POINT (-73.97418 40.75290) | \n",
635 | " 1 | \n",
636 | " 1 | \n",
637 | " 3556 | \n",
638 | " 9 | \n",
639 | " 22 | \n",
640 | "
\n",
641 | " \n",
642 | "
\n",
643 | "
"
644 | ],
645 | "text/plain": [
646 | " venueId userId gender friend_num follow_num latitude \\\n",
647 | "0 3fd66200f964a5206fe71ee3 654 male 103.0 46.0 40.752901 \n",
648 | "\n",
649 | " longitude venueCateg week hour geometry week_attr \\\n",
650 | "0 -73.974176 Gym Mon 17 POINT (-73.97418 40.75290) 1 \n",
651 | "\n",
652 | " category n_pt_0 n_pt_1 n_pt_2 \n",
653 | "0 1 3556 9 22 "
654 | ]
655 | },
656 | "execution_count": 13,
657 | "metadata": {},
658 | "output_type": "execute_result"
659 | }
660 | ],
661 | "source": [
662 | "nyc_check_sticc = nyc_check_sticc.join(nearest_pt)\n",
663 | "nyc_check_sticc.head(1)"
664 | ]
665 | },
666 | {
667 | "cell_type": "code",
668 | "execution_count": 14,
669 | "metadata": {},
670 | "outputs": [],
671 | "source": [
672 | "nyc_check_sticc[[\"week_attr\", \"hour\", \"n_pt_0\", \"n_pt_1\", \n",
673 | " \"n_pt_2\"]].to_csv(r'nyc_checkin3.txt', header=None, index=True, sep=',')"
674 | ]
675 | },
676 | {
677 | "cell_type": "code",
678 | "execution_count": 15,
679 | "metadata": {},
680 | "outputs": [],
681 | "source": [
682 | "w_voronoi = weights.Voronoi.from_dataframe(nyc_check_sticc)"
683 | ]
684 | },
685 | {
686 | "cell_type": "markdown",
687 | "metadata": {},
688 | "source": [
689 | "# STICC"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": 16,
695 | "metadata": {
696 | "collapsed": true,
697 | "jupyter": {
698 | "outputs_hidden": true
699 | }
700 | },
701 | "outputs": [
702 | {
703 | "name": "stdout",
704 | "output_type": "stream",
705 | "text": [
706 | "lam_sparse 0.1\n",
707 | "switch_penalty 5.0\n",
708 | "num_cluster 3\n",
709 | "num stacked 4\n",
710 | "completed getting the data\n",
711 | "2 (5909, 2) (5909, 3)\n",
712 | "\n",
713 | "\n",
714 | "\n",
715 | "ITERATION ### 0\n",
716 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
717 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
718 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
719 | "length of the cluster 0 ------> 3087\n",
720 | "length of the cluster 1 ------> 1475\n",
721 | "length of the cluster 2 ------> 1347\n",
722 | "UPDATED THE OLD COVARIANCE\n",
723 | "beginning the smoothening ALGORITHM\n",
724 | "length of cluster # 0 --------> 3196\n",
725 | "length of cluster # 1 --------> 1611\n",
726 | "length of cluster # 2 --------> 1102\n",
727 | "Done writing the figure\n",
728 | "\n",
729 | "\n",
730 | "\n",
731 | "\n",
732 | "\n",
733 | "\n",
734 | "\n",
735 | "ITERATION ### 1\n",
736 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
737 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
738 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
739 | "length of the cluster 0 ------> 3196\n",
740 | "length of the cluster 1 ------> 1611\n",
741 | "length of the cluster 2 ------> 1102\n",
742 | "UPDATED THE OLD COVARIANCE\n",
743 | "beginning the smoothening ALGORITHM\n",
744 | "length of cluster # 0 --------> 3012\n",
745 | "length of cluster # 1 --------> 1478\n",
746 | "length of cluster # 2 --------> 1419\n",
747 | "Done writing the figure\n",
748 | "\n",
749 | "\n",
750 | "\n",
751 | "\n",
752 | "\n",
753 | "\n",
754 | "\n",
755 | "ITERATION ### 2\n",
756 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
757 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
758 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
759 | "length of the cluster 0 ------> 3012\n",
760 | "length of the cluster 1 ------> 1478\n",
761 | "length of the cluster 2 ------> 1419\n",
762 | "UPDATED THE OLD COVARIANCE\n",
763 | "beginning the smoothening ALGORITHM\n",
764 | "length of cluster # 0 --------> 2800\n",
765 | "length of cluster # 1 --------> 1490\n",
766 | "length of cluster # 2 --------> 1619\n",
767 | "Done writing the figure\n",
768 | "\n",
769 | "\n",
770 | "\n",
771 | "\n",
772 | "\n",
773 | "\n",
774 | "\n",
775 | "ITERATION ### 3\n",
776 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
777 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
778 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
779 | "length of the cluster 0 ------> 2800\n",
780 | "length of the cluster 1 ------> 1490\n",
781 | "length of the cluster 2 ------> 1619\n",
782 | "UPDATED THE OLD COVARIANCE\n",
783 | "beginning the smoothening ALGORITHM\n",
784 | "length of cluster # 0 --------> 2696\n",
785 | "length of cluster # 1 --------> 1554\n",
786 | "length of cluster # 2 --------> 1659\n",
787 | "Done writing the figure\n",
788 | "\n",
789 | "\n",
790 | "\n",
791 | "\n",
792 | "\n",
793 | "\n",
794 | "\n",
795 | "ITERATION ### 4\n",
796 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
797 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
798 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
799 | "length of the cluster 0 ------> 2696\n",
800 | "length of the cluster 1 ------> 1554\n",
801 | "length of the cluster 2 ------> 1659\n",
802 | "UPDATED THE OLD COVARIANCE\n",
803 | "beginning the smoothening ALGORITHM\n",
804 | "length of cluster # 0 --------> 2644\n",
805 | "length of cluster # 1 --------> 1606\n",
806 | "length of cluster # 2 --------> 1659\n",
807 | "Done writing the figure\n",
808 | "\n",
809 | "\n",
810 | "\n",
811 | "\n",
812 | "\n",
813 | "\n",
814 | "\n",
815 | "ITERATION ### 5\n",
816 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
817 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
818 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
819 | "length of the cluster 0 ------> 2644\n",
820 | "length of the cluster 1 ------> 1606\n",
821 | "length of the cluster 2 ------> 1659\n",
822 | "UPDATED THE OLD COVARIANCE\n",
823 | "beginning the smoothening ALGORITHM\n",
824 | "length of cluster # 0 --------> 2633\n",
825 | "length of cluster # 1 --------> 1614\n",
826 | "length of cluster # 2 --------> 1662\n",
827 | "Done writing the figure\n",
828 | "\n",
829 | "\n",
830 | "\n",
831 | "\n",
832 | "\n",
833 | "\n",
834 | "\n",
835 | "ITERATION ### 6\n",
836 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
837 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
838 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
839 | "length of the cluster 0 ------> 2633\n",
840 | "length of the cluster 1 ------> 1614\n",
841 | "length of the cluster 2 ------> 1662\n",
842 | "UPDATED THE OLD COVARIANCE\n",
843 | "beginning the smoothening ALGORITHM\n",
844 | "length of cluster # 0 --------> 2625\n",
845 | "length of cluster # 1 --------> 1636\n",
846 | "length of cluster # 2 --------> 1648\n",
847 | "Done writing the figure\n",
848 | "\n",
849 | "\n",
850 | "\n",
851 | "\n",
852 | "\n",
853 | "\n",
854 | "\n",
855 | "ITERATION ### 7\n",
856 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
857 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
858 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
859 | "length of the cluster 0 ------> 2625\n",
860 | "length of the cluster 1 ------> 1636\n",
861 | "length of the cluster 2 ------> 1648\n",
862 | "UPDATED THE OLD COVARIANCE\n",
863 | "beginning the smoothening ALGORITHM\n",
864 | "length of cluster # 0 --------> 2623\n",
865 | "length of cluster # 1 --------> 1640\n",
866 | "length of cluster # 2 --------> 1646\n",
867 | "Done writing the figure\n",
868 | "\n",
869 | "\n",
870 | "\n",
871 | "\n",
872 | "\n",
873 | "\n",
874 | "\n",
875 | "ITERATION ### 8\n",
876 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
877 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
878 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
879 | "length of the cluster 0 ------> 2623\n",
880 | "length of the cluster 1 ------> 1640\n",
881 | "length of the cluster 2 ------> 1646\n",
882 | "UPDATED THE OLD COVARIANCE\n",
883 | "beginning the smoothening ALGORITHM\n",
884 | "length of cluster # 0 --------> 2623\n",
885 | "length of cluster # 1 --------> 1640\n",
886 | "length of cluster # 2 --------> 1646\n",
887 | "Done writing the figure\n",
888 | "\n",
889 | "\n",
890 | "\n",
891 | "\n",
892 | "\n",
893 | "\n",
894 | "\n",
895 | "ITERATION ### 9\n",
896 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
897 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
898 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
899 | "length of the cluster 0 ------> 2623\n",
900 | "length of the cluster 1 ------> 1640\n",
901 | "length of the cluster 2 ------> 1646\n",
902 | "UPDATED THE OLD COVARIANCE\n",
903 | "beginning the smoothening ALGORITHM\n",
904 | "length of cluster # 0 --------> 2623\n",
905 | "length of cluster # 1 --------> 1638\n",
906 | "length of cluster # 2 --------> 1648\n",
907 | "Done writing the figure\n",
908 | "\n",
909 | "\n",
910 | "\n",
911 | "\n",
912 | "\n",
913 | "\n",
914 | "\n",
915 | "ITERATION ### 10\n",
916 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
917 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
918 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
919 | "length of the cluster 0 ------> 2623\n",
920 | "length of the cluster 1 ------> 1638\n",
921 | "length of the cluster 2 ------> 1648\n",
922 | "UPDATED THE OLD COVARIANCE\n",
923 | "beginning the smoothening ALGORITHM\n",
924 | "length of cluster # 0 --------> 2623\n",
925 | "length of cluster # 1 --------> 1647\n",
926 | "length of cluster # 2 --------> 1639\n",
927 | "Done writing the figure\n",
928 | "\n",
929 | "\n",
930 | "\n",
931 | "\n",
932 | "\n",
933 | "\n",
934 | "\n",
935 | "ITERATION ### 11\n",
936 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
937 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
938 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
939 | "length of the cluster 0 ------> 2623\n",
940 | "length of the cluster 1 ------> 1647\n",
941 | "length of the cluster 2 ------> 1639\n",
942 | "UPDATED THE OLD COVARIANCE\n",
943 | "beginning the smoothening ALGORITHM\n",
944 | "length of cluster # 0 --------> 2623\n",
945 | "length of cluster # 1 --------> 1646\n",
946 | "length of cluster # 2 --------> 1640\n",
947 | "Done writing the figure\n",
948 | "\n",
949 | "\n",
950 | "\n",
951 | "\n",
952 | "\n",
953 | "\n",
954 | "\n",
955 | "ITERATION ### 12\n",
956 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
957 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
958 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
959 | "length of the cluster 0 ------> 2623\n",
960 | "length of the cluster 1 ------> 1646\n",
961 | "length of the cluster 2 ------> 1640\n",
962 | "UPDATED THE OLD COVARIANCE\n",
963 | "beginning the smoothening ALGORITHM\n",
964 | "length of cluster # 0 --------> 2623\n",
965 | "length of cluster # 1 --------> 1644\n",
966 | "length of cluster # 2 --------> 1642\n",
967 | "Done writing the figure\n",
968 | "\n",
969 | "\n",
970 | "\n",
971 | "\n",
972 | "\n",
973 | "\n",
974 | "\n",
975 | "ITERATION ### 13\n",
976 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
977 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
978 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
979 | "length of the cluster 0 ------> 2623\n",
980 | "length of the cluster 1 ------> 1644\n",
981 | "length of the cluster 2 ------> 1642\n",
982 | "UPDATED THE OLD COVARIANCE\n",
983 | "beginning the smoothening ALGORITHM\n",
984 | "length of cluster # 0 --------> 2623\n",
985 | "length of cluster # 1 --------> 1637\n",
986 | "length of cluster # 2 --------> 1649\n",
987 | "Done writing the figure\n",
988 | "\n",
989 | "\n",
990 | "\n",
991 | "\n",
992 | "\n",
993 | "\n",
994 | "\n",
995 | "ITERATION ### 14\n",
996 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
997 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
998 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
999 | "length of the cluster 0 ------> 2623\n",
1000 | "length of the cluster 1 ------> 1637\n",
1001 | "length of the cluster 2 ------> 1649\n",
1002 | "UPDATED THE OLD COVARIANCE\n",
1003 | "beginning the smoothening ALGORITHM\n",
1004 | "length of cluster # 0 --------> 2623\n",
1005 | "length of cluster # 1 --------> 1641\n",
1006 | "length of cluster # 2 --------> 1645\n",
1007 | "Done writing the figure\n",
1008 | "\n",
1009 | "\n",
1010 | "\n",
1011 | "\n",
1012 | "\n",
1013 | "\n",
1014 | "\n",
1015 | "ITERATION ### 15\n",
1016 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1017 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1018 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1019 | "length of the cluster 0 ------> 2623\n",
1020 | "length of the cluster 1 ------> 1641\n",
1021 | "length of the cluster 2 ------> 1645\n",
1022 | "UPDATED THE OLD COVARIANCE\n",
1023 | "beginning the smoothening ALGORITHM\n",
1024 | "length of cluster # 0 --------> 2623\n",
1025 | "length of cluster # 1 --------> 1644\n",
1026 | "length of cluster # 2 --------> 1642\n",
1027 | "Done writing the figure\n",
1028 | "\n",
1029 | "\n",
1030 | "\n",
1031 | "\n",
1032 | "\n",
1033 | "\n",
1034 | "\n",
1035 | "ITERATION ### 16\n",
1036 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1037 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1038 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1039 | "length of the cluster 0 ------> 2623\n",
1040 | "length of the cluster 1 ------> 1644\n",
1041 | "length of the cluster 2 ------> 1642\n",
1042 | "UPDATED THE OLD COVARIANCE\n",
1043 | "beginning the smoothening ALGORITHM\n",
1044 | "length of cluster # 0 --------> 2623\n",
1045 | "length of cluster # 1 --------> 1643\n",
1046 | "length of cluster # 2 --------> 1643\n",
1047 | "Done writing the figure\n",
1048 | "\n",
1049 | "\n",
1050 | "\n",
1051 | "\n",
1052 | "\n",
1053 | "\n",
1054 | "\n",
1055 | "ITERATION ### 17\n",
1056 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1057 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1058 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1059 | "length of the cluster 0 ------> 2623\n",
1060 | "length of the cluster 1 ------> 1643\n",
1061 | "length of the cluster 2 ------> 1643\n",
1062 | "UPDATED THE OLD COVARIANCE\n",
1063 | "beginning the smoothening ALGORITHM\n",
1064 | "length of cluster # 0 --------> 2623\n",
1065 | "length of cluster # 1 --------> 1646\n",
1066 | "length of cluster # 2 --------> 1640\n",
1067 | "Done writing the figure\n",
1068 | "\n",
1069 | "\n",
1070 | "\n",
1071 | "\n",
1072 | "\n",
1073 | "\n",
1074 | "\n",
1075 | "ITERATION ### 18\n",
1076 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1077 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1078 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1079 | "length of the cluster 0 ------> 2623\n",
1080 | "length of the cluster 1 ------> 1646\n",
1081 | "length of the cluster 2 ------> 1640\n",
1082 | "UPDATED THE OLD COVARIANCE\n",
1083 | "beginning the smoothening ALGORITHM\n",
1084 | "length of cluster # 0 --------> 2622\n",
1085 | "length of cluster # 1 --------> 1639\n",
1086 | "length of cluster # 2 --------> 1648\n",
1087 | "Done writing the figure\n",
1088 | "\n",
1089 | "\n",
1090 | "\n",
1091 | "\n",
1092 | "\n",
1093 | "\n",
1094 | "\n",
1095 | "ITERATION ### 19\n",
1096 | "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1097 | "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1098 | "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1099 | "length of the cluster 0 ------> 2622\n",
1100 | "length of the cluster 1 ------> 1639\n",
1101 | "length of the cluster 2 ------> 1648\n",
1102 | "UPDATED THE OLD COVARIANCE\n",
1103 | "beginning the smoothening ALGORITHM\n",
1104 | "length of cluster # 0 --------> 2622\n",
1105 | "length of cluster # 1 --------> 1643\n",
1106 | "length of cluster # 2 --------> 1644\n",
1107 | "Done writing the figure\n",
1108 | "\n",
1109 | "\n",
1110 | "\n",
1111 | "\n",
1112 | "\n",
1113 | "\n",
1114 | "\n",
1115 | "TRAINING F1 score: -1 -1 -1\n",
1116 | "[1.0000 1.0000 1.0000 ... 0.0000 0.0000 0.0000]\n"
1117 | ]
1118 | }
1119 | ],
1120 | "source": [
1121 | "!python STICC_main.py --fname=nyc_checkin3.txt --oname=result_nyc_checkin3.txt --attr_idx_start=1 \\\n",
1122 | "--attr_idx_end=2 --spatial_idx_start=3 --spatial_idx_end=5 \\\n",
1123 | "--spatial_radius 4 --number_of_clusters 3 --lambda_parameter 10e-2 --beta 5 --maxIters 20"
1124 | ]
1125 | },
1126 | {
1127 | "cell_type": "code",
1128 | "execution_count": 17,
1129 | "metadata": {},
1130 | "outputs": [
1131 | {
1132 | "name": "stdout",
1133 | "output_type": "stream",
1134 | "text": [
1135 | "Adjusted rand score 0.299048156707623\n",
1136 | "Spatial contiguity: 0.7245619074978454\n",
1137 | "f1_score 0.5239363875462164 [3, 4, 1]\n"
1138 | ]
1139 | }
1140 | ],
1141 | "source": [
1142 | "group = pd.read_table('result_nyc_checkin3.txt', names=[\"group\"])\n",
1143 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1144 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1145 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1146 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1147 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1148 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1149 | "get_max_f1_score(result_nyc_check_sticc)"
1150 | ]
1151 | },
1152 | {
1153 | "cell_type": "markdown",
1154 | "metadata": {},
1155 | "source": [
1156 | "# Other methods"
1157 | ]
1158 | },
1159 | {
1160 | "cell_type": "code",
1161 | "execution_count": 18,
1162 | "metadata": {},
1163 | "outputs": [],
1164 | "source": [
1165 | "def get_pycluster_result(ground_truth, cluster_method):\n",
1166 | "# data = ground_truth[[\"week_attr\", \"hour\"]].values # For K-Means\n",
1167 | " data = ground_truth[[\"week_attr\", \"hour\", \"latitude\", \"longitude\"]].values # For Sp K-Means\n",
1168 | "\n",
1169 | " if cluster_method == kmeans:\n",
1170 | " initial_centers = kmeans_plusplus_initializer(data.tolist(), 2).initialize()\n",
1171 | " instance = cluster_method(data.tolist(), initial_centers)\n",
1172 | " elif cluster_method == cure:\n",
1173 | " print(\"cure\")\n",
1174 | " instance = cure(data, 3)\n",
1175 | " else:\n",
1176 | " instance = cluster_method(data.tolist(), 2)\n",
1177 | "\n",
1178 | " instance.process()\n",
1179 | " clusters = instance.get_clusters()\n",
1180 | " \n",
1181 | " clusters_result = []\n",
1182 | " for i, clus in enumerate(clusters):\n",
1183 | " for data in clus:\n",
1184 | " clusters_result.append([data, i])\n",
1185 | " clusters_result_df = pd.DataFrame(clusters_result, columns=[\"pt\", \"group\"]).sort_values(\"pt\").set_index(\"pt\")\n",
1186 | " return clusters_result_df"
1187 | ]
1188 | },
1189 | {
1190 | "cell_type": "markdown",
1191 | "metadata": {},
1192 | "source": [
1193 | "# K-Means"
1194 | ]
1195 | },
1196 | {
1197 | "cell_type": "code",
1198 | "execution_count": 19,
1199 | "metadata": {},
1200 | "outputs": [
1201 | {
1202 | "name": "stdout",
1203 | "output_type": "stream",
1204 | "text": [
1205 | "Adjusted rand score 0.06540493878619441\n",
1206 | "Spatial contiguity: 0.6700948003447286\n",
1207 | "f1_score 0.38086125317189695 [3, 4, 1]\n"
1208 | ]
1209 | }
1210 | ],
1211 | "source": [
1212 | "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
1213 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1214 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1215 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1216 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1217 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1218 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1219 | "get_max_f1_score(result_nyc_check_sticc)"
1220 | ]
1221 | },
1222 | {
1223 | "cell_type": "markdown",
1224 | "metadata": {},
1225 | "source": [
1226 | "# Sp K-Means"
1227 | ]
1228 | },
1229 | {
1230 | "cell_type": "code",
1231 | "execution_count": 20,
1232 | "metadata": {},
1233 | "outputs": [
1234 | {
1235 | "name": "stdout",
1236 | "output_type": "stream",
1237 | "text": [
1238 | "Adjusted rand score 0.06540493878619441\n",
1239 | "Spatial contiguity: 0.6700948003447286\n",
1240 | "f1_score 0.38086125317189695 [3, 4, 1]\n"
1241 | ]
1242 | }
1243 | ],
1244 | "source": [
1245 | "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
1246 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1247 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1248 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1249 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1250 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1251 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1252 | "get_max_f1_score(result_nyc_check_sticc)"
1253 | ]
1254 | },
1255 | {
1256 | "cell_type": "markdown",
1257 | "metadata": {},
1258 | "source": [
1259 | "# CURE"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "code",
1264 | "execution_count": 21,
1265 | "metadata": {},
1266 | "outputs": [
1267 | {
1268 | "name": "stdout",
1269 | "output_type": "stream",
1270 | "text": [
1271 | "cure\n",
1272 | "Adjusted rand score 0.0729293684699148\n",
1273 | "Spatial contiguity: 0.6272335535765584\n",
1274 | "f1_score 0.4208030109481018 [3, 4, 1]\n"
1275 | ]
1276 | }
1277 | ],
1278 | "source": [
1279 | "group = get_pycluster_result(nyc_check_sticc, cure)\n",
1280 | "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1281 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1282 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1283 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1284 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1285 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1286 | "get_max_f1_score(result_nyc_check_sticc)"
1287 | ]
1288 | },
1289 | {
1290 | "cell_type": "markdown",
1291 | "metadata": {},
1292 | "source": [
1293 | "# GMM"
1294 | ]
1295 | },
1296 | {
1297 | "cell_type": "code",
1298 | "execution_count": 22,
1299 | "metadata": {},
1300 | "outputs": [],
1301 | "source": [
1302 | "from sklearn.mixture import GaussianMixture"
1303 | ]
1304 | },
1305 | {
1306 | "cell_type": "code",
1307 | "execution_count": 23,
1308 | "metadata": {},
1309 | "outputs": [
1310 | {
1311 | "data": {
1312 | "text/html": [
1313 | "\n",
1314 | "\n",
1327 | "
\n",
1328 | " \n",
1329 | " \n",
1330 | " | \n",
1331 | " venueId | \n",
1332 | " userId | \n",
1333 | " gender | \n",
1334 | " friend_num | \n",
1335 | " follow_num | \n",
1336 | " latitude | \n",
1337 | " longitude | \n",
1338 | " venueCateg | \n",
1339 | " week | \n",
1340 | " hour | \n",
1341 | " geometry | \n",
1342 | " week_attr | \n",
1343 | " category | \n",
1344 | " n_pt_0 | \n",
1345 | " n_pt_1 | \n",
1346 | " n_pt_2 | \n",
1347 | "
\n",
1348 | " \n",
1349 | " \n",
1350 | " \n",
1351 | " 0 | \n",
1352 | " 3fd66200f964a5206fe71ee3 | \n",
1353 | " 654 | \n",
1354 | " male | \n",
1355 | " 103.0 | \n",
1356 | " 46.0 | \n",
1357 | " 40.752901 | \n",
1358 | " -73.974176 | \n",
1359 | " Gym | \n",
1360 | " Mon | \n",
1361 | " 17 | \n",
1362 | " POINT (-73.97418 40.75290) | \n",
1363 | " 1 | \n",
1364 | " 1 | \n",
1365 | " 3556 | \n",
1366 | " 9 | \n",
1367 | " 22 | \n",
1368 | "
\n",
1369 | " \n",
1370 | "
\n",
1371 | "
"
1372 | ],
1373 | "text/plain": [
1374 | " venueId userId gender friend_num follow_num latitude \\\n",
1375 | "0 3fd66200f964a5206fe71ee3 654 male 103.0 46.0 40.752901 \n",
1376 | "\n",
1377 | " longitude venueCateg week hour geometry week_attr \\\n",
1378 | "0 -73.974176 Gym Mon 17 POINT (-73.97418 40.75290) 1 \n",
1379 | "\n",
1380 | " category n_pt_0 n_pt_1 n_pt_2 \n",
1381 | "0 1 3556 9 22 "
1382 | ]
1383 | },
1384 | "execution_count": 23,
1385 | "metadata": {},
1386 | "output_type": "execute_result"
1387 | }
1388 | ],
1389 | "source": [
1390 | "gmm_data = nyc_check_sticc.copy()\n",
1391 | "gmm_data.head(1)"
1392 | ]
1393 | },
1394 | {
1395 | "cell_type": "code",
1396 | "execution_count": 24,
1397 | "metadata": {},
1398 | "outputs": [],
1399 | "source": [
1400 | "X = gmm_data[['hour', 'week_attr']].values"
1401 | ]
1402 | },
1403 | {
1404 | "cell_type": "code",
1405 | "execution_count": 25,
1406 | "metadata": {},
1407 | "outputs": [
1408 | {
1409 | "data": {
1410 | "text/html": [
1411 | "\n",
1412 | "\n",
1425 | "
\n",
1426 | " \n",
1427 | " \n",
1428 | " | \n",
1429 | " group | \n",
1430 | "
\n",
1431 | " \n",
1432 | " \n",
1433 | " \n",
1434 | " 0 | \n",
1435 | " 1 | \n",
1436 | "
\n",
1437 | " \n",
1438 | "
\n",
1439 | "
"
1440 | ],
1441 | "text/plain": [
1442 | " group\n",
1443 | "0 1"
1444 | ]
1445 | },
1446 | "execution_count": 25,
1447 | "metadata": {},
1448 | "output_type": "execute_result"
1449 | }
1450 | ],
1451 | "source": [
1452 | "gm = GaussianMixture(n_components=3).fit(X)\n",
1453 | "gmm = pd.DataFrame(gm.predict(X), columns=[\"group\"])\n",
1454 | "gmm.head(1)"
1455 | ]
1456 | },
1457 | {
1458 | "cell_type": "code",
1459 | "execution_count": 26,
1460 | "metadata": {},
1461 | "outputs": [
1462 | {
1463 | "name": "stdout",
1464 | "output_type": "stream",
1465 | "text": [
1466 | "Adjusted rand score 0.09072443404391904\n",
1467 | "Spatial contiguity: 0.6405630565929331\n",
1468 | "f1_score 0.4349576813996136 [3, 4, 1]\n"
1469 | ]
1470 | }
1471 | ],
1472 | "source": [
1473 | "result_nyc_check_sticc = nyc_check_sticc.join(gmm)\n",
1474 | "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1475 | "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1476 | " result_nyc_check_sticc.clus_group_gt.values))\n",
1477 | "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1478 | "print(\"Spatial contiguity: \", sp_contiguity)\n",
1479 | "get_max_f1_score(result_nyc_check_sticc)"
1480 | ]
1481 | }
1482 | ],
1483 | "metadata": {
1484 | "kernelspec": {
1485 | "display_name": "Python 3",
1486 | "language": "python",
1487 | "name": "python3"
1488 | },
1489 | "language_info": {
1490 | "codemirror_mode": {
1491 | "name": "ipython",
1492 | "version": 3
1493 | },
1494 | "file_extension": ".py",
1495 | "mimetype": "text/x-python",
1496 | "name": "python",
1497 | "nbconvert_exporter": "python",
1498 | "pygments_lexer": "ipython3",
1499 | "version": "3.8.3"
1500 | }
1501 | },
1502 | "nbformat": 4,
1503 | "nbformat_minor": 4
1504 | }
1505 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://opensource.org/licenses/BSD-2-Clause)
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | STICC: A multivariate spatial clustering method for repeated geographic pattern discovery with consideration of spatial contiguity
10 |
11 |
12 | GeoDS Lab, Department of Geography, University of Wisconsin-Madison.
13 |
14 |
15 |
16 |
17 |
18 | ## Table of Contents
19 |
20 | * [Citation](#citation)
21 | * [About the Project](#about-the-project)
22 | * [Code Usage](#code-usage)
23 | * [Folder Structure](#folder-structure)
24 | * [License](#license)
25 | * [Contact](#contact)
26 | * [Acknowledgements](#acknowledgements)
27 |
28 |
29 | ## Citation
30 | If you use this algorithm in your research or applications, please cite this source:
31 |
32 | Kang, Y., Wu, K., Gao, S., Ng, I., Rao, J., Ye, S., Zhang, F. and Fei, T. [STICC: A multivariate spatial clustering method for repeated geographic pattern discovery with consideration of spatial contiguity](https://doi.org/10.1080/13658816.2022.2053980). *International Journal of Geographical Information Science* (2022). DOI:10.1080/13658816.2022.2053980.
33 |
34 |
35 | ```
36 | @article{kang2022sticc,
37 | title = {STICC: A multivariate spatial clustering method for repeated geographic pattern discovery with consideration of spatial contiguity},
38 | author = {Kang, Yuhao and Wu, Kunlin and Gao, Song and Ng, Ignavier and Rao, Jinmeng and Ye, Shan and Zhang, Fan and Fei, Teng},
39 | journal = {International Journal of Geographical Information Science},
40 | doi = {10.1080/13658816.2022.2053980},
41 | year = {2022}
42 | }
43 | ```
44 |
45 |
46 | ## About The Project
47 | Spatial clustering has been widely used for spatial data mining and knowledge discovery. An ideal multivariate spatial clustering should consider both spatial contiguity and aspatial attributes. Existing spatial clustering approaches may face challenges for discovering repeated geographic patterns with spatial contiguity maintained. In this paper, we propose a Spatial Toeplitz Inverse Covariance-Based Clustering (STICC) method that considers both attributes and spatial relationships of geographic objects for multivariate spatial clustering. A subregion is created for each geographic object serving as the basic unit when performing clustering. A Markov random field (MRF) is then constructed to characterize the attribute dependencies of subregions. Using a spatial consistency strategy, nearby objects are encouraged to belong to the same cluster. To test the performance of the proposed STICC algorithm, we apply it in two use cases. The comparison results with several baseline methods show that the STICC outperforms others significantly in terms of adjusted rand index and macro-F1. Joint count statistics is also calculated and shows that the spatial contiguity is well preserved by STICC. Such a spatial clustering method may benefit various applications in the fields of geography, remote sensing, transportation, and urban planning, etc.
48 |
49 | The expected outcome of using STICC for spatial clustering is shown as follows:
50 |
51 |
52 |
53 |
54 | The general idea of the STICC algorithm is illustrated as follows:
55 |
56 |
57 |
58 |
59 |
60 | The STICC algorithm is developed based on the TICC algorithm:
61 |
62 | D. Hallac, S. Vare, S. Boyd, and J. Leskovec Toeplitz Inverse Covariance-Based Clustering of Multivariate Time Series Data. *Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining* 215--223 (2017)
63 |
64 | GitHub: [TICC](https://github.com/davidhallac/TICC)
65 |
66 | ## Code Usage
67 |
68 | Environment: Python 3.7 or newer
69 | See requirements.txt
70 |
71 | #### Experiment Reproduction
72 | To reproduce the experiments in the paper, please check the three jupyter notebooks: synthetic.ipynb, NYC_checkin.ipynb, NYC_checkin3.ipynb. All datasets have been uplodaded in the folder data/
73 |
74 | #### Input Data Structure
75 | The input data should be a .txt file with a .csv structure. The first column (column 0) indicates the unique identifier of the geographic object. The following columns indicate the attributes of the geographic object. The last several collumns indicate the nearest neighbors of the geographic object.
76 |
77 | For instance, given the following two objects:
78 | ```
79 | 0,4.471435163732493,2.158530256342078,96.54097808132826,1016.5109582462767,997.3221602361555,41,78,45
80 | 1,2.8090243052935353,2.1454885080772383,68.55061966023295,1701.5536144719163,1001.8594793592364,11,80,35
81 | ```
82 |
83 | Column 0 indicates the id of the object, columns 1-5 show attributes of geographic objects, columns 6-8 are nearest neighbors. For the object 0, its nearest neighbor is object 41, its second nearest neighbor is object 78, and its third nearest neighbor is object 45.
84 |
85 |
86 | #### Execute Python Code
87 | To perform STICC on your own dataset, please run the following code in python.
88 |
89 | Usage:
90 |
91 | ```
92 | python STICC_main.py --fname=[input_data] --oname=[output_data] \
93 | --attr_idx_start=[attr_idx_start] --attr_idx_end=[attr_idx_end] \
94 | --spatial_idx_start=[spatial_idx_start] --spatial_idx_end=[spatial_idx_end] --spatial_radius=[spatial_radius] \
95 | --number_of_clusters=[number_of_clusters] --lambda_parameter=[lambda_parameter] --beta=[beta] --maxIters=[maxIters]
96 | ```
97 |
98 |
99 | ```
100 | --fname, input data name
101 | --oname, output file name
102 | --attr_idx_start, attribute start index
103 | --attr_idx_end, attribute end index
104 | --spatial_idx_start, neighboring object start index
105 | --spatial_idx_end, neighboring object end index
106 | --spatial_radius, radius of subregion
107 | --number_of_clusters, number of clusters
108 | --lambda_parameter, lambda
109 | --beta, beta
110 | --maxIters, maximum iterations
111 | ```
112 |
113 |
114 | Example:
115 | Perform STICC on the synthetic_data.txt with spatial radius=3 and beta=3.
116 | ```
117 | python STICC_main.py --fname=synthetic_data.txt --oname=result_synthetic_data.txt \
118 | --attr_idx_start=1 --attr_idx_end=5 --spatial_idx_start=6 --spatial_idx_end=8 \
119 | --spatial_radius=3 --number_of_clusters 7 --lambda_parameter 0.01 --beta 3 --maxIters 20
120 | ```
121 |
122 |
123 | If you meet the following error:
124 | ```
125 | numpy.linalg.LinAlgError: Eigenvalues did not converge
126 | ```
127 |
128 | A potential solution is to standardize your dataset.
129 |
130 | ## Folder Structure
131 | The folders and files are organized as follows.
132 | ```
133 | project
134 | |-- data
135 | |-- images
136 | |-- src
137 | | |-- __init__.py
138 | | |-- admm_solver.py
139 | | `-- STICC_helper.py
140 | |-- STICC_main.py
141 | |-- STICC_solver.py
142 | |-- synthetic.ipynb
143 | |-- NYC_checkin.ipynb
144 | `-- NYC_checkin3.ipynb
145 | ```
146 |
147 |
148 | ## License
149 |
150 | Distributed under the BSD License. See `LICENSE` for more information.
151 |
152 |
153 | ## Contact
154 |
155 | Yuhao Kang - [@YuhaoKang](https://twitter.com/YuhaoKang) - yuhao.kang at wisc.edu
156 | Song Gao - [@gissong](https://twitter.com/gissong) - song.gao at wisc.edu
157 |
158 | Project Link: [https://github.com/GeoDS/STICC](https://github.com/GeoDS/STICC)
159 |
160 |
161 | ## Acknowledgements
162 |
163 | Code inherits from [TICC](https://github.com/davidhallac/TICC).
164 |
165 | Yuhao Kang acknowledges the support by the Trewartha Research Award, Department of the Geography, University of Wisconsin-Madison. Song Gao and Jinmeng Rao acknowledge the support by the American Family Insurance Data Science Institute at the University of Wisconsin-Madison and the National Science Foundation funded AI institute (Grant No.2112606) for Intelligent Cyberinfrastructure with Computational Learning in the Environment (ICICLE). Fan Zhang would like to thank the support by the National Natural Science Foundation of China under Grant 41901321. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the funders.
166 |
167 |
168 | [license-shield]: https://img.shields.io/github/license/othneildrew/Best-README-Template.svg?style=flat-square
169 | [license-url]: https://github.com/GeoDS/COVID19USFlows/blob/master/LICENSE.txt
170 |
--------------------------------------------------------------------------------
/STICC_main.py:
--------------------------------------------------------------------------------
1 | from STICC_solver import STICC
2 | import numpy as np
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser(description='Parameters of the STICC')
6 | parser.add_argument('--fname', type=str,
7 | default="synthetic_data.txt", help='Input data name')
8 | parser.add_argument('--oname', type=str,
9 | default="result_synthetic_data.txt", help='Output file name')
10 | parser.add_argument('--attr_idx_start', type=int,
11 | default=1, help='Attribute start index')
12 | parser.add_argument('--attr_idx_end', type=int,
13 | default=5, help='Attribute end index')
14 | parser.add_argument('--spatial_idx_start', type=int,
15 | default=6, help='Neighbouring object start index')
16 | parser.add_argument('--spatial_idx_end', type=int, default=8,
17 | help='Neighbouring object end index')
18 | parser.add_argument('--spatial_radius', type=int,
19 | default=3, help='Radius of the subregion')
20 | parser.add_argument('--number_of_clusters', type=int,
21 | default=5, help='Number of clusters')
22 | parser.add_argument('--lambda_parameter', type=float,
23 | default=0.1, help='Lambda')
24 | parser.add_argument('--beta', type=float, default=5, help='Beta')
25 | parser.add_argument('--maxIters', type=int, default=20, help='Max Iterations')
26 |
27 | args = parser.parse_args()
28 |
29 | sticc = STICC(spatial_radius=args.spatial_radius, number_of_clusters=args.number_of_clusters,
30 | lambda_parameter=args.lambda_parameter, beta=args.beta, maxIters=args.maxIters,
31 | threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1,
32 | attr_idx_start=args.attr_idx_start, attr_idx_end=args.attr_idx_end,
33 | spatial_idx_start=args.spatial_idx_start, spatial_idx_end=args.spatial_idx_end)
34 | (cluster_assignment, cluster_MRFs) = sticc.fit(input_file=args.fname)
35 |
36 | # Save cluster output
37 | print(cluster_assignment)
38 | np.savetxt(args.oname, cluster_assignment, fmt='%d', delimiter=',')
39 |
40 | # Save MRF as npy
41 | for key, value in cluster_MRFs.items():
42 | with open(f'output_folder/MRF_{args.fname.split(".")[0]}_{key}.npy', 'wb') as f:
43 | np.save(f, np.array(value))
44 |
--------------------------------------------------------------------------------
/STICC_solver.py:
--------------------------------------------------------------------------------
1 | from src.admm_solver import ADMMSolver
2 | from src.STICC_helper import *
3 | from multiprocessing import Pool
4 | import pandas as pd
5 | from sklearn.cluster import KMeans
6 | from sklearn import mixture
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 | import math
10 | import time
11 | import collections
12 | import os
13 | import errno
14 | import sys
15 | import code
16 | import random
17 | import matplotlib
18 | matplotlib.use('Agg')
19 |
20 |
21 | class STICC:
22 | def __init__(self, spatial_radius=1, number_of_clusters=5, lambda_parameter=11e-2,
23 | beta=400, maxIters=1000, threshold=2e-5, write_out_file=False,
24 | prefix_string="", num_proc=1, cluster_reassignment=20, biased=False,
25 | attr_idx_start=0, attr_idx_end=0, spatial_idx_start=0, spatial_idx_end=0):
26 | """
27 | Parameters:
28 | - spatial_radius: size of the subregion
29 | - number_of_clusters: number of clusters
30 | - lambda_parameter: sparsity parameter
31 | - switch_penalty: temporal consistency parameter
32 | - maxIters: number of iterations
33 | - threshold: convergence threshold
34 | - write_out_file: (bool) if true, prefix_string is output file dir
35 | - prefix_string: output directory if necessary
36 | - cluster_reassignment: number of points to reassign to a 0 cluster
37 | - biased: Using the biased or the unbiased covariance
38 | """
39 | self.spatial_radius = spatial_radius
40 | self.number_of_clusters = number_of_clusters
41 | self.lambda_parameter = lambda_parameter
42 | self.switch_penalty = beta
43 | self.maxIters = maxIters
44 | self.threshold = threshold
45 | self.write_out_file = write_out_file
46 | self.prefix_string = prefix_string
47 | self.num_proc = num_proc
48 | self.cluster_reassignment = cluster_reassignment
49 | self.num_blocks = self.spatial_radius + 1
50 | self.biased = biased
51 | self.attr_idx_start = attr_idx_start
52 | self.attr_idx_end = attr_idx_end
53 | self.spatial_idx_start = spatial_idx_start
54 | self.spatial_idx_end = spatial_idx_end
55 | self.spatial_series_index = []
56 | self.spatial_series_close = []
57 | self.spatial_series_closest = []
58 | pd.set_option('display.max_columns', 500)
59 | np.set_printoptions(
60 | formatter={'float': lambda x: "{0:0.4f}".format(x)})
61 | np.random.seed(102)
62 |
63 | def fit(self, input_file):
64 | """
65 | Main method for TICC solver.
66 | Parameters:
67 | - input_file: location of the data file
68 | """
69 | assert self.maxIters > 0 # must have at least one iteration
70 | self.log_parameters()
71 |
72 | # Get data into proper format
73 |
74 | total_arr, total_rows_size, total_cols_size = self.load_data(
75 | input_file)
76 | spatial_series_arr = total_arr[:,
77 | self.attr_idx_start:self.attr_idx_end+1]
78 | spatial_series_rows_size = total_rows_size
79 | spatial_series_col_size = self.attr_idx_end - self.attr_idx_start + 1
80 | spatial_series_index = total_arr[:, 0]
81 | spatial_series_close = total_arr[:,
82 | self.spatial_idx_start:self.spatial_idx_end+1]
83 | print(spatial_series_col_size, spatial_series_arr.shape,
84 | spatial_series_close.shape)
85 | self.spatial_series_closest = spatial_series_close[:, 0]
86 | self.spatial_series_index = spatial_series_index
87 | self.spatial_series_close = spatial_series_close
88 |
89 | ############
90 | # The basic folder to be created
91 | str_NULL = self.prepare_out_directory()
92 |
93 | # Train test split
94 | training_indices = spatial_series_index
95 | num_train_points = len(training_indices)
96 |
97 | # Stack the training data
98 | complete_D_train = self.stack_training_data(total_arr, spatial_series_col_size, num_train_points,
99 | training_indices, spatial_series_col_size)
100 |
101 | # Initialization
102 | # Gaussian Mixture
103 | gmm = mixture.GaussianMixture(
104 | n_components=self.number_of_clusters, covariance_type="full")
105 | gmm.fit(complete_D_train)
106 | clustered_points = gmm.predict(complete_D_train)
107 | gmm_clustered_pts = clustered_points + 0
108 | # K-means
109 | kmeans = KMeans(n_clusters=self.number_of_clusters,
110 | n_init=300, random_state=0).fit(complete_D_train)
111 | clustered_points = kmeans.labels_
112 | # todo, is there a difference between these two?
113 | clustered_points_kmeans = kmeans.labels_
114 | kmeans_clustered_pts = kmeans.labels_
115 |
116 | train_cluster_inverse = {}
117 | log_det_values = {} # log dets of the thetas
118 | computed_covariance = {}
119 | cluster_mean_info = {}
120 | cluster_mean_stacked_info = {}
121 | old_clustered_points = None # points from last iteration
122 |
123 | empirical_covariances = {}
124 |
125 | # PERFORM TRAINING ITERATIONS
126 | pool = Pool(processes=self.num_proc) # multi-threading
127 | for iters in range(self.maxIters):
128 | print("\n\n\nITERATION ###", iters)
129 | # Get the train and test points
130 | train_clusters_arr = collections.defaultdict(
131 | list) # {cluster: [point indices]}
132 | for point, cluster_num in enumerate(clustered_points):
133 | train_clusters_arr[cluster_num].append(point)
134 |
135 | len_train_clusters = {
136 | k: len(train_clusters_arr[k]) for k in range(self.number_of_clusters)}
137 |
138 | # train_clusters holds the indices in complete_D_train
139 | # for each of the clusters
140 | opt_res = self.train_clusters(cluster_mean_info, cluster_mean_stacked_info, complete_D_train,
141 | empirical_covariances, len_train_clusters, spatial_series_col_size, pool,
142 | train_clusters_arr)
143 |
144 | self.optimize_clusters(computed_covariance, len_train_clusters, log_det_values, opt_res,
145 | train_cluster_inverse)
146 |
147 | # update old computed covariance
148 | old_computed_covariance = computed_covariance
149 |
150 | print("UPDATED THE OLD COVARIANCE")
151 |
152 | self.trained_model = {'cluster_mean_info': cluster_mean_info,
153 | 'computed_covariance': computed_covariance,
154 | 'cluster_mean_stacked_info': cluster_mean_stacked_info,
155 | 'complete_D_train': complete_D_train,
156 | 'spatial_series_col_size': spatial_series_col_size}
157 | clustered_points = self.predict_clusters()
158 |
159 | # recalculate lengths
160 | new_train_clusters = collections.defaultdict(
161 | list) # {cluster: [point indices]}
162 | for point, cluster in enumerate(clustered_points):
163 | new_train_clusters[cluster].append(point)
164 |
165 | len_new_train_clusters = {
166 | k: len(new_train_clusters[k]) for k in range(self.number_of_clusters)}
167 |
168 | before_empty_cluster_assign = clustered_points.copy()
169 |
170 | if iters != 0:
171 | cluster_norms = [(np.linalg.norm(old_computed_covariance[self.number_of_clusters, i]), i) for i in
172 | range(self.number_of_clusters)]
173 | norms_sorted = sorted(cluster_norms, reverse=True)
174 | # clusters that are not 0 as sorted by norm
175 | valid_clusters = [
176 | cp[1] for cp in norms_sorted if len_new_train_clusters[cp[1]] != 0]
177 |
178 | # Add a point to the empty clusters
179 | # assuming more non empty clusters than empty ones
180 | counter = 0
181 | for cluster_num in range(self.number_of_clusters):
182 | if len_new_train_clusters[cluster_num] == 0:
183 | # a cluster that is not len 0
184 | cluster_selected = valid_clusters[counter]
185 | counter = (counter + 1) % len(valid_clusters)
186 | print("cluster that is zero is:", cluster_num,
187 | "selected cluster instead is:", cluster_selected)
188 | start_point = np.random.choice(
189 | new_train_clusters[cluster_selected]) # random point number from that cluster
190 | for i in range(0, self.cluster_reassignment):
191 | # put cluster_reassignment points from point_num in this cluster
192 | point_to_move = start_point + i
193 | if point_to_move >= len(clustered_points):
194 | break
195 | clustered_points[point_to_move] = cluster_num
196 | computed_covariance[self.number_of_clusters, cluster_num] = old_computed_covariance[
197 | self.number_of_clusters, cluster_selected]
198 | cluster_mean_stacked_info[self.number_of_clusters, cluster_num] = complete_D_train[
199 | point_to_move, :]
200 | cluster_mean_info[self.number_of_clusters, cluster_num] \
201 | = complete_D_train[point_to_move, :][
202 | (self.spatial_radius - 1) * spatial_series_col_size:self.spatial_radius * spatial_series_col_size]
203 |
204 | for cluster_num in range(self.number_of_clusters):
205 | print("length of cluster #", cluster_num, "-------->",
206 | sum([x == cluster_num for x in clustered_points]))
207 |
208 | # TEST SETS STUFF
209 | # LLE + swtiching_penalty
210 | # Segment length
211 | # Get the train and test points
212 | train_confusion_matrix_EM = compute_confusion_matrix(self.number_of_clusters, clustered_points,
213 | training_indices)
214 | train_confusion_matrix_GMM = compute_confusion_matrix(self.number_of_clusters, gmm_clustered_pts,
215 | training_indices)
216 | train_confusion_matrix_kmeans = compute_confusion_matrix(self.number_of_clusters, kmeans_clustered_pts,
217 | training_indices)
218 | # compute the matchings
219 | matching_EM, matching_GMM, matching_Kmeans = self.compute_matches(train_confusion_matrix_EM,
220 | train_confusion_matrix_GMM,
221 | train_confusion_matrix_kmeans)
222 |
223 | print("\n\n\n")
224 |
225 | if np.array_equal(old_clustered_points, clustered_points):
226 | print("\n\n\n\nCONVERGED!!! BREAKING EARLY!!!")
227 | break
228 | old_clustered_points = before_empty_cluster_assign
229 | # end of training
230 | if pool is not None:
231 | pool.close()
232 | pool.join()
233 | train_confusion_matrix_EM = compute_confusion_matrix(self.number_of_clusters, clustered_points,
234 | training_indices)
235 | train_confusion_matrix_GMM = compute_confusion_matrix(self.number_of_clusters, gmm_clustered_pts,
236 | training_indices)
237 | train_confusion_matrix_kmeans = compute_confusion_matrix(self.number_of_clusters, clustered_points_kmeans,
238 | training_indices)
239 |
240 | return clustered_points, train_cluster_inverse
241 |
242 | def compute_matches(self, train_confusion_matrix_EM, train_confusion_matrix_GMM, train_confusion_matrix_kmeans):
243 | matching_Kmeans = find_matching(train_confusion_matrix_kmeans)
244 | matching_GMM = find_matching(train_confusion_matrix_GMM)
245 | matching_EM = find_matching(train_confusion_matrix_EM)
246 | correct_e_m = 0
247 | correct_g_m_m = 0
248 | correct_k_means = 0
249 | for cluster in range(self.number_of_clusters):
250 | matched_cluster_e_m = matching_EM[cluster]
251 | matched_cluster_g_m_m = matching_GMM[cluster]
252 | matched_cluster_k_means = matching_Kmeans[cluster]
253 |
254 | correct_e_m += train_confusion_matrix_EM[cluster,
255 | matched_cluster_e_m]
256 | correct_g_m_m += train_confusion_matrix_GMM[cluster,
257 | matched_cluster_g_m_m]
258 | correct_k_means += train_confusion_matrix_kmeans[cluster,
259 | matched_cluster_k_means]
260 | return matching_EM, matching_GMM, matching_Kmeans
261 |
262 | def smoothen_clusters(self, cluster_mean_info, computed_covariance,
263 | cluster_mean_stacked_info, complete_D_train, n):
264 | clustered_points_len = len(complete_D_train)
265 | inv_cov_dict = {} # cluster to inv_cov
266 | log_det_dict = {} # cluster to log_det
267 | for cluster in range(self.number_of_clusters):
268 | cov_matrix = computed_covariance[self.number_of_clusters, cluster][0:(2 * (self.num_blocks - 1)-1) * n,
269 | 0:(2 * (self.num_blocks - 1)-1) * n]
270 | inv_cov_matrix = np.linalg.inv(cov_matrix)
271 | log_det_cov = np.log(np.linalg.det(cov_matrix)
272 | ) # log(det(sigma2|1))
273 | inv_cov_dict[cluster] = inv_cov_matrix
274 | log_det_dict[cluster] = log_det_cov
275 | # For each point compute the LLE
276 | print("beginning the smoothening ALGORITHM")
277 | LLE_all_points_clusters = np.zeros(
278 | [clustered_points_len, self.number_of_clusters])
279 | for point in range(clustered_points_len):
280 | if point + self.spatial_radius - 1 < complete_D_train.shape[0]:
281 | for cluster in range(self.number_of_clusters):
282 | cluster_mean = cluster_mean_info[self.number_of_clusters, cluster]
283 | cluster_mean_stacked = cluster_mean_stacked_info[self.number_of_clusters, cluster]
284 | x = complete_D_train[point, :] - \
285 | cluster_mean_stacked[0:(
286 | 2 * (self.num_blocks - 1)-1) * n]
287 | inv_cov_matrix = inv_cov_dict[cluster]
288 | log_det_cov = log_det_dict[cluster]
289 | lle = np.dot(x.reshape([1, (self.spatial_radius) * n]),
290 | np.dot(inv_cov_matrix, x.reshape([n * (self.spatial_radius), 1]))) + log_det_cov
291 | LLE_all_points_clusters[point, cluster] = lle
292 |
293 | return LLE_all_points_clusters
294 |
295 | def optimize_clusters(self, computed_covariance, len_train_clusters, log_det_values, optRes, train_cluster_inverse):
296 | for cluster in range(self.number_of_clusters):
297 | if optRes[cluster] == None:
298 | continue
299 | val = optRes[cluster].get()
300 | print("OPTIMIZATION for Cluster #", cluster, "DONE!!!")
301 | # THIS IS THE SOLUTION
302 | S_est = upperToFull(val, 0)
303 | X2 = S_est
304 | u, _ = np.linalg.eig(S_est)
305 | cov_out = np.linalg.inv(X2)
306 |
307 | # Store the log-det, covariance, inverse-covariance, cluster means, stacked means
308 | log_det_values[self.number_of_clusters,
309 | cluster] = np.log(np.linalg.det(cov_out))
310 | computed_covariance[self.number_of_clusters, cluster] = cov_out
311 | train_cluster_inverse[cluster] = X2
312 | for cluster in range(self.number_of_clusters):
313 | print("length of the cluster ", cluster,
314 | "------>", len_train_clusters[cluster])
315 |
316 | def train_clusters(self, cluster_mean_info, cluster_mean_stacked_info, complete_D_train, empirical_covariances,
317 | len_train_clusters, n, pool, train_clusters_arr):
318 | optRes = [None for i in range(self.number_of_clusters)]
319 | for cluster in range(self.number_of_clusters):
320 | cluster_length = len_train_clusters[cluster]
321 | if cluster_length != 0:
322 | size_blocks = n
323 | indices = train_clusters_arr[cluster]
324 | D_train = np.zeros([cluster_length, (self.spatial_radius) * n])
325 | for i in range(cluster_length):
326 | point = indices[i]
327 | D_train[i, :] = complete_D_train[point, :]
328 |
329 | cluster_mean_info[self.number_of_clusters, cluster] = np.mean(D_train, axis=0)[
330 | (
331 | self.spatial_radius - 1) * n:self.spatial_radius * n].reshape(
332 | [1, n])
333 | cluster_mean_stacked_info[self.number_of_clusters, cluster] = np.mean(
334 | D_train, axis=0)
335 | # Fit a model - OPTIMIZATION
336 | probSize = (self.spatial_radius) * size_blocks
337 | lamb = np.zeros((probSize, probSize)) + self.lambda_parameter
338 | S = np.cov(np.transpose(D_train), bias=self.biased)
339 | empirical_covariances[cluster] = S
340 |
341 | rho = 1
342 | solver = ADMMSolver(
343 | lamb, (self.spatial_radius), size_blocks, 1, S)
344 | # apply to process pool
345 | optRes[cluster] = pool.apply_async(
346 | solver, (1000, 1e-6, 1e-6, False,))
347 | return optRes
348 |
349 | def stack_training_data(self, Data, n, num_train_points, training_indices, spatial_cols_size):
350 | complete_D_train = np.zeros(
351 | [num_train_points, self.spatial_radius * n])
352 | # STICC data stack
353 | for i in range(num_train_points):
354 | for k in range(self.spatial_radius):
355 | if k == 0:
356 | complete_D_train[i][k * n:(k + 1) * n] = Data[i][1:(n + 1)]
357 | else:
358 | complete_D_train[i][k * n:(k + 1) *
359 | n] = Data[int(Data[i][n + k])][1:(n + 1)]
360 | return complete_D_train
361 |
362 | def prepare_out_directory(self):
363 | str_NULL = self.prefix_string
364 | if not os.path.exists(os.path.dirname(str_NULL)):
365 | try:
366 | os.makedirs(os.path.dirname(str_NULL))
367 | except OSError as exc: # Guard against race condition of path already existing
368 | if exc.errno != errno.EEXIST:
369 | raise
370 |
371 | return str_NULL
372 |
373 | def load_data(self, input_file):
374 | Data = np.loadtxt(input_file, delimiter=",")
375 | (m, n) = Data.shape # m: num of observations, n: size of observation vector
376 | print("completed getting the data")
377 | return Data, m, n
378 |
379 | def log_parameters(self):
380 | print("lam_sparse", self.lambda_parameter)
381 | print("switch_penalty", self.switch_penalty)
382 | print("num_cluster", self.number_of_clusters)
383 | print("num stacked", self.spatial_radius)
384 |
385 | def predict_clusters(self, test_data=None):
386 | '''
387 | Given the current trained model, predict clusters. If the cluster segmentation has not been optimized yet,
388 | than this will be part of the interative process.
389 |
390 | Args:
391 | numpy array of data for which to predict clusters. Columns are dimensions of the data, each row is
392 | a different timestamp
393 |
394 | Returns:
395 | vector of predicted cluster for the points
396 | '''
397 | if test_data is not None:
398 | if not isinstance(test_data, np.ndarray):
399 | raise TypeError("input must be a numpy array!")
400 | else:
401 | test_data = self.trained_model['complete_D_train']
402 |
403 | # SMOOTHENING
404 | lle_all_points_clusters = self.smoothen_clusters(self.trained_model['cluster_mean_info'],
405 | self.trained_model['computed_covariance'],
406 | self.trained_model['cluster_mean_stacked_info'],
407 | test_data,
408 | self.trained_model['spatial_series_col_size'])
409 |
410 | # Update cluster points - using NEW smoothening
411 | clustered_points = updateClusters(lle_all_points_clusters, switch_penalty=self.switch_penalty, spatial_series_index=self.spatial_series_index,
412 | spatial_series_closest=self.spatial_series_closest, spatial_radius=self.spatial_radius)
413 |
414 | return(clustered_points)
415 |
--------------------------------------------------------------------------------
/data/nyc_checkin.cpg:
--------------------------------------------------------------------------------
1 | ISO-8859-1
--------------------------------------------------------------------------------
/data/nyc_checkin.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.dbf
--------------------------------------------------------------------------------
/data/nyc_checkin.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.shp
--------------------------------------------------------------------------------
/data/nyc_checkin.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.shx
--------------------------------------------------------------------------------
/data/nyc_checkin.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.zip
--------------------------------------------------------------------------------
/data/nyc_checkin_sticc.cpg:
--------------------------------------------------------------------------------
1 | ISO-8859-1
--------------------------------------------------------------------------------
/data/nyc_checkin_sticc.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc.shp
--------------------------------------------------------------------------------
/data/nyc_checkin_sticc.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc.shx
--------------------------------------------------------------------------------
/data/nyc_checkin_sticc3.cpg:
--------------------------------------------------------------------------------
1 | ISO-8859-1
--------------------------------------------------------------------------------
/data/nyc_checkin_sticc3.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc3.shp
--------------------------------------------------------------------------------
/data/nyc_checkin_sticc3.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc3.shx
--------------------------------------------------------------------------------
/data/sticc_points.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points.dbf
--------------------------------------------------------------------------------
/data/sticc_points.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points.shp
--------------------------------------------------------------------------------
/data/sticc_points.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points.shx
--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.cpg:
--------------------------------------------------------------------------------
1 | UTF-8
--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.dbf
--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.prj:
--------------------------------------------------------------------------------
1 | PROJCS["WGS_1984_Web_Mercator_Auxiliary_Sphere",GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Mercator_Auxiliary_Sphere"],PARAMETER["False_Easting",0.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",0.0],PARAMETER["Standard_Parallel_1",0.0],PARAMETER["Auxiliary_Sphere_Type",0.0],UNIT["Meter",1.0]]
--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.sbn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.sbn
--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.shp
--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.shp.xml:
--------------------------------------------------------------------------------
1 |
2 | 20210624162713001.0TRUEFeatureClassToFeatureClass sticc_points "G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC" sticc_points_spatial_multivariate.shp # "SOURCE_ID "SOURCE_ID" false false true 4 Long 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,sticc_points.FID,-1,-1;synthetic_ "Field2" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field2,-1,-1;syntheti_1 "Field3" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field3,-1,-1;syntheti_2 "Field4" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field4,-1,-1;syntheti_3 "Field5" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field5,-1,-1;syntheti_4 "Field6" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field6,-1,-1" #SpatiallyConstrainedMultivariateClustering sticc_points "G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points_spatial_multivariate.shp" synthetic_data.txt.Field2;synthetic_data.txt.Field3;synthetic_data.txt.Field4;synthetic_data.txt.Field5;synthetic_data.txt.Field6 None # # # 7 "Trimmed Delaunay triangulation" # 100 #{"type":"CIMLayerDocument","version":"2.5.0","build":22081,"layers":["CIMPATH=map2/sticc_points_spatial_multivariate.xml"],"layerDefinitions":[{"type":"CIMFeatureLayer","name":"sticc_points_spatial_multivariate","uRI":"CIMPATH=map2/sticc_points_spatial_multivariate.xml","sourceModifiedTime":{"type":"TimeInstant"},"useSourceMetadata":true,"description":"sticc_points_spatial_multivariate","layerElevation":{"type":"CIMLayerElevationSurface","mapElevationID":"{46382D1F-EE9C-43F7-BD65-E563D4E9A7D8}"},"expanded":true,"layerType":"Operational","showLegends":true,"visibility":true,"displayCacheType":"Permanent","maxDisplayCacheAge":5,"showPopups":true,"serviceLayerID":-1,"charts":[{"type":"CIMChart","name":"Spatially Constrained Multivariate Clustering Box-Plots","series":[{"type":"CIMChartBoxPlotSeries","uniqueName":"Series0","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"verticalAxis":1,"colorType":"SingleColor","orderFieldsSortTypes":[0],"visible":true,"fillSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[178,178,178,100]}},"verticalOrientation":true,"standardizeValues":true},{"type":"CIMChartLineSeries","name":"1","uniqueName":"Series1","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 1","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[31,120,180,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[31,120,180,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"2","uniqueName":"Series2","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 2","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[178,223,138,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[178,223,138,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"3","uniqueName":"Series3","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 3","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[51,160,44,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[51,160,44,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"4","uniqueName":"Series4","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 4","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[251,154,153,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[251,154,153,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"5","uniqueName":"Series5","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 5","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[227,26,28,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[227,26,28,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"6","uniqueName":"Series6","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 6","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[253,191,111,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[253,191,111,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"7","uniqueName":"Series7","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 7","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[255,127,0,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[255,127,0,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true}],"generalProperties":{"type":"CIMChartGeneralProperties","title":"Spatially Constrained Multivariate Clustering Box-Plots","showTitle":true,"useAutomaticTitle":false,"showSubTitle":true,"showFooter":true,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":16,"fontWeight":"Normal","textCase":"Normal"},"subTitleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"footerText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"backgroundSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[255,255,255,100]}},"gridLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":1,"style":"Solid","color":{"type":"CIMRGBColor","values":[119,119,119,100]}}},"legend":{"type":"CIMChartLegend","visible":true,"showTitle":true,"alignment":"Right","legendText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"legendTitle":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"}},"axes":[{"type":"CIMChartAxis","visible":true,"title":"Analysis Fields","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1},{"type":"CIMChartAxis","visible":true,"title":"Standardized Values","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1}],"mapSelectionHandling":"Highlight"},{"type":"CIMChart","name":"Features Per Cluster Chart","series":[{"type":"CIMChartBarSeries","uniqueName":"Series0","fields":["CLUSTER_ID",""],"orderFields":["CLUSTER_ID"],"groupFields":["CLUSTER_ID"],"verticalAxis":1,"colorType":"ColorMatch","fieldAggregation":["","COUNT"],"orderFieldsSortTypes":[0],"visible":true,"multipleBarType":"SideBySide","barSize":90,"fillSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[166,206,227,100]}},"verticalOrientation":true}],"generalProperties":{"type":"CIMChartGeneralProperties","title":"Features Per Cluster Chart","showTitle":true,"useAutomaticTitle":false,"showSubTitle":true,"showFooter":true,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":16,"fontWeight":"Normal","textCase":"Normal"},"subTitleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"footerText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"backgroundSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[255,255,255,100]}},"gridLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":1,"style":"Solid","color":{"type":"CIMRGBColor","values":[119,119,119,100]}}},"legend":{"type":"CIMChartLegend","visible":true,"showTitle":true,"alignment":"Right","legendText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"legendTitle":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"}},"axes":[{"type":"CIMChartAxis","visible":true,"title":"Cluster","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1},{"type":"CIMChartAxis","visible":true,"title":"Count","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1}],"mapSelectionHandling":"Highlight"},{"type":"CIMChart","name":"Distribution of Membership Probability","series":[{"type":"CIMChartHistogramSeries","name":"Series0","uniqueName":"Series0","fields":["MEM_PROB"],"verticalAxis":1,"colorType":"SingleColor","visible":true,"binCount":0,"fillSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[166,206,227,100]}},"meanLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[227,36,0,100]}},"medianLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[144,66,159,100]}},"standardDeviationLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[192,154,98,100]}},"dataTransformationType":"None","distributionLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[101,158,199,100]}}}],"generalProperties":{"type":"CIMChartGeneralProperties","title":"Distribution of Membership Probability","showTitle":true,"useAutomaticTitle":false,"showSubTitle":true,"showFooter":true,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":16,"fontWeight":"Normal","textCase":"Normal"},"subTitleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"footerText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"backgroundSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[255,255,255,100]}},"gridLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":1,"style":"Solid","color":{"type":"CIMRGBColor","values":[119,119,119,100]}}},"legend":{"type":"CIMChartLegend","visible":true,"showTitle":true,"alignment":"Right","legendText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"legendTitle":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"}},"axes":[{"type":"CIMChartAxis","visible":true,"title":"Membership Probability","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1},{"type":"CIMChartAxis","visible":true,"showTitle":true,"useAutomaticTitle":true,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1}],"mapSelectionHandling":"Highlight"}],"refreshRate":-1,"refreshRateUnit":"esriTimeUnitsSeconds","autoGenerateFeatureTemplates":true,"featureElevationExpression":"0","featureTable":{"type":"CIMFeatureTable","displayField":"SOURCE_ID","editable":true,"dataConnection":{"type":"CIMStandardDataConnection","workspaceConnectionString":"DATABASE=G:\\My Drive\\Documents\\ArcGIS\\Projects\\STICC\\STICC","workspaceFactory":"Shapefile","dataset":"sticc_points_spatial_multivariate","datasetType":"esriDTFeatureClass"},"studyAreaSpatialRel":"esriSpatialRelUndefined","searchOrder":"esriSearchOrderSpatial"},"htmlPopupEnabled":true,"htmlPopupFormat":{"type":"CIMHtmlPopupFormat","htmlUseCodedDomainValues":true,"htmlPresentationStyle":"TwoColumnTable"},"isFlattened":true,"selectable":true,"featureCacheType":"Session","labelClasses":[{"type":"CIMLabelClass","expression":"[SOURCE_ID]","expressionEngine":"VBScript","featuresToLabel":"AllVisibleFeatures","maplexLabelPlacementProperties":{"type":"CIMMaplexLabelPlacementProperties","featureType":"Line","avoidPolygonHoles":true,"canOverrunFeature":true,"canPlaceLabelOutsidePolygon":true,"canRemoveOverlappingLabel":true,"canStackLabel":true,"connectionType":"Unambiguous","constrainOffset":"NoConstraint","contourAlignmentType":"Page","contourLadderType":"Straight","contourMaximumAngle":90,"enableConnection":true,"featureWeight":100,"fontHeightReductionLimit":4,"fontHeightReductionStep":0.5,"fontWidthReductionLimit":90,"fontWidthReductionStep":5,"graticuleAlignmentType":"Straight","labelBuffer":15,"labelLargestPolygon":true,"labelPriority":-1,"labelStackingProperties":{"type":"CIMMaplexLabelStackingProperties","stackAlignment":"ChooseBest","maximumNumberOfLines":3,"minimumNumberOfCharsPerLine":3,"maximumNumberOfCharsPerLine":24},"lineFeatureType":"General","linePlacementMethod":"OffsetCurvedFromLine","maximumLabelOverrun":36,"maximumLabelOverrunUnit":"Point","minimumFeatureSizeUnit":"Map","multiPartOption":"OneLabelPerPart","offsetAlongLineProperties":{"type":"CIMMaplexOffsetAlongLineProperties","placementMethod":"BestPositionAlongLine","labelAnchorPoint":"CenterOfLabel","distanceUnit":"Percentage","useLineDirection":true},"pointExternalZonePriorities":{"type":"CIMMaplexExternalZonePriorities","aboveLeft":4,"aboveCenter":2,"aboveRight":1,"centerRight":3,"belowRight":5,"belowCenter":7,"belowLeft":8,"centerLeft":6},"pointPlacementMethod":"AroundPoint","polygonAnchorPointType":"GeometricCenter","polygonBoundaryWeight":200,"polygonExternalZones":{"type":"CIMMaplexExternalZonePriorities","aboveLeft":4,"aboveCenter":2,"aboveRight":1,"centerRight":3,"belowRight":5,"belowCenter":7,"belowLeft":8,"centerLeft":6},"polygonFeatureType":"General","polygonInternalZones":{"type":"CIMMaplexInternalZonePriorities","center":1},"polygonPlacementMethod":"CurvedInPolygon","primaryOffset":1,"primaryOffsetUnit":"Point","removeExtraWhiteSpace":true,"repetitionIntervalUnit":"Map","rotationProperties":{"type":"CIMMaplexRotationProperties","rotationType":"Arithmetic","alignmentType":"Straight"},"secondaryOffset":100,"strategyPriorities":{"type":"CIMMaplexStrategyPriorities","stacking":1,"overrun":2,"fontCompression":3,"fontReduction":4,"abbreviation":5},"thinningDistanceUnit":"Map","truncationMarkerCharacter":".","truncationMinimumLength":1,"truncationPreferredCharacters":"aeiou"},"name":"Default","priority":2,"standardLabelPlacementProperties":{"type":"CIMStandardLabelPlacementProperties","featureType":"Line","featureWeight":"None","labelWeight":"High","numLabelsOption":"OneLabelPerName","lineLabelPosition":{"type":"CIMStandardLineLabelPosition","above":true,"inLine":true,"parallel":true},"lineLabelPriorities":{"type":"CIMStandardLineLabelPriorities","aboveStart":3,"aboveAlong":3,"aboveEnd":3,"centerStart":3,"centerAlong":3,"centerEnd":3,"belowStart":3,"belowAlong":3,"belowEnd":3},"pointPlacementMethod":"AroundPoint","pointPlacementPriorities":{"type":"CIMStandardPointPlacementPriorities","aboveLeft":2,"aboveCenter":2,"aboveRight":1,"centerLeft":3,"centerRight":2,"belowLeft":3,"belowCenter":3,"belowRight":2},"rotationType":"Arithmetic","polygonPlacementMethod":"AlwaysHorizontal"},"textSymbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMTextSymbol","blockProgression":"TTB","compatibilityMode":true,"depth3D":1,"drawSoftHyphen":true,"extrapolateBaselines":true,"flipAngle":90,"fontEffects":"Normal","fontEncoding":"Unicode","fontFamilyName":"Arial","fontStyleName":"Regular","fontType":"Unspecified","haloSize":1,"height":8,"hinting":"Default","horizontalAlignment":"Center","kerning":true,"letterWidth":100,"ligatures":true,"lineGapType":"ExtraLeading","shadowColor":{"type":"CIMRGBColor","values":[0,0,0,100]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}}]},"textCase":"Normal","textDirection":"LTR","verticalAlignment":"Bottom","verticalGlyphOrientation":"Right","wordSpacing":100,"billboardMode3D":"FaceNearPlane"}},"useCodedValue":true,"visibility":true,"iD":-1}],"renderer":{"type":"CIMUniqueValueRenderer","colorRamp":{"type":"CIMRandomHSVColorRamp","colorSpace":{"type":"CIMICCColorSpace","url":"Default RGB"},"maxH":360,"minS":33,"maxS":66,"minV":50,"maxV":99,"minAlpha":100,"maxAlpha":100},"defaultLabel":"\u003call other values\u003e","defaultSymbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMCharacterMarker","enable":true,"colorLocked":true,"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":7,"billboardMode3D":"FaceNearPlane","characterIndex":40,"fontFamilyName":"Arial","fontStyleName":"Regular","fontType":"Unspecified","scaleX":1,"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}}]},"scaleSymbolsProportionally":true,"respectFrame":true},{"type":"CIMCharacterMarker","enable":true,"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":7,"billboardMode3D":"FaceNearPlane","characterIndex":33,"fontFamilyName":"Arial","fontStyleName":"Regular","fontType":"Unspecified","scaleX":1,"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[224,223,227,0]}}]},"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Map"},"symbolName":"Level_1"},"defaultSymbolPatch":"Default","fields":["CLUSTER_ID"],"groups":[{"type":"CIMUniqueValueGroup","classes":[{"type":"CIMUniqueValueClass","label":"1","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[120,170,255,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["1"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"2","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[255,100,85,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["2"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"3","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[125,220,85,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["3"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"4","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[255,180,0,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["4"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"5","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[200,100,225,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["5"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"6","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[190,160,100,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["6"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"7","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[250,190,200,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["7"]}],"visible":true}]}],"polygonSymbolColorTarget":"Fill"},"scaleSymbols":true,"snappable":true}]}
3 |
--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.shx
--------------------------------------------------------------------------------
/images/GeoDSLogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/images/GeoDSLogo.jpg
--------------------------------------------------------------------------------
/images/STICC.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/images/STICC.jpeg
--------------------------------------------------------------------------------
/images/clustering.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/images/clustering.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | esda==2.3.1
2 | geopandas==0.8.1
3 | libpysal==4.3.0
4 | matplotlib==3.4.1
5 | networkx==2.5.1
6 | numpy==1.22.0
7 | pandas==1.4.1
8 | pyclustering==0.10.1.2
9 | scikit_learn~>1.5.0
10 | Shapely==1.7.0
11 |
--------------------------------------------------------------------------------
/src/STICC_helper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def getTrainTestSplit(m, num_blocks, num_stacked):
5 | '''
6 | - m: number of observations
7 | - num_blocks: spatial_radius + 1
8 | - num_stacked: spatial_radius
9 | Returns:
10 | - sorted list of training indices
11 | '''
12 | # Now splitting up stuff
13 | # split1 : Training and Test
14 | # split2 : Training and Test - different clusters
15 | training_percent = 1
16 | # list of training indices
17 | training_idx = np.random.choice(
18 | m-num_blocks+1, size=int((m-num_stacked)*training_percent), replace=False)
19 | # Ensure that the first and the last few points are in
20 | training_idx = list(training_idx)
21 | if 0 not in training_idx:
22 | training_idx.append(0)
23 | if m - num_stacked not in training_idx:
24 | training_idx.append(m-num_stacked)
25 | training_idx = np.array(training_idx)
26 | return sorted(training_idx)
27 |
28 |
29 | def upperToFull(a, eps=0):
30 | ind = (a < eps) & (a > -eps)
31 | a[ind] = 0
32 | n = int((-1 + np.sqrt(1 + 8*a.shape[0]))/2)
33 | A = np.zeros([n, n])
34 | A[np.triu_indices(n)] = a
35 | temp = A.diagonal()
36 | A = np.asarray((A + A.T) - np.diag(temp))
37 | return A
38 |
39 |
40 | def hex_to_rgb(value):
41 | """Return (red, green, blue) for the color given as #rrggbb."""
42 | lv = len(value)
43 | out = tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
44 | out = tuple([x/256.0 for x in out])
45 | return out
46 |
47 |
48 | def updateClusters(LLE_node_vals, switch_penalty=1, spatial_series_index=[], spatial_series_closest=[], spatial_radius=1):
49 | """
50 | Takes in LLE_node_vals matrix and computes the path that minimizes
51 | the total cost over the path
52 | Note the LLE's are negative of the true LLE's actually!!!!!
53 |
54 | Note: switch penalty > 0
55 | """
56 | (T, num_clusters) = LLE_node_vals.shape
57 | future_cost_vals = np.zeros(LLE_node_vals.shape)
58 |
59 | # compute future costs
60 | for i in range(T-2, -1, -1):
61 | j = spatial_series_closest[i] # find the closest
62 | j = int(j)
63 | indicator = np.zeros(num_clusters)
64 | if j <= (len(spatial_series_index) - spatial_radius):
65 | future_costs = future_cost_vals[j, :]
66 | lle_vals = LLE_node_vals[j, :]
67 | for cluster in range(num_clusters):
68 | total_vals = future_costs + lle_vals + switch_penalty
69 | total_vals[cluster] -= switch_penalty
70 | future_cost_vals[i, cluster] = np.min(total_vals)
71 |
72 | # compute the best path
73 | path = np.zeros(T)
74 |
75 | # the first location
76 | curr_location = np.argmin(future_cost_vals[0, :] + LLE_node_vals[0, :])
77 | path[0] = curr_location
78 |
79 | # compute the path
80 | for i in range(T-1):
81 | j = spatial_series_closest[i] # find the closest
82 | j = int(j)
83 | if j <= (len(spatial_series_index) - spatial_radius):
84 | future_costs = future_cost_vals[j, :]
85 | lle_vals = LLE_node_vals[j, :]
86 | total_vals = future_costs + lle_vals + switch_penalty
87 | total_vals[int(path[i])] -= switch_penalty
88 |
89 | path[i+1] = np.argmin(total_vals)
90 |
91 | # return the computed path
92 | return path
93 |
94 |
95 | def find_matching(confusion_matrix):
96 | """
97 | returns the perfect matching
98 | """
99 | _, n = confusion_matrix.shape
100 | path = []
101 | for i in range(n):
102 | max_val = -1e10
103 | max_ind = -1
104 | for j in range(n):
105 | if j in path:
106 | pass
107 | else:
108 | temp = confusion_matrix[i, j]
109 | if temp > max_val:
110 | max_val = temp
111 | max_ind = j
112 | path.append(max_ind)
113 | return path
114 |
115 |
116 | def compute_confusion_matrix(num_clusters, clustered_points_algo, sorted_indices_algo):
117 | """
118 | computes a confusion matrix and returns it
119 | """
120 | seg_len = 400
121 | true_confusion_matrix = np.zeros([num_clusters, num_clusters])
122 | for point in range(len(clustered_points_algo)):
123 | cluster = clustered_points_algo[point]
124 | num = (int(sorted_indices_algo[point]/seg_len) % num_clusters)
125 | true_confusion_matrix[int(num), int(cluster)] += 1
126 | return true_confusion_matrix
127 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/src/__init__.py
--------------------------------------------------------------------------------
/src/admm_solver.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import math
3 | class ADMMSolver:
4 | def __init__(self, lamb, num_stacked, size_blocks, rho, S, rho_update_func=None):
5 | self.lamb = lamb
6 | self.numBlocks = num_stacked
7 | self.sizeBlocks = size_blocks
8 | probSize = num_stacked*size_blocks
9 | self.length = int(probSize*(probSize+1)/2)
10 | self.x = numpy.zeros(self.length)
11 | self.z = numpy.zeros(self.length)
12 | self.u = numpy.zeros(self.length)
13 | self.rho = float(rho)
14 | self.S = S
15 | self.status = 'initialized'
16 | self.rho_update_func = rho_update_func
17 |
18 | def ij2symmetric(self, i,j,size):
19 | return (size * (size + 1))/2 - (size-i)*((size - i + 1))/2 + j - i
20 |
21 | def upper2Full(self, a):
22 | n = int((-1 + numpy.sqrt(1+ 8*a.shape[0]))/2)
23 | A = numpy.zeros([n,n])
24 | A[numpy.triu_indices(n)] = a
25 | temp = A.diagonal()
26 | A = (A + A.T) - numpy.diag(temp)
27 | return A
28 |
29 | def Prox_logdet(self, S, A, eta):
30 | d, q = numpy.linalg.eigh(eta*A-S)
31 | q = numpy.matrix(q)
32 | X_var = ( 1/(2*float(eta)) )*q*( numpy.diag(d + numpy.sqrt(numpy.square(d) + (4*eta)*numpy.ones(d.shape))) )*q.T
33 | x_var = X_var[numpy.triu_indices(S.shape[1])] # extract upper triangular part as update variable
34 | return numpy.matrix(x_var).T
35 |
36 | def ADMM_x(self):
37 | a = self.z-self.u
38 | A = self.upper2Full(a)
39 | eta = self.rho
40 | x_update = self.Prox_logdet(self.S, A, eta)
41 | self.x = numpy.array(x_update).T.reshape(-1)
42 |
43 | def ADMM_z(self, index_penalty = 1):
44 | a = self.x + self.u
45 | probSize = self.numBlocks*self.sizeBlocks
46 | z_update = numpy.zeros(self.length)
47 |
48 | # TODO: can we parallelize these?
49 | for i in range(self.numBlocks):
50 | elems = self.numBlocks if i==0 else (2*self.numBlocks - 2*i)/2 # i=0 is diagonal
51 | for j in range(self.sizeBlocks):
52 | startPoint = j if i==0 else 0
53 | for k in range(startPoint, self.sizeBlocks):
54 | locList = [((l+i)*self.sizeBlocks + j, l*self.sizeBlocks+k) for l in range(int(elems))]
55 | if i == 0:
56 | lamSum = sum(self.lamb[loc1, loc2] for (loc1, loc2) in locList)
57 | indices = [self.ij2symmetric(loc1, loc2, probSize) for (loc1, loc2) in locList]
58 | else:
59 | lamSum = sum(self.lamb[loc2, loc1] for (loc1, loc2) in locList)
60 | indices = [self.ij2symmetric(loc2, loc1, probSize) for (loc1, loc2) in locList]
61 | pointSum = sum(a[int(index)] for index in indices)
62 | rhoPointSum = self.rho * pointSum
63 |
64 | #Calculate soft threshold
65 | ans = 0
66 | #If answer is positive
67 | if rhoPointSum > lamSum:
68 | ans = max((rhoPointSum - lamSum)/(self.rho*elems),0)
69 | elif rhoPointSum < -1*lamSum:
70 | ans = min((rhoPointSum + lamSum)/(self.rho*elems),0)
71 |
72 | for index in indices:
73 | z_update[int(index)] = ans
74 | self.z = z_update
75 |
76 | def ADMM_u(self):
77 | u_update = self.u + self.x - self.z
78 | self.u = u_update
79 |
80 | # Returns True if convergence criteria have been satisfied
81 | # eps_abs = eps_rel = 0.01
82 | # r = x - z
83 | # s = rho * (z - z_old)
84 | # e_pri = sqrt(length) * e_abs + e_rel * max(||x||, ||z||)
85 | # e_dual = sqrt(length) * e_abs + e_rel * ||rho * u||
86 | # Should stop if (||r|| <= e_pri) and (||s|| <= e_dual)
87 | # Returns (boolean shouldStop, primal residual value, primal threshold,
88 | # dual residual value, dual threshold)
89 | def CheckConvergence(self, z_old, e_abs, e_rel, verbose):
90 | norm = numpy.linalg.norm
91 | r = self.x - self.z
92 | s = self.rho * (self.z - z_old)
93 | # Primal and dual thresholds. Add .0001 to prevent the case of 0.
94 | e_pri = math.sqrt(self.length) * e_abs + e_rel * max(norm(self.x), norm(self.z)) + .0001
95 | e_dual = math.sqrt(self.length) * e_abs + e_rel * norm(self.rho * self.u) + .0001
96 | # Primal and dual residuals
97 | res_pri = norm(r)
98 | res_dual = norm(s)
99 | if verbose:
100 | # Debugging information to print(convergence criteria values)
101 | print(' r:', res_pri)
102 | print(' e_pri:', e_pri)
103 | print(' s:', res_dual)
104 | print(' e_dual:', e_dual)
105 | stop = (res_pri <= e_pri) and (res_dual <= e_dual)
106 | return (stop, res_pri, e_pri, res_dual, e_dual)
107 |
108 | #solve
109 | def __call__(self, maxIters, eps_abs, eps_rel, verbose):
110 | num_iterations = 0
111 | self.status = 'Incomplete: max iterations reached'
112 | for i in range(maxIters):
113 | z_old = numpy.copy(self.z)
114 | self.ADMM_x()
115 | self.ADMM_z()
116 | self.ADMM_u()
117 | if i != 0:
118 | stop, res_pri, e_pri, res_dual, e_dual = self.CheckConvergence(z_old, eps_abs, eps_rel, verbose)
119 | if stop:
120 | self.status = 'Optimal'
121 | break
122 | new_rho = self.rho
123 | if self.rho_update_func:
124 | new_rho = rho_update_func(self.rho, res_pri, e_pri, res_dual, e_dual)
125 | scale = self.rho / new_rho
126 | rho = new_rho
127 | self.u = scale*self.u
128 | if verbose:
129 | # Debugging information prints current iteration #
130 | print('Iteration %d' % i)
131 | return self.x
132 |
--------------------------------------------------------------------------------