├── LICENSE
├── NYC_checkin.ipynb
├── NYC_checkin3.ipynb
├── README.md
├── STICC_main.py
├── STICC_solver.py
├── data
    ├── nyc_checkin.cpg
    ├── nyc_checkin.dbf
    ├── nyc_checkin.shp
    ├── nyc_checkin.shx
    ├── nyc_checkin.zip
    ├── nyc_checkin_sticc.cpg
    ├── nyc_checkin_sticc.dbf
    ├── nyc_checkin_sticc.shp
    ├── nyc_checkin_sticc.shx
    ├── nyc_checkin_sticc3.cpg
    ├── nyc_checkin_sticc3.dbf
    ├── nyc_checkin_sticc3.shp
    ├── nyc_checkin_sticc3.shx
    ├── sticc_points.dbf
    ├── sticc_points.shp
    ├── sticc_points.shx
    ├── sticc_points_spatial_multivariate.cpg
    ├── sticc_points_spatial_multivariate.dbf
    ├── sticc_points_spatial_multivariate.prj
    ├── sticc_points_spatial_multivariate.sbn
    ├── sticc_points_spatial_multivariate.shp
    ├── sticc_points_spatial_multivariate.shp.xml
    └── sticc_points_spatial_multivariate.shx
├── images
    ├── GeoDSLogo.jpg
    ├── STICC.jpeg
    └── clustering.jpg
├── requirements.txt
├── src
    ├── STICC_helper.py
    ├── __init__.py
    └── admm_solver.py
└── synthetic.ipynb


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2017-2018, David Hallac, Sagar Vare, Saachi Jain, and Others
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/NYC_checkin.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import random\n",
  10 |     "import pandas as pd\n",
  11 |     "import geopandas as gpd\n",
  12 |     "import matplotlib.pyplot as plt\n",
  13 |     "import esda\n",
  14 |     "import libpysal.weights as weights\n",
  15 |     "from esda.moran import Moran\n",
  16 |     "from shapely.geometry import Point, MultiPoint, LineString, Polygon, shape\n",
  17 |     "import json\n",
  18 |     "import pylab\n",
  19 |     "import libpysal\n",
  20 |     "import numpy as np\n",
  21 |     "from sklearn.metrics.cluster import adjusted_rand_score\n",
  22 |     "from sklearn.metrics import f1_score\n",
  23 |     "from pyclustering.cluster.cure import cure\n",
  24 |     "from pyclustering.cluster.kmeans import kmeans\n",
  25 |     "from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer\n",
  26 |     "from sklearn import preprocessing"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "code",
  31 |    "execution_count": 2,
  32 |    "metadata": {},
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "def permutation(lst):\n",
  36 |     "    if len(lst) == 0:\n",
  37 |     "        return []\n",
  38 |     "\n",
  39 |     "    if len(lst) == 1:\n",
  40 |     "        return [lst]\n",
  41 |     "\n",
  42 |     "    l = []\n",
  43 |     "    for i in range(len(lst)):\n",
  44 |     "        m = lst[i]\n",
  45 |     "        remLst = lst[:i] + lst[i+1:]\n",
  46 |     "        for p in permutation(remLst):\n",
  47 |     "            l.append([m] + p)       \n",
  48 |     "    return l"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": 3,
  54 |    "metadata": {},
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "def get_f1_score(df, permut):\n",
  58 |     "    def match_clus(x, permut):\n",
  59 |     "        if x == 0:\n",
  60 |     "            return int(permut[0])\n",
  61 |     "        elif x == 1:\n",
  62 |     "            return int(permut[1])\n",
  63 |     "        else:\n",
  64 |     "            return x\n",
  65 |     "\n",
  66 |     "    df[\"group_match\"] = df[\"group\"].apply(lambda x: match_clus(x, permut))\n",
  67 |     "    return df, f1_score(df.group_match.values, df.clus_group_gt.values, average='macro')"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "code",
  72 |    "execution_count": 4,
  73 |    "metadata": {},
  74 |    "outputs": [],
  75 |    "source": [
  76 |     "def get_max_f1_score(df):\n",
  77 |     "    max_f1 = 0\n",
  78 |     "    max_p = []\n",
  79 |     "    for p in permutation([3,4]):\n",
  80 |     "        df, f1 = get_f1_score(df, p)\n",
  81 |     "        if max_f1 < f1:\n",
  82 |     "            max_f1 = f1\n",
  83 |     "            max_p = p\n",
  84 |     "    print(\"f1_score \", max_f1, max_p)"
  85 |    ]
  86 |   },
  87 |   {
  88 |    "cell_type": "code",
  89 |    "execution_count": 5,
  90 |    "metadata": {},
  91 |    "outputs": [],
  92 |    "source": [
  93 |     "def cal_joint_statistic(nyc_data, w_voronoi):\n",
  94 |     "    matched_connects = 0\n",
  95 |     "    all_neighbors_connects = 0\n",
  96 |     "    for obj_id, neighbors in w_voronoi.neighbors.items():\n",
  97 |     "        obj_clus = nyc_data.iat[obj_id, -1]\n",
  98 |     "        for nei in neighbors:\n",
  99 |     "            nei_clus = nyc_data.iat[nei, -1]\n",
 100 |     "            all_neighbors_connects += 1\n",
 101 |     "            if obj_clus == nei_clus:\n",
 102 |     "                matched_connects += 1\n",
 103 |     "    return matched_connects / all_neighbors_connects"
 104 |    ]
 105 |   },
 106 |   {
 107 |    "cell_type": "markdown",
 108 |    "metadata": {},
 109 |    "source": [
 110 |     "# Processing NYC Check-in Data"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": 6,
 116 |    "metadata": {},
 117 |    "outputs": [
 118 |     {
 119 |      "data": {
 120 |       "text/html": [
 121 |        "<div>\n",
 122 |        "<style scoped>\n",
 123 |        "    .dataframe tbody tr th:only-of-type {\n",
 124 |        "        vertical-align: middle;\n",
 125 |        "    }\n",
 126 |        "\n",
 127 |        "    .dataframe tbody tr th {\n",
 128 |        "        vertical-align: top;\n",
 129 |        "    }\n",
 130 |        "\n",
 131 |        "    .dataframe thead th {\n",
 132 |        "        text-align: right;\n",
 133 |        "    }\n",
 134 |        "</style>\n",
 135 |        "<table border=\"1\" class=\"dataframe\">\n",
 136 |        "  <thead>\n",
 137 |        "    <tr style=\"text-align: right;\">\n",
 138 |        "      <th></th>\n",
 139 |        "      <th>venueId</th>\n",
 140 |        "      <th>userId</th>\n",
 141 |        "      <th>gender</th>\n",
 142 |        "      <th>friend_num</th>\n",
 143 |        "      <th>follow_num</th>\n",
 144 |        "      <th>latitude</th>\n",
 145 |        "      <th>longitude</th>\n",
 146 |        "      <th>venueCateg</th>\n",
 147 |        "      <th>week</th>\n",
 148 |        "      <th>hour</th>\n",
 149 |        "      <th>geometry</th>\n",
 150 |        "    </tr>\n",
 151 |        "  </thead>\n",
 152 |        "  <tbody>\n",
 153 |        "    <tr>\n",
 154 |        "      <th>0</th>\n",
 155 |        "      <td>3fd66200f964a52000e71ee3</td>\n",
 156 |        "      <td>445</td>\n",
 157 |        "      <td>male</td>\n",
 158 |        "      <td>4.0</td>\n",
 159 |        "      <td>13.0</td>\n",
 160 |        "      <td>40.73385</td>\n",
 161 |        "      <td>-74.002998</td>\n",
 162 |        "      <td>Jazz Club</td>\n",
 163 |        "      <td>Sat</td>\n",
 164 |        "      <td>8</td>\n",
 165 |        "      <td>POINT (-74.00300 40.73385)</td>\n",
 166 |        "    </tr>\n",
 167 |        "  </tbody>\n",
 168 |        "</table>\n",
 169 |        "</div>"
 170 |       ],
 171 |       "text/plain": [
 172 |        "                    venueId  userId gender  friend_num  follow_num  latitude  \\\n",
 173 |        "0  3fd66200f964a52000e71ee3     445   male         4.0        13.0  40.73385   \n",
 174 |        "\n",
 175 |        "   longitude venueCateg week  hour                    geometry  \n",
 176 |        "0 -74.002998  Jazz Club  Sat     8  POINT (-74.00300 40.73385)  "
 177 |       ]
 178 |      },
 179 |      "execution_count": 6,
 180 |      "metadata": {},
 181 |      "output_type": "execute_result"
 182 |     }
 183 |    ],
 184 |    "source": [
 185 |     "nyc_check_in = gpd.read_file('data/nyc_checkin.shp')\n",
 186 |     "nyc_check_in.head(1)"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "code",
 191 |    "execution_count": 7,
 192 |    "metadata": {},
 193 |    "outputs": [
 194 |     {
 195 |      "data": {
 196 |       "text/html": [
 197 |        "<div>\n",
 198 |        "<style scoped>\n",
 199 |        "    .dataframe tbody tr th:only-of-type {\n",
 200 |        "        vertical-align: middle;\n",
 201 |        "    }\n",
 202 |        "\n",
 203 |        "    .dataframe tbody tr th {\n",
 204 |        "        vertical-align: top;\n",
 205 |        "    }\n",
 206 |        "\n",
 207 |        "    .dataframe thead th {\n",
 208 |        "        text-align: right;\n",
 209 |        "    }\n",
 210 |        "</style>\n",
 211 |        "<table border=\"1\" class=\"dataframe\">\n",
 212 |        "  <thead>\n",
 213 |        "    <tr style=\"text-align: right;\">\n",
 214 |        "      <th></th>\n",
 215 |        "      <th>venueId</th>\n",
 216 |        "      <th>userId</th>\n",
 217 |        "      <th>gender</th>\n",
 218 |        "      <th>friend_num</th>\n",
 219 |        "      <th>follow_num</th>\n",
 220 |        "      <th>latitude</th>\n",
 221 |        "      <th>longitude</th>\n",
 222 |        "      <th>week</th>\n",
 223 |        "      <th>hour</th>\n",
 224 |        "      <th>geometry</th>\n",
 225 |        "    </tr>\n",
 226 |        "    <tr>\n",
 227 |        "      <th>venueCateg</th>\n",
 228 |        "      <th></th>\n",
 229 |        "      <th></th>\n",
 230 |        "      <th></th>\n",
 231 |        "      <th></th>\n",
 232 |        "      <th></th>\n",
 233 |        "      <th></th>\n",
 234 |        "      <th></th>\n",
 235 |        "      <th></th>\n",
 236 |        "      <th></th>\n",
 237 |        "      <th></th>\n",
 238 |        "    </tr>\n",
 239 |        "  </thead>\n",
 240 |        "  <tbody>\n",
 241 |        "    <tr>\n",
 242 |        "      <th>Subway</th>\n",
 243 |        "      <td>10042</td>\n",
 244 |        "      <td>10042</td>\n",
 245 |        "      <td>10042</td>\n",
 246 |        "      <td>10042</td>\n",
 247 |        "      <td>10042</td>\n",
 248 |        "      <td>10042</td>\n",
 249 |        "      <td>10042</td>\n",
 250 |        "      <td>10042</td>\n",
 251 |        "      <td>10042</td>\n",
 252 |        "      <td>10042</td>\n",
 253 |        "    </tr>\n",
 254 |        "  </tbody>\n",
 255 |        "</table>\n",
 256 |        "</div>"
 257 |       ],
 258 |       "text/plain": [
 259 |        "            venueId  userId  gender  friend_num  follow_num  latitude  \\\n",
 260 |        "venueCateg                                                              \n",
 261 |        "Subway        10042   10042   10042       10042       10042     10042   \n",
 262 |        "\n",
 263 |        "            longitude   week   hour  geometry  \n",
 264 |        "venueCateg                                     \n",
 265 |        "Subway          10042  10042  10042     10042  "
 266 |       ]
 267 |      },
 268 |      "execution_count": 7,
 269 |      "metadata": {},
 270 |      "output_type": "execute_result"
 271 |     }
 272 |    ],
 273 |    "source": [
 274 |     "nyc_check_in.groupby(\"venueCateg\").count().sort_values(\"venueId\").tail(1)"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "code",
 279 |    "execution_count": 8,
 280 |    "metadata": {},
 281 |    "outputs": [
 282 |     {
 283 |      "name": "stdout",
 284 |      "output_type": "stream",
 285 |      "text": [
 286 |       "(7228, 11)\n"
 287 |      ]
 288 |     },
 289 |     {
 290 |      "data": {
 291 |       "text/html": [
 292 |        "<div>\n",
 293 |        "<style scoped>\n",
 294 |        "    .dataframe tbody tr th:only-of-type {\n",
 295 |        "        vertical-align: middle;\n",
 296 |        "    }\n",
 297 |        "\n",
 298 |        "    .dataframe tbody tr th {\n",
 299 |        "        vertical-align: top;\n",
 300 |        "    }\n",
 301 |        "\n",
 302 |        "    .dataframe thead th {\n",
 303 |        "        text-align: right;\n",
 304 |        "    }\n",
 305 |        "</style>\n",
 306 |        "<table border=\"1\" class=\"dataframe\">\n",
 307 |        "  <thead>\n",
 308 |        "    <tr style=\"text-align: right;\">\n",
 309 |        "      <th></th>\n",
 310 |        "      <th>venueId</th>\n",
 311 |        "      <th>userId</th>\n",
 312 |        "      <th>gender</th>\n",
 313 |        "      <th>friend_num</th>\n",
 314 |        "      <th>follow_num</th>\n",
 315 |        "      <th>latitude</th>\n",
 316 |        "      <th>longitude</th>\n",
 317 |        "      <th>venueCateg</th>\n",
 318 |        "      <th>week</th>\n",
 319 |        "      <th>hour</th>\n",
 320 |        "      <th>geometry</th>\n",
 321 |        "    </tr>\n",
 322 |        "  </thead>\n",
 323 |        "  <tbody>\n",
 324 |        "    <tr>\n",
 325 |        "      <th>7421</th>\n",
 326 |        "      <td>42829c80f964a5202f221fe3</td>\n",
 327 |        "      <td>1409</td>\n",
 328 |        "      <td>female</td>\n",
 329 |        "      <td>487.0</td>\n",
 330 |        "      <td>98.0</td>\n",
 331 |        "      <td>40.754239</td>\n",
 332 |        "      <td>-73.985473</td>\n",
 333 |        "      <td>Office</td>\n",
 334 |        "      <td>Tue</td>\n",
 335 |        "      <td>8</td>\n",
 336 |        "      <td>POINT (-73.98547 40.75424)</td>\n",
 337 |        "    </tr>\n",
 338 |        "  </tbody>\n",
 339 |        "</table>\n",
 340 |        "</div>"
 341 |       ],
 342 |       "text/plain": [
 343 |        "                       venueId  userId  gender  friend_num  follow_num  \\\n",
 344 |        "7421  42829c80f964a5202f221fe3    1409  female       487.0        98.0   \n",
 345 |        "\n",
 346 |        "       latitude  longitude venueCateg week  hour                    geometry  \n",
 347 |        "7421  40.754239 -73.985473     Office  Tue     8  POINT (-73.98547 40.75424)  "
 348 |       ]
 349 |      },
 350 |      "execution_count": 8,
 351 |      "metadata": {},
 352 |      "output_type": "execute_result"
 353 |     }
 354 |    ],
 355 |    "source": [
 356 |     "venueCateg_list = [\"Office\", \"Home (private)\"]\n",
 357 |     "venueId_list = pd.DataFrame(nyc_check_in.venueId.unique()).sample(frac=0.5).values.squeeze()\n",
 358 |     "nyc_check_sticc = nyc_check_in[(nyc_check_in.venueCateg.isin(venueCateg_list))&(nyc_check_in.venueId.isin(venueId_list))]\n",
 359 |     "print(nyc_check_sticc.shape)\n",
 360 |     "nyc_check_sticc.head(1)"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "code",
 365 |    "execution_count": 9,
 366 |    "metadata": {},
 367 |    "outputs": [
 368 |     {
 369 |      "name": "stderr",
 370 |      "output_type": "stream",
 371 |      "text": [
 372 |       "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
 373 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 374 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 375 |       "\n",
 376 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 377 |       "  super(GeoDataFrame, self).__setitem__(key, value)\n",
 378 |       "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
 379 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 380 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 381 |       "\n",
 382 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 383 |       "  super(GeoDataFrame, self).__setitem__(key, value)\n"
 384 |      ]
 385 |     }
 386 |    ],
 387 |    "source": [
 388 |     "def return_week(x):\n",
 389 |     "    if x == \"Mon\":\n",
 390 |     "        return 1\n",
 391 |     "    elif x == \"Tue\":\n",
 392 |     "        return 2\n",
 393 |     "    elif x == \"Wed\":\n",
 394 |     "        return 3\n",
 395 |     "    elif x == \"Thu\":\n",
 396 |     "        return 4\n",
 397 |     "    elif x == \"Fri\":\n",
 398 |     "        return 5\n",
 399 |     "    elif x == \"Sat\":\n",
 400 |     "        return 6\n",
 401 |     "    elif x == \"Sun\":\n",
 402 |     "        return 7\n",
 403 |     "    \n",
 404 |     "def return_category(x):\n",
 405 |     "    if x == \"Gym\":\n",
 406 |     "        return 1\n",
 407 |     "    elif x == \"Coffee Shop\":\n",
 408 |     "        return 2\n",
 409 |     "    elif x == \"Office\":\n",
 410 |     "        return 3\n",
 411 |     "    elif x == \"Home (private)\":\n",
 412 |     "        return 4\n",
 413 |     "    elif x == \"Subway\":\n",
 414 |     "        return 5\n",
 415 |     "\n",
 416 |     "nyc_check_sticc[\"week_attr\"] = nyc_check_sticc[\"week\"].apply(lambda x: return_week(x))\n",
 417 |     "nyc_check_sticc[\"category\"] = nyc_check_sticc[\"venueCateg\"].apply(lambda x: return_category(x))\n",
 418 |     "nyc_check_sticc = nyc_check_sticc.reset_index().drop(\"index\", axis=1)"
 419 |    ]
 420 |   },
 421 |   {
 422 |    "cell_type": "code",
 423 |    "execution_count": 10,
 424 |    "metadata": {},
 425 |    "outputs": [
 426 |     {
 427 |      "data": {
 428 |       "text/html": [
 429 |        "<div>\n",
 430 |        "<style scoped>\n",
 431 |        "    .dataframe tbody tr th:only-of-type {\n",
 432 |        "        vertical-align: middle;\n",
 433 |        "    }\n",
 434 |        "\n",
 435 |        "    .dataframe tbody tr th {\n",
 436 |        "        vertical-align: top;\n",
 437 |        "    }\n",
 438 |        "\n",
 439 |        "    .dataframe thead th {\n",
 440 |        "        text-align: right;\n",
 441 |        "    }\n",
 442 |        "</style>\n",
 443 |        "<table border=\"1\" class=\"dataframe\">\n",
 444 |        "  <thead>\n",
 445 |        "    <tr style=\"text-align: right;\">\n",
 446 |        "      <th></th>\n",
 447 |        "      <th>venueId</th>\n",
 448 |        "      <th>userId</th>\n",
 449 |        "      <th>gender</th>\n",
 450 |        "      <th>friend_num</th>\n",
 451 |        "      <th>follow_num</th>\n",
 452 |        "      <th>latitude</th>\n",
 453 |        "      <th>longitude</th>\n",
 454 |        "      <th>venueCateg</th>\n",
 455 |        "      <th>week</th>\n",
 456 |        "      <th>hour</th>\n",
 457 |        "      <th>geometry</th>\n",
 458 |        "      <th>week_attr</th>\n",
 459 |        "      <th>category</th>\n",
 460 |        "    </tr>\n",
 461 |        "  </thead>\n",
 462 |        "  <tbody>\n",
 463 |        "    <tr>\n",
 464 |        "      <th>0</th>\n",
 465 |        "      <td>42829c80f964a5202f221fe3</td>\n",
 466 |        "      <td>1409</td>\n",
 467 |        "      <td>female</td>\n",
 468 |        "      <td>487.0</td>\n",
 469 |        "      <td>98.0</td>\n",
 470 |        "      <td>40.754239</td>\n",
 471 |        "      <td>-73.985473</td>\n",
 472 |        "      <td>Office</td>\n",
 473 |        "      <td>Tue</td>\n",
 474 |        "      <td>8</td>\n",
 475 |        "      <td>POINT (-73.98547 40.75424)</td>\n",
 476 |        "      <td>2</td>\n",
 477 |        "      <td>3</td>\n",
 478 |        "    </tr>\n",
 479 |        "  </tbody>\n",
 480 |        "</table>\n",
 481 |        "</div>"
 482 |       ],
 483 |       "text/plain": [
 484 |        "                    venueId  userId  gender  friend_num  follow_num  \\\n",
 485 |        "0  42829c80f964a5202f221fe3    1409  female       487.0        98.0   \n",
 486 |        "\n",
 487 |        "    latitude  longitude venueCateg week  hour                    geometry  \\\n",
 488 |        "0  40.754239 -73.985473     Office  Tue     8  POINT (-73.98547 40.75424)   \n",
 489 |        "\n",
 490 |        "   week_attr  category  \n",
 491 |        "0          2         3  "
 492 |       ]
 493 |      },
 494 |      "execution_count": 10,
 495 |      "metadata": {},
 496 |      "output_type": "execute_result"
 497 |     }
 498 |    ],
 499 |    "source": [
 500 |     "nyc_check_sticc.head(1)"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": 11,
 506 |    "metadata": {},
 507 |    "outputs": [
 508 |     {
 509 |      "name": "stderr",
 510 |      "output_type": "stream",
 511 |      "text": [
 512 |       "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n",
 513 |       " There are 156 disconnected components.\n",
 514 |       "  warnings.warn(message)\n"
 515 |      ]
 516 |     }
 517 |    ],
 518 |    "source": [
 519 |     "kd = libpysal.cg.KDTree(np.array(nyc_check_sticc[[\"latitude\", \"longitude\"]].values))\n",
 520 |     "wnn = libpysal.weights.KNN(kd, 3)"
 521 |    ]
 522 |   },
 523 |   {
 524 |    "cell_type": "code",
 525 |    "execution_count": 12,
 526 |    "metadata": {},
 527 |    "outputs": [
 528 |     {
 529 |      "data": {
 530 |       "text/html": [
 531 |        "<div>\n",
 532 |        "<style scoped>\n",
 533 |        "    .dataframe tbody tr th:only-of-type {\n",
 534 |        "        vertical-align: middle;\n",
 535 |        "    }\n",
 536 |        "\n",
 537 |        "    .dataframe tbody tr th {\n",
 538 |        "        vertical-align: top;\n",
 539 |        "    }\n",
 540 |        "\n",
 541 |        "    .dataframe thead th {\n",
 542 |        "        text-align: right;\n",
 543 |        "    }\n",
 544 |        "</style>\n",
 545 |        "<table border=\"1\" class=\"dataframe\">\n",
 546 |        "  <thead>\n",
 547 |        "    <tr style=\"text-align: right;\">\n",
 548 |        "      <th></th>\n",
 549 |        "      <th>n_pt_0</th>\n",
 550 |        "      <th>n_pt_1</th>\n",
 551 |        "      <th>n_pt_2</th>\n",
 552 |        "    </tr>\n",
 553 |        "  </thead>\n",
 554 |        "  <tbody>\n",
 555 |        "    <tr>\n",
 556 |        "      <th>0</th>\n",
 557 |        "      <td>6322</td>\n",
 558 |        "      <td>5330</td>\n",
 559 |        "      <td>6317</td>\n",
 560 |        "    </tr>\n",
 561 |        "  </tbody>\n",
 562 |        "</table>\n",
 563 |        "</div>"
 564 |       ],
 565 |       "text/plain": [
 566 |        "   n_pt_0  n_pt_1  n_pt_2\n",
 567 |        "0    6322    5330    6317"
 568 |       ]
 569 |      },
 570 |      "execution_count": 12,
 571 |      "metadata": {},
 572 |      "output_type": "execute_result"
 573 |     }
 574 |    ],
 575 |    "source": [
 576 |     "nearest_pt = pd.DataFrame().from_dict(wnn.neighbors, orient=\"index\")\n",
 577 |     "for i in range(nearest_pt.shape[1]):\n",
 578 |     "    nearest_pt = nearest_pt.rename({i:f\"n_pt_{i}\"}, axis=1)\n",
 579 |     "nearest_pt.head(1)"
 580 |    ]
 581 |   },
 582 |   {
 583 |    "cell_type": "code",
 584 |    "execution_count": 13,
 585 |    "metadata": {},
 586 |    "outputs": [
 587 |     {
 588 |      "data": {
 589 |       "text/html": [
 590 |        "<div>\n",
 591 |        "<style scoped>\n",
 592 |        "    .dataframe tbody tr th:only-of-type {\n",
 593 |        "        vertical-align: middle;\n",
 594 |        "    }\n",
 595 |        "\n",
 596 |        "    .dataframe tbody tr th {\n",
 597 |        "        vertical-align: top;\n",
 598 |        "    }\n",
 599 |        "\n",
 600 |        "    .dataframe thead th {\n",
 601 |        "        text-align: right;\n",
 602 |        "    }\n",
 603 |        "</style>\n",
 604 |        "<table border=\"1\" class=\"dataframe\">\n",
 605 |        "  <thead>\n",
 606 |        "    <tr style=\"text-align: right;\">\n",
 607 |        "      <th></th>\n",
 608 |        "      <th>venueId</th>\n",
 609 |        "      <th>userId</th>\n",
 610 |        "      <th>gender</th>\n",
 611 |        "      <th>friend_num</th>\n",
 612 |        "      <th>follow_num</th>\n",
 613 |        "      <th>latitude</th>\n",
 614 |        "      <th>longitude</th>\n",
 615 |        "      <th>venueCateg</th>\n",
 616 |        "      <th>week</th>\n",
 617 |        "      <th>hour</th>\n",
 618 |        "      <th>geometry</th>\n",
 619 |        "      <th>week_attr</th>\n",
 620 |        "      <th>category</th>\n",
 621 |        "      <th>n_pt_0</th>\n",
 622 |        "      <th>n_pt_1</th>\n",
 623 |        "      <th>n_pt_2</th>\n",
 624 |        "    </tr>\n",
 625 |        "  </thead>\n",
 626 |        "  <tbody>\n",
 627 |        "    <tr>\n",
 628 |        "      <th>0</th>\n",
 629 |        "      <td>42829c80f964a5202f221fe3</td>\n",
 630 |        "      <td>1409</td>\n",
 631 |        "      <td>female</td>\n",
 632 |        "      <td>487.0</td>\n",
 633 |        "      <td>98.0</td>\n",
 634 |        "      <td>40.754239</td>\n",
 635 |        "      <td>-73.985473</td>\n",
 636 |        "      <td>Office</td>\n",
 637 |        "      <td>Tue</td>\n",
 638 |        "      <td>8</td>\n",
 639 |        "      <td>POINT (-73.98547 40.75424)</td>\n",
 640 |        "      <td>2</td>\n",
 641 |        "      <td>3</td>\n",
 642 |        "      <td>6322</td>\n",
 643 |        "      <td>5330</td>\n",
 644 |        "      <td>6317</td>\n",
 645 |        "    </tr>\n",
 646 |        "  </tbody>\n",
 647 |        "</table>\n",
 648 |        "</div>"
 649 |       ],
 650 |       "text/plain": [
 651 |        "                    venueId  userId  gender  friend_num  follow_num  \\\n",
 652 |        "0  42829c80f964a5202f221fe3    1409  female       487.0        98.0   \n",
 653 |        "\n",
 654 |        "    latitude  longitude venueCateg week  hour                    geometry  \\\n",
 655 |        "0  40.754239 -73.985473     Office  Tue     8  POINT (-73.98547 40.75424)   \n",
 656 |        "\n",
 657 |        "   week_attr  category  n_pt_0  n_pt_1  n_pt_2  \n",
 658 |        "0          2         3    6322    5330    6317  "
 659 |       ]
 660 |      },
 661 |      "execution_count": 13,
 662 |      "metadata": {},
 663 |      "output_type": "execute_result"
 664 |     }
 665 |    ],
 666 |    "source": [
 667 |     "nyc_check_sticc = nyc_check_sticc.join(nearest_pt)\n",
 668 |     "nyc_check_sticc.head(1)"
 669 |    ]
 670 |   },
 671 |   {
 672 |    "cell_type": "code",
 673 |    "execution_count": 14,
 674 |    "metadata": {},
 675 |    "outputs": [],
 676 |    "source": [
 677 |     "nyc_check_sticc[[\"week_attr\", \"hour\", \"n_pt_0\", \n",
 678 |     "                 \"n_pt_1\", \"n_pt_2\"]].to_csv(r'nyc_checkin.txt', header=None, index=True, sep=',')"
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "code",
 683 |    "execution_count": 15,
 684 |    "metadata": {},
 685 |    "outputs": [],
 686 |    "source": [
 687 |     "w_voronoi = weights.Voronoi.from_dataframe(nyc_check_sticc)"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "markdown",
 692 |    "metadata": {},
 693 |    "source": [
 694 |     "# STICC"
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "code",
 699 |    "execution_count": 16,
 700 |    "metadata": {
 701 |     "collapsed": true,
 702 |     "jupyter": {
 703 |      "outputs_hidden": true
 704 |     }
 705 |    },
 706 |    "outputs": [
 707 |     {
 708 |      "name": "stdout",
 709 |      "output_type": "stream",
 710 |      "text": [
 711 |       "lam_sparse 0.1\n",
 712 |       "switch_penalty 5.0\n",
 713 |       "num_cluster 2\n",
 714 |       "num stacked 4\n",
 715 |       "completed getting the data\n",
 716 |       "2 (7228, 2) (7228, 3)\n",
 717 |       "\n",
 718 |       "\n",
 719 |       "\n",
 720 |       "ITERATION ### 0\n",
 721 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 722 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 723 |       "length of the cluster  0 ------> 2425\n",
 724 |       "length of the cluster  1 ------> 4803\n",
 725 |       "UPDATED THE OLD COVARIANCE\n",
 726 |       "beginning the smoothening ALGORITHM\n",
 727 |       "length of cluster # 0 --------> 2595\n",
 728 |       "length of cluster # 1 --------> 4633\n",
 729 |       "Done writing the figure\n",
 730 |       "\n",
 731 |       "\n",
 732 |       "\n",
 733 |       "\n",
 734 |       "\n",
 735 |       "\n",
 736 |       "\n",
 737 |       "ITERATION ### 1\n",
 738 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 739 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 740 |       "length of the cluster  0 ------> 2595\n",
 741 |       "length of the cluster  1 ------> 4633\n",
 742 |       "UPDATED THE OLD COVARIANCE\n",
 743 |       "beginning the smoothening ALGORITHM\n",
 744 |       "length of cluster # 0 --------> 3342\n",
 745 |       "length of cluster # 1 --------> 3886\n",
 746 |       "Done writing the figure\n",
 747 |       "\n",
 748 |       "\n",
 749 |       "\n",
 750 |       "\n",
 751 |       "\n",
 752 |       "\n",
 753 |       "\n",
 754 |       "ITERATION ### 2\n",
 755 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 756 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 757 |       "length of the cluster  0 ------> 3342\n",
 758 |       "length of the cluster  1 ------> 3886\n",
 759 |       "UPDATED THE OLD COVARIANCE\n",
 760 |       "beginning the smoothening ALGORITHM\n",
 761 |       "length of cluster # 0 --------> 3701\n",
 762 |       "length of cluster # 1 --------> 3527\n",
 763 |       "Done writing the figure\n",
 764 |       "\n",
 765 |       "\n",
 766 |       "\n",
 767 |       "\n",
 768 |       "\n",
 769 |       "\n",
 770 |       "\n",
 771 |       "ITERATION ### 3\n",
 772 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 773 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 774 |       "length of the cluster  0 ------> 3701\n",
 775 |       "length of the cluster  1 ------> 3527\n",
 776 |       "UPDATED THE OLD COVARIANCE\n",
 777 |       "beginning the smoothening ALGORITHM\n",
 778 |       "length of cluster # 0 --------> 3865\n",
 779 |       "length of cluster # 1 --------> 3363\n",
 780 |       "Done writing the figure\n",
 781 |       "\n",
 782 |       "\n",
 783 |       "\n",
 784 |       "\n",
 785 |       "\n",
 786 |       "\n",
 787 |       "\n",
 788 |       "ITERATION ### 4\n",
 789 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 790 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 791 |       "length of the cluster  0 ------> 3865\n",
 792 |       "length of the cluster  1 ------> 3363\n",
 793 |       "UPDATED THE OLD COVARIANCE\n",
 794 |       "beginning the smoothening ALGORITHM\n",
 795 |       "length of cluster # 0 --------> 3928\n",
 796 |       "length of cluster # 1 --------> 3300\n",
 797 |       "Done writing the figure\n",
 798 |       "\n",
 799 |       "\n",
 800 |       "\n",
 801 |       "\n",
 802 |       "\n",
 803 |       "\n",
 804 |       "\n",
 805 |       "ITERATION ### 5\n",
 806 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 807 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 808 |       "length of the cluster  0 ------> 3928\n",
 809 |       "length of the cluster  1 ------> 3300\n",
 810 |       "UPDATED THE OLD COVARIANCE\n",
 811 |       "beginning the smoothening ALGORITHM\n",
 812 |       "length of cluster # 0 --------> 3955\n",
 813 |       "length of cluster # 1 --------> 3273\n",
 814 |       "Done writing the figure\n",
 815 |       "\n",
 816 |       "\n",
 817 |       "\n",
 818 |       "\n",
 819 |       "\n",
 820 |       "\n",
 821 |       "\n",
 822 |       "ITERATION ### 6\n",
 823 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 824 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 825 |       "length of the cluster  0 ------> 3955\n",
 826 |       "length of the cluster  1 ------> 3273\n",
 827 |       "UPDATED THE OLD COVARIANCE\n",
 828 |       "beginning the smoothening ALGORITHM\n",
 829 |       "length of cluster # 0 --------> 3972\n",
 830 |       "length of cluster # 1 --------> 3256\n",
 831 |       "Done writing the figure\n",
 832 |       "\n",
 833 |       "\n",
 834 |       "\n",
 835 |       "\n",
 836 |       "\n",
 837 |       "\n",
 838 |       "\n",
 839 |       "ITERATION ### 7\n",
 840 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 841 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 842 |       "length of the cluster  0 ------> 3972\n",
 843 |       "length of the cluster  1 ------> 3256\n",
 844 |       "UPDATED THE OLD COVARIANCE\n",
 845 |       "beginning the smoothening ALGORITHM\n",
 846 |       "length of cluster # 0 --------> 3974\n",
 847 |       "length of cluster # 1 --------> 3254\n",
 848 |       "Done writing the figure\n",
 849 |       "\n",
 850 |       "\n",
 851 |       "\n",
 852 |       "\n",
 853 |       "\n",
 854 |       "\n",
 855 |       "\n",
 856 |       "ITERATION ### 8\n",
 857 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 858 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 859 |       "length of the cluster  0 ------> 3974\n",
 860 |       "length of the cluster  1 ------> 3254\n",
 861 |       "UPDATED THE OLD COVARIANCE\n",
 862 |       "beginning the smoothening ALGORITHM\n",
 863 |       "length of cluster # 0 --------> 3976\n",
 864 |       "length of cluster # 1 --------> 3252\n",
 865 |       "Done writing the figure\n",
 866 |       "\n",
 867 |       "\n",
 868 |       "\n",
 869 |       "\n",
 870 |       "\n",
 871 |       "\n",
 872 |       "\n",
 873 |       "ITERATION ### 9\n",
 874 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 875 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 876 |       "length of the cluster  0 ------> 3976\n",
 877 |       "length of the cluster  1 ------> 3252\n",
 878 |       "UPDATED THE OLD COVARIANCE\n",
 879 |       "beginning the smoothening ALGORITHM\n",
 880 |       "length of cluster # 0 --------> 3976\n",
 881 |       "length of cluster # 1 --------> 3252\n",
 882 |       "Done writing the figure\n",
 883 |       "\n",
 884 |       "\n",
 885 |       "\n",
 886 |       "\n",
 887 |       "\n",
 888 |       "\n",
 889 |       "\n",
 890 |       "\n",
 891 |       "CONVERGED!!! BREAKING EARLY!!!\n",
 892 |       "\n",
 893 |       "\n",
 894 |       "\n",
 895 |       "TRAINING F1 score: -1 -1 -1\n",
 896 |       "[1.0000 1.0000 1.0000 ... 0.0000 0.0000 1.0000]\n"
 897 |      ]
 898 |     }
 899 |    ],
 900 |    "source": [
 901 |     "!python STICC_main.py --fname=nyc_checkin.txt --oname=result_nyc_checkin.txt --attr_idx_start=1 \\\n",
 902 |     "--attr_idx_end=2 --spatial_idx_start=3 --spatial_idx_end=5 \\\n",
 903 |     "--spatial_radius 4 --number_of_clusters 2 --lambda_parameter 10e-2 --beta 5 --maxIters 20"
 904 |    ]
 905 |   },
 906 |   {
 907 |    "cell_type": "code",
 908 |    "execution_count": 17,
 909 |    "metadata": {},
 910 |    "outputs": [
 911 |     {
 912 |      "name": "stdout",
 913 |      "output_type": "stream",
 914 |      "text": [
 915 |       "Adjusted rand score 0.5125812312665118\n",
 916 |       "Spatial contiguity:  0.8781538173477508\n",
 917 |       "f1_score  0.8552023059764304 [4, 3]\n"
 918 |      ]
 919 |     }
 920 |    ],
 921 |    "source": [
 922 |     "group = pd.read_table('result_nyc_checkin.txt', names=[\"group\"])\n",
 923 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
 924 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
 925 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
 926 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
 927 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
 928 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
 929 |     "get_max_f1_score(result_nyc_check_sticc)"
 930 |    ]
 931 |   },
 932 |   {
 933 |    "cell_type": "markdown",
 934 |    "metadata": {},
 935 |    "source": [
 936 |     "# Other methods"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": 18,
 942 |    "metadata": {},
 943 |    "outputs": [],
 944 |    "source": [
 945 |     "def get_pycluster_result(ground_truth, cluster_method):\n",
 946 |     "    data = ground_truth[[\"week_attr\", \"hour\"]].values # For K-Means\n",
 947 |     "    data = ground_truth[[\"week_attr\", \"hour\", \"latitude\", \"longitude\"]].values # For Sp K-Means\n",
 948 |     "    \n",
 949 |     "    if cluster_method == kmeans:\n",
 950 |     "        initial_centers = kmeans_plusplus_initializer(data.tolist(), 2).initialize()\n",
 951 |     "        instance = cluster_method(data.tolist(), initial_centers)\n",
 952 |     "    elif cluster_method == cure:\n",
 953 |     "        print(\"cure\")\n",
 954 |     "        instance = cure(data, 2)\n",
 955 |     "    else:\n",
 956 |     "        instance = cluster_method(data.tolist(), 2)\n",
 957 |     "\n",
 958 |     "    instance.process()\n",
 959 |     "    clusters = instance.get_clusters()\n",
 960 |     "    \n",
 961 |     "    clusters_result = []\n",
 962 |     "    for i, clus in enumerate(clusters):\n",
 963 |     "        for data in clus:\n",
 964 |     "            clusters_result.append([data, i])\n",
 965 |     "    clusters_result_df = pd.DataFrame(clusters_result, columns=[\"pt\", \"group\"]).sort_values(\"pt\").set_index(\"pt\")\n",
 966 |     "    return clusters_result_df"
 967 |    ]
 968 |   },
 969 |   {
 970 |    "cell_type": "markdown",
 971 |    "metadata": {},
 972 |    "source": [
 973 |     "# K-Means"
 974 |    ]
 975 |   },
 976 |   {
 977 |    "cell_type": "code",
 978 |    "execution_count": 19,
 979 |    "metadata": {},
 980 |    "outputs": [
 981 |     {
 982 |      "name": "stdout",
 983 |      "output_type": "stream",
 984 |      "text": [
 985 |       "Adjusted rand score 0.04405713957270548\n",
 986 |       "Spatial contiguity:  0.6662453775218836\n",
 987 |       "f1_score  0.6069746428638629 [4, 3]\n"
 988 |      ]
 989 |     }
 990 |    ],
 991 |    "source": [
 992 |     "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
 993 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
 994 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
 995 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
 996 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
 997 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
 998 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
 999 |     "get_max_f1_score(result_nyc_check_sticc)"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "markdown",
1004 |    "metadata": {},
1005 |    "source": [
1006 |     "# Sp K-Means"
1007 |    ]
1008 |   },
1009 |   {
1010 |    "cell_type": "code",
1011 |    "execution_count": 20,
1012 |    "metadata": {},
1013 |    "outputs": [
1014 |     {
1015 |      "name": "stdout",
1016 |      "output_type": "stream",
1017 |      "text": [
1018 |       "Adjusted rand score 0.014733457093020055\n",
1019 |       "Spatial contiguity:  0.6291100394563788\n",
1020 |       "f1_score  0.5679695938194762 [3, 4]\n"
1021 |      ]
1022 |     }
1023 |    ],
1024 |    "source": [
1025 |     "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
1026 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1027 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1028 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1029 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1030 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1031 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1032 |     "get_max_f1_score(result_nyc_check_sticc)"
1033 |    ]
1034 |   },
1035 |   {
1036 |    "cell_type": "code",
1037 |    "execution_count": 21,
1038 |    "metadata": {},
1039 |    "outputs": [
1040 |     {
1041 |      "data": {
1042 |       "text/html": [
1043 |        "<div>\n",
1044 |        "<style scoped>\n",
1045 |        "    .dataframe tbody tr th:only-of-type {\n",
1046 |        "        vertical-align: middle;\n",
1047 |        "    }\n",
1048 |        "\n",
1049 |        "    .dataframe tbody tr th {\n",
1050 |        "        vertical-align: top;\n",
1051 |        "    }\n",
1052 |        "\n",
1053 |        "    .dataframe thead th {\n",
1054 |        "        text-align: right;\n",
1055 |        "    }\n",
1056 |        "</style>\n",
1057 |        "<table border=\"1\" class=\"dataframe\">\n",
1058 |        "  <thead>\n",
1059 |        "    <tr style=\"text-align: right;\">\n",
1060 |        "      <th></th>\n",
1061 |        "      <th>venueId</th>\n",
1062 |        "      <th>userId</th>\n",
1063 |        "      <th>gender</th>\n",
1064 |        "      <th>friend_num</th>\n",
1065 |        "      <th>follow_num</th>\n",
1066 |        "      <th>latitude</th>\n",
1067 |        "      <th>longitude</th>\n",
1068 |        "      <th>venueCateg</th>\n",
1069 |        "      <th>week</th>\n",
1070 |        "      <th>hour</th>\n",
1071 |        "      <th>geometry</th>\n",
1072 |        "      <th>week_attr</th>\n",
1073 |        "      <th>category</th>\n",
1074 |        "      <th>n_pt_0</th>\n",
1075 |        "      <th>n_pt_1</th>\n",
1076 |        "      <th>n_pt_2</th>\n",
1077 |        "    </tr>\n",
1078 |        "  </thead>\n",
1079 |        "  <tbody>\n",
1080 |        "    <tr>\n",
1081 |        "      <th>0</th>\n",
1082 |        "      <td>46ce971cf964a520414a1fe3</td>\n",
1083 |        "      <td>2636</td>\n",
1084 |        "      <td>male</td>\n",
1085 |        "      <td>84.0</td>\n",
1086 |        "      <td>84.0</td>\n",
1087 |        "      <td>40.760867</td>\n",
1088 |        "      <td>-73.980347</td>\n",
1089 |        "      <td>Office</td>\n",
1090 |        "      <td>Wed</td>\n",
1091 |        "      <td>23</td>\n",
1092 |        "      <td>POINT (-73.98035 40.76087)</td>\n",
1093 |        "      <td>3</td>\n",
1094 |        "      <td>3</td>\n",
1095 |        "      <td>322</td>\n",
1096 |        "      <td>315</td>\n",
1097 |        "      <td>288</td>\n",
1098 |        "    </tr>\n",
1099 |        "  </tbody>\n",
1100 |        "</table>\n",
1101 |        "</div>"
1102 |       ],
1103 |       "text/plain": [
1104 |        "                    venueId  userId gender  friend_num  follow_num   latitude  \\\n",
1105 |        "0  46ce971cf964a520414a1fe3    2636   male        84.0        84.0  40.760867   \n",
1106 |        "\n",
1107 |        "   longitude venueCateg week  hour                    geometry  week_attr  \\\n",
1108 |        "0 -73.980347     Office  Wed    23  POINT (-73.98035 40.76087)          3   \n",
1109 |        "\n",
1110 |        "   category  n_pt_0  n_pt_1  n_pt_2  \n",
1111 |        "0         3     322     315     288  "
1112 |       ]
1113 |      },
1114 |      "execution_count": 21,
1115 |      "metadata": {},
1116 |      "output_type": "execute_result"
1117 |     }
1118 |    ],
1119 |    "source": [
1120 |     "nyc_check_sticc.head(1)"
1121 |    ]
1122 |   },
1123 |   {
1124 |    "cell_type": "markdown",
1125 |    "metadata": {},
1126 |    "source": [
1127 |     "# CURE"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "code",
1132 |    "execution_count": 22,
1133 |    "metadata": {},
1134 |    "outputs": [
1135 |     {
1136 |      "name": "stdout",
1137 |      "output_type": "stream",
1138 |      "text": [
1139 |       "cure\n",
1140 |       "Adjusted rand score 0.0011630086117161073\n",
1141 |       "Spatial contiguity:  0.6589215256466462\n",
1142 |       "f1_score  0.5566708449149055 [3, 4]\n"
1143 |      ]
1144 |     }
1145 |    ],
1146 |    "source": [
1147 |     "group = get_pycluster_result(nyc_check_sticc, cure)\n",
1148 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1149 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1150 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1151 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1152 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1153 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1154 |     "get_max_f1_score(result_nyc_check_sticc)"
1155 |    ]
1156 |   },
1157 |   {
1158 |    "cell_type": "markdown",
1159 |    "metadata": {},
1160 |    "source": [
1161 |     "# GMM"
1162 |    ]
1163 |   },
1164 |   {
1165 |    "cell_type": "code",
1166 |    "execution_count": 23,
1167 |    "metadata": {},
1168 |    "outputs": [],
1169 |    "source": [
1170 |     "from sklearn.mixture import GaussianMixture"
1171 |    ]
1172 |   },
1173 |   {
1174 |    "cell_type": "code",
1175 |    "execution_count": 24,
1176 |    "metadata": {},
1177 |    "outputs": [
1178 |     {
1179 |      "data": {
1180 |       "text/html": [
1181 |        "<div>\n",
1182 |        "<style scoped>\n",
1183 |        "    .dataframe tbody tr th:only-of-type {\n",
1184 |        "        vertical-align: middle;\n",
1185 |        "    }\n",
1186 |        "\n",
1187 |        "    .dataframe tbody tr th {\n",
1188 |        "        vertical-align: top;\n",
1189 |        "    }\n",
1190 |        "\n",
1191 |        "    .dataframe thead th {\n",
1192 |        "        text-align: right;\n",
1193 |        "    }\n",
1194 |        "</style>\n",
1195 |        "<table border=\"1\" class=\"dataframe\">\n",
1196 |        "  <thead>\n",
1197 |        "    <tr style=\"text-align: right;\">\n",
1198 |        "      <th></th>\n",
1199 |        "      <th>venueId</th>\n",
1200 |        "      <th>userId</th>\n",
1201 |        "      <th>gender</th>\n",
1202 |        "      <th>friend_num</th>\n",
1203 |        "      <th>follow_num</th>\n",
1204 |        "      <th>latitude</th>\n",
1205 |        "      <th>longitude</th>\n",
1206 |        "      <th>venueCateg</th>\n",
1207 |        "      <th>week</th>\n",
1208 |        "      <th>hour</th>\n",
1209 |        "      <th>geometry</th>\n",
1210 |        "      <th>week_attr</th>\n",
1211 |        "      <th>category</th>\n",
1212 |        "      <th>n_pt_0</th>\n",
1213 |        "      <th>n_pt_1</th>\n",
1214 |        "      <th>n_pt_2</th>\n",
1215 |        "    </tr>\n",
1216 |        "  </thead>\n",
1217 |        "  <tbody>\n",
1218 |        "    <tr>\n",
1219 |        "      <th>0</th>\n",
1220 |        "      <td>46ce971cf964a520414a1fe3</td>\n",
1221 |        "      <td>2636</td>\n",
1222 |        "      <td>male</td>\n",
1223 |        "      <td>84.0</td>\n",
1224 |        "      <td>84.0</td>\n",
1225 |        "      <td>40.760867</td>\n",
1226 |        "      <td>-73.980347</td>\n",
1227 |        "      <td>Office</td>\n",
1228 |        "      <td>Wed</td>\n",
1229 |        "      <td>23</td>\n",
1230 |        "      <td>POINT (-73.98035 40.76087)</td>\n",
1231 |        "      <td>3</td>\n",
1232 |        "      <td>3</td>\n",
1233 |        "      <td>322</td>\n",
1234 |        "      <td>315</td>\n",
1235 |        "      <td>288</td>\n",
1236 |        "    </tr>\n",
1237 |        "  </tbody>\n",
1238 |        "</table>\n",
1239 |        "</div>"
1240 |       ],
1241 |       "text/plain": [
1242 |        "                    venueId  userId gender  friend_num  follow_num   latitude  \\\n",
1243 |        "0  46ce971cf964a520414a1fe3    2636   male        84.0        84.0  40.760867   \n",
1244 |        "\n",
1245 |        "   longitude venueCateg week  hour                    geometry  week_attr  \\\n",
1246 |        "0 -73.980347     Office  Wed    23  POINT (-73.98035 40.76087)          3   \n",
1247 |        "\n",
1248 |        "   category  n_pt_0  n_pt_1  n_pt_2  \n",
1249 |        "0         3     322     315     288  "
1250 |       ]
1251 |      },
1252 |      "execution_count": 24,
1253 |      "metadata": {},
1254 |      "output_type": "execute_result"
1255 |     }
1256 |    ],
1257 |    "source": [
1258 |     "gmm_data = nyc_check_sticc.copy()\n",
1259 |     "gmm_data.head(1)"
1260 |    ]
1261 |   },
1262 |   {
1263 |    "cell_type": "code",
1264 |    "execution_count": 25,
1265 |    "metadata": {},
1266 |    "outputs": [],
1267 |    "source": [
1268 |     "X = gmm_data[['hour', 'week_attr']].values"
1269 |    ]
1270 |   },
1271 |   {
1272 |    "cell_type": "code",
1273 |    "execution_count": 26,
1274 |    "metadata": {},
1275 |    "outputs": [
1276 |     {
1277 |      "data": {
1278 |       "text/html": [
1279 |        "<div>\n",
1280 |        "<style scoped>\n",
1281 |        "    .dataframe tbody tr th:only-of-type {\n",
1282 |        "        vertical-align: middle;\n",
1283 |        "    }\n",
1284 |        "\n",
1285 |        "    .dataframe tbody tr th {\n",
1286 |        "        vertical-align: top;\n",
1287 |        "    }\n",
1288 |        "\n",
1289 |        "    .dataframe thead th {\n",
1290 |        "        text-align: right;\n",
1291 |        "    }\n",
1292 |        "</style>\n",
1293 |        "<table border=\"1\" class=\"dataframe\">\n",
1294 |        "  <thead>\n",
1295 |        "    <tr style=\"text-align: right;\">\n",
1296 |        "      <th></th>\n",
1297 |        "      <th>group</th>\n",
1298 |        "    </tr>\n",
1299 |        "  </thead>\n",
1300 |        "  <tbody>\n",
1301 |        "    <tr>\n",
1302 |        "      <th>0</th>\n",
1303 |        "      <td>1</td>\n",
1304 |        "    </tr>\n",
1305 |        "  </tbody>\n",
1306 |        "</table>\n",
1307 |        "</div>"
1308 |       ],
1309 |       "text/plain": [
1310 |        "   group\n",
1311 |        "0      1"
1312 |       ]
1313 |      },
1314 |      "execution_count": 26,
1315 |      "metadata": {},
1316 |      "output_type": "execute_result"
1317 |     }
1318 |    ],
1319 |    "source": [
1320 |     "gm = GaussianMixture(n_components=2).fit(X)\n",
1321 |     "gmm = pd.DataFrame(gm.predict(X), columns=[\"group\"])\n",
1322 |     "gmm.head(1)"
1323 |    ]
1324 |   },
1325 |   {
1326 |    "cell_type": "code",
1327 |    "execution_count": 27,
1328 |    "metadata": {},
1329 |    "outputs": [
1330 |     {
1331 |      "name": "stdout",
1332 |      "output_type": "stream",
1333 |      "text": [
1334 |       "Adjusted rand score 0.008571217314584517\n",
1335 |       "Spatial contiguity:  0.6590530469092504\n",
1336 |       "f1_score  0.568958207722862 [3, 4]\n"
1337 |      ]
1338 |     }
1339 |    ],
1340 |    "source": [
1341 |     "result_nyc_check_sticc = nyc_check_sticc.join(gmm)\n",
1342 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1343 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1344 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1345 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1346 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1347 |     "get_max_f1_score(result_nyc_check_sticc)"
1348 |    ]
1349 |   }
1350 |  ],
1351 |  "metadata": {
1352 |   "kernelspec": {
1353 |    "display_name": "Python 3",
1354 |    "language": "python",
1355 |    "name": "python3"
1356 |   },
1357 |   "language_info": {
1358 |    "codemirror_mode": {
1359 |     "name": "ipython",
1360 |     "version": 3
1361 |    },
1362 |    "file_extension": ".py",
1363 |    "mimetype": "text/x-python",
1364 |    "name": "python",
1365 |    "nbconvert_exporter": "python",
1366 |    "pygments_lexer": "ipython3",
1367 |    "version": "3.8.3"
1368 |   }
1369 |  },
1370 |  "nbformat": 4,
1371 |  "nbformat_minor": 4
1372 | }
1373 | 


--------------------------------------------------------------------------------
/NYC_checkin3.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import random\n",
  10 |     "import pandas as pd\n",
  11 |     "import geopandas as gpd\n",
  12 |     "import matplotlib.pyplot as plt\n",
  13 |     "import esda\n",
  14 |     "import libpysal.weights as weights\n",
  15 |     "from esda.moran import Moran\n",
  16 |     "from shapely.geometry import Point, MultiPoint, LineString, Polygon, shape\n",
  17 |     "import json\n",
  18 |     "import pylab\n",
  19 |     "import libpysal\n",
  20 |     "import numpy as np\n",
  21 |     "from sklearn.metrics.cluster import adjusted_rand_score\n",
  22 |     "from sklearn.metrics import f1_score\n",
  23 |     "from pyclustering.cluster.cure import cure\n",
  24 |     "from pyclustering.cluster.kmeans import kmeans\n",
  25 |     "from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer\n",
  26 |     "from sklearn import preprocessing"
  27 |    ]
  28 |   },
  29 |   {
  30 |    "cell_type": "code",
  31 |    "execution_count": 2,
  32 |    "metadata": {},
  33 |    "outputs": [],
  34 |    "source": [
  35 |     "def permutation(lst):\n",
  36 |     "    if len(lst) == 0:\n",
  37 |     "        return []\n",
  38 |     "\n",
  39 |     "    if len(lst) == 1:\n",
  40 |     "        return [lst]\n",
  41 |     "\n",
  42 |     "    l = []\n",
  43 |     "    for i in range(len(lst)):\n",
  44 |     "        m = lst[i]\n",
  45 |     "        remLst = lst[:i] + lst[i+1:]\n",
  46 |     "        for p in permutation(remLst):\n",
  47 |     "            l.append([m] + p)       \n",
  48 |     "    return l"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": 3,
  54 |    "metadata": {},
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "def get_f1_score(df, permut):\n",
  58 |     "    def match_clus(x, permut):\n",
  59 |     "        if x == 0:\n",
  60 |     "            return int(permut[0])\n",
  61 |     "        elif x == 1:\n",
  62 |     "            return int(permut[1])\n",
  63 |     "        elif x == 2:\n",
  64 |     "            return int(permut[1])\n",
  65 |     "        else:\n",
  66 |     "            return x\n",
  67 |     "\n",
  68 |     "    df[\"group_match\"] = df[\"group\"].apply(lambda x: match_clus(x, permut))\n",
  69 |     "    return df, f1_score(df.group_match.values, df.clus_group_gt.values, average='macro')"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "code",
  74 |    "execution_count": 4,
  75 |    "metadata": {},
  76 |    "outputs": [],
  77 |    "source": [
  78 |     "def get_max_f1_score(df):\n",
  79 |     "    max_f1 = 0\n",
  80 |     "    max_p = []\n",
  81 |     "    for p in permutation([1,3,4]):\n",
  82 |     "        df, f1 = get_f1_score(df, p)\n",
  83 |     "        if max_f1 < f1:\n",
  84 |     "            max_f1 = f1\n",
  85 |     "            max_p = p\n",
  86 |     "    print(\"f1_score \", max_f1, max_p)"
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": 5,
  92 |    "metadata": {},
  93 |    "outputs": [],
  94 |    "source": [
  95 |     "def cal_joint_statistic(nyc_data, w_voronoi):\n",
  96 |     "    matched_connects = 0\n",
  97 |     "    all_neighbors_connects = 0\n",
  98 |     "    for obj_id, neighbors in w_voronoi.neighbors.items():\n",
  99 |     "        obj_clus = nyc_data.iat[obj_id, -1]\n",
 100 |     "        for nei in neighbors:\n",
 101 |     "            nei_clus = nyc_data.iat[nei, -1]\n",
 102 |     "            all_neighbors_connects += 1\n",
 103 |     "            if obj_clus == nei_clus:\n",
 104 |     "                matched_connects += 1\n",
 105 |     "    return matched_connects / all_neighbors_connects"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "code",
 110 |    "execution_count": 6,
 111 |    "metadata": {},
 112 |    "outputs": [
 113 |     {
 114 |      "data": {
 115 |       "text/html": [
 116 |        "<div>\n",
 117 |        "<style scoped>\n",
 118 |        "    .dataframe tbody tr th:only-of-type {\n",
 119 |        "        vertical-align: middle;\n",
 120 |        "    }\n",
 121 |        "\n",
 122 |        "    .dataframe tbody tr th {\n",
 123 |        "        vertical-align: top;\n",
 124 |        "    }\n",
 125 |        "\n",
 126 |        "    .dataframe thead th {\n",
 127 |        "        text-align: right;\n",
 128 |        "    }\n",
 129 |        "</style>\n",
 130 |        "<table border=\"1\" class=\"dataframe\">\n",
 131 |        "  <thead>\n",
 132 |        "    <tr style=\"text-align: right;\">\n",
 133 |        "      <th></th>\n",
 134 |        "      <th>venueId</th>\n",
 135 |        "      <th>userId</th>\n",
 136 |        "      <th>gender</th>\n",
 137 |        "      <th>friend_num</th>\n",
 138 |        "      <th>follow_num</th>\n",
 139 |        "      <th>latitude</th>\n",
 140 |        "      <th>longitude</th>\n",
 141 |        "      <th>venueCateg</th>\n",
 142 |        "      <th>week</th>\n",
 143 |        "      <th>hour</th>\n",
 144 |        "      <th>geometry</th>\n",
 145 |        "    </tr>\n",
 146 |        "  </thead>\n",
 147 |        "  <tbody>\n",
 148 |        "    <tr>\n",
 149 |        "      <th>0</th>\n",
 150 |        "      <td>3fd66200f964a52000e71ee3</td>\n",
 151 |        "      <td>445</td>\n",
 152 |        "      <td>male</td>\n",
 153 |        "      <td>4.0</td>\n",
 154 |        "      <td>13.0</td>\n",
 155 |        "      <td>40.73385</td>\n",
 156 |        "      <td>-74.002998</td>\n",
 157 |        "      <td>Jazz Club</td>\n",
 158 |        "      <td>Sat</td>\n",
 159 |        "      <td>8</td>\n",
 160 |        "      <td>POINT (-74.00300 40.73385)</td>\n",
 161 |        "    </tr>\n",
 162 |        "  </tbody>\n",
 163 |        "</table>\n",
 164 |        "</div>"
 165 |       ],
 166 |       "text/plain": [
 167 |        "                    venueId  userId gender  friend_num  follow_num  latitude  \\\n",
 168 |        "0  3fd66200f964a52000e71ee3     445   male         4.0        13.0  40.73385   \n",
 169 |        "\n",
 170 |        "   longitude venueCateg week  hour                    geometry  \n",
 171 |        "0 -74.002998  Jazz Club  Sat     8  POINT (-74.00300 40.73385)  "
 172 |       ]
 173 |      },
 174 |      "execution_count": 6,
 175 |      "metadata": {},
 176 |      "output_type": "execute_result"
 177 |     }
 178 |    ],
 179 |    "source": [
 180 |     "nyc_check_in = gpd.read_file('data/nyc_checkin.shp')\n",
 181 |     "nyc_check_in.head(1)"
 182 |    ]
 183 |   },
 184 |   {
 185 |    "cell_type": "code",
 186 |    "execution_count": 7,
 187 |    "metadata": {},
 188 |    "outputs": [
 189 |     {
 190 |      "data": {
 191 |       "text/html": [
 192 |        "<div>\n",
 193 |        "<style scoped>\n",
 194 |        "    .dataframe tbody tr th:only-of-type {\n",
 195 |        "        vertical-align: middle;\n",
 196 |        "    }\n",
 197 |        "\n",
 198 |        "    .dataframe tbody tr th {\n",
 199 |        "        vertical-align: top;\n",
 200 |        "    }\n",
 201 |        "\n",
 202 |        "    .dataframe thead th {\n",
 203 |        "        text-align: right;\n",
 204 |        "    }\n",
 205 |        "</style>\n",
 206 |        "<table border=\"1\" class=\"dataframe\">\n",
 207 |        "  <thead>\n",
 208 |        "    <tr style=\"text-align: right;\">\n",
 209 |        "      <th></th>\n",
 210 |        "      <th>venueId</th>\n",
 211 |        "      <th>userId</th>\n",
 212 |        "      <th>gender</th>\n",
 213 |        "      <th>friend_num</th>\n",
 214 |        "      <th>follow_num</th>\n",
 215 |        "      <th>latitude</th>\n",
 216 |        "      <th>longitude</th>\n",
 217 |        "      <th>week</th>\n",
 218 |        "      <th>hour</th>\n",
 219 |        "      <th>geometry</th>\n",
 220 |        "    </tr>\n",
 221 |        "    <tr>\n",
 222 |        "      <th>venueCateg</th>\n",
 223 |        "      <th></th>\n",
 224 |        "      <th></th>\n",
 225 |        "      <th></th>\n",
 226 |        "      <th></th>\n",
 227 |        "      <th></th>\n",
 228 |        "      <th></th>\n",
 229 |        "      <th></th>\n",
 230 |        "      <th></th>\n",
 231 |        "      <th></th>\n",
 232 |        "      <th></th>\n",
 233 |        "    </tr>\n",
 234 |        "  </thead>\n",
 235 |        "  <tbody>\n",
 236 |        "    <tr>\n",
 237 |        "      <th>Subway</th>\n",
 238 |        "      <td>10042</td>\n",
 239 |        "      <td>10042</td>\n",
 240 |        "      <td>10042</td>\n",
 241 |        "      <td>10042</td>\n",
 242 |        "      <td>10042</td>\n",
 243 |        "      <td>10042</td>\n",
 244 |        "      <td>10042</td>\n",
 245 |        "      <td>10042</td>\n",
 246 |        "      <td>10042</td>\n",
 247 |        "      <td>10042</td>\n",
 248 |        "    </tr>\n",
 249 |        "  </tbody>\n",
 250 |        "</table>\n",
 251 |        "</div>"
 252 |       ],
 253 |       "text/plain": [
 254 |        "            venueId  userId  gender  friend_num  follow_num  latitude  \\\n",
 255 |        "venueCateg                                                              \n",
 256 |        "Subway        10042   10042   10042       10042       10042     10042   \n",
 257 |        "\n",
 258 |        "            longitude   week   hour  geometry  \n",
 259 |        "venueCateg                                     \n",
 260 |        "Subway          10042  10042  10042     10042  "
 261 |       ]
 262 |      },
 263 |      "execution_count": 7,
 264 |      "metadata": {},
 265 |      "output_type": "execute_result"
 266 |     }
 267 |    ],
 268 |    "source": [
 269 |     "nyc_check_in.groupby(\"venueCateg\").count().sort_values(\"venueId\").tail(1)"
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "code",
 274 |    "execution_count": 8,
 275 |    "metadata": {},
 276 |    "outputs": [
 277 |     {
 278 |      "name": "stdout",
 279 |      "output_type": "stream",
 280 |      "text": [
 281 |       "(5909, 11)\n"
 282 |      ]
 283 |     },
 284 |     {
 285 |      "data": {
 286 |       "text/html": [
 287 |        "<div>\n",
 288 |        "<style scoped>\n",
 289 |        "    .dataframe tbody tr th:only-of-type {\n",
 290 |        "        vertical-align: middle;\n",
 291 |        "    }\n",
 292 |        "\n",
 293 |        "    .dataframe tbody tr th {\n",
 294 |        "        vertical-align: top;\n",
 295 |        "    }\n",
 296 |        "\n",
 297 |        "    .dataframe thead th {\n",
 298 |        "        text-align: right;\n",
 299 |        "    }\n",
 300 |        "</style>\n",
 301 |        "<table border=\"1\" class=\"dataframe\">\n",
 302 |        "  <thead>\n",
 303 |        "    <tr style=\"text-align: right;\">\n",
 304 |        "      <th></th>\n",
 305 |        "      <th>venueId</th>\n",
 306 |        "      <th>userId</th>\n",
 307 |        "      <th>gender</th>\n",
 308 |        "      <th>friend_num</th>\n",
 309 |        "      <th>follow_num</th>\n",
 310 |        "      <th>latitude</th>\n",
 311 |        "      <th>longitude</th>\n",
 312 |        "      <th>venueCateg</th>\n",
 313 |        "      <th>week</th>\n",
 314 |        "      <th>hour</th>\n",
 315 |        "      <th>geometry</th>\n",
 316 |        "    </tr>\n",
 317 |        "  </thead>\n",
 318 |        "  <tbody>\n",
 319 |        "    <tr>\n",
 320 |        "      <th>1828</th>\n",
 321 |        "      <td>3fd66200f964a5206fe71ee3</td>\n",
 322 |        "      <td>654</td>\n",
 323 |        "      <td>male</td>\n",
 324 |        "      <td>103.0</td>\n",
 325 |        "      <td>46.0</td>\n",
 326 |        "      <td>40.752901</td>\n",
 327 |        "      <td>-73.974176</td>\n",
 328 |        "      <td>Gym</td>\n",
 329 |        "      <td>Mon</td>\n",
 330 |        "      <td>17</td>\n",
 331 |        "      <td>POINT (-73.97418 40.75290)</td>\n",
 332 |        "    </tr>\n",
 333 |        "  </tbody>\n",
 334 |        "</table>\n",
 335 |        "</div>"
 336 |       ],
 337 |       "text/plain": [
 338 |        "                       venueId  userId gender  friend_num  follow_num  \\\n",
 339 |        "1828  3fd66200f964a5206fe71ee3     654   male       103.0        46.0   \n",
 340 |        "\n",
 341 |        "       latitude  longitude venueCateg week  hour                    geometry  \n",
 342 |        "1828  40.752901 -73.974176        Gym  Mon    17  POINT (-73.97418 40.75290)  "
 343 |       ]
 344 |      },
 345 |      "execution_count": 8,
 346 |      "metadata": {},
 347 |      "output_type": "execute_result"
 348 |     }
 349 |    ],
 350 |    "source": [
 351 |     "venueCateg_list = [\"Gym\", \"Office\", \"Home (private)\"]\n",
 352 |     "venueId_list = pd.DataFrame(nyc_check_in.venueId.unique()).sample(frac=0.3).values.squeeze()\n",
 353 |     "nyc_check_sticc = nyc_check_in[(nyc_check_in.venueCateg.isin(venueCateg_list))&(nyc_check_in.venueId.isin(venueId_list))]\n",
 354 |     "print(nyc_check_sticc.shape)\n",
 355 |     "nyc_check_sticc.head(1)"
 356 |    ]
 357 |   },
 358 |   {
 359 |    "cell_type": "code",
 360 |    "execution_count": 9,
 361 |    "metadata": {},
 362 |    "outputs": [
 363 |     {
 364 |      "name": "stderr",
 365 |      "output_type": "stream",
 366 |      "text": [
 367 |       "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
 368 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 369 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 370 |       "\n",
 371 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 372 |       "  super(GeoDataFrame, self).__setitem__(key, value)\n",
 373 |       "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/geopandas/geodataframe.py:853: SettingWithCopyWarning: \n",
 374 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
 375 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
 376 |       "\n",
 377 |       "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 378 |       "  super(GeoDataFrame, self).__setitem__(key, value)\n"
 379 |      ]
 380 |     }
 381 |    ],
 382 |    "source": [
 383 |     "def return_week(x):\n",
 384 |     "    if x == \"Mon\":\n",
 385 |     "        return 1\n",
 386 |     "    elif x == \"Tue\":\n",
 387 |     "        return 2\n",
 388 |     "    elif x == \"Wed\":\n",
 389 |     "        return 3\n",
 390 |     "    elif x == \"Thu\":\n",
 391 |     "        return 4\n",
 392 |     "    elif x == \"Fri\":\n",
 393 |     "        return 5\n",
 394 |     "    elif x == \"Sat\":\n",
 395 |     "        return 6\n",
 396 |     "    elif x == \"Sun\":\n",
 397 |     "        return 7\n",
 398 |     "    \n",
 399 |     "def return_category(x):\n",
 400 |     "    if x == \"Gym\":\n",
 401 |     "        return 1\n",
 402 |     "    elif x == \"Coffee Shop\":\n",
 403 |     "        return 2\n",
 404 |     "    elif x == \"Office\":\n",
 405 |     "        return 3\n",
 406 |     "    elif x == \"Home (private)\":\n",
 407 |     "        return 4\n",
 408 |     "    elif x == \"Subway\":\n",
 409 |     "        return 5\n",
 410 |     "\n",
 411 |     "nyc_check_sticc[\"week_attr\"] = nyc_check_sticc[\"week\"].apply(lambda x: return_week(x))\n",
 412 |     "nyc_check_sticc[\"category\"] = nyc_check_sticc[\"venueCateg\"].apply(lambda x: return_category(x))\n",
 413 |     "nyc_check_sticc = nyc_check_sticc.reset_index().drop(\"index\", axis=1)"
 414 |    ]
 415 |   },
 416 |   {
 417 |    "cell_type": "code",
 418 |    "execution_count": 10,
 419 |    "metadata": {},
 420 |    "outputs": [
 421 |     {
 422 |      "data": {
 423 |       "text/html": [
 424 |        "<div>\n",
 425 |        "<style scoped>\n",
 426 |        "    .dataframe tbody tr th:only-of-type {\n",
 427 |        "        vertical-align: middle;\n",
 428 |        "    }\n",
 429 |        "\n",
 430 |        "    .dataframe tbody tr th {\n",
 431 |        "        vertical-align: top;\n",
 432 |        "    }\n",
 433 |        "\n",
 434 |        "    .dataframe thead th {\n",
 435 |        "        text-align: right;\n",
 436 |        "    }\n",
 437 |        "</style>\n",
 438 |        "<table border=\"1\" class=\"dataframe\">\n",
 439 |        "  <thead>\n",
 440 |        "    <tr style=\"text-align: right;\">\n",
 441 |        "      <th></th>\n",
 442 |        "      <th>venueId</th>\n",
 443 |        "      <th>userId</th>\n",
 444 |        "      <th>gender</th>\n",
 445 |        "      <th>friend_num</th>\n",
 446 |        "      <th>follow_num</th>\n",
 447 |        "      <th>latitude</th>\n",
 448 |        "      <th>longitude</th>\n",
 449 |        "      <th>venueCateg</th>\n",
 450 |        "      <th>week</th>\n",
 451 |        "      <th>hour</th>\n",
 452 |        "      <th>geometry</th>\n",
 453 |        "      <th>week_attr</th>\n",
 454 |        "      <th>category</th>\n",
 455 |        "    </tr>\n",
 456 |        "  </thead>\n",
 457 |        "  <tbody>\n",
 458 |        "    <tr>\n",
 459 |        "      <th>0</th>\n",
 460 |        "      <td>3fd66200f964a5206fe71ee3</td>\n",
 461 |        "      <td>654</td>\n",
 462 |        "      <td>male</td>\n",
 463 |        "      <td>103.0</td>\n",
 464 |        "      <td>46.0</td>\n",
 465 |        "      <td>40.752901</td>\n",
 466 |        "      <td>-73.974176</td>\n",
 467 |        "      <td>Gym</td>\n",
 468 |        "      <td>Mon</td>\n",
 469 |        "      <td>17</td>\n",
 470 |        "      <td>POINT (-73.97418 40.75290)</td>\n",
 471 |        "      <td>1</td>\n",
 472 |        "      <td>1</td>\n",
 473 |        "    </tr>\n",
 474 |        "  </tbody>\n",
 475 |        "</table>\n",
 476 |        "</div>"
 477 |       ],
 478 |       "text/plain": [
 479 |        "                    venueId  userId gender  friend_num  follow_num   latitude  \\\n",
 480 |        "0  3fd66200f964a5206fe71ee3     654   male       103.0        46.0  40.752901   \n",
 481 |        "\n",
 482 |        "   longitude venueCateg week  hour                    geometry  week_attr  \\\n",
 483 |        "0 -73.974176        Gym  Mon    17  POINT (-73.97418 40.75290)          1   \n",
 484 |        "\n",
 485 |        "   category  \n",
 486 |        "0         1  "
 487 |       ]
 488 |      },
 489 |      "execution_count": 10,
 490 |      "metadata": {},
 491 |      "output_type": "execute_result"
 492 |     }
 493 |    ],
 494 |    "source": [
 495 |     "nyc_check_sticc.head(1)"
 496 |    ]
 497 |   },
 498 |   {
 499 |    "cell_type": "code",
 500 |    "execution_count": 11,
 501 |    "metadata": {},
 502 |    "outputs": [
 503 |     {
 504 |      "name": "stderr",
 505 |      "output_type": "stream",
 506 |      "text": [
 507 |       "/home/kangyuhao/anaconda3/lib/python3.8/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n",
 508 |       " There are 140 disconnected components.\n",
 509 |       "  warnings.warn(message)\n"
 510 |      ]
 511 |     }
 512 |    ],
 513 |    "source": [
 514 |     "kd = libpysal.cg.KDTree(np.array(nyc_check_sticc[[\"latitude\", \"longitude\"]].values))\n",
 515 |     "wnn = libpysal.weights.KNN(kd, 3)"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "code",
 520 |    "execution_count": 12,
 521 |    "metadata": {},
 522 |    "outputs": [
 523 |     {
 524 |      "data": {
 525 |       "text/html": [
 526 |        "<div>\n",
 527 |        "<style scoped>\n",
 528 |        "    .dataframe tbody tr th:only-of-type {\n",
 529 |        "        vertical-align: middle;\n",
 530 |        "    }\n",
 531 |        "\n",
 532 |        "    .dataframe tbody tr th {\n",
 533 |        "        vertical-align: top;\n",
 534 |        "    }\n",
 535 |        "\n",
 536 |        "    .dataframe thead th {\n",
 537 |        "        text-align: right;\n",
 538 |        "    }\n",
 539 |        "</style>\n",
 540 |        "<table border=\"1\" class=\"dataframe\">\n",
 541 |        "  <thead>\n",
 542 |        "    <tr style=\"text-align: right;\">\n",
 543 |        "      <th></th>\n",
 544 |        "      <th>n_pt_0</th>\n",
 545 |        "      <th>n_pt_1</th>\n",
 546 |        "      <th>n_pt_2</th>\n",
 547 |        "    </tr>\n",
 548 |        "  </thead>\n",
 549 |        "  <tbody>\n",
 550 |        "    <tr>\n",
 551 |        "      <th>0</th>\n",
 552 |        "      <td>3556</td>\n",
 553 |        "      <td>9</td>\n",
 554 |        "      <td>22</td>\n",
 555 |        "    </tr>\n",
 556 |        "  </tbody>\n",
 557 |        "</table>\n",
 558 |        "</div>"
 559 |       ],
 560 |       "text/plain": [
 561 |        "   n_pt_0  n_pt_1  n_pt_2\n",
 562 |        "0    3556       9      22"
 563 |       ]
 564 |      },
 565 |      "execution_count": 12,
 566 |      "metadata": {},
 567 |      "output_type": "execute_result"
 568 |     }
 569 |    ],
 570 |    "source": [
 571 |     "nearest_pt = pd.DataFrame().from_dict(wnn.neighbors, orient=\"index\")\n",
 572 |     "for i in range(nearest_pt.shape[1]):\n",
 573 |     "    nearest_pt = nearest_pt.rename({i:f\"n_pt_{i}\"}, axis=1)\n",
 574 |     "nearest_pt.head(1)"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "code",
 579 |    "execution_count": 13,
 580 |    "metadata": {},
 581 |    "outputs": [
 582 |     {
 583 |      "data": {
 584 |       "text/html": [
 585 |        "<div>\n",
 586 |        "<style scoped>\n",
 587 |        "    .dataframe tbody tr th:only-of-type {\n",
 588 |        "        vertical-align: middle;\n",
 589 |        "    }\n",
 590 |        "\n",
 591 |        "    .dataframe tbody tr th {\n",
 592 |        "        vertical-align: top;\n",
 593 |        "    }\n",
 594 |        "\n",
 595 |        "    .dataframe thead th {\n",
 596 |        "        text-align: right;\n",
 597 |        "    }\n",
 598 |        "</style>\n",
 599 |        "<table border=\"1\" class=\"dataframe\">\n",
 600 |        "  <thead>\n",
 601 |        "    <tr style=\"text-align: right;\">\n",
 602 |        "      <th></th>\n",
 603 |        "      <th>venueId</th>\n",
 604 |        "      <th>userId</th>\n",
 605 |        "      <th>gender</th>\n",
 606 |        "      <th>friend_num</th>\n",
 607 |        "      <th>follow_num</th>\n",
 608 |        "      <th>latitude</th>\n",
 609 |        "      <th>longitude</th>\n",
 610 |        "      <th>venueCateg</th>\n",
 611 |        "      <th>week</th>\n",
 612 |        "      <th>hour</th>\n",
 613 |        "      <th>geometry</th>\n",
 614 |        "      <th>week_attr</th>\n",
 615 |        "      <th>category</th>\n",
 616 |        "      <th>n_pt_0</th>\n",
 617 |        "      <th>n_pt_1</th>\n",
 618 |        "      <th>n_pt_2</th>\n",
 619 |        "    </tr>\n",
 620 |        "  </thead>\n",
 621 |        "  <tbody>\n",
 622 |        "    <tr>\n",
 623 |        "      <th>0</th>\n",
 624 |        "      <td>3fd66200f964a5206fe71ee3</td>\n",
 625 |        "      <td>654</td>\n",
 626 |        "      <td>male</td>\n",
 627 |        "      <td>103.0</td>\n",
 628 |        "      <td>46.0</td>\n",
 629 |        "      <td>40.752901</td>\n",
 630 |        "      <td>-73.974176</td>\n",
 631 |        "      <td>Gym</td>\n",
 632 |        "      <td>Mon</td>\n",
 633 |        "      <td>17</td>\n",
 634 |        "      <td>POINT (-73.97418 40.75290)</td>\n",
 635 |        "      <td>1</td>\n",
 636 |        "      <td>1</td>\n",
 637 |        "      <td>3556</td>\n",
 638 |        "      <td>9</td>\n",
 639 |        "      <td>22</td>\n",
 640 |        "    </tr>\n",
 641 |        "  </tbody>\n",
 642 |        "</table>\n",
 643 |        "</div>"
 644 |       ],
 645 |       "text/plain": [
 646 |        "                    venueId  userId gender  friend_num  follow_num   latitude  \\\n",
 647 |        "0  3fd66200f964a5206fe71ee3     654   male       103.0        46.0  40.752901   \n",
 648 |        "\n",
 649 |        "   longitude venueCateg week  hour                    geometry  week_attr  \\\n",
 650 |        "0 -73.974176        Gym  Mon    17  POINT (-73.97418 40.75290)          1   \n",
 651 |        "\n",
 652 |        "   category  n_pt_0  n_pt_1  n_pt_2  \n",
 653 |        "0         1    3556       9      22  "
 654 |       ]
 655 |      },
 656 |      "execution_count": 13,
 657 |      "metadata": {},
 658 |      "output_type": "execute_result"
 659 |     }
 660 |    ],
 661 |    "source": [
 662 |     "nyc_check_sticc = nyc_check_sticc.join(nearest_pt)\n",
 663 |     "nyc_check_sticc.head(1)"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "code",
 668 |    "execution_count": 14,
 669 |    "metadata": {},
 670 |    "outputs": [],
 671 |    "source": [
 672 |     "nyc_check_sticc[[\"week_attr\", \"hour\", \"n_pt_0\", \"n_pt_1\", \n",
 673 |     "                 \"n_pt_2\"]].to_csv(r'nyc_checkin3.txt', header=None, index=True, sep=',')"
 674 |    ]
 675 |   },
 676 |   {
 677 |    "cell_type": "code",
 678 |    "execution_count": 15,
 679 |    "metadata": {},
 680 |    "outputs": [],
 681 |    "source": [
 682 |     "w_voronoi = weights.Voronoi.from_dataframe(nyc_check_sticc)"
 683 |    ]
 684 |   },
 685 |   {
 686 |    "cell_type": "markdown",
 687 |    "metadata": {},
 688 |    "source": [
 689 |     "# STICC"
 690 |    ]
 691 |   },
 692 |   {
 693 |    "cell_type": "code",
 694 |    "execution_count": 16,
 695 |    "metadata": {
 696 |     "collapsed": true,
 697 |     "jupyter": {
 698 |      "outputs_hidden": true
 699 |     }
 700 |    },
 701 |    "outputs": [
 702 |     {
 703 |      "name": "stdout",
 704 |      "output_type": "stream",
 705 |      "text": [
 706 |       "lam_sparse 0.1\n",
 707 |       "switch_penalty 5.0\n",
 708 |       "num_cluster 3\n",
 709 |       "num stacked 4\n",
 710 |       "completed getting the data\n",
 711 |       "2 (5909, 2) (5909, 3)\n",
 712 |       "\n",
 713 |       "\n",
 714 |       "\n",
 715 |       "ITERATION ### 0\n",
 716 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 717 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 718 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 719 |       "length of the cluster  0 ------> 3087\n",
 720 |       "length of the cluster  1 ------> 1475\n",
 721 |       "length of the cluster  2 ------> 1347\n",
 722 |       "UPDATED THE OLD COVARIANCE\n",
 723 |       "beginning the smoothening ALGORITHM\n",
 724 |       "length of cluster # 0 --------> 3196\n",
 725 |       "length of cluster # 1 --------> 1611\n",
 726 |       "length of cluster # 2 --------> 1102\n",
 727 |       "Done writing the figure\n",
 728 |       "\n",
 729 |       "\n",
 730 |       "\n",
 731 |       "\n",
 732 |       "\n",
 733 |       "\n",
 734 |       "\n",
 735 |       "ITERATION ### 1\n",
 736 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 737 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 738 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 739 |       "length of the cluster  0 ------> 3196\n",
 740 |       "length of the cluster  1 ------> 1611\n",
 741 |       "length of the cluster  2 ------> 1102\n",
 742 |       "UPDATED THE OLD COVARIANCE\n",
 743 |       "beginning the smoothening ALGORITHM\n",
 744 |       "length of cluster # 0 --------> 3012\n",
 745 |       "length of cluster # 1 --------> 1478\n",
 746 |       "length of cluster # 2 --------> 1419\n",
 747 |       "Done writing the figure\n",
 748 |       "\n",
 749 |       "\n",
 750 |       "\n",
 751 |       "\n",
 752 |       "\n",
 753 |       "\n",
 754 |       "\n",
 755 |       "ITERATION ### 2\n",
 756 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 757 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 758 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 759 |       "length of the cluster  0 ------> 3012\n",
 760 |       "length of the cluster  1 ------> 1478\n",
 761 |       "length of the cluster  2 ------> 1419\n",
 762 |       "UPDATED THE OLD COVARIANCE\n",
 763 |       "beginning the smoothening ALGORITHM\n",
 764 |       "length of cluster # 0 --------> 2800\n",
 765 |       "length of cluster # 1 --------> 1490\n",
 766 |       "length of cluster # 2 --------> 1619\n",
 767 |       "Done writing the figure\n",
 768 |       "\n",
 769 |       "\n",
 770 |       "\n",
 771 |       "\n",
 772 |       "\n",
 773 |       "\n",
 774 |       "\n",
 775 |       "ITERATION ### 3\n",
 776 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 777 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 778 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 779 |       "length of the cluster  0 ------> 2800\n",
 780 |       "length of the cluster  1 ------> 1490\n",
 781 |       "length of the cluster  2 ------> 1619\n",
 782 |       "UPDATED THE OLD COVARIANCE\n",
 783 |       "beginning the smoothening ALGORITHM\n",
 784 |       "length of cluster # 0 --------> 2696\n",
 785 |       "length of cluster # 1 --------> 1554\n",
 786 |       "length of cluster # 2 --------> 1659\n",
 787 |       "Done writing the figure\n",
 788 |       "\n",
 789 |       "\n",
 790 |       "\n",
 791 |       "\n",
 792 |       "\n",
 793 |       "\n",
 794 |       "\n",
 795 |       "ITERATION ### 4\n",
 796 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 797 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 798 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 799 |       "length of the cluster  0 ------> 2696\n",
 800 |       "length of the cluster  1 ------> 1554\n",
 801 |       "length of the cluster  2 ------> 1659\n",
 802 |       "UPDATED THE OLD COVARIANCE\n",
 803 |       "beginning the smoothening ALGORITHM\n",
 804 |       "length of cluster # 0 --------> 2644\n",
 805 |       "length of cluster # 1 --------> 1606\n",
 806 |       "length of cluster # 2 --------> 1659\n",
 807 |       "Done writing the figure\n",
 808 |       "\n",
 809 |       "\n",
 810 |       "\n",
 811 |       "\n",
 812 |       "\n",
 813 |       "\n",
 814 |       "\n",
 815 |       "ITERATION ### 5\n",
 816 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 817 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 818 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 819 |       "length of the cluster  0 ------> 2644\n",
 820 |       "length of the cluster  1 ------> 1606\n",
 821 |       "length of the cluster  2 ------> 1659\n",
 822 |       "UPDATED THE OLD COVARIANCE\n",
 823 |       "beginning the smoothening ALGORITHM\n",
 824 |       "length of cluster # 0 --------> 2633\n",
 825 |       "length of cluster # 1 --------> 1614\n",
 826 |       "length of cluster # 2 --------> 1662\n",
 827 |       "Done writing the figure\n",
 828 |       "\n",
 829 |       "\n",
 830 |       "\n",
 831 |       "\n",
 832 |       "\n",
 833 |       "\n",
 834 |       "\n",
 835 |       "ITERATION ### 6\n",
 836 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 837 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 838 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 839 |       "length of the cluster  0 ------> 2633\n",
 840 |       "length of the cluster  1 ------> 1614\n",
 841 |       "length of the cluster  2 ------> 1662\n",
 842 |       "UPDATED THE OLD COVARIANCE\n",
 843 |       "beginning the smoothening ALGORITHM\n",
 844 |       "length of cluster # 0 --------> 2625\n",
 845 |       "length of cluster # 1 --------> 1636\n",
 846 |       "length of cluster # 2 --------> 1648\n",
 847 |       "Done writing the figure\n",
 848 |       "\n",
 849 |       "\n",
 850 |       "\n",
 851 |       "\n",
 852 |       "\n",
 853 |       "\n",
 854 |       "\n",
 855 |       "ITERATION ### 7\n",
 856 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 857 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 858 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 859 |       "length of the cluster  0 ------> 2625\n",
 860 |       "length of the cluster  1 ------> 1636\n",
 861 |       "length of the cluster  2 ------> 1648\n",
 862 |       "UPDATED THE OLD COVARIANCE\n",
 863 |       "beginning the smoothening ALGORITHM\n",
 864 |       "length of cluster # 0 --------> 2623\n",
 865 |       "length of cluster # 1 --------> 1640\n",
 866 |       "length of cluster # 2 --------> 1646\n",
 867 |       "Done writing the figure\n",
 868 |       "\n",
 869 |       "\n",
 870 |       "\n",
 871 |       "\n",
 872 |       "\n",
 873 |       "\n",
 874 |       "\n",
 875 |       "ITERATION ### 8\n",
 876 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 877 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 878 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 879 |       "length of the cluster  0 ------> 2623\n",
 880 |       "length of the cluster  1 ------> 1640\n",
 881 |       "length of the cluster  2 ------> 1646\n",
 882 |       "UPDATED THE OLD COVARIANCE\n",
 883 |       "beginning the smoothening ALGORITHM\n",
 884 |       "length of cluster # 0 --------> 2623\n",
 885 |       "length of cluster # 1 --------> 1640\n",
 886 |       "length of cluster # 2 --------> 1646\n",
 887 |       "Done writing the figure\n",
 888 |       "\n",
 889 |       "\n",
 890 |       "\n",
 891 |       "\n",
 892 |       "\n",
 893 |       "\n",
 894 |       "\n",
 895 |       "ITERATION ### 9\n",
 896 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 897 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 898 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 899 |       "length of the cluster  0 ------> 2623\n",
 900 |       "length of the cluster  1 ------> 1640\n",
 901 |       "length of the cluster  2 ------> 1646\n",
 902 |       "UPDATED THE OLD COVARIANCE\n",
 903 |       "beginning the smoothening ALGORITHM\n",
 904 |       "length of cluster # 0 --------> 2623\n",
 905 |       "length of cluster # 1 --------> 1638\n",
 906 |       "length of cluster # 2 --------> 1648\n",
 907 |       "Done writing the figure\n",
 908 |       "\n",
 909 |       "\n",
 910 |       "\n",
 911 |       "\n",
 912 |       "\n",
 913 |       "\n",
 914 |       "\n",
 915 |       "ITERATION ### 10\n",
 916 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 917 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 918 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 919 |       "length of the cluster  0 ------> 2623\n",
 920 |       "length of the cluster  1 ------> 1638\n",
 921 |       "length of the cluster  2 ------> 1648\n",
 922 |       "UPDATED THE OLD COVARIANCE\n",
 923 |       "beginning the smoothening ALGORITHM\n",
 924 |       "length of cluster # 0 --------> 2623\n",
 925 |       "length of cluster # 1 --------> 1647\n",
 926 |       "length of cluster # 2 --------> 1639\n",
 927 |       "Done writing the figure\n",
 928 |       "\n",
 929 |       "\n",
 930 |       "\n",
 931 |       "\n",
 932 |       "\n",
 933 |       "\n",
 934 |       "\n",
 935 |       "ITERATION ### 11\n",
 936 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 937 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 938 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 939 |       "length of the cluster  0 ------> 2623\n",
 940 |       "length of the cluster  1 ------> 1647\n",
 941 |       "length of the cluster  2 ------> 1639\n",
 942 |       "UPDATED THE OLD COVARIANCE\n",
 943 |       "beginning the smoothening ALGORITHM\n",
 944 |       "length of cluster # 0 --------> 2623\n",
 945 |       "length of cluster # 1 --------> 1646\n",
 946 |       "length of cluster # 2 --------> 1640\n",
 947 |       "Done writing the figure\n",
 948 |       "\n",
 949 |       "\n",
 950 |       "\n",
 951 |       "\n",
 952 |       "\n",
 953 |       "\n",
 954 |       "\n",
 955 |       "ITERATION ### 12\n",
 956 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 957 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 958 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 959 |       "length of the cluster  0 ------> 2623\n",
 960 |       "length of the cluster  1 ------> 1646\n",
 961 |       "length of the cluster  2 ------> 1640\n",
 962 |       "UPDATED THE OLD COVARIANCE\n",
 963 |       "beginning the smoothening ALGORITHM\n",
 964 |       "length of cluster # 0 --------> 2623\n",
 965 |       "length of cluster # 1 --------> 1644\n",
 966 |       "length of cluster # 2 --------> 1642\n",
 967 |       "Done writing the figure\n",
 968 |       "\n",
 969 |       "\n",
 970 |       "\n",
 971 |       "\n",
 972 |       "\n",
 973 |       "\n",
 974 |       "\n",
 975 |       "ITERATION ### 13\n",
 976 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 977 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 978 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 979 |       "length of the cluster  0 ------> 2623\n",
 980 |       "length of the cluster  1 ------> 1644\n",
 981 |       "length of the cluster  2 ------> 1642\n",
 982 |       "UPDATED THE OLD COVARIANCE\n",
 983 |       "beginning the smoothening ALGORITHM\n",
 984 |       "length of cluster # 0 --------> 2623\n",
 985 |       "length of cluster # 1 --------> 1637\n",
 986 |       "length of cluster # 2 --------> 1649\n",
 987 |       "Done writing the figure\n",
 988 |       "\n",
 989 |       "\n",
 990 |       "\n",
 991 |       "\n",
 992 |       "\n",
 993 |       "\n",
 994 |       "\n",
 995 |       "ITERATION ### 14\n",
 996 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
 997 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
 998 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
 999 |       "length of the cluster  0 ------> 2623\n",
1000 |       "length of the cluster  1 ------> 1637\n",
1001 |       "length of the cluster  2 ------> 1649\n",
1002 |       "UPDATED THE OLD COVARIANCE\n",
1003 |       "beginning the smoothening ALGORITHM\n",
1004 |       "length of cluster # 0 --------> 2623\n",
1005 |       "length of cluster # 1 --------> 1641\n",
1006 |       "length of cluster # 2 --------> 1645\n",
1007 |       "Done writing the figure\n",
1008 |       "\n",
1009 |       "\n",
1010 |       "\n",
1011 |       "\n",
1012 |       "\n",
1013 |       "\n",
1014 |       "\n",
1015 |       "ITERATION ### 15\n",
1016 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1017 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1018 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1019 |       "length of the cluster  0 ------> 2623\n",
1020 |       "length of the cluster  1 ------> 1641\n",
1021 |       "length of the cluster  2 ------> 1645\n",
1022 |       "UPDATED THE OLD COVARIANCE\n",
1023 |       "beginning the smoothening ALGORITHM\n",
1024 |       "length of cluster # 0 --------> 2623\n",
1025 |       "length of cluster # 1 --------> 1644\n",
1026 |       "length of cluster # 2 --------> 1642\n",
1027 |       "Done writing the figure\n",
1028 |       "\n",
1029 |       "\n",
1030 |       "\n",
1031 |       "\n",
1032 |       "\n",
1033 |       "\n",
1034 |       "\n",
1035 |       "ITERATION ### 16\n",
1036 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1037 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1038 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1039 |       "length of the cluster  0 ------> 2623\n",
1040 |       "length of the cluster  1 ------> 1644\n",
1041 |       "length of the cluster  2 ------> 1642\n",
1042 |       "UPDATED THE OLD COVARIANCE\n",
1043 |       "beginning the smoothening ALGORITHM\n",
1044 |       "length of cluster # 0 --------> 2623\n",
1045 |       "length of cluster # 1 --------> 1643\n",
1046 |       "length of cluster # 2 --------> 1643\n",
1047 |       "Done writing the figure\n",
1048 |       "\n",
1049 |       "\n",
1050 |       "\n",
1051 |       "\n",
1052 |       "\n",
1053 |       "\n",
1054 |       "\n",
1055 |       "ITERATION ### 17\n",
1056 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1057 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1058 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1059 |       "length of the cluster  0 ------> 2623\n",
1060 |       "length of the cluster  1 ------> 1643\n",
1061 |       "length of the cluster  2 ------> 1643\n",
1062 |       "UPDATED THE OLD COVARIANCE\n",
1063 |       "beginning the smoothening ALGORITHM\n",
1064 |       "length of cluster # 0 --------> 2623\n",
1065 |       "length of cluster # 1 --------> 1646\n",
1066 |       "length of cluster # 2 --------> 1640\n",
1067 |       "Done writing the figure\n",
1068 |       "\n",
1069 |       "\n",
1070 |       "\n",
1071 |       "\n",
1072 |       "\n",
1073 |       "\n",
1074 |       "\n",
1075 |       "ITERATION ### 18\n",
1076 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1077 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1078 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1079 |       "length of the cluster  0 ------> 2623\n",
1080 |       "length of the cluster  1 ------> 1646\n",
1081 |       "length of the cluster  2 ------> 1640\n",
1082 |       "UPDATED THE OLD COVARIANCE\n",
1083 |       "beginning the smoothening ALGORITHM\n",
1084 |       "length of cluster # 0 --------> 2622\n",
1085 |       "length of cluster # 1 --------> 1639\n",
1086 |       "length of cluster # 2 --------> 1648\n",
1087 |       "Done writing the figure\n",
1088 |       "\n",
1089 |       "\n",
1090 |       "\n",
1091 |       "\n",
1092 |       "\n",
1093 |       "\n",
1094 |       "\n",
1095 |       "ITERATION ### 19\n",
1096 |       "OPTIMIZATION for Cluster # 0 DONE!!!\n",
1097 |       "OPTIMIZATION for Cluster # 1 DONE!!!\n",
1098 |       "OPTIMIZATION for Cluster # 2 DONE!!!\n",
1099 |       "length of the cluster  0 ------> 2622\n",
1100 |       "length of the cluster  1 ------> 1639\n",
1101 |       "length of the cluster  2 ------> 1648\n",
1102 |       "UPDATED THE OLD COVARIANCE\n",
1103 |       "beginning the smoothening ALGORITHM\n",
1104 |       "length of cluster # 0 --------> 2622\n",
1105 |       "length of cluster # 1 --------> 1643\n",
1106 |       "length of cluster # 2 --------> 1644\n",
1107 |       "Done writing the figure\n",
1108 |       "\n",
1109 |       "\n",
1110 |       "\n",
1111 |       "\n",
1112 |       "\n",
1113 |       "\n",
1114 |       "\n",
1115 |       "TRAINING F1 score: -1 -1 -1\n",
1116 |       "[1.0000 1.0000 1.0000 ... 0.0000 0.0000 0.0000]\n"
1117 |      ]
1118 |     }
1119 |    ],
1120 |    "source": [
1121 |     "!python STICC_main.py --fname=nyc_checkin3.txt --oname=result_nyc_checkin3.txt --attr_idx_start=1 \\\n",
1122 |     "--attr_idx_end=2 --spatial_idx_start=3 --spatial_idx_end=5 \\\n",
1123 |     "--spatial_radius 4 --number_of_clusters 3 --lambda_parameter 10e-2 --beta 5 --maxIters 20"
1124 |    ]
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": 17,
1129 |    "metadata": {},
1130 |    "outputs": [
1131 |     {
1132 |      "name": "stdout",
1133 |      "output_type": "stream",
1134 |      "text": [
1135 |       "Adjusted rand score 0.299048156707623\n",
1136 |       "Spatial contiguity:  0.7245619074978454\n",
1137 |       "f1_score  0.5239363875462164 [3, 4, 1]\n"
1138 |      ]
1139 |     }
1140 |    ],
1141 |    "source": [
1142 |     "group = pd.read_table('result_nyc_checkin3.txt', names=[\"group\"])\n",
1143 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1144 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1145 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1146 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1147 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1148 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1149 |     "get_max_f1_score(result_nyc_check_sticc)"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "markdown",
1154 |    "metadata": {},
1155 |    "source": [
1156 |     "# Other methods"
1157 |    ]
1158 |   },
1159 |   {
1160 |    "cell_type": "code",
1161 |    "execution_count": 18,
1162 |    "metadata": {},
1163 |    "outputs": [],
1164 |    "source": [
1165 |     "def get_pycluster_result(ground_truth, cluster_method):\n",
1166 |     "#     data = ground_truth[[\"week_attr\", \"hour\"]].values # For K-Means\n",
1167 |     "    data = ground_truth[[\"week_attr\", \"hour\", \"latitude\", \"longitude\"]].values # For Sp K-Means\n",
1168 |     "\n",
1169 |     "    if cluster_method == kmeans:\n",
1170 |     "        initial_centers = kmeans_plusplus_initializer(data.tolist(), 2).initialize()\n",
1171 |     "        instance = cluster_method(data.tolist(), initial_centers)\n",
1172 |     "    elif cluster_method == cure:\n",
1173 |     "        print(\"cure\")\n",
1174 |     "        instance = cure(data, 3)\n",
1175 |     "    else:\n",
1176 |     "        instance = cluster_method(data.tolist(), 2)\n",
1177 |     "\n",
1178 |     "    instance.process()\n",
1179 |     "    clusters = instance.get_clusters()\n",
1180 |     "    \n",
1181 |     "    clusters_result = []\n",
1182 |     "    for i, clus in enumerate(clusters):\n",
1183 |     "        for data in clus:\n",
1184 |     "            clusters_result.append([data, i])\n",
1185 |     "    clusters_result_df = pd.DataFrame(clusters_result, columns=[\"pt\", \"group\"]).sort_values(\"pt\").set_index(\"pt\")\n",
1186 |     "    return clusters_result_df"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "markdown",
1191 |    "metadata": {},
1192 |    "source": [
1193 |     "# K-Means"
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "code",
1198 |    "execution_count": 19,
1199 |    "metadata": {},
1200 |    "outputs": [
1201 |     {
1202 |      "name": "stdout",
1203 |      "output_type": "stream",
1204 |      "text": [
1205 |       "Adjusted rand score 0.06540493878619441\n",
1206 |       "Spatial contiguity:  0.6700948003447286\n",
1207 |       "f1_score  0.38086125317189695 [3, 4, 1]\n"
1208 |      ]
1209 |     }
1210 |    ],
1211 |    "source": [
1212 |     "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
1213 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1214 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1215 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1216 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1217 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1218 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1219 |     "get_max_f1_score(result_nyc_check_sticc)"
1220 |    ]
1221 |   },
1222 |   {
1223 |    "cell_type": "markdown",
1224 |    "metadata": {},
1225 |    "source": [
1226 |     "# Sp K-Means"
1227 |    ]
1228 |   },
1229 |   {
1230 |    "cell_type": "code",
1231 |    "execution_count": 20,
1232 |    "metadata": {},
1233 |    "outputs": [
1234 |     {
1235 |      "name": "stdout",
1236 |      "output_type": "stream",
1237 |      "text": [
1238 |       "Adjusted rand score 0.06540493878619441\n",
1239 |       "Spatial contiguity:  0.6700948003447286\n",
1240 |       "f1_score  0.38086125317189695 [3, 4, 1]\n"
1241 |      ]
1242 |     }
1243 |    ],
1244 |    "source": [
1245 |     "group = get_pycluster_result(nyc_check_sticc, kmeans)\n",
1246 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1247 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1248 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1249 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1250 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1251 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1252 |     "get_max_f1_score(result_nyc_check_sticc)"
1253 |    ]
1254 |   },
1255 |   {
1256 |    "cell_type": "markdown",
1257 |    "metadata": {},
1258 |    "source": [
1259 |     "# CURE"
1260 |    ]
1261 |   },
1262 |   {
1263 |    "cell_type": "code",
1264 |    "execution_count": 21,
1265 |    "metadata": {},
1266 |    "outputs": [
1267 |     {
1268 |      "name": "stdout",
1269 |      "output_type": "stream",
1270 |      "text": [
1271 |       "cure\n",
1272 |       "Adjusted rand score 0.0729293684699148\n",
1273 |       "Spatial contiguity:  0.6272335535765584\n",
1274 |       "f1_score  0.4208030109481018 [3, 4, 1]\n"
1275 |      ]
1276 |     }
1277 |    ],
1278 |    "source": [
1279 |     "group = get_pycluster_result(nyc_check_sticc, cure)\n",
1280 |     "result_nyc_check_sticc = nyc_check_sticc.join(group)\n",
1281 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1282 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1283 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1284 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1285 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1286 |     "get_max_f1_score(result_nyc_check_sticc)"
1287 |    ]
1288 |   },
1289 |   {
1290 |    "cell_type": "markdown",
1291 |    "metadata": {},
1292 |    "source": [
1293 |     "# GMM"
1294 |    ]
1295 |   },
1296 |   {
1297 |    "cell_type": "code",
1298 |    "execution_count": 22,
1299 |    "metadata": {},
1300 |    "outputs": [],
1301 |    "source": [
1302 |     "from sklearn.mixture import GaussianMixture"
1303 |    ]
1304 |   },
1305 |   {
1306 |    "cell_type": "code",
1307 |    "execution_count": 23,
1308 |    "metadata": {},
1309 |    "outputs": [
1310 |     {
1311 |      "data": {
1312 |       "text/html": [
1313 |        "<div>\n",
1314 |        "<style scoped>\n",
1315 |        "    .dataframe tbody tr th:only-of-type {\n",
1316 |        "        vertical-align: middle;\n",
1317 |        "    }\n",
1318 |        "\n",
1319 |        "    .dataframe tbody tr th {\n",
1320 |        "        vertical-align: top;\n",
1321 |        "    }\n",
1322 |        "\n",
1323 |        "    .dataframe thead th {\n",
1324 |        "        text-align: right;\n",
1325 |        "    }\n",
1326 |        "</style>\n",
1327 |        "<table border=\"1\" class=\"dataframe\">\n",
1328 |        "  <thead>\n",
1329 |        "    <tr style=\"text-align: right;\">\n",
1330 |        "      <th></th>\n",
1331 |        "      <th>venueId</th>\n",
1332 |        "      <th>userId</th>\n",
1333 |        "      <th>gender</th>\n",
1334 |        "      <th>friend_num</th>\n",
1335 |        "      <th>follow_num</th>\n",
1336 |        "      <th>latitude</th>\n",
1337 |        "      <th>longitude</th>\n",
1338 |        "      <th>venueCateg</th>\n",
1339 |        "      <th>week</th>\n",
1340 |        "      <th>hour</th>\n",
1341 |        "      <th>geometry</th>\n",
1342 |        "      <th>week_attr</th>\n",
1343 |        "      <th>category</th>\n",
1344 |        "      <th>n_pt_0</th>\n",
1345 |        "      <th>n_pt_1</th>\n",
1346 |        "      <th>n_pt_2</th>\n",
1347 |        "    </tr>\n",
1348 |        "  </thead>\n",
1349 |        "  <tbody>\n",
1350 |        "    <tr>\n",
1351 |        "      <th>0</th>\n",
1352 |        "      <td>3fd66200f964a5206fe71ee3</td>\n",
1353 |        "      <td>654</td>\n",
1354 |        "      <td>male</td>\n",
1355 |        "      <td>103.0</td>\n",
1356 |        "      <td>46.0</td>\n",
1357 |        "      <td>40.752901</td>\n",
1358 |        "      <td>-73.974176</td>\n",
1359 |        "      <td>Gym</td>\n",
1360 |        "      <td>Mon</td>\n",
1361 |        "      <td>17</td>\n",
1362 |        "      <td>POINT (-73.97418 40.75290)</td>\n",
1363 |        "      <td>1</td>\n",
1364 |        "      <td>1</td>\n",
1365 |        "      <td>3556</td>\n",
1366 |        "      <td>9</td>\n",
1367 |        "      <td>22</td>\n",
1368 |        "    </tr>\n",
1369 |        "  </tbody>\n",
1370 |        "</table>\n",
1371 |        "</div>"
1372 |       ],
1373 |       "text/plain": [
1374 |        "                    venueId  userId gender  friend_num  follow_num   latitude  \\\n",
1375 |        "0  3fd66200f964a5206fe71ee3     654   male       103.0        46.0  40.752901   \n",
1376 |        "\n",
1377 |        "   longitude venueCateg week  hour                    geometry  week_attr  \\\n",
1378 |        "0 -73.974176        Gym  Mon    17  POINT (-73.97418 40.75290)          1   \n",
1379 |        "\n",
1380 |        "   category  n_pt_0  n_pt_1  n_pt_2  \n",
1381 |        "0         1    3556       9      22  "
1382 |       ]
1383 |      },
1384 |      "execution_count": 23,
1385 |      "metadata": {},
1386 |      "output_type": "execute_result"
1387 |     }
1388 |    ],
1389 |    "source": [
1390 |     "gmm_data = nyc_check_sticc.copy()\n",
1391 |     "gmm_data.head(1)"
1392 |    ]
1393 |   },
1394 |   {
1395 |    "cell_type": "code",
1396 |    "execution_count": 24,
1397 |    "metadata": {},
1398 |    "outputs": [],
1399 |    "source": [
1400 |     "X = gmm_data[['hour', 'week_attr']].values"
1401 |    ]
1402 |   },
1403 |   {
1404 |    "cell_type": "code",
1405 |    "execution_count": 25,
1406 |    "metadata": {},
1407 |    "outputs": [
1408 |     {
1409 |      "data": {
1410 |       "text/html": [
1411 |        "<div>\n",
1412 |        "<style scoped>\n",
1413 |        "    .dataframe tbody tr th:only-of-type {\n",
1414 |        "        vertical-align: middle;\n",
1415 |        "    }\n",
1416 |        "\n",
1417 |        "    .dataframe tbody tr th {\n",
1418 |        "        vertical-align: top;\n",
1419 |        "    }\n",
1420 |        "\n",
1421 |        "    .dataframe thead th {\n",
1422 |        "        text-align: right;\n",
1423 |        "    }\n",
1424 |        "</style>\n",
1425 |        "<table border=\"1\" class=\"dataframe\">\n",
1426 |        "  <thead>\n",
1427 |        "    <tr style=\"text-align: right;\">\n",
1428 |        "      <th></th>\n",
1429 |        "      <th>group</th>\n",
1430 |        "    </tr>\n",
1431 |        "  </thead>\n",
1432 |        "  <tbody>\n",
1433 |        "    <tr>\n",
1434 |        "      <th>0</th>\n",
1435 |        "      <td>1</td>\n",
1436 |        "    </tr>\n",
1437 |        "  </tbody>\n",
1438 |        "</table>\n",
1439 |        "</div>"
1440 |       ],
1441 |       "text/plain": [
1442 |        "   group\n",
1443 |        "0      1"
1444 |       ]
1445 |      },
1446 |      "execution_count": 25,
1447 |      "metadata": {},
1448 |      "output_type": "execute_result"
1449 |     }
1450 |    ],
1451 |    "source": [
1452 |     "gm = GaussianMixture(n_components=3).fit(X)\n",
1453 |     "gmm = pd.DataFrame(gm.predict(X), columns=[\"group\"])\n",
1454 |     "gmm.head(1)"
1455 |    ]
1456 |   },
1457 |   {
1458 |    "cell_type": "code",
1459 |    "execution_count": 26,
1460 |    "metadata": {},
1461 |    "outputs": [
1462 |     {
1463 |      "name": "stdout",
1464 |      "output_type": "stream",
1465 |      "text": [
1466 |       "Adjusted rand score 0.09072443404391904\n",
1467 |       "Spatial contiguity:  0.6405630565929331\n",
1468 |       "f1_score  0.4349576813996136 [3, 4, 1]\n"
1469 |      ]
1470 |     }
1471 |    ],
1472 |    "source": [
1473 |     "result_nyc_check_sticc = nyc_check_sticc.join(gmm)\n",
1474 |     "result_nyc_check_sticc = result_nyc_check_sticc.rename({\"category\": \"clus_group_gt\"}, axis=1)\n",
1475 |     "print(\"Adjusted rand score\", adjusted_rand_score(result_nyc_check_sticc[\"group\"].values, \n",
1476 |     "                                                 result_nyc_check_sticc.clus_group_gt.values))\n",
1477 |     "sp_contiguity = cal_joint_statistic(result_nyc_check_sticc, w_voronoi)\n",
1478 |     "print(\"Spatial contiguity: \", sp_contiguity)\n",
1479 |     "get_max_f1_score(result_nyc_check_sticc)"
1480 |    ]
1481 |   }
1482 |  ],
1483 |  "metadata": {
1484 |   "kernelspec": {
1485 |    "display_name": "Python 3",
1486 |    "language": "python",
1487 |    "name": "python3"
1488 |   },
1489 |   "language_info": {
1490 |    "codemirror_mode": {
1491 |     "name": "ipython",
1492 |     "version": 3
1493 |    },
1494 |    "file_extension": ".py",
1495 |    "mimetype": "text/x-python",
1496 |    "name": "python",
1497 |    "nbconvert_exporter": "python",
1498 |    "pygments_lexer": "ipython3",
1499 |    "version": "3.8.3"
1500 |   }
1501 |  },
1502 |  "nbformat": 4,
1503 |  "nbformat_minor": 4
1504 | }
1505 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![License](https://img.shields.io/badge/License-BSD_2--Clause-orange.svg)](https://opensource.org/licenses/BSD-2-Clause)
  2 | 
  3 | <!-- PROJECT LOGO -->
  4 | <br />
  5 | <p align="center">
  6 |   <a href="https://geods.geography.wisc.edu/">
  7 |     <img src="images/GeoDSLogo.jpg" alt="Logo" width="400">
  8 | 
  9 |   <h2 align="center">STICC: A multivariate spatial clustering method for repeated geographic pattern discovery with consideration of spatial contiguity</h2>
 10 | 
 11 |   <p align="center">
 12 |     GeoDS Lab, Department of Geography, University of Wisconsin-Madison.
 13 |     <br />
 14 |   </p>
 15 | </p>
 16 | 
 17 | <!-- TABLE OF CONTENTS -->
 18 | ## Table of Contents
 19 | 
 20 | * [Citation](#citation)
 21 | * [About the Project](#about-the-project)
 22 | * [Code Usage](#code-usage)
 23 | * [Folder Structure](#folder-structure)
 24 | * [License](#license)
 25 | * [Contact](#contact)
 26 | * [Acknowledgements](#acknowledgements)
 27 | 
 28 | <!-- Citation -->
 29 | ## Citation
 30 | If you use this algorithm in your research or applications, please cite this source:
 31 | 
 32 | Kang, Y., Wu, K., Gao, S., Ng, I., Rao, J., Ye, S., Zhang, F. and Fei, T. [STICC: A multivariate spatial clustering method for repeated geographic pattern discovery with consideration of spatial contiguity](https://doi.org/10.1080/13658816.2022.2053980). *International Journal of Geographical Information Science* (2022). DOI:10.1080/13658816.2022.2053980.
 33 |     
 34 | 
 35 | ```
 36 | @article{kang2022sticc,
 37 |   title     = {STICC: A multivariate spatial clustering method for repeated geographic pattern discovery with consideration of spatial contiguity},
 38 |   author    = {Kang, Yuhao and Wu, Kunlin and Gao, Song and Ng, Ignavier and Rao, Jinmeng and Ye, Shan and Zhang, Fan and Fei, Teng},
 39 |   journal   = {International Journal of Geographical Information Science},
 40 |   doi = {10.1080/13658816.2022.2053980},
 41 |   year = {2022}
 42 | }
 43 | ```
 44 | 
 45 | <!-- ABOUT THE PROJECT -->
 46 | ## About The Project
 47 | Spatial clustering has been widely used for spatial data mining and knowledge discovery. An ideal multivariate spatial clustering should consider both spatial contiguity and aspatial attributes. Existing spatial clustering approaches may face challenges for discovering repeated geographic patterns with spatial contiguity maintained. In this paper, we propose a Spatial Toeplitz Inverse Covariance-Based Clustering (STICC) method that considers both attributes and spatial relationships of geographic objects for multivariate spatial clustering. A subregion is created for each geographic object serving as the basic unit when performing clustering. A Markov random field (MRF) is then constructed to characterize the attribute dependencies of subregions. Using a spatial consistency strategy, nearby objects are encouraged to belong to the same cluster. To test the performance of the proposed STICC algorithm, we apply it in two use cases. The comparison results with several baseline methods show that the STICC outperforms others significantly in terms of adjusted rand index and macro-F1. Joint count statistics is also calculated and shows that the spatial contiguity is well preserved by STICC. Such a spatial clustering method may benefit various applications in the fields of geography, remote sensing, transportation, and urban planning, etc.
 48 | 
 49 | The expected outcome of using STICC for spatial clustering is shown as follows:  
 50 | <p align="center">
 51 |     <img src="images/clustering.jpg" alt="clustering" >
 52 | </p>
 53 | 
 54 | The general idea of the STICC algorithm is illustrated as follows:  
 55 | <p align="center">
 56 |     <img src="images/STICC.jpeg" alt="framework" >
 57 | </p>
 58 | 
 59 | 
 60 | The STICC algorithm is developed based on the TICC algorithm:   
 61 | 
 62 | D. Hallac, S. Vare, S. Boyd, and J. Leskovec Toeplitz Inverse Covariance-Based Clustering of Multivariate Time Series Data. *Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining* 215--223 (2017)  
 63 | 
 64 | GitHub: [TICC](https://github.com/davidhallac/TICC)
 65 | 
 66 | ## Code Usage
 67 | 
 68 | Environment: Python 3.7 or newer  
 69 | See <em>requirements.txt</em>
 70 | 
 71 | #### Experiment Reproduction
 72 | To reproduce the experiments in the paper, please check the three jupyter notebooks: synthetic.ipynb, NYC_checkin.ipynb, NYC_checkin3.ipynb. All datasets have been uplodaded in the folder <em>data/</em>
 73 | 
 74 | #### Input Data Structure
 75 | The input data should be a .txt file with a .csv structure. The first column (column 0) indicates the unique identifier of the geographic object. The following columns indicate the attributes of the geographic object. The last several collumns indicate the nearest neighbors of the geographic object.    
 76 | 
 77 | For instance, given the following two objects:  
 78 | ```
 79 | 0,4.471435163732493,2.158530256342078,96.54097808132826,1016.5109582462767,997.3221602361555,41,78,45   
 80 | 1,2.8090243052935353,2.1454885080772383,68.55061966023295,1701.5536144719163,1001.8594793592364,11,80,35   
 81 | ```
 82 | 
 83 | Column 0 indicates the id of the object, columns 1-5 show attributes of geographic objects, columns 6-8 are nearest neighbors. For the object 0, its nearest neighbor is object 41, its second nearest neighbor is object 78, and its third nearest neighbor is object 45.
 84 | 
 85 | 
 86 | #### Execute Python Code
 87 | To perform STICC on your own dataset, please run the following code in python.  
 88 | 
 89 | Usage:  
 90 |     
 91 | ```
 92 | python STICC_main.py --fname=[input_data] --oname=[output_data] \
 93 | --attr_idx_start=[attr_idx_start] --attr_idx_end=[attr_idx_end] \
 94 | --spatial_idx_start=[spatial_idx_start] --spatial_idx_end=[spatial_idx_end] --spatial_radius=[spatial_radius] \
 95 | --number_of_clusters=[number_of_clusters] --lambda_parameter=[lambda_parameter] --beta=[beta] --maxIters=[maxIters]
 96 | ```
 97 | 
 98 | 
 99 | ```
100 | --fname, input data name
101 | --oname, output file name
102 | --attr_idx_start, attribute start index
103 | --attr_idx_end, attribute end index
104 | --spatial_idx_start, neighboring object start index
105 | --spatial_idx_end, neighboring object end index
106 | --spatial_radius, radius of subregion
107 | --number_of_clusters, number of clusters
108 | --lambda_parameter, lambda
109 | --beta, beta
110 | --maxIters, maximum iterations
111 | ```
112 | 
113 | 
114 | Example:    
115 | Perform STICC on the synthetic_data.txt with spatial radius=3 and beta=3.     
116 | ```
117 | python STICC_main.py --fname=synthetic_data.txt --oname=result_synthetic_data.txt \
118 | --attr_idx_start=1 --attr_idx_end=5 --spatial_idx_start=6 --spatial_idx_end=8 \
119 | --spatial_radius=3 --number_of_clusters 7 --lambda_parameter 0.01 --beta 3 --maxIters 20 
120 | ```
121 | 
122 | 
123 | If you meet the following error:   
124 | ```
125 | numpy.linalg.LinAlgError: Eigenvalues did not converge
126 | ```
127 | 
128 | A potential solution is to standardize your dataset.    
129 | 
130 | ## Folder Structure 
131 | The folders and files are organized as follows.   
132 | ```
133 | project
134 | |-- data
135 | |-- images
136 | |-- src
137 | |   |-- __init__.py
138 | |   |-- admm_solver.py
139 | |   `-- STICC_helper.py
140 | |-- STICC_main.py
141 | |-- STICC_solver.py
142 | |-- synthetic.ipynb
143 | |-- NYC_checkin.ipynb
144 | `-- NYC_checkin3.ipynb
145 | ```
146 | 
147 | <!-- LICENSE -->
148 | ## License
149 | 
150 | Distributed under the BSD License. See `LICENSE` for more information.
151 | 
152 | <!-- CONTACT -->
153 | ## Contact
154 | 
155 | Yuhao Kang - [@YuhaoKang](https://twitter.com/YuhaoKang) - yuhao.kang at wisc.edu  
156 | Song Gao - [@gissong](https://twitter.com/gissong) - song.gao at wisc.edu  
157 | 
158 | Project Link: [https://github.com/GeoDS/STICC](https://github.com/GeoDS/STICC)  
159 | 
160 | <!-- ACKNOWLEDGEMENTS -->
161 | ## Acknowledgements
162 | 
163 | Code inherits from [TICC](https://github.com/davidhallac/TICC).
164 | 
165 | Yuhao Kang acknowledges the support by the Trewartha Research Award, Department of the Geography, University of Wisconsin-Madison. Song Gao and Jinmeng Rao acknowledge the support by the American Family Insurance Data Science Institute at the University of Wisconsin-Madison and the National Science Foundation funded AI institute (Grant No.2112606) for Intelligent Cyberinfrastructure with Computational Learning in the Environment (ICICLE). Fan Zhang would like to thank the support by the National Natural Science Foundation of China under Grant 41901321. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the funders.
166 | 
167 | <!-- MARKDOWN LINKS & IMAGES -->
168 | [license-shield]: https://img.shields.io/github/license/othneildrew/Best-README-Template.svg?style=flat-square
169 | [license-url]: https://github.com/GeoDS/COVID19USFlows/blob/master/LICENSE.txt
170 | 


--------------------------------------------------------------------------------
/STICC_main.py:
--------------------------------------------------------------------------------
 1 | from STICC_solver import STICC
 2 | import numpy as np
 3 | import argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description='Parameters of the STICC')
 6 | parser.add_argument('--fname', type=str,
 7 |                     default="synthetic_data.txt", help='Input data name')
 8 | parser.add_argument('--oname', type=str,
 9 |                     default="result_synthetic_data.txt", help='Output file name')
10 | parser.add_argument('--attr_idx_start', type=int,
11 |                     default=1, help='Attribute start index')
12 | parser.add_argument('--attr_idx_end', type=int,
13 |                     default=5, help='Attribute end index')
14 | parser.add_argument('--spatial_idx_start', type=int,
15 |                     default=6, help='Neighbouring object start index')
16 | parser.add_argument('--spatial_idx_end', type=int, default=8,
17 |                     help='Neighbouring object end index')
18 | parser.add_argument('--spatial_radius', type=int,
19 |                     default=3, help='Radius of the subregion')
20 | parser.add_argument('--number_of_clusters', type=int,
21 |                     default=5, help='Number of clusters')
22 | parser.add_argument('--lambda_parameter', type=float,
23 |                     default=0.1, help='Lambda')
24 | parser.add_argument('--beta', type=float, default=5, help='Beta')
25 | parser.add_argument('--maxIters', type=int, default=20, help='Max Iterations')
26 | 
27 | args = parser.parse_args()
28 | 
29 | sticc = STICC(spatial_radius=args.spatial_radius, number_of_clusters=args.number_of_clusters,
30 |              lambda_parameter=args.lambda_parameter, beta=args.beta, maxIters=args.maxIters,
31 |              threshold=2e-5, write_out_file=False, prefix_string="output_folder/", num_proc=1,
32 |              attr_idx_start=args.attr_idx_start, attr_idx_end=args.attr_idx_end,
33 |              spatial_idx_start=args.spatial_idx_start, spatial_idx_end=args.spatial_idx_end)
34 | (cluster_assignment, cluster_MRFs) = sticc.fit(input_file=args.fname)
35 | 
36 | # Save cluster output
37 | print(cluster_assignment)
38 | np.savetxt(args.oname, cluster_assignment, fmt='%d', delimiter=',')
39 | 
40 | # Save MRF as npy
41 | for key, value in cluster_MRFs.items():
42 |     with open(f'output_folder/MRF_{args.fname.split(".")[0]}_{key}.npy', 'wb') as f:
43 |         np.save(f, np.array(value))
44 | 


--------------------------------------------------------------------------------
/STICC_solver.py:
--------------------------------------------------------------------------------
  1 | from src.admm_solver import ADMMSolver
  2 | from src.STICC_helper import *
  3 | from multiprocessing import Pool
  4 | import pandas as pd
  5 | from sklearn.cluster import KMeans
  6 | from sklearn import mixture
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import math
 10 | import time
 11 | import collections
 12 | import os
 13 | import errno
 14 | import sys
 15 | import code
 16 | import random
 17 | import matplotlib
 18 | matplotlib.use('Agg')
 19 | 
 20 | 
 21 | class STICC:
 22 |     def __init__(self, spatial_radius=1, number_of_clusters=5, lambda_parameter=11e-2,
 23 |                  beta=400, maxIters=1000, threshold=2e-5, write_out_file=False,
 24 |                  prefix_string="", num_proc=1, cluster_reassignment=20, biased=False,
 25 |                  attr_idx_start=0, attr_idx_end=0, spatial_idx_start=0, spatial_idx_end=0):
 26 |         """
 27 |         Parameters:
 28 |             - spatial_radius: size of the subregion
 29 |             - number_of_clusters: number of clusters
 30 |             - lambda_parameter: sparsity parameter
 31 |             - switch_penalty: temporal consistency parameter
 32 |             - maxIters: number of iterations
 33 |             - threshold: convergence threshold
 34 |             - write_out_file: (bool) if true, prefix_string is output file dir
 35 |             - prefix_string: output directory if necessary
 36 |             - cluster_reassignment: number of points to reassign to a 0 cluster
 37 |             - biased: Using the biased or the unbiased covariance
 38 |         """
 39 |         self.spatial_radius = spatial_radius
 40 |         self.number_of_clusters = number_of_clusters
 41 |         self.lambda_parameter = lambda_parameter
 42 |         self.switch_penalty = beta
 43 |         self.maxIters = maxIters
 44 |         self.threshold = threshold
 45 |         self.write_out_file = write_out_file
 46 |         self.prefix_string = prefix_string
 47 |         self.num_proc = num_proc
 48 |         self.cluster_reassignment = cluster_reassignment
 49 |         self.num_blocks = self.spatial_radius + 1
 50 |         self.biased = biased
 51 |         self.attr_idx_start = attr_idx_start
 52 |         self.attr_idx_end = attr_idx_end
 53 |         self.spatial_idx_start = spatial_idx_start
 54 |         self.spatial_idx_end = spatial_idx_end
 55 |         self.spatial_series_index = []
 56 |         self.spatial_series_close = []
 57 |         self.spatial_series_closest = []
 58 |         pd.set_option('display.max_columns', 500)
 59 |         np.set_printoptions(
 60 |             formatter={'float': lambda x: "{0:0.4f}".format(x)})
 61 |         np.random.seed(102)
 62 | 
 63 |     def fit(self, input_file):
 64 |         """
 65 |         Main method for TICC solver.
 66 |         Parameters:
 67 |             - input_file: location of the data file
 68 |         """
 69 |         assert self.maxIters > 0  # must have at least one iteration
 70 |         self.log_parameters()
 71 | 
 72 |         # Get data into proper format
 73 | 
 74 |         total_arr, total_rows_size, total_cols_size = self.load_data(
 75 |             input_file)
 76 |         spatial_series_arr = total_arr[:,
 77 |                                        self.attr_idx_start:self.attr_idx_end+1]
 78 |         spatial_series_rows_size = total_rows_size
 79 |         spatial_series_col_size = self.attr_idx_end - self.attr_idx_start + 1
 80 |         spatial_series_index = total_arr[:, 0]
 81 |         spatial_series_close = total_arr[:,
 82 |                                          self.spatial_idx_start:self.spatial_idx_end+1]
 83 |         print(spatial_series_col_size, spatial_series_arr.shape,
 84 |               spatial_series_close.shape)
 85 |         self.spatial_series_closest = spatial_series_close[:, 0]
 86 |         self.spatial_series_index = spatial_series_index
 87 |         self.spatial_series_close = spatial_series_close
 88 | 
 89 |         ############
 90 |         # The basic folder to be created
 91 |         str_NULL = self.prepare_out_directory()
 92 | 
 93 |         # Train test split
 94 |         training_indices = spatial_series_index
 95 |         num_train_points = len(training_indices)
 96 | 
 97 |         # Stack the training data
 98 |         complete_D_train = self.stack_training_data(total_arr, spatial_series_col_size, num_train_points,
 99 |                                                     training_indices, spatial_series_col_size)
100 | 
101 |         # Initialization
102 |         # Gaussian Mixture
103 |         gmm = mixture.GaussianMixture(
104 |             n_components=self.number_of_clusters, covariance_type="full")
105 |         gmm.fit(complete_D_train)
106 |         clustered_points = gmm.predict(complete_D_train)
107 |         gmm_clustered_pts = clustered_points + 0
108 |         # K-means
109 |         kmeans = KMeans(n_clusters=self.number_of_clusters,
110 |                         n_init=300, random_state=0).fit(complete_D_train)
111 |         clustered_points = kmeans.labels_
112 |         # todo, is there a difference between these two?
113 |         clustered_points_kmeans = kmeans.labels_
114 |         kmeans_clustered_pts = kmeans.labels_
115 | 
116 |         train_cluster_inverse = {}
117 |         log_det_values = {}  # log dets of the thetas
118 |         computed_covariance = {}
119 |         cluster_mean_info = {}
120 |         cluster_mean_stacked_info = {}
121 |         old_clustered_points = None  # points from last iteration
122 | 
123 |         empirical_covariances = {}
124 | 
125 |         # PERFORM TRAINING ITERATIONS
126 |         pool = Pool(processes=self.num_proc)  # multi-threading
127 |         for iters in range(self.maxIters):
128 |             print("\n\n\nITERATION ###", iters)
129 |             # Get the train and test points
130 |             train_clusters_arr = collections.defaultdict(
131 |                 list)  # {cluster: [point indices]}
132 |             for point, cluster_num in enumerate(clustered_points):
133 |                 train_clusters_arr[cluster_num].append(point)
134 | 
135 |             len_train_clusters = {
136 |                 k: len(train_clusters_arr[k]) for k in range(self.number_of_clusters)}
137 | 
138 |             # train_clusters holds the indices in complete_D_train
139 |             # for each of the clusters
140 |             opt_res = self.train_clusters(cluster_mean_info, cluster_mean_stacked_info, complete_D_train,
141 |                                           empirical_covariances, len_train_clusters, spatial_series_col_size, pool,
142 |                                           train_clusters_arr)
143 | 
144 |             self.optimize_clusters(computed_covariance, len_train_clusters, log_det_values, opt_res,
145 |                                    train_cluster_inverse)
146 | 
147 |             # update old computed covariance
148 |             old_computed_covariance = computed_covariance
149 | 
150 |             print("UPDATED THE OLD COVARIANCE")
151 | 
152 |             self.trained_model = {'cluster_mean_info': cluster_mean_info,
153 |                                   'computed_covariance': computed_covariance,
154 |                                   'cluster_mean_stacked_info': cluster_mean_stacked_info,
155 |                                   'complete_D_train': complete_D_train,
156 |                                   'spatial_series_col_size': spatial_series_col_size}
157 |             clustered_points = self.predict_clusters()
158 | 
159 |             # recalculate lengths
160 |             new_train_clusters = collections.defaultdict(
161 |                 list)  # {cluster: [point indices]}
162 |             for point, cluster in enumerate(clustered_points):
163 |                 new_train_clusters[cluster].append(point)
164 | 
165 |             len_new_train_clusters = {
166 |                 k: len(new_train_clusters[k]) for k in range(self.number_of_clusters)}
167 | 
168 |             before_empty_cluster_assign = clustered_points.copy()
169 | 
170 |             if iters != 0:
171 |                 cluster_norms = [(np.linalg.norm(old_computed_covariance[self.number_of_clusters, i]), i) for i in
172 |                                  range(self.number_of_clusters)]
173 |                 norms_sorted = sorted(cluster_norms, reverse=True)
174 |                 # clusters that are not 0 as sorted by norm
175 |                 valid_clusters = [
176 |                     cp[1] for cp in norms_sorted if len_new_train_clusters[cp[1]] != 0]
177 | 
178 |                 # Add a point to the empty clusters
179 |                 # assuming more non empty clusters than empty ones
180 |                 counter = 0
181 |                 for cluster_num in range(self.number_of_clusters):
182 |                     if len_new_train_clusters[cluster_num] == 0:
183 |                         # a cluster that is not len 0
184 |                         cluster_selected = valid_clusters[counter]
185 |                         counter = (counter + 1) % len(valid_clusters)
186 |                         print("cluster that is zero is:", cluster_num,
187 |                               "selected cluster instead is:", cluster_selected)
188 |                         start_point = np.random.choice(
189 |                             new_train_clusters[cluster_selected])  # random point number from that cluster
190 |                         for i in range(0, self.cluster_reassignment):
191 |                             # put cluster_reassignment points from point_num in this cluster
192 |                             point_to_move = start_point + i
193 |                             if point_to_move >= len(clustered_points):
194 |                                 break
195 |                             clustered_points[point_to_move] = cluster_num
196 |                             computed_covariance[self.number_of_clusters, cluster_num] = old_computed_covariance[
197 |                                 self.number_of_clusters, cluster_selected]
198 |                             cluster_mean_stacked_info[self.number_of_clusters, cluster_num] = complete_D_train[
199 |                                 point_to_move, :]
200 |                             cluster_mean_info[self.number_of_clusters, cluster_num] \
201 |                                 = complete_D_train[point_to_move, :][
202 |                                 (self.spatial_radius - 1) * spatial_series_col_size:self.spatial_radius * spatial_series_col_size]
203 | 
204 |             for cluster_num in range(self.number_of_clusters):
205 |                 print("length of cluster #", cluster_num, "-------->",
206 |                       sum([x == cluster_num for x in clustered_points]))
207 | 
208 |             # TEST SETS STUFF
209 |             # LLE + swtiching_penalty
210 |             # Segment length
211 |             # Get the train and test points
212 |             train_confusion_matrix_EM = compute_confusion_matrix(self.number_of_clusters, clustered_points,
213 |                                                                  training_indices)
214 |             train_confusion_matrix_GMM = compute_confusion_matrix(self.number_of_clusters, gmm_clustered_pts,
215 |                                                                   training_indices)
216 |             train_confusion_matrix_kmeans = compute_confusion_matrix(self.number_of_clusters, kmeans_clustered_pts,
217 |                                                                      training_indices)
218 |             # compute the matchings
219 |             matching_EM, matching_GMM, matching_Kmeans = self.compute_matches(train_confusion_matrix_EM,
220 |                                                                               train_confusion_matrix_GMM,
221 |                                                                               train_confusion_matrix_kmeans)
222 | 
223 |             print("\n\n\n")
224 | 
225 |             if np.array_equal(old_clustered_points, clustered_points):
226 |                 print("\n\n\n\nCONVERGED!!! BREAKING EARLY!!!")
227 |                 break
228 |             old_clustered_points = before_empty_cluster_assign
229 |             # end of training
230 |         if pool is not None:
231 |             pool.close()
232 |             pool.join()
233 |         train_confusion_matrix_EM = compute_confusion_matrix(self.number_of_clusters, clustered_points,
234 |                                                              training_indices)
235 |         train_confusion_matrix_GMM = compute_confusion_matrix(self.number_of_clusters, gmm_clustered_pts,
236 |                                                               training_indices)
237 |         train_confusion_matrix_kmeans = compute_confusion_matrix(self.number_of_clusters, clustered_points_kmeans,
238 |                                                                  training_indices)
239 | 
240 |         return clustered_points, train_cluster_inverse
241 | 
242 |     def compute_matches(self, train_confusion_matrix_EM, train_confusion_matrix_GMM, train_confusion_matrix_kmeans):
243 |         matching_Kmeans = find_matching(train_confusion_matrix_kmeans)
244 |         matching_GMM = find_matching(train_confusion_matrix_GMM)
245 |         matching_EM = find_matching(train_confusion_matrix_EM)
246 |         correct_e_m = 0
247 |         correct_g_m_m = 0
248 |         correct_k_means = 0
249 |         for cluster in range(self.number_of_clusters):
250 |             matched_cluster_e_m = matching_EM[cluster]
251 |             matched_cluster_g_m_m = matching_GMM[cluster]
252 |             matched_cluster_k_means = matching_Kmeans[cluster]
253 | 
254 |             correct_e_m += train_confusion_matrix_EM[cluster,
255 |                                                      matched_cluster_e_m]
256 |             correct_g_m_m += train_confusion_matrix_GMM[cluster,
257 |                                                         matched_cluster_g_m_m]
258 |             correct_k_means += train_confusion_matrix_kmeans[cluster,
259 |                                                              matched_cluster_k_means]
260 |         return matching_EM, matching_GMM, matching_Kmeans
261 | 
262 |     def smoothen_clusters(self, cluster_mean_info, computed_covariance,
263 |                           cluster_mean_stacked_info, complete_D_train, n):
264 |         clustered_points_len = len(complete_D_train)
265 |         inv_cov_dict = {}  # cluster to inv_cov
266 |         log_det_dict = {}  # cluster to log_det
267 |         for cluster in range(self.number_of_clusters):
268 |             cov_matrix = computed_covariance[self.number_of_clusters, cluster][0:(2 * (self.num_blocks - 1)-1) * n,
269 |                                                                                0:(2 * (self.num_blocks - 1)-1) * n]
270 |             inv_cov_matrix = np.linalg.inv(cov_matrix)
271 |             log_det_cov = np.log(np.linalg.det(cov_matrix)
272 |                                  )  # log(det(sigma2|1))
273 |             inv_cov_dict[cluster] = inv_cov_matrix
274 |             log_det_dict[cluster] = log_det_cov
275 |         # For each point compute the LLE
276 |         print("beginning the smoothening ALGORITHM")
277 |         LLE_all_points_clusters = np.zeros(
278 |             [clustered_points_len, self.number_of_clusters])
279 |         for point in range(clustered_points_len):
280 |             if point + self.spatial_radius - 1 < complete_D_train.shape[0]:
281 |                 for cluster in range(self.number_of_clusters):
282 |                     cluster_mean = cluster_mean_info[self.number_of_clusters, cluster]
283 |                     cluster_mean_stacked = cluster_mean_stacked_info[self.number_of_clusters, cluster]
284 |                     x = complete_D_train[point, :] - \
285 |                         cluster_mean_stacked[0:(
286 |                             2 * (self.num_blocks - 1)-1) * n]
287 |                     inv_cov_matrix = inv_cov_dict[cluster]
288 |                     log_det_cov = log_det_dict[cluster]
289 |                     lle = np.dot(x.reshape([1, (self.spatial_radius) * n]),
290 |                                  np.dot(inv_cov_matrix, x.reshape([n * (self.spatial_radius), 1]))) + log_det_cov
291 |                     LLE_all_points_clusters[point, cluster] = lle
292 | 
293 |         return LLE_all_points_clusters
294 | 
295 |     def optimize_clusters(self, computed_covariance, len_train_clusters, log_det_values, optRes, train_cluster_inverse):
296 |         for cluster in range(self.number_of_clusters):
297 |             if optRes[cluster] == None:
298 |                 continue
299 |             val = optRes[cluster].get()
300 |             print("OPTIMIZATION for Cluster #", cluster, "DONE!!!")
301 |             # THIS IS THE SOLUTION
302 |             S_est = upperToFull(val, 0)
303 |             X2 = S_est
304 |             u, _ = np.linalg.eig(S_est)
305 |             cov_out = np.linalg.inv(X2)
306 | 
307 |             # Store the log-det, covariance, inverse-covariance, cluster means, stacked means
308 |             log_det_values[self.number_of_clusters,
309 |                            cluster] = np.log(np.linalg.det(cov_out))
310 |             computed_covariance[self.number_of_clusters, cluster] = cov_out
311 |             train_cluster_inverse[cluster] = X2
312 |         for cluster in range(self.number_of_clusters):
313 |             print("length of the cluster ", cluster,
314 |                   "------>", len_train_clusters[cluster])
315 | 
316 |     def train_clusters(self, cluster_mean_info, cluster_mean_stacked_info, complete_D_train, empirical_covariances,
317 |                        len_train_clusters, n, pool, train_clusters_arr):
318 |         optRes = [None for i in range(self.number_of_clusters)]
319 |         for cluster in range(self.number_of_clusters):
320 |             cluster_length = len_train_clusters[cluster]
321 |             if cluster_length != 0:
322 |                 size_blocks = n
323 |                 indices = train_clusters_arr[cluster]
324 |                 D_train = np.zeros([cluster_length, (self.spatial_radius) * n])
325 |                 for i in range(cluster_length):
326 |                     point = indices[i]
327 |                     D_train[i, :] = complete_D_train[point, :]
328 | 
329 |                 cluster_mean_info[self.number_of_clusters, cluster] = np.mean(D_train, axis=0)[
330 |                     (
331 |                         self.spatial_radius - 1) * n:self.spatial_radius * n].reshape(
332 |                     [1, n])
333 |                 cluster_mean_stacked_info[self.number_of_clusters, cluster] = np.mean(
334 |                     D_train, axis=0)
335 |                 # Fit a model - OPTIMIZATION
336 |                 probSize = (self.spatial_radius) * size_blocks
337 |                 lamb = np.zeros((probSize, probSize)) + self.lambda_parameter
338 |                 S = np.cov(np.transpose(D_train), bias=self.biased)
339 |                 empirical_covariances[cluster] = S
340 | 
341 |                 rho = 1
342 |                 solver = ADMMSolver(
343 |                     lamb, (self.spatial_radius), size_blocks, 1, S)
344 |                 # apply to process pool
345 |                 optRes[cluster] = pool.apply_async(
346 |                     solver, (1000, 1e-6, 1e-6, False,))
347 |         return optRes
348 | 
349 |     def stack_training_data(self, Data, n, num_train_points, training_indices, spatial_cols_size):
350 |         complete_D_train = np.zeros(
351 |             [num_train_points, self.spatial_radius * n])
352 |         # STICC data stack
353 |         for i in range(num_train_points):
354 |             for k in range(self.spatial_radius):
355 |                 if k == 0:
356 |                     complete_D_train[i][k * n:(k + 1) * n] = Data[i][1:(n + 1)]
357 |                 else:
358 |                     complete_D_train[i][k * n:(k + 1) *
359 |                                         n] = Data[int(Data[i][n + k])][1:(n + 1)]
360 |         return complete_D_train
361 | 
362 |     def prepare_out_directory(self):
363 |         str_NULL = self.prefix_string
364 |         if not os.path.exists(os.path.dirname(str_NULL)):
365 |             try:
366 |                 os.makedirs(os.path.dirname(str_NULL))
367 |             except OSError as exc:  # Guard against race condition of path already existing
368 |                 if exc.errno != errno.EEXIST:
369 |                     raise
370 | 
371 |         return str_NULL
372 | 
373 |     def load_data(self, input_file):
374 |         Data = np.loadtxt(input_file, delimiter=",")
375 |         (m, n) = Data.shape  # m: num of observations, n: size of observation vector
376 |         print("completed getting the data")
377 |         return Data, m, n
378 | 
379 |     def log_parameters(self):
380 |         print("lam_sparse", self.lambda_parameter)
381 |         print("switch_penalty", self.switch_penalty)
382 |         print("num_cluster", self.number_of_clusters)
383 |         print("num stacked", self.spatial_radius)
384 | 
385 |     def predict_clusters(self, test_data=None):
386 |         '''
387 |         Given the current trained model, predict clusters.  If the cluster segmentation has not been optimized yet,
388 |         than this will be part of the interative process.
389 | 
390 |         Args:
391 |             numpy array of data for which to predict clusters.  Columns are dimensions of the data, each row is
392 |             a different timestamp
393 | 
394 |         Returns:
395 |             vector of predicted cluster for the points
396 |         '''
397 |         if test_data is not None:
398 |             if not isinstance(test_data, np.ndarray):
399 |                 raise TypeError("input must be a numpy array!")
400 |         else:
401 |             test_data = self.trained_model['complete_D_train']
402 | 
403 |         # SMOOTHENING
404 |         lle_all_points_clusters = self.smoothen_clusters(self.trained_model['cluster_mean_info'],
405 |                                                          self.trained_model['computed_covariance'],
406 |                                                          self.trained_model['cluster_mean_stacked_info'],
407 |                                                          test_data,
408 |                                                          self.trained_model['spatial_series_col_size'])
409 | 
410 |         # Update cluster points - using NEW smoothening
411 |         clustered_points = updateClusters(lle_all_points_clusters, switch_penalty=self.switch_penalty, spatial_series_index=self.spatial_series_index,
412 |                                           spatial_series_closest=self.spatial_series_closest, spatial_radius=self.spatial_radius)
413 | 
414 |         return(clustered_points)
415 | 


--------------------------------------------------------------------------------
/data/nyc_checkin.cpg:
--------------------------------------------------------------------------------
1 | ISO-8859-1


--------------------------------------------------------------------------------
/data/nyc_checkin.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.dbf


--------------------------------------------------------------------------------
/data/nyc_checkin.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.shp


--------------------------------------------------------------------------------
/data/nyc_checkin.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.shx


--------------------------------------------------------------------------------
/data/nyc_checkin.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin.zip


--------------------------------------------------------------------------------
/data/nyc_checkin_sticc.cpg:
--------------------------------------------------------------------------------
1 | ISO-8859-1


--------------------------------------------------------------------------------
/data/nyc_checkin_sticc.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc.shp


--------------------------------------------------------------------------------
/data/nyc_checkin_sticc.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc.shx


--------------------------------------------------------------------------------
/data/nyc_checkin_sticc3.cpg:
--------------------------------------------------------------------------------
1 | ISO-8859-1


--------------------------------------------------------------------------------
/data/nyc_checkin_sticc3.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc3.shp


--------------------------------------------------------------------------------
/data/nyc_checkin_sticc3.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/nyc_checkin_sticc3.shx


--------------------------------------------------------------------------------
/data/sticc_points.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points.dbf


--------------------------------------------------------------------------------
/data/sticc_points.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points.shp


--------------------------------------------------------------------------------
/data/sticc_points.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points.shx


--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.cpg:
--------------------------------------------------------------------------------
1 | UTF-8


--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.dbf


--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.prj:
--------------------------------------------------------------------------------
1 | PROJCS["WGS_1984_Web_Mercator_Auxiliary_Sphere",GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Mercator_Auxiliary_Sphere"],PARAMETER["False_Easting",0.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",0.0],PARAMETER["Standard_Parallel_1",0.0],PARAMETER["Auxiliary_Sphere_Type",0.0],UNIT["Meter",1.0]]


--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.sbn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.sbn


--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.shp


--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.shp.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <metadata xml:lang="en"><Esri><CreaDate>20210624</CreaDate><CreaTime>16271300</CreaTime><ArcGISFormat>1.0</ArcGISFormat><SyncOnce>TRUE</SyncOnce><DataProperties><lineage><Process ToolSource="e:\program files\arcgis\pro\Resources\ArcToolbox\toolboxes\Conversion Tools.tbx\FeatureClassToFeatureClass" Date="20210624" Time="162713">FeatureClassToFeatureClass sticc_points "G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC" sticc_points_spatial_multivariate.shp # "SOURCE_ID "SOURCE_ID" false false true 4 Long 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,sticc_points.FID,-1,-1;synthetic_ "Field2" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field2,-1,-1;syntheti_1 "Field3" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field3,-1,-1;syntheti_2 "Field4" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field4,-1,-1;syntheti_3 "Field5" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field5,-1,-1;syntheti_4 "Field6" true true false 8 Double 0 0,First,#,G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points.shp,synthetic_data.txt.Field6,-1,-1" #</Process><Process ToolSource="e:\program files\arcgis\pro\Resources\ArcToolbox\toolboxes\Spatial Statistics Tools.pyt\SpatiallyConstrainedMultivariateClustering" Date="20210624" Time="162724">SpatiallyConstrainedMultivariateClustering sticc_points "G:\My Drive\Documents\ArcGIS\Projects\STICC\STICC\sticc_points_spatial_multivariate.shp" synthetic_data.txt.Field2;synthetic_data.txt.Field3;synthetic_data.txt.Field4;synthetic_data.txt.Field5;synthetic_data.txt.Field6 None # # # 7 "Trimmed Delaunay triangulation" # 100 #</Process></lineage><LayerFile>{"type":"CIMLayerDocument","version":"2.5.0","build":22081,"layers":["CIMPATH=map2/sticc_points_spatial_multivariate.xml"],"layerDefinitions":[{"type":"CIMFeatureLayer","name":"sticc_points_spatial_multivariate","uRI":"CIMPATH=map2/sticc_points_spatial_multivariate.xml","sourceModifiedTime":{"type":"TimeInstant"},"useSourceMetadata":true,"description":"sticc_points_spatial_multivariate","layerElevation":{"type":"CIMLayerElevationSurface","mapElevationID":"{46382D1F-EE9C-43F7-BD65-E563D4E9A7D8}"},"expanded":true,"layerType":"Operational","showLegends":true,"visibility":true,"displayCacheType":"Permanent","maxDisplayCacheAge":5,"showPopups":true,"serviceLayerID":-1,"charts":[{"type":"CIMChart","name":"Spatially Constrained Multivariate Clustering Box-Plots","series":[{"type":"CIMChartBoxPlotSeries","uniqueName":"Series0","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"verticalAxis":1,"colorType":"SingleColor","orderFieldsSortTypes":[0],"visible":true,"fillSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[178,178,178,100]}},"verticalOrientation":true,"standardizeValues":true},{"type":"CIMChartLineSeries","name":"1","uniqueName":"Series1","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 1","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[31,120,180,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[31,120,180,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"2","uniqueName":"Series2","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 2","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[178,223,138,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[178,223,138,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"3","uniqueName":"Series3","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 3","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[51,160,44,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[51,160,44,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"4","uniqueName":"Series4","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 4","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[251,154,153,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[251,154,153,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"5","uniqueName":"Series5","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 5","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[227,26,28,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[227,26,28,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"6","uniqueName":"Series6","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 6","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[253,191,111,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[253,191,111,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true},{"type":"CIMChartLineSeries","name":"7","uniqueName":"Series7","fields":["","synthetic_","syntheti_1","syntheti_2","syntheti_3","syntheti_4"],"orderFields":[""],"whereClause":"CLUSTER_ID = 7","verticalAxis":1,"colorType":"SingleColor","fieldAggregation":["","MEAN"],"orderFieldsSortTypes":[0],"visible":true,"lineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[255,127,0,100]}},"markerSymbolProperties":{"type":"CIMChartMarkerSymbolProperties","visible":true,"width":2,"height":2,"style":"Circle","color":{"type":"CIMRGBColor","values":[255,127,0,100]}},"timeAggregationType":"EqualIntervalsFromStartTime","timeIntervalUnits":"esriTimeUnitsMonths","timeIntervalSize":-1,"calculateAutomaticTimeInterval":true,"trimIncompleteTimeInterval":true,"nullPolicy":"Null","verticalOrientation":true}],"generalProperties":{"type":"CIMChartGeneralProperties","title":"Spatially Constrained Multivariate Clustering Box-Plots","showTitle":true,"useAutomaticTitle":false,"showSubTitle":true,"showFooter":true,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":16,"fontWeight":"Normal","textCase":"Normal"},"subTitleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"footerText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"backgroundSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[255,255,255,100]}},"gridLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":1,"style":"Solid","color":{"type":"CIMRGBColor","values":[119,119,119,100]}}},"legend":{"type":"CIMChartLegend","visible":true,"showTitle":true,"alignment":"Right","legendText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"legendTitle":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"}},"axes":[{"type":"CIMChartAxis","visible":true,"title":"Analysis Fields","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1},{"type":"CIMChartAxis","visible":true,"title":"Standardized Values","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1}],"mapSelectionHandling":"Highlight"},{"type":"CIMChart","name":"Features Per Cluster Chart","series":[{"type":"CIMChartBarSeries","uniqueName":"Series0","fields":["CLUSTER_ID",""],"orderFields":["CLUSTER_ID"],"groupFields":["CLUSTER_ID"],"verticalAxis":1,"colorType":"ColorMatch","fieldAggregation":["","COUNT"],"orderFieldsSortTypes":[0],"visible":true,"multipleBarType":"SideBySide","barSize":90,"fillSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[166,206,227,100]}},"verticalOrientation":true}],"generalProperties":{"type":"CIMChartGeneralProperties","title":"Features Per Cluster Chart","showTitle":true,"useAutomaticTitle":false,"showSubTitle":true,"showFooter":true,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":16,"fontWeight":"Normal","textCase":"Normal"},"subTitleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"footerText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"backgroundSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[255,255,255,100]}},"gridLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":1,"style":"Solid","color":{"type":"CIMRGBColor","values":[119,119,119,100]}}},"legend":{"type":"CIMChartLegend","visible":true,"showTitle":true,"alignment":"Right","legendText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"legendTitle":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"}},"axes":[{"type":"CIMChartAxis","visible":true,"title":"Cluster","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1},{"type":"CIMChartAxis","visible":true,"title":"Count","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1}],"mapSelectionHandling":"Highlight"},{"type":"CIMChart","name":"Distribution of Membership Probability","series":[{"type":"CIMChartHistogramSeries","name":"Series0","uniqueName":"Series0","fields":["MEM_PROB"],"verticalAxis":1,"colorType":"SingleColor","visible":true,"binCount":0,"fillSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[166,206,227,100]}},"meanLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[227,36,0,100]}},"medianLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[144,66,159,100]}},"standardDeviationLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[192,154,98,100]}},"dataTransformationType":"None","distributionLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":2,"style":"Solid","color":{"type":"CIMRGBColor","values":[101,158,199,100]}}}],"generalProperties":{"type":"CIMChartGeneralProperties","title":"Distribution of Membership Probability","showTitle":true,"useAutomaticTitle":false,"showSubTitle":true,"showFooter":true,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":16,"fontWeight":"Normal","textCase":"Normal"},"subTitleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"footerText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"backgroundSymbolProperties":{"type":"CIMChartFillSymbolProperties","color":{"type":"CIMRGBColor","values":[255,255,255,100]}},"gridLineSymbolProperties":{"type":"CIMChartLineSymbolProperties","visible":true,"width":1,"style":"Solid","color":{"type":"CIMRGBColor","values":[119,119,119,100]}}},"legend":{"type":"CIMChartLegend","visible":true,"showTitle":true,"alignment":"Right","legendText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"legendTitle":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"}},"axes":[{"type":"CIMChartAxis","visible":true,"title":"Membership Probability","showTitle":true,"useAutomaticTitle":false,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1},{"type":"CIMChartAxis","visible":true,"showTitle":true,"useAutomaticTitle":true,"valueFormat":"N2","dateTimeFormat":"M/d/yyyy","calculateAutomaticMinimum":true,"calculateAutomaticMaximum":true,"minimum":null,"maximum":null,"titleText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontItalic":true,"fontSize":12,"fontWeight":"Normal","textCase":"Normal"},"labelText":{"type":"CIMChartTextProperties","fontFillColor":{"type":"CIMRGBColor","values":[68,68,68,100]},"fontFamilyName":"Calibri","fontSize":10.8000000000000007,"fontWeight":"Normal","textCase":"Normal"},"labelCharacterLimit":11,"navigationScaleFactor":1}],"mapSelectionHandling":"Highlight"}],"refreshRate":-1,"refreshRateUnit":"esriTimeUnitsSeconds","autoGenerateFeatureTemplates":true,"featureElevationExpression":"0","featureTable":{"type":"CIMFeatureTable","displayField":"SOURCE_ID","editable":true,"dataConnection":{"type":"CIMStandardDataConnection","workspaceConnectionString":"DATABASE=G:\\My Drive\\Documents\\ArcGIS\\Projects\\STICC\\STICC","workspaceFactory":"Shapefile","dataset":"sticc_points_spatial_multivariate","datasetType":"esriDTFeatureClass"},"studyAreaSpatialRel":"esriSpatialRelUndefined","searchOrder":"esriSearchOrderSpatial"},"htmlPopupEnabled":true,"htmlPopupFormat":{"type":"CIMHtmlPopupFormat","htmlUseCodedDomainValues":true,"htmlPresentationStyle":"TwoColumnTable"},"isFlattened":true,"selectable":true,"featureCacheType":"Session","labelClasses":[{"type":"CIMLabelClass","expression":"[SOURCE_ID]","expressionEngine":"VBScript","featuresToLabel":"AllVisibleFeatures","maplexLabelPlacementProperties":{"type":"CIMMaplexLabelPlacementProperties","featureType":"Line","avoidPolygonHoles":true,"canOverrunFeature":true,"canPlaceLabelOutsidePolygon":true,"canRemoveOverlappingLabel":true,"canStackLabel":true,"connectionType":"Unambiguous","constrainOffset":"NoConstraint","contourAlignmentType":"Page","contourLadderType":"Straight","contourMaximumAngle":90,"enableConnection":true,"featureWeight":100,"fontHeightReductionLimit":4,"fontHeightReductionStep":0.5,"fontWidthReductionLimit":90,"fontWidthReductionStep":5,"graticuleAlignmentType":"Straight","labelBuffer":15,"labelLargestPolygon":true,"labelPriority":-1,"labelStackingProperties":{"type":"CIMMaplexLabelStackingProperties","stackAlignment":"ChooseBest","maximumNumberOfLines":3,"minimumNumberOfCharsPerLine":3,"maximumNumberOfCharsPerLine":24},"lineFeatureType":"General","linePlacementMethod":"OffsetCurvedFromLine","maximumLabelOverrun":36,"maximumLabelOverrunUnit":"Point","minimumFeatureSizeUnit":"Map","multiPartOption":"OneLabelPerPart","offsetAlongLineProperties":{"type":"CIMMaplexOffsetAlongLineProperties","placementMethod":"BestPositionAlongLine","labelAnchorPoint":"CenterOfLabel","distanceUnit":"Percentage","useLineDirection":true},"pointExternalZonePriorities":{"type":"CIMMaplexExternalZonePriorities","aboveLeft":4,"aboveCenter":2,"aboveRight":1,"centerRight":3,"belowRight":5,"belowCenter":7,"belowLeft":8,"centerLeft":6},"pointPlacementMethod":"AroundPoint","polygonAnchorPointType":"GeometricCenter","polygonBoundaryWeight":200,"polygonExternalZones":{"type":"CIMMaplexExternalZonePriorities","aboveLeft":4,"aboveCenter":2,"aboveRight":1,"centerRight":3,"belowRight":5,"belowCenter":7,"belowLeft":8,"centerLeft":6},"polygonFeatureType":"General","polygonInternalZones":{"type":"CIMMaplexInternalZonePriorities","center":1},"polygonPlacementMethod":"CurvedInPolygon","primaryOffset":1,"primaryOffsetUnit":"Point","removeExtraWhiteSpace":true,"repetitionIntervalUnit":"Map","rotationProperties":{"type":"CIMMaplexRotationProperties","rotationType":"Arithmetic","alignmentType":"Straight"},"secondaryOffset":100,"strategyPriorities":{"type":"CIMMaplexStrategyPriorities","stacking":1,"overrun":2,"fontCompression":3,"fontReduction":4,"abbreviation":5},"thinningDistanceUnit":"Map","truncationMarkerCharacter":".","truncationMinimumLength":1,"truncationPreferredCharacters":"aeiou"},"name":"Default","priority":2,"standardLabelPlacementProperties":{"type":"CIMStandardLabelPlacementProperties","featureType":"Line","featureWeight":"None","labelWeight":"High","numLabelsOption":"OneLabelPerName","lineLabelPosition":{"type":"CIMStandardLineLabelPosition","above":true,"inLine":true,"parallel":true},"lineLabelPriorities":{"type":"CIMStandardLineLabelPriorities","aboveStart":3,"aboveAlong":3,"aboveEnd":3,"centerStart":3,"centerAlong":3,"centerEnd":3,"belowStart":3,"belowAlong":3,"belowEnd":3},"pointPlacementMethod":"AroundPoint","pointPlacementPriorities":{"type":"CIMStandardPointPlacementPriorities","aboveLeft":2,"aboveCenter":2,"aboveRight":1,"centerLeft":3,"centerRight":2,"belowLeft":3,"belowCenter":3,"belowRight":2},"rotationType":"Arithmetic","polygonPlacementMethod":"AlwaysHorizontal"},"textSymbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMTextSymbol","blockProgression":"TTB","compatibilityMode":true,"depth3D":1,"drawSoftHyphen":true,"extrapolateBaselines":true,"flipAngle":90,"fontEffects":"Normal","fontEncoding":"Unicode","fontFamilyName":"Arial","fontStyleName":"Regular","fontType":"Unspecified","haloSize":1,"height":8,"hinting":"Default","horizontalAlignment":"Center","kerning":true,"letterWidth":100,"ligatures":true,"lineGapType":"ExtraLeading","shadowColor":{"type":"CIMRGBColor","values":[0,0,0,100]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}}]},"textCase":"Normal","textDirection":"LTR","verticalAlignment":"Bottom","verticalGlyphOrientation":"Right","wordSpacing":100,"billboardMode3D":"FaceNearPlane"}},"useCodedValue":true,"visibility":true,"iD":-1}],"renderer":{"type":"CIMUniqueValueRenderer","colorRamp":{"type":"CIMRandomHSVColorRamp","colorSpace":{"type":"CIMICCColorSpace","url":"Default RGB"},"maxH":360,"minS":33,"maxS":66,"minV":50,"maxV":99,"minAlpha":100,"maxAlpha":100},"defaultLabel":"\u003call other values\u003e","defaultSymbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMCharacterMarker","enable":true,"colorLocked":true,"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":7,"billboardMode3D":"FaceNearPlane","characterIndex":40,"fontFamilyName":"Arial","fontStyleName":"Regular","fontType":"Unspecified","scaleX":1,"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}}]},"scaleSymbolsProportionally":true,"respectFrame":true},{"type":"CIMCharacterMarker","enable":true,"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":7,"billboardMode3D":"FaceNearPlane","characterIndex":33,"fontFamilyName":"Arial","fontStyleName":"Regular","fontType":"Unspecified","scaleX":1,"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[224,223,227,0]}}]},"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Map"},"symbolName":"Level_1"},"defaultSymbolPatch":"Default","fields":["CLUSTER_ID"],"groups":[{"type":"CIMUniqueValueGroup","classes":[{"type":"CIMUniqueValueClass","label":"1","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[120,170,255,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["1"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"2","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[255,100,85,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["2"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"3","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[125,220,85,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["3"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"4","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[255,180,0,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["4"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"5","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[200,100,225,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["5"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"6","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[190,160,100,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["6"]}],"visible":true},{"type":"CIMUniqueValueClass","label":"7","patch":"Default","symbol":{"type":"CIMSymbolReference","symbol":{"type":"CIMPointSymbol","symbolLayers":[{"type":"CIMVectorMarker","enable":true,"anchorPoint":{"x":0,"y":0,"z":0},"anchorPointUnits":"Relative","dominantSizeAxis3D":"Y","size":5,"billboardMode3D":"FaceNearPlane","frame":{"xmin":0,"ymin":0,"xmax":17,"ymax":17},"markerGraphics":[{"type":"CIMMarkerGraphic","geometry":{"curveRings":[[[17,8.5],{"b":[[8.5,0],[17,3.8100000000000001],[13.19,0]]},{"b":[[0,8.5],[3.8100000000000001,0],[0,3.8100000000000001]]},{"b":[[8.5,17],[0,13.19],[3.8100000000000001,17]]},{"b":[[17,8.5],[13.19,17],[17,13.19]]}]]},"symbol":{"type":"CIMPolygonSymbol","symbolLayers":[{"type":"CIMSolidStroke","enable":true,"capStyle":"Round","joinStyle":"Round","lineStyle3D":"Strip","miterLimit":10,"width":1,"color":{"type":"CIMRGBColor","values":[0,0,0,100]}},{"type":"CIMSolidFill","enable":true,"color":{"type":"CIMRGBColor","values":[250,190,200,100]}}]}}],"scaleSymbolsProportionally":true,"respectFrame":true}],"haloSize":1,"scaleX":1,"angleAlignment":"Display"}},"values":[{"type":"CIMUniqueValue","fieldValues":["7"]}],"visible":true}]}],"polygonSymbolColorTarget":"Fill"},"scaleSymbols":true,"snappable":true}]}</LayerFile></DataProperties></Esri></metadata>
3 | 


--------------------------------------------------------------------------------
/data/sticc_points_spatial_multivariate.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/data/sticc_points_spatial_multivariate.shx


--------------------------------------------------------------------------------
/images/GeoDSLogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/images/GeoDSLogo.jpg


--------------------------------------------------------------------------------
/images/STICC.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/images/STICC.jpeg


--------------------------------------------------------------------------------
/images/clustering.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/images/clustering.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | esda==2.3.1
 2 | geopandas==0.8.1
 3 | libpysal==4.3.0
 4 | matplotlib==3.4.1
 5 | networkx==2.5.1
 6 | numpy==1.22.0
 7 | pandas==1.4.1
 8 | pyclustering==0.10.1.2
 9 | scikit_learn~>1.5.0
10 | Shapely==1.7.0
11 | 


--------------------------------------------------------------------------------
/src/STICC_helper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def getTrainTestSplit(m, num_blocks, num_stacked):
  5 |     '''
  6 |     - m: number of observations
  7 |     - num_blocks: spatial_radius + 1
  8 |     - num_stacked: spatial_radius
  9 |     Returns:
 10 |     - sorted list of training indices
 11 |     '''
 12 |     # Now splitting up stuff
 13 |     # split1 : Training and Test
 14 |     # split2 : Training and Test - different clusters
 15 |     training_percent = 1
 16 |     # list of training indices
 17 |     training_idx = np.random.choice(
 18 |         m-num_blocks+1, size=int((m-num_stacked)*training_percent), replace=False)
 19 |     # Ensure that the first and the last few points are in
 20 |     training_idx = list(training_idx)
 21 |     if 0 not in training_idx:
 22 |         training_idx.append(0)
 23 |     if m - num_stacked not in training_idx:
 24 |         training_idx.append(m-num_stacked)
 25 |     training_idx = np.array(training_idx)
 26 |     return sorted(training_idx)
 27 | 
 28 | 
 29 | def upperToFull(a, eps=0):
 30 |     ind = (a < eps) & (a > -eps)
 31 |     a[ind] = 0
 32 |     n = int((-1 + np.sqrt(1 + 8*a.shape[0]))/2)
 33 |     A = np.zeros([n, n])
 34 |     A[np.triu_indices(n)] = a
 35 |     temp = A.diagonal()
 36 |     A = np.asarray((A + A.T) - np.diag(temp))
 37 |     return A
 38 | 
 39 | 
 40 | def hex_to_rgb(value):
 41 |     """Return (red, green, blue) for the color given as #rrggbb."""
 42 |     lv = len(value)
 43 |     out = tuple(int(value[i:i + lv // 3], 16) for i in range(0, lv, lv // 3))
 44 |     out = tuple([x/256.0 for x in out])
 45 |     return out
 46 | 
 47 | 
 48 | def updateClusters(LLE_node_vals, switch_penalty=1, spatial_series_index=[], spatial_series_closest=[], spatial_radius=1):
 49 |     """
 50 |     Takes in LLE_node_vals matrix and computes the path that minimizes
 51 |     the total cost over the path
 52 |     Note the LLE's are negative of the true LLE's actually!!!!!
 53 | 
 54 |     Note: switch penalty > 0
 55 |     """
 56 |     (T, num_clusters) = LLE_node_vals.shape
 57 |     future_cost_vals = np.zeros(LLE_node_vals.shape)
 58 | 
 59 |     # compute future costs
 60 |     for i in range(T-2, -1, -1):
 61 |         j = spatial_series_closest[i]  # find the closest
 62 |         j = int(j)
 63 |         indicator = np.zeros(num_clusters)
 64 |         if j <= (len(spatial_series_index) - spatial_radius):
 65 |             future_costs = future_cost_vals[j, :]
 66 |             lle_vals = LLE_node_vals[j, :]
 67 |             for cluster in range(num_clusters):
 68 |                 total_vals = future_costs + lle_vals + switch_penalty
 69 |                 total_vals[cluster] -= switch_penalty
 70 |                 future_cost_vals[i, cluster] = np.min(total_vals)
 71 | 
 72 |     # compute the best path
 73 |     path = np.zeros(T)
 74 | 
 75 |     # the first location
 76 |     curr_location = np.argmin(future_cost_vals[0, :] + LLE_node_vals[0, :])
 77 |     path[0] = curr_location
 78 | 
 79 |     # compute the path
 80 |     for i in range(T-1):
 81 |         j = spatial_series_closest[i]  # find the closest
 82 |         j = int(j)
 83 |         if j <= (len(spatial_series_index) - spatial_radius):
 84 |             future_costs = future_cost_vals[j, :]
 85 |             lle_vals = LLE_node_vals[j, :]
 86 |             total_vals = future_costs + lle_vals + switch_penalty
 87 |             total_vals[int(path[i])] -= switch_penalty
 88 | 
 89 |         path[i+1] = np.argmin(total_vals)
 90 | 
 91 |     # return the computed path
 92 |     return path
 93 | 
 94 | 
 95 | def find_matching(confusion_matrix):
 96 |     """
 97 |     returns the perfect matching
 98 |     """
 99 |     _, n = confusion_matrix.shape
100 |     path = []
101 |     for i in range(n):
102 |         max_val = -1e10
103 |         max_ind = -1
104 |         for j in range(n):
105 |             if j in path:
106 |                 pass
107 |             else:
108 |                 temp = confusion_matrix[i, j]
109 |                 if temp > max_val:
110 |                     max_val = temp
111 |                     max_ind = j
112 |         path.append(max_ind)
113 |     return path
114 | 
115 | 
116 | def compute_confusion_matrix(num_clusters, clustered_points_algo, sorted_indices_algo):
117 |     """
118 |     computes a confusion matrix and returns it
119 |     """
120 |     seg_len = 400
121 |     true_confusion_matrix = np.zeros([num_clusters, num_clusters])
122 |     for point in range(len(clustered_points_algo)):
123 |         cluster = clustered_points_algo[point]
124 |         num = (int(sorted_indices_algo[point]/seg_len) % num_clusters)
125 |         true_confusion_matrix[int(num), int(cluster)] += 1
126 |     return true_confusion_matrix
127 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeoDS/STICC/6848a613198e55317c744fa3df423b68513b7fab/src/__init__.py


--------------------------------------------------------------------------------
/src/admm_solver.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import math
  3 | class ADMMSolver:
  4 |     def __init__(self, lamb, num_stacked, size_blocks, rho, S, rho_update_func=None):
  5 |         self.lamb = lamb
  6 |         self.numBlocks = num_stacked
  7 |         self.sizeBlocks = size_blocks
  8 |         probSize = num_stacked*size_blocks
  9 |         self.length = int(probSize*(probSize+1)/2)
 10 |         self.x = numpy.zeros(self.length)
 11 |         self.z = numpy.zeros(self.length)
 12 |         self.u = numpy.zeros(self.length)
 13 |         self.rho = float(rho)
 14 |         self.S = S
 15 |         self.status = 'initialized'
 16 |         self.rho_update_func = rho_update_func
 17 | 
 18 |     def ij2symmetric(self, i,j,size):
 19 |         return (size * (size + 1))/2 - (size-i)*((size - i + 1))/2 + j - i
 20 | 
 21 |     def upper2Full(self, a):
 22 |         n = int((-1  + numpy.sqrt(1+ 8*a.shape[0]))/2)  
 23 |         A = numpy.zeros([n,n])
 24 |         A[numpy.triu_indices(n)] = a 
 25 |         temp = A.diagonal()
 26 |         A = (A + A.T) - numpy.diag(temp)             
 27 |         return A 
 28 | 
 29 |     def Prox_logdet(self, S, A, eta):
 30 |         d, q = numpy.linalg.eigh(eta*A-S)
 31 |         q = numpy.matrix(q)
 32 |         X_var = ( 1/(2*float(eta)) )*q*( numpy.diag(d + numpy.sqrt(numpy.square(d) + (4*eta)*numpy.ones(d.shape))) )*q.T
 33 |         x_var = X_var[numpy.triu_indices(S.shape[1])] # extract upper triangular part as update variable      
 34 |         return numpy.matrix(x_var).T
 35 | 
 36 |     def ADMM_x(self):    
 37 |         a = self.z-self.u
 38 |         A = self.upper2Full(a)
 39 |         eta = self.rho
 40 |         x_update = self.Prox_logdet(self.S, A, eta)
 41 |         self.x = numpy.array(x_update).T.reshape(-1)
 42 | 
 43 |     def ADMM_z(self, index_penalty = 1):
 44 |         a = self.x + self.u
 45 |         probSize = self.numBlocks*self.sizeBlocks
 46 |         z_update = numpy.zeros(self.length)
 47 | 
 48 |         # TODO: can we parallelize these?
 49 |         for i in range(self.numBlocks):
 50 |             elems = self.numBlocks if i==0 else (2*self.numBlocks - 2*i)/2 # i=0 is diagonal
 51 |             for j in range(self.sizeBlocks):
 52 |                 startPoint = j if i==0 else 0
 53 |                 for k in range(startPoint, self.sizeBlocks):
 54 |                     locList = [((l+i)*self.sizeBlocks + j, l*self.sizeBlocks+k) for l in range(int(elems))]
 55 |                     if i == 0:
 56 |                         lamSum = sum(self.lamb[loc1, loc2] for (loc1, loc2) in locList)
 57 |                         indices = [self.ij2symmetric(loc1, loc2, probSize) for (loc1, loc2) in locList]
 58 |                     else:
 59 |                         lamSum = sum(self.lamb[loc2, loc1] for (loc1, loc2) in locList)
 60 |                         indices = [self.ij2symmetric(loc2, loc1, probSize) for (loc1, loc2) in locList]
 61 |                     pointSum = sum(a[int(index)] for index in indices)
 62 |                     rhoPointSum = self.rho * pointSum
 63 | 
 64 |                     #Calculate soft threshold
 65 |                     ans = 0
 66 |                     #If answer is positive
 67 |                     if rhoPointSum > lamSum:
 68 |                         ans = max((rhoPointSum - lamSum)/(self.rho*elems),0)
 69 |                     elif rhoPointSum < -1*lamSum:
 70 |                         ans = min((rhoPointSum + lamSum)/(self.rho*elems),0)
 71 | 
 72 |                     for index in indices:
 73 |                         z_update[int(index)] = ans
 74 |         self.z = z_update
 75 | 
 76 |     def ADMM_u(self):
 77 |         u_update = self.u + self.x - self.z
 78 |         self.u = u_update
 79 | 
 80 |     # Returns True if convergence criteria have been satisfied
 81 |     # eps_abs = eps_rel = 0.01
 82 |     # r = x - z
 83 |     # s = rho * (z - z_old)
 84 |     # e_pri = sqrt(length) * e_abs + e_rel * max(||x||, ||z||)
 85 |     # e_dual = sqrt(length) * e_abs + e_rel * ||rho * u||
 86 |     # Should stop if (||r|| <= e_pri) and (||s|| <= e_dual)
 87 |     # Returns (boolean shouldStop, primal residual value, primal threshold,
 88 |     #          dual residual value, dual threshold)
 89 |     def CheckConvergence(self, z_old, e_abs, e_rel, verbose):
 90 |         norm = numpy.linalg.norm
 91 |         r = self.x - self.z
 92 |         s = self.rho * (self.z - z_old)
 93 |         # Primal and dual thresholds. Add .0001 to prevent the case of 0.
 94 |         e_pri = math.sqrt(self.length) * e_abs + e_rel * max(norm(self.x), norm(self.z)) + .0001
 95 |         e_dual = math.sqrt(self.length) * e_abs + e_rel * norm(self.rho * self.u) + .0001
 96 |         # Primal and dual residuals
 97 |         res_pri = norm(r)
 98 |         res_dual = norm(s)
 99 |         if verbose:
100 |             # Debugging information to print(convergence criteria values)
101 |             print('  r:', res_pri)
102 |             print('  e_pri:', e_pri)
103 |             print('  s:', res_dual)
104 |             print('  e_dual:', e_dual)
105 |         stop = (res_pri <= e_pri) and (res_dual <= e_dual)
106 |         return (stop, res_pri, e_pri, res_dual, e_dual)
107 | 
108 |     #solve
109 |     def __call__(self, maxIters, eps_abs, eps_rel, verbose):
110 |         num_iterations = 0
111 |         self.status = 'Incomplete: max iterations reached'
112 |         for i in range(maxIters):
113 |             z_old = numpy.copy(self.z)
114 |             self.ADMM_x()
115 |             self.ADMM_z()
116 |             self.ADMM_u()
117 |             if i != 0:
118 |                 stop, res_pri, e_pri, res_dual, e_dual = self.CheckConvergence(z_old, eps_abs, eps_rel, verbose)
119 |                 if stop:
120 |                     self.status = 'Optimal'
121 |                     break
122 |                 new_rho = self.rho
123 |                 if self.rho_update_func:
124 |                     new_rho = rho_update_func(self.rho, res_pri, e_pri, res_dual, e_dual)
125 |                 scale = self.rho / new_rho
126 |                 rho = new_rho
127 |                 self.u = scale*self.u
128 |             if verbose:
129 |                 # Debugging information prints current iteration #
130 |                 print('Iteration %d' % i)
131 |         return self.x
132 | 


--------------------------------------------------------------------------------