├── .DS_Store
├── transformation.pdf
├── README.md
├── LICENSE
├── Transforming Data to Unlock Its Latent Value.ipynb
└── .ipynb_checkpoints
    └── Transforming Data to Unlock Its Latent Value-checkpoint.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ojedatony1616/exploratory_transformation/HEAD/.DS_Store


--------------------------------------------------------------------------------
/transformation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ojedatony1616/exploratory_transformation/HEAD/transformation.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # exploratory_transformation
2 | Repository for exploratory data transformation &amp; visualization talk
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Transforming Data to Unlock Its Latent Value.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Transforming Data to Unlock Its Latent Value"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": null,
  13 |    "metadata": {
  14 |     "collapsed": false
  15 |    },
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "import os\n",
  19 |     "import zipfile\n",
  20 |     "import requests\n",
  21 |     "import pandas as pd\n",
  22 |     "import numpy as np\n",
  23 |     "\n",
  24 |     "import warnings\n",
  25 |     "warnings.filterwarnings('ignore')\n",
  26 |     "\n",
  27 |     "import matplotlib.pyplot as plt\n",
  28 |     "import seaborn as sns\n",
  29 |     "\n",
  30 |     "path = 'data'\n",
  31 |     "\n",
  32 |     "%matplotlib inline"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "markdown",
  37 |    "metadata": {},
  38 |    "source": [
  39 |     "## Download the Data"
  40 |    ]
  41 |   },
  42 |   {
  43 |    "cell_type": "code",
  44 |    "execution_count": null,
  45 |    "metadata": {
  46 |     "collapsed": true
  47 |    },
  48 |    "outputs": [],
  49 |    "source": [
  50 |     "def download_data(url, name, path='data'):\n",
  51 |     "    if not os.path.exists(path):\n",
  52 |     "        os.mkdir(path)\n",
  53 |     "\n",
  54 |     "    response = requests.get(url)\n",
  55 |     "    with open(os.path.join(path, name), 'wb') as f:\n",
  56 |     "        f.write(response.content)\n",
  57 |     "        \n",
  58 |     "    z = zipfile.ZipFile(os.path.join(path, 'vehicles.zip'))\n",
  59 |     "    z.extractall(path)\n",
  60 |     "\n",
  61 |     "VEHICLES = 'http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip'\n",
  62 |     "\n",
  63 |     "download_data(VEHICLES, 'vehicles.zip')"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": null,
  69 |    "metadata": {
  70 |     "collapsed": true
  71 |    },
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "vehicles = pd.read_csv(os.path.join(path, 'vehicles.csv'))"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "markdown",
  79 |    "metadata": {},
  80 |    "source": [
  81 |     "## Clean and Reorganize the Data"
  82 |    ]
  83 |   },
  84 |   {
  85 |    "cell_type": "code",
  86 |    "execution_count": null,
  87 |    "metadata": {
  88 |     "collapsed": false
  89 |    },
  90 |    "outputs": [],
  91 |    "source": [
  92 |     "select_columns = ['make', 'model', 'year', 'displ', 'cylinders', 'trany', 'drive', 'VClass','fuelType',  \n",
  93 |     "                 'barrels08', 'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08']\n",
  94 |     "\n",
  95 |     "vehicles = vehicles[select_columns][vehicles.year <= 2016].drop_duplicates().dropna()\n",
  96 |     "vehicles = vehicles.sort_values(['make', 'model', 'year'])\n",
  97 |     "vehicles.head()"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "markdown",
 102 |    "metadata": {},
 103 |    "source": [
 104 |     "## Create Category Aggregations\n",
 105 |     "\n",
 106 |     "Hint: Look for object fields that have many categories. "
 107 |    ]
 108 |   },
 109 |   {
 110 |    "cell_type": "code",
 111 |    "execution_count": null,
 112 |    "metadata": {
 113 |     "collapsed": false
 114 |    },
 115 |    "outputs": [],
 116 |    "source": [
 117 |     "def unique_col_values(df):\n",
 118 |     "    for column in df:\n",
 119 |     "        print(str(df[column].name) + \" | \" + str(len(df[column].unique())) + \" | \" + str(df[column].dtype))\n",
 120 |     "\n",
 121 |     "unique_col_values(vehicles)"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "code",
 126 |    "execution_count": null,
 127 |    "metadata": {
 128 |     "collapsed": true
 129 |    },
 130 |    "outputs": [],
 131 |    "source": [
 132 |     "#Create new trantype field that specifies whether the vehicle is Automatic or Manual. \n",
 133 |     "vehicles.loc[vehicles.trany.str[0] == 'A', 'trantype'] = 'Automatic'\n",
 134 |     "vehicles.loc[vehicles.trany.str[0] == 'M', 'trantype'] = 'Manual'\n",
 135 |     "\n",
 136 |     "#Create new model_type field that parses the model type from the model field. \n",
 137 |     "vehicles['model_type'] = vehicles.make + \" \" + vehicles.model.str.split().str.get(0)\n",
 138 |     "\n",
 139 |     "#Create new category field that rolls up VClass into more general categories. \n",
 140 |     "small = ['Compact Cars','Subcompact Cars','Two Seaters','Minicompact Cars']\n",
 141 |     "midsize = ['Midsize Cars']\n",
 142 |     "large = ['Large Cars']\n",
 143 |     "\n",
 144 |     "vehicles.loc[vehicles.VClass.isin(small), 'category'] = 'Small Cars'\n",
 145 |     "vehicles.loc[vehicles.VClass.isin(midsize), 'category'] = 'Midsize Cars'\n",
 146 |     "vehicles.loc[vehicles.VClass.isin(large), 'category'] = 'Large Cars'\n",
 147 |     "vehicles.loc[vehicles.VClass.str.contains('Station'), 'category'] = 'Station Wagons'\n",
 148 |     "vehicles.loc[vehicles.VClass.str.contains('Pickup'), 'category'] = 'Pickup Trucks'\n",
 149 |     "vehicles.loc[vehicles.VClass.str.contains('Special Purpose'), 'category'] = 'Special Purpose'\n",
 150 |     "vehicles.loc[vehicles.VClass.str.contains('Sport Utility'), 'category'] = 'Sport Utility'\n",
 151 |     "vehicles.loc[(vehicles.VClass.str.contains('van')) | (vehicles.VClass.str.contains('van')),\n",
 152 |     "               'category'] = 'Vans & Minivans'\n",
 153 |     "\n",
 154 |     "#Create new fuel_category field that rolls up fuelType into more general categories. \n",
 155 |     "vehicles['fuel_category'] = ''\n",
 156 |     "gas = ['Regular', 'Premium', 'Midgrade']\n",
 157 |     "vehicles.loc[vehicles.fuelType.isin(gas), 'fuel_category'] = 'Gasoline'\n",
 158 |     "vehicles.loc[vehicles.fuelType == 'Diesel', 'fuel_category'] = 'Diesel'\n",
 159 |     "vehicles.loc[vehicles.fuel_category == '', 'fuel_category'] = 'Alternative/Hybrid'"
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "markdown",
 164 |    "metadata": {},
 165 |    "source": [
 166 |     "## Create Categorical Fields from Continuous"
 167 |    ]
 168 |   },
 169 |   {
 170 |    "cell_type": "code",
 171 |    "execution_count": null,
 172 |    "metadata": {
 173 |     "collapsed": true
 174 |    },
 175 |    "outputs": [],
 176 |    "source": [
 177 |     "engine_categories = ['Very Small Engine', 'Small Engine','Moderate Engine', \n",
 178 |     "                     'Large Engine', 'Very Large Engine']\n",
 179 |     "vehicles['engine_size'] = pd.qcut(vehicles.displ, 5, engine_categories)\n",
 180 |     "\n",
 181 |     "efficiency_categories = ['Very Low Efficiency', 'Low Efficiency', 'Moderate Efficiency',\n",
 182 |     "                        'High Efficiency', 'Very High Efficiency']\n",
 183 |     "vehicles['fuel_efficiency'] = pd.qcut(vehicles.comb08, 5, efficiency_categories)\n",
 184 |     "\n",
 185 |     "emmission_categories = ['Very Low Emmissions', 'Low Emmissions', 'Moderate Emmissions',\n",
 186 |     "                       'High Emmissions', 'Very High Emmissions']\n",
 187 |     "vehicles['emmission'] = pd.qcut(vehicles.co2TailpipeGpm, 5, emmission_categories)\n",
 188 |     "\n",
 189 |     "fuelcost_categories = ['Very Low Fuel Cost', 'Low Fuel Cost', 'Moderate Fuel Cost',\n",
 190 |     "                      'High Fuel Cost', 'Very High Fuel Cost']\n",
 191 |     "vehicles['fuel_cost'] = pd.qcut(vehicles.fuelCost08, 5, fuelcost_categories)"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "markdown",
 196 |    "metadata": {},
 197 |    "source": [
 198 |     "## Cluster to Create Additional Categories"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": null,
 204 |    "metadata": {
 205 |     "collapsed": true
 206 |    },
 207 |    "outputs": [],
 208 |    "source": [
 209 |     "vehicles_numeric = vehicles._get_numeric_data()\n",
 210 |     "del vehicles_numeric['year']\n",
 211 |     "\n",
 212 |     "vehicles_numeric_norm = vehicles_numeric.apply(lambda x: (x / x.max()))"
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "code",
 217 |    "execution_count": null,
 218 |    "metadata": {
 219 |     "collapsed": false
 220 |    },
 221 |    "outputs": [],
 222 |    "source": [
 223 |     "from sklearn.cluster import KMeans\n",
 224 |     "\n",
 225 |     "model = KMeans(n_clusters=8)\n",
 226 |     "clusters = model.fit_predict(vehicles_numeric_norm)\n",
 227 |     "vehicles_numeric_norm['cluster'] = clusters\n",
 228 |     "\n",
 229 |     "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n",
 230 |     "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n",
 231 |     "\n",
 232 |     "fig, ax = plt.subplots(figsize=(20,10))\n",
 233 |     "sns.heatmap(cluster_means[cluster_columns], annot=True)"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "code",
 238 |    "execution_count": null,
 239 |    "metadata": {
 240 |     "collapsed": false
 241 |    },
 242 |    "outputs": [],
 243 |    "source": [
 244 |     "model = KMeans(n_clusters=4)\n",
 245 |     "clusters = model.fit_predict(vehicles_numeric_norm)\n",
 246 |     "vehicles_numeric_norm['cluster'] = clusters\n",
 247 |     "\n",
 248 |     "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n",
 249 |     "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n",
 250 |     "\n",
 251 |     "fig, ax = plt.subplots(figsize=(20,10))\n",
 252 |     "sns.heatmap(cluster_means[cluster_columns], annot=True)"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "code",
 257 |    "execution_count": null,
 258 |    "metadata": {
 259 |     "collapsed": true
 260 |    },
 261 |    "outputs": [],
 262 |    "source": [
 263 |     "vehicles['cluster'] = clusters\n",
 264 |     "vehicles['cluster'][vehicles['cluster']==0] = 'Small Very Efficient'\n",
 265 |     "vehicles['cluster'][vehicles['cluster']==1] = 'Large Inefficient'\n",
 266 |     "vehicles['cluster'][vehicles['cluster']==2] = 'Midsized Balanced'\n",
 267 |     "vehicles['cluster'][vehicles['cluster']==3] = 'Small Moderately Efficient'"
 268 |    ]
 269 |   },
 270 |   {
 271 |    "cell_type": "markdown",
 272 |    "metadata": {},
 273 |    "source": [
 274 |     "## Aggregate and Filter"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "code",
 279 |    "execution_count": null,
 280 |    "metadata": {
 281 |     "collapsed": true
 282 |    },
 283 |    "outputs": [],
 284 |    "source": [
 285 |     "def barchart(df, group_field, calc_field, calc, length, width):\n",
 286 |     "    grouped = pd.DataFrame(zip_agg.groupby(group_field).agg({calc_field: {calc_field: calc}}).to_records())\n",
 287 |     "    grouped.columns = [group_field, calc_field]\n",
 288 |     "    grouped = grouped.sort_values(calc_field, ascending=False)\n",
 289 |     "\n",
 290 |     "    fig = plt.subplots(figsize=(width,length))\n",
 291 |     "    ax = sns.barplot(x=calc_field, y=group_field, data=grouped)\n",
 292 |     "    ax.set(xlabel=calc + '(' + calc_field + ')', ylabel=group_field )\n",
 293 |     "    plt.show()"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": null,
 299 |    "metadata": {
 300 |     "collapsed": false
 301 |    },
 302 |    "outputs": [],
 303 |    "source": [
 304 |     "barchart(vehicles, 2016, 'category','count', 6,8)"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": null,
 310 |    "metadata": {
 311 |     "collapsed": false
 312 |    },
 313 |    "outputs": [],
 314 |    "source": [
 315 |     "count_barchart(vehicles, 1985, 'category', 'count', 6,8)"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "code",
 320 |    "execution_count": null,
 321 |    "metadata": {
 322 |     "collapsed": false
 323 |    },
 324 |    "outputs": [],
 325 |    "source": [
 326 |     "count_barchart(vehicles, 2016, 'engine_size', 'count', 6, 8)"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "code",
 331 |    "execution_count": null,
 332 |    "metadata": {
 333 |     "collapsed": false
 334 |    },
 335 |    "outputs": [],
 336 |    "source": [
 337 |     "count_barchart(vehicles, 2016, 'fuel_efficiency', 'count',6, 8)"
 338 |    ]
 339 |   },
 340 |   {
 341 |    "cell_type": "code",
 342 |    "execution_count": null,
 343 |    "metadata": {
 344 |     "collapsed": false
 345 |    },
 346 |    "outputs": [],
 347 |    "source": [
 348 |     "count_barchart(vehicles, 2016, 'cluster', 'count',6,8)"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "code",
 353 |    "execution_count": null,
 354 |    "metadata": {
 355 |     "collapsed": false
 356 |    },
 357 |    "outputs": [],
 358 |    "source": [
 359 |     "count_barchart(vehicles, 2016, 'make', 'count',12, 12)"
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "markdown",
 364 |    "metadata": {},
 365 |    "source": [
 366 |     "## More Details with Pivoting"
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": null,
 372 |    "metadata": {
 373 |     "collapsed": true
 374 |    },
 375 |    "outputs": [],
 376 |    "source": [
 377 |     "def pivot_heatmap(df, year, rows, columns, values, width, length):\n",
 378 |     "    df_year = df[df.year == year]\n",
 379 |     "    df_pivot = df_year.pivot_table(values=values, index=rows, columns=columns, \n",
 380 |     "                                   aggfunc=np.size).dropna(axis=0, how='all')\n",
 381 |     "    \n",
 382 |     "    fig = plt.subplots(figsize=(width,length))\n",
 383 |     "    ax = sns.heatmap(df_pivot, annot=True, fmt='g')\n",
 384 |     "    plt.show()"
 385 |    ]
 386 |   },
 387 |   {
 388 |    "cell_type": "code",
 389 |    "execution_count": null,
 390 |    "metadata": {
 391 |     "collapsed": false
 392 |    },
 393 |    "outputs": [],
 394 |    "source": [
 395 |     "pivot_heatmap(vehicles, 2016, 'fuel_efficiency','engine_size','comb08',15, 8)"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "code",
 400 |    "execution_count": null,
 401 |    "metadata": {
 402 |     "collapsed": false
 403 |    },
 404 |    "outputs": [],
 405 |    "source": [
 406 |     "pivot_heatmap(vehicles, 1985, 'fuel_efficiency','engine_size','comb08',15, 8)"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "code",
 411 |    "execution_count": null,
 412 |    "metadata": {
 413 |     "collapsed": false
 414 |    },
 415 |    "outputs": [],
 416 |    "source": [
 417 |     "pivot_heatmap(vehicles, 2016, 'cluster','category', 'comb08', 15, 10)"
 418 |    ]
 419 |   },
 420 |   {
 421 |    "cell_type": "code",
 422 |    "execution_count": null,
 423 |    "metadata": {
 424 |     "collapsed": false
 425 |    },
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "pivot_heatmap(vehicles, 2016, ['engine_size', 'fuel_efficiency'],'category', 'comb08', 15, 15)"
 429 |    ]
 430 |   },
 431 |   {
 432 |    "cell_type": "code",
 433 |    "execution_count": null,
 434 |    "metadata": {
 435 |     "collapsed": false
 436 |    },
 437 |    "outputs": [],
 438 |    "source": [
 439 |     "pivot_heatmap(vehicles, 2016, 'make','category', 'comb08', 10, 10)"
 440 |    ]
 441 |   },
 442 |   {
 443 |    "cell_type": "markdown",
 444 |    "metadata": {},
 445 |    "source": [
 446 |     "## Exploring Aggregations Over Time"
 447 |    ]
 448 |   },
 449 |   {
 450 |    "cell_type": "code",
 451 |    "execution_count": null,
 452 |    "metadata": {
 453 |     "collapsed": true
 454 |    },
 455 |    "outputs": [],
 456 |    "source": [
 457 |     "def multi_line(df, x, y):\n",
 458 |     "    ax = df.groupby([x, y]).size().unstack(y).plot(figsize=(15,8), cmap=\"Set2\")"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "code",
 463 |    "execution_count": null,
 464 |    "metadata": {
 465 |     "collapsed": false
 466 |    },
 467 |    "outputs": [],
 468 |    "source": [
 469 |     "multi_line(vehicles, 'year', 'category')"
 470 |    ]
 471 |   },
 472 |   {
 473 |    "cell_type": "code",
 474 |    "execution_count": null,
 475 |    "metadata": {
 476 |     "collapsed": false
 477 |    },
 478 |    "outputs": [],
 479 |    "source": [
 480 |     "bmw = vehicles[vehicles.make == 'BMW']\n",
 481 |     "multi_line(bmw, 'year', 'category')"
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "code",
 486 |    "execution_count": null,
 487 |    "metadata": {
 488 |     "collapsed": false
 489 |    },
 490 |    "outputs": [],
 491 |    "source": [
 492 |     "toyota = vehicles[vehicles.make == 'Toyota']\n",
 493 |     "multi_line(toyota, 'year', 'category')"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "markdown",
 498 |    "metadata": {},
 499 |    "source": [
 500 |     "## Exploring Field Relationships"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": null,
 506 |    "metadata": {
 507 |     "collapsed": false
 508 |    },
 509 |    "outputs": [],
 510 |    "source": [
 511 |     "def scatter_matrix(df, labels=None):\n",
 512 |     "    ax = sns.pairplot(df, hue=labels, diag_kind='kde', size=2)\n",
 513 |     "    plt.show()\n",
 514 |     "\n",
 515 |     "scatter_matrix(vehicles_numeric_norm)"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "code",
 520 |    "execution_count": null,
 521 |    "metadata": {
 522 |     "collapsed": false
 523 |    },
 524 |    "outputs": [],
 525 |    "source": [
 526 |     "scatter_matrix(vehicles_numeric_norm, labels=\"cluster\")"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "code",
 531 |    "execution_count": null,
 532 |    "metadata": {
 533 |     "collapsed": false
 534 |    },
 535 |    "outputs": [],
 536 |    "source": [
 537 |     "vehicles_numeric_norm['Cluster'] = vehicles['cluster']\n",
 538 |     "sns.lmplot('displ', 'comb08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)"
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": null,
 544 |    "metadata": {
 545 |     "collapsed": false
 546 |    },
 547 |    "outputs": [],
 548 |    "source": [
 549 |     "sns.lmplot('displ', 'fuelCost08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)"
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "markdown",
 554 |    "metadata": {},
 555 |    "source": [
 556 |     "## Exploring Entity Relationships (Graph Analysis)"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": null,
 562 |    "metadata": {
 563 |     "collapsed": true
 564 |    },
 565 |    "outputs": [],
 566 |    "source": [
 567 |     "entity = 'make'\n",
 568 |     "year = 2016\n",
 569 |     "\n",
 570 |     "vehicles_year = vehicles[vehicles.year==year]\n",
 571 |     "\n",
 572 |     "graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n",
 573 |     "                                                 'comb08','VClass', 'cluster'], \n",
 574 |     "                                                as_index=False).size()).reset_index()\n",
 575 |     "\n",
 576 |     "graph_year = graph_year.rename(columns={0: 'count'})\n",
 577 |     "graph_year['edge'] = (graph_year['cylinders'].map(str)\n",
 578 |     "                      + graph_year['displ'].map(str)\n",
 579 |     "                      + graph_year['trantype']\n",
 580 |     "                      + graph_year['drive']\n",
 581 |     "                      + graph_year['comb08'].map(str)\n",
 582 |     "                      + graph_year['VClass']\n",
 583 |     "                      + graph_year['cluster']\n",
 584 |     "                     )\n",
 585 |     "\n",
 586 |     "graph_year = graph_year[[entity, 'edge', 'count']]"
 587 |    ]
 588 |   },
 589 |   {
 590 |    "cell_type": "code",
 591 |    "execution_count": null,
 592 |    "metadata": {
 593 |     "collapsed": true
 594 |    },
 595 |    "outputs": [],
 596 |    "source": [
 597 |     "def df_to_graph(df, entity, edge):\n",
 598 |     "    df2 = df.copy()\n",
 599 |     "    graph_df = pd.merge(df, df2, how='inner', on=edge)\n",
 600 |     "    graph_df = graph_df.groupby([entity + '_x', entity + '_y']).count().reset_index()\n",
 601 |     "    graph_df = graph_df[graph_df[entity + '_x'] != graph_df[entity + '_y']]\n",
 602 |     "    graph_df = graph_df[[entity + '_x', entity + '_y', edge]]\n",
 603 |     "    return graph_df"
 604 |    ]
 605 |   },
 606 |   {
 607 |    "cell_type": "code",
 608 |    "execution_count": null,
 609 |    "metadata": {
 610 |     "collapsed": false
 611 |    },
 612 |    "outputs": [],
 613 |    "source": [
 614 |     "vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n",
 615 |     "vehicle_make_graph.head(10)"
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "code",
 620 |    "execution_count": null,
 621 |    "metadata": {
 622 |     "collapsed": false
 623 |    },
 624 |    "outputs": [],
 625 |    "source": [
 626 |     "import networkx as nx\n",
 627 |     "import graph_tool.all as gt\n",
 628 |     "import graph_tool as gt\n",
 629 |     "from graph_tool import *\n",
 630 |     "\n",
 631 |     "G = nx.from_pandas_dataframe(vehicle_make_graph, entity + '_x', entity + '_y', 'edge')"
 632 |    ]
 633 |   },
 634 |   {
 635 |    "cell_type": "markdown",
 636 |    "metadata": {},
 637 |    "source": [
 638 |     "[Converting NetworkX to Graph-Tool](http://bbengfort.github.io/snippets/2016/06/23/graph-tool-from-networkx.html) by Benjamin Bengfort (converts NetworkX graphs to much prettier Graph-Tool graphs). "
 639 |    ]
 640 |   },
 641 |   {
 642 |    "cell_type": "code",
 643 |    "execution_count": null,
 644 |    "metadata": {
 645 |     "collapsed": true
 646 |    },
 647 |    "outputs": [],
 648 |    "source": [
 649 |     "def get_prop_type(value, key=None):\n",
 650 |     "    \"\"\"\n",
 651 |     "    Performs typing and value conversion for the graph_tool PropertyMap class.\n",
 652 |     "    If a key is provided, it also ensures the key is in a format that can be\n",
 653 |     "    used with the PropertyMap. Returns a tuple, (type name, value, key)\n",
 654 |     "    \"\"\"\n",
 655 |     "    if isinstance(key, unicode):\n",
 656 |     "        # Encode the key as ASCII\n",
 657 |     "        key = key.encode('ascii', errors='replace')\n",
 658 |     "\n",
 659 |     "    # Deal with the value\n",
 660 |     "    if isinstance(value, bool):\n",
 661 |     "        tname = 'bool'\n",
 662 |     "\n",
 663 |     "    elif isinstance(value, int):\n",
 664 |     "        tname = 'float'\n",
 665 |     "        value = float(value)\n",
 666 |     "\n",
 667 |     "    elif isinstance(value, float):\n",
 668 |     "        tname = 'float'\n",
 669 |     "\n",
 670 |     "    elif isinstance(value, unicode):\n",
 671 |     "        tname = 'string'\n",
 672 |     "        value = value.encode('ascii', errors='replace')\n",
 673 |     "\n",
 674 |     "    elif isinstance(value, dict):\n",
 675 |     "        tname = 'object'\n",
 676 |     "\n",
 677 |     "    else:\n",
 678 |     "        tname = 'string'\n",
 679 |     "        value = str(value)\n",
 680 |     "\n",
 681 |     "    return tname, value, key\n",
 682 |     "\n",
 683 |     "\n",
 684 |     "def nx2gt(nxG):\n",
 685 |     "    \"\"\"\n",
 686 |     "    Converts a networkx graph to a graph-tool graph.\n",
 687 |     "    \"\"\"\n",
 688 |     "    # Phase 0: Create a directed or undirected graph-tool Graph\n",
 689 |     "    gtG = gt.Graph(directed=nxG.is_directed())\n",
 690 |     "\n",
 691 |     "    # Add the Graph properties as \"internal properties\"\n",
 692 |     "    for key, value in nxG.graph.items():\n",
 693 |     "        # Convert the value and key into a type for graph-tool\n",
 694 |     "        tname, value, key = get_prop_type(value, key)\n",
 695 |     "\n",
 696 |     "        prop = gtG.new_graph_property(tname) # Create the PropertyMap\n",
 697 |     "        gtG.graph_properties[key] = prop     # Set the PropertyMap\n",
 698 |     "        gtG.graph_properties[key] = value    # Set the actual value\n",
 699 |     "\n",
 700 |     "    # Phase 1: Add the vertex and edge property maps\n",
 701 |     "    # Go through all nodes and edges and add seen properties\n",
 702 |     "    # Add the node properties first\n",
 703 |     "    nprops = set() # cache keys to only add properties once\n",
 704 |     "    for node, data in nxG.nodes_iter(data=True):\n",
 705 |     "\n",
 706 |     "        # Go through all the properties if not seen and add them.\n",
 707 |     "        for key, val in data.items():\n",
 708 |     "            if key in nprops: continue # Skip properties already added\n",
 709 |     "\n",
 710 |     "            # Convert the value and key into a type for graph-tool\n",
 711 |     "            tname, _, key  = get_prop_type(val, key)\n",
 712 |     "\n",
 713 |     "            prop = gtG.new_vertex_property(tname) # Create the PropertyMap\n",
 714 |     "            gtG.vertex_properties[key] = prop     # Set the PropertyMap\n",
 715 |     "\n",
 716 |     "            # Add the key to the already seen properties\n",
 717 |     "            nprops.add(key)\n",
 718 |     "\n",
 719 |     "    # Also add the node id: in NetworkX a node can be any hashable type, but\n",
 720 |     "    # in graph-tool node are defined as indices. So we capture any strings\n",
 721 |     "    # in a special PropertyMap called 'id' -- modify as needed!\n",
 722 |     "    gtG.vertex_properties['id'] = gtG.new_vertex_property('string')\n",
 723 |     "\n",
 724 |     "    # Add the edge properties second\n",
 725 |     "    eprops = set() # cache keys to only add properties once\n",
 726 |     "    for src, dst, data in nxG.edges_iter(data=True):\n",
 727 |     "\n",
 728 |     "        # Go through all the edge properties if not seen and add them.\n",
 729 |     "        for key, val in data.items():\n",
 730 |     "            if key in eprops: continue # Skip properties already added\n",
 731 |     "\n",
 732 |     "            # Convert the value and key into a type for graph-tool\n",
 733 |     "            tname, _, key = get_prop_type(val, key)\n",
 734 |     "\n",
 735 |     "            prop = gtG.new_edge_property(tname) # Create the PropertyMap\n",
 736 |     "            gtG.edge_properties[key] = prop     # Set the PropertyMap\n",
 737 |     "\n",
 738 |     "            # Add the key to the already seen properties\n",
 739 |     "            eprops.add(key)\n",
 740 |     "\n",
 741 |     "    # Phase 2: Actually add all the nodes and vertices with their properties\n",
 742 |     "    # Add the nodes\n",
 743 |     "    vertices = {} # vertex mapping for tracking edges later\n",
 744 |     "    for node, data in nxG.nodes_iter(data=True):\n",
 745 |     "\n",
 746 |     "        # Create the vertex and annotate for our edges later\n",
 747 |     "        v = gtG.add_vertex()\n",
 748 |     "        vertices[node] = v\n",
 749 |     "\n",
 750 |     "        # Set the vertex properties, not forgetting the id property\n",
 751 |     "        data['id'] = str(node)\n",
 752 |     "        for key, value in data.items():\n",
 753 |     "            gtG.vp[key][v] = value # vp is short for vertex_properties\n",
 754 |     "\n",
 755 |     "    # Add the edges\n",
 756 |     "    for src, dst, data in nxG.edges_iter(data=True):\n",
 757 |     "\n",
 758 |     "        # Look up the vertex structs from our vertices mapping and add edge.\n",
 759 |     "        e = gtG.add_edge(vertices[src], vertices[dst])\n",
 760 |     "\n",
 761 |     "        # Add the edge properties\n",
 762 |     "        for key, value in data.items():\n",
 763 |     "            gtG.ep[key][e] = value # ep is short for edge_properties\n",
 764 |     "\n",
 765 |     "    # Done, finally!\n",
 766 |     "    return gtG\n",
 767 |     "\n",
 768 |     "\n",
 769 |     "if __name__ == '__main__':\n",
 770 |     "\n",
 771 |     "    # Create the networkx graph\n",
 772 |     "    nxG = nx.Graph(name=\"Undirected Graph\")\n",
 773 |     "    nxG.add_node(\"v1\", name=\"alpha\", color=\"red\")\n",
 774 |     "    nxG.add_node(\"v2\", name=\"bravo\", color=\"blue\")\n",
 775 |     "    nxG.add_node(\"v3\", name=\"charlie\", color=\"blue\")\n",
 776 |     "    nxG.add_node(\"v4\", name=\"hub\", color=\"purple\")\n",
 777 |     "    nxG.add_node(\"v5\", name=\"delta\", color=\"red\")\n",
 778 |     "    nxG.add_node(\"v6\", name=\"echo\", color=\"red\")\n",
 779 |     "\n",
 780 |     "    nxG.add_edge(\"v1\", \"v2\", weight=0.5, label=\"follows\")\n",
 781 |     "    nxG.add_edge(\"v1\", \"v3\", weight=0.25, label=\"follows\")\n",
 782 |     "    nxG.add_edge(\"v2\", \"v4\", weight=0.05, label=\"follows\")\n",
 783 |     "    nxG.add_edge(\"v3\", \"v4\", weight=0.35, label=\"follows\")\n",
 784 |     "    nxG.add_edge(\"v5\", \"v4\", weight=0.65, label=\"follows\")\n",
 785 |     "    nxG.add_edge(\"v6\", \"v4\", weight=0.53, label=\"follows\")\n",
 786 |     "    nxG.add_edge(\"v5\", \"v6\", weight=0.21, label=\"follows\")\n",
 787 |     "\n",
 788 |     "    for item in nxG.edges_iter(data=True):\n",
 789 |     "        print(item)\n",
 790 |     "\n",
 791 |     "    # Convert to graph-tool graph\n",
 792 |     "    gtG = nx2gt(nxG)\n",
 793 |     "    gtG.list_properties()"
 794 |    ]
 795 |   },
 796 |   {
 797 |    "cell_type": "code",
 798 |    "execution_count": null,
 799 |    "metadata": {
 800 |     "collapsed": true
 801 |    },
 802 |    "outputs": [],
 803 |    "source": [
 804 |     "def plot_graph(graph, width, length):\n",
 805 |     "    g = nx2gt(graph)\n",
 806 |     "    vlabel = g.vp['id']\n",
 807 |     "    gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n",
 808 |     "               vertex_size=5, vertex_fill_color='cyan')\n",
 809 |     "\n",
 810 |     "plot_graph(G, 1200, 800)"
 811 |    ]
 812 |   },
 813 |   {
 814 |    "cell_type": "code",
 815 |    "execution_count": null,
 816 |    "metadata": {
 817 |     "collapsed": true
 818 |    },
 819 |    "outputs": [],
 820 |    "source": [
 821 |     "ego = nx.ego_graph(G, 'Nissan', 1)\n",
 822 |     "plot_graph(ego, 500, 500)"
 823 |    ]
 824 |   },
 825 |   {
 826 |    "cell_type": "code",
 827 |    "execution_count": null,
 828 |    "metadata": {
 829 |     "collapsed": true
 830 |    },
 831 |    "outputs": [],
 832 |    "source": [
 833 |     "import community\n",
 834 |     "\n",
 835 |     "def detect_communities(graph):\n",
 836 |     "    partition = community.best_partition(graph)\n",
 837 |     "    nx.set_node_attributes(graph, 'partition', partition)\n",
 838 |     "    return graph, partition\n",
 839 |     "\n",
 840 |     "make_communities = pd.DataFrame(detect_communities(G)[1].items(), \n",
 841 |     "                                columns=['make', 'community']).sort_values('community', ascending=True)\n",
 842 |     "\n",
 843 |     "make_communities.head()"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "code",
 848 |    "execution_count": null,
 849 |    "metadata": {
 850 |     "collapsed": true
 851 |    },
 852 |    "outputs": [],
 853 |    "source": [
 854 |     "import random\n",
 855 |     "from copy import copy\n",
 856 |     "\n",
 857 |     "##########################################################################\n",
 858 |     "## Color Palettes\n",
 859 |     "##########################################################################\n",
 860 |     "\n",
 861 |     "FLATUI = [\"#9b59b6\", \"#3498db\", \"#95a5a6\", \"#e74c3c\", \"#34495e\", \"#2ecc71\"]\n",
 862 |     "PAIRED = [\n",
 863 |     "    \"#a6cee3\", \"#1f78b4\", \"#b2df8a\", \"#33a02c\", \"#fb9a99\", \"#e31a1c\",\n",
 864 |     "    \"#fdbf6f\", \"#ff7f00\", \"#cab2d6\", \"#6a3d9a\", \"#ffff99\", \"#b15928\",\n",
 865 |     "]\n",
 866 |     "SET1   = [\n",
 867 |     "    \"#e41a1c\", \"#377eb8\", \"#4daf4a\",\n",
 868 |     "    \"#984ea3\", \"#ff7f00\", \"#ffff33\",\n",
 869 |     "    \"#a65628\", \"#f781bf\", \"#999999\"\n",
 870 |     "]\n",
 871 |     "\n",
 872 |     "PALETTES = {\n",
 873 |     "    'flatui': FLATUI,\n",
 874 |     "    'paired': PAIRED,\n",
 875 |     "    'set1': SET1,\n",
 876 |     "}\n",
 877 |     "\n",
 878 |     "##########################################################################\n",
 879 |     "## Color Utilities\n",
 880 |     "##########################################################################\n",
 881 |     "\n",
 882 |     "class ColorMap(object):\n",
 883 |     "    \"\"\"\n",
 884 |     "    A helper for mapping categorical values to colors on demand.\n",
 885 |     "    \"\"\"\n",
 886 |     "\n",
 887 |     "    def __init__(self, colors='flatui', shuffle=False):\n",
 888 |     "        \"\"\"\n",
 889 |     "        Specify either a list of colors or one of the color names. If shuffle\n",
 890 |     "        is True then the colors will be shuffled randomly.\n",
 891 |     "        \"\"\"\n",
 892 |     "        self.mapping = {}\n",
 893 |     "        self.colors = colors\n",
 894 |     "\n",
 895 |     "        if shuffle:\n",
 896 |     "            random.shuffle(self._colors)\n",
 897 |     "\n",
 898 |     "    @property\n",
 899 |     "    def colors(self):\n",
 900 |     "        return self._colors\n",
 901 |     "\n",
 902 |     "    @colors.setter\n",
 903 |     "    def colors(self, value):\n",
 904 |     "        \"\"\"\n",
 905 |     "        Converts color strings into a color listing.\n",
 906 |     "        \"\"\"\n",
 907 |     "        if isinstance(value, basestring):\n",
 908 |     "            if value not in PALETTES:\n",
 909 |     "                raise ValueError(\"'{}' is not a registered color palette\")\n",
 910 |     "            self._colors = copy(PALETTES[value])\n",
 911 |     "        elif isinstance(value, list):\n",
 912 |     "            self._colors = value\n",
 913 |     "        else:\n",
 914 |     "            self._colors = list(value)\n",
 915 |     "\n",
 916 |     "    def __call__(self, category):\n",
 917 |     "        if category not in self.mapping:\n",
 918 |     "            if self.colors:\n",
 919 |     "                self.mapping[category] = self.colors.pop()\n",
 920 |     "            else:\n",
 921 |     "                raise ValueError(\n",
 922 |     "                    \"Not enough colors for this many categories!\"\n",
 923 |     "                )\n",
 924 |     "\n",
 925 |     "        return self.mapping[category]"
 926 |    ]
 927 |   },
 928 |   {
 929 |    "cell_type": "code",
 930 |    "execution_count": null,
 931 |    "metadata": {
 932 |     "collapsed": true
 933 |    },
 934 |    "outputs": [],
 935 |    "source": [
 936 |     "def plot_community_graph(graph, community_df, width, length):\n",
 937 |     "    g = nx2gt(G)\n",
 938 |     "    vlabel = g.vp['id']\n",
 939 |     "    vcolor = g.new_vertex_property('string') \n",
 940 |     "    vcmap = ColorMap('flatui', shuffle=False)\n",
 941 |     "    for vertex in g.vertices():\n",
 942 |     "        vcolor[vertex] = vcmap(community_df.community[vertex])\n",
 943 |     "    gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n",
 944 |     "               vertex_size=5, vertex_fill_color=vcolor)\n",
 945 |     "\n",
 946 |     "plot_community_graph(G, make_communities, 1200, 800)"
 947 |    ]
 948 |   },
 949 |   {
 950 |    "cell_type": "markdown",
 951 |    "metadata": {},
 952 |    "source": [
 953 |     "## Exploring Connections Over Time"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": null,
 959 |    "metadata": {
 960 |     "collapsed": true
 961 |    },
 962 |    "outputs": [],
 963 |    "source": [
 964 |     "columns = ['make_x','make_y', 'edge','year']\n",
 965 |     "graph_all_years = pd.DataFrame(columns=columns)"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "code",
 970 |    "execution_count": null,
 971 |    "metadata": {
 972 |     "collapsed": true
 973 |    },
 974 |    "outputs": [],
 975 |    "source": [
 976 |     "for i in vehicles['year'].unique():\n",
 977 |     "    vehicles_year = vehicles[vehicles.year==i]\n",
 978 |     "\n",
 979 |     "    graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n",
 980 |     "                                                     'comb08','VClass', 'cluster'], \n",
 981 |     "                                                    as_index=False).size()).reset_index()\n",
 982 |     "\n",
 983 |     "    graph_year = graph_year.rename(columns={0: 'count'})\n",
 984 |     "    graph_year['edge'] = (graph_year['cylinders'].map(str)\n",
 985 |     "                          + graph_year['displ'].map(str)\n",
 986 |     "                          + graph_year['trantype']\n",
 987 |     "                          + graph_year['drive']\n",
 988 |     "                          + graph_year['comb08'].map(str)\n",
 989 |     "                          + graph_year['VClass']\n",
 990 |     "                          + graph_year['cluster']\n",
 991 |     "                         )\n",
 992 |     "\n",
 993 |     "    graph_year = graph_year[[entity, 'edge', 'count']]\n",
 994 |     "    vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n",
 995 |     "    vehicle_make_graph['year'] = i\n",
 996 |     "    graph_all_years = graph_all_years.append(vehicle_make_graph)"
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "code",
1001 |    "execution_count": null,
1002 |    "metadata": {
1003 |     "collapsed": false
1004 |    },
1005 |    "outputs": [],
1006 |    "source": [
1007 |     "graph_summary = graph_all_years.groupby(['make_x', 'year'], \n",
1008 |     "                                        as_index=False).sum()\n",
1009 |     "\n",
1010 |     "graph_summary.head()"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "code",
1015 |    "execution_count": null,
1016 |    "metadata": {
1017 |     "collapsed": false
1018 |    },
1019 |    "outputs": [],
1020 |    "source": [
1021 |     "def graph_multi_line(df, x, y):\n",
1022 |     "    ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n",
1023 |     "    ax.legend(loc='center', bbox_to_anchor=(0.5, -0.35),\n",
1024 |     "          ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n",
1025 |     "\n",
1026 |     "graph_multi_line(graph_summary, 'year', 'make_x')"
1027 |    ]
1028 |   },
1029 |   {
1030 |    "cell_type": "code",
1031 |    "execution_count": null,
1032 |    "metadata": {
1033 |     "collapsed": false
1034 |    },
1035 |    "outputs": [],
1036 |    "source": [
1037 |     "makes = ['Chevrolet', 'Ford', 'Toyota', 'Honda', 'Nissan']\n",
1038 |     "\n",
1039 |     "def graph_multi_line_makes(df, x, y):\n",
1040 |     "    ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n",
1041 |     "    ax.legend(loc='center', bbox_to_anchor=(0.5, -0.15),\n",
1042 |     "          ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n",
1043 |     "\n",
1044 |     "graph_summary_makes = graph_summary[graph_summary.make_x.isin(makes)]\n",
1045 |     "graph_multi_line_makes(graph_summary_makes, 'year', 'make_x')"
1046 |    ]
1047 |   }
1048 |  ],
1049 |  "metadata": {
1050 |   "anaconda-cloud": {},
1051 |   "kernelspec": {
1052 |    "display_name": "Python [py27]",
1053 |    "language": "python",
1054 |    "name": "Python [py27]"
1055 |   },
1056 |   "language_info": {
1057 |    "codemirror_mode": {
1058 |     "name": "ipython",
1059 |     "version": 2
1060 |    },
1061 |    "file_extension": ".py",
1062 |    "mimetype": "text/x-python",
1063 |    "name": "python",
1064 |    "nbconvert_exporter": "python",
1065 |    "pygments_lexer": "ipython2",
1066 |    "version": "2.7.12"
1067 |   }
1068 |  },
1069 |  "nbformat": 4,
1070 |  "nbformat_minor": 0
1071 | }
1072 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Transforming Data to Unlock Its Latent Value-checkpoint.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Transforming Data to Unlock Its Latent Value"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": null,
  13 |    "metadata": {
  14 |     "collapsed": false
  15 |    },
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "import os\n",
  19 |     "import zipfile\n",
  20 |     "import requests\n",
  21 |     "import pandas as pd\n",
  22 |     "import numpy as np\n",
  23 |     "\n",
  24 |     "import warnings\n",
  25 |     "warnings.filterwarnings('ignore')\n",
  26 |     "\n",
  27 |     "import matplotlib.pyplot as plt\n",
  28 |     "import seaborn as sns\n",
  29 |     "\n",
  30 |     "path = 'data'\n",
  31 |     "\n",
  32 |     "%matplotlib inline"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "markdown",
  37 |    "metadata": {},
  38 |    "source": [
  39 |     "## Download the Data"
  40 |    ]
  41 |   },
  42 |   {
  43 |    "cell_type": "code",
  44 |    "execution_count": null,
  45 |    "metadata": {
  46 |     "collapsed": true
  47 |    },
  48 |    "outputs": [],
  49 |    "source": [
  50 |     "def download_data(url, name, path='data'):\n",
  51 |     "    if not os.path.exists(path):\n",
  52 |     "        os.mkdir(path)\n",
  53 |     "\n",
  54 |     "    response = requests.get(url)\n",
  55 |     "    with open(os.path.join(path, name), 'wb') as f:\n",
  56 |     "        f.write(response.content)\n",
  57 |     "        \n",
  58 |     "    z = zipfile.ZipFile(os.path.join(path, 'vehicles.zip'))\n",
  59 |     "    z.extractall(path)\n",
  60 |     "\n",
  61 |     "VEHICLES = 'http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip'\n",
  62 |     "\n",
  63 |     "download_data(VEHICLES, 'vehicles.zip')"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": null,
  69 |    "metadata": {
  70 |     "collapsed": true
  71 |    },
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "vehicles = pd.read_csv(os.path.join(path, 'vehicles.csv'))"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "markdown",
  79 |    "metadata": {},
  80 |    "source": [
  81 |     "## Clean and Reorganize the Data"
  82 |    ]
  83 |   },
  84 |   {
  85 |    "cell_type": "code",
  86 |    "execution_count": null,
  87 |    "metadata": {
  88 |     "collapsed": false
  89 |    },
  90 |    "outputs": [],
  91 |    "source": [
  92 |     "select_columns = ['make', 'model', 'year', 'displ', 'cylinders', 'trany', 'drive', 'VClass','fuelType',  \n",
  93 |     "                 'barrels08', 'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08']\n",
  94 |     "\n",
  95 |     "vehicles = vehicles[select_columns][vehicles.year <= 2016].drop_duplicates().dropna()\n",
  96 |     "vehicles = vehicles.sort_values(['make', 'model', 'year'])\n",
  97 |     "vehicles.head()"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "markdown",
 102 |    "metadata": {},
 103 |    "source": [
 104 |     "## Create Category Aggregations\n",
 105 |     "\n",
 106 |     "Hint: Look for object fields that have many categories. "
 107 |    ]
 108 |   },
 109 |   {
 110 |    "cell_type": "code",
 111 |    "execution_count": null,
 112 |    "metadata": {
 113 |     "collapsed": false
 114 |    },
 115 |    "outputs": [],
 116 |    "source": [
 117 |     "def unique_col_values(df):\n",
 118 |     "    for column in df:\n",
 119 |     "        print(str(df[column].name) + \" | \" + str(len(df[column].unique())) + \" | \" + str(df[column].dtype))\n",
 120 |     "\n",
 121 |     "unique_col_values(vehicles)"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "code",
 126 |    "execution_count": null,
 127 |    "metadata": {
 128 |     "collapsed": true
 129 |    },
 130 |    "outputs": [],
 131 |    "source": [
 132 |     "#Create new trantype field that specifies whether the vehicle is Automatic or Manual. \n",
 133 |     "vehicles.loc[vehicles.trany.str[0] == 'A', 'trantype'] = 'Automatic'\n",
 134 |     "vehicles.loc[vehicles.trany.str[0] == 'M', 'trantype'] = 'Manual'\n",
 135 |     "\n",
 136 |     "#Create new model_type field that parses the model type from the model field. \n",
 137 |     "vehicles['model_type'] = vehicles.make + \" \" + vehicles.model.str.split().str.get(0)\n",
 138 |     "\n",
 139 |     "#Create new category field that rolls up VClass into more general categories. \n",
 140 |     "small = ['Compact Cars','Subcompact Cars','Two Seaters','Minicompact Cars']\n",
 141 |     "midsize = ['Midsize Cars']\n",
 142 |     "large = ['Large Cars']\n",
 143 |     "\n",
 144 |     "vehicles.loc[vehicles.VClass.isin(small), 'category'] = 'Small Cars'\n",
 145 |     "vehicles.loc[vehicles.VClass.isin(midsize), 'category'] = 'Midsize Cars'\n",
 146 |     "vehicles.loc[vehicles.VClass.isin(large), 'category'] = 'Large Cars'\n",
 147 |     "vehicles.loc[vehicles.VClass.str.contains('Station'), 'category'] = 'Station Wagons'\n",
 148 |     "vehicles.loc[vehicles.VClass.str.contains('Pickup'), 'category'] = 'Pickup Trucks'\n",
 149 |     "vehicles.loc[vehicles.VClass.str.contains('Special Purpose'), 'category'] = 'Special Purpose'\n",
 150 |     "vehicles.loc[vehicles.VClass.str.contains('Sport Utility'), 'category'] = 'Sport Utility'\n",
 151 |     "vehicles.loc[(vehicles.VClass.str.contains('van')) | (vehicles.VClass.str.contains('van')),\n",
 152 |     "               'category'] = 'Vans & Minivans'\n",
 153 |     "\n",
 154 |     "#Create new fuel_category field that rolls up fuelType into more general categories. \n",
 155 |     "vehicles['fuel_category'] = ''\n",
 156 |     "gas = ['Regular', 'Premium', 'Midgrade']\n",
 157 |     "vehicles.loc[vehicles.fuelType.isin(gas), 'fuel_category'] = 'Gasoline'\n",
 158 |     "vehicles.loc[vehicles.fuelType == 'Diesel', 'fuel_category'] = 'Diesel'\n",
 159 |     "vehicles.loc[vehicles.fuel_category == '', 'fuel_category'] = 'Alternative/Hybrid'"
 160 |    ]
 161 |   },
 162 |   {
 163 |    "cell_type": "markdown",
 164 |    "metadata": {},
 165 |    "source": [
 166 |     "## Create Categorical Fields from Continuous"
 167 |    ]
 168 |   },
 169 |   {
 170 |    "cell_type": "code",
 171 |    "execution_count": null,
 172 |    "metadata": {
 173 |     "collapsed": true
 174 |    },
 175 |    "outputs": [],
 176 |    "source": [
 177 |     "engine_categories = ['Very Small Engine', 'Small Engine','Moderate Engine', \n",
 178 |     "                     'Large Engine', 'Very Large Engine']\n",
 179 |     "vehicles['engine_size'] = pd.qcut(vehicles.displ, 5, engine_categories)\n",
 180 |     "\n",
 181 |     "efficiency_categories = ['Very Low Efficiency', 'Low Efficiency', 'Moderate Efficiency',\n",
 182 |     "                        'High Efficiency', 'Very High Efficiency']\n",
 183 |     "vehicles['fuel_efficiency'] = pd.qcut(vehicles.comb08, 5, efficiency_categories)\n",
 184 |     "\n",
 185 |     "emmission_categories = ['Very Low Emmissions', 'Low Emmissions', 'Moderate Emmissions',\n",
 186 |     "                       'High Emmissions', 'Very High Emmissions']\n",
 187 |     "vehicles['emmission'] = pd.qcut(vehicles.co2TailpipeGpm, 5, emmission_categories)\n",
 188 |     "\n",
 189 |     "fuelcost_categories = ['Very Low Fuel Cost', 'Low Fuel Cost', 'Moderate Fuel Cost',\n",
 190 |     "                      'High Fuel Cost', 'Very High Fuel Cost']\n",
 191 |     "vehicles['fuel_cost'] = pd.qcut(vehicles.fuelCost08, 5, fuelcost_categories)"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "markdown",
 196 |    "metadata": {},
 197 |    "source": [
 198 |     "## Cluster to Create Additional Categories"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": null,
 204 |    "metadata": {
 205 |     "collapsed": true
 206 |    },
 207 |    "outputs": [],
 208 |    "source": [
 209 |     "vehicles_numeric = vehicles._get_numeric_data()\n",
 210 |     "del vehicles_numeric['year']\n",
 211 |     "\n",
 212 |     "vehicles_numeric_norm = vehicles_numeric.apply(lambda x: (x / x.max()))"
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "code",
 217 |    "execution_count": null,
 218 |    "metadata": {
 219 |     "collapsed": false
 220 |    },
 221 |    "outputs": [],
 222 |    "source": [
 223 |     "from sklearn.cluster import KMeans\n",
 224 |     "\n",
 225 |     "model = KMeans(n_clusters=8)\n",
 226 |     "clusters = model.fit_predict(vehicles_numeric_norm)\n",
 227 |     "vehicles_numeric_norm['cluster'] = clusters\n",
 228 |     "\n",
 229 |     "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n",
 230 |     "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n",
 231 |     "\n",
 232 |     "fig, ax = plt.subplots(figsize=(20,10))\n",
 233 |     "sns.heatmap(cluster_means[cluster_columns], annot=True)"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "code",
 238 |    "execution_count": null,
 239 |    "metadata": {
 240 |     "collapsed": false
 241 |    },
 242 |    "outputs": [],
 243 |    "source": [
 244 |     "model = KMeans(n_clusters=4)\n",
 245 |     "clusters = model.fit_predict(vehicles_numeric_norm)\n",
 246 |     "vehicles_numeric_norm['cluster'] = clusters\n",
 247 |     "\n",
 248 |     "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n",
 249 |     "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n",
 250 |     "\n",
 251 |     "fig, ax = plt.subplots(figsize=(20,10))\n",
 252 |     "sns.heatmap(cluster_means[cluster_columns], annot=True)"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "code",
 257 |    "execution_count": null,
 258 |    "metadata": {
 259 |     "collapsed": true
 260 |    },
 261 |    "outputs": [],
 262 |    "source": [
 263 |     "vehicles['cluster'] = clusters\n",
 264 |     "vehicles['cluster'][vehicles['cluster']==0] = 'Small Very Efficient'\n",
 265 |     "vehicles['cluster'][vehicles['cluster']==1] = 'Large Inefficient'\n",
 266 |     "vehicles['cluster'][vehicles['cluster']==2] = 'Midsized Balanced'\n",
 267 |     "vehicles['cluster'][vehicles['cluster']==3] = 'Small Moderately Efficient'"
 268 |    ]
 269 |   },
 270 |   {
 271 |    "cell_type": "markdown",
 272 |    "metadata": {},
 273 |    "source": [
 274 |     "## Aggregate and Filter"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "code",
 279 |    "execution_count": null,
 280 |    "metadata": {
 281 |     "collapsed": true
 282 |    },
 283 |    "outputs": [],
 284 |    "source": [
 285 |     "def barchart(df, group_field, calc_field, calc, length, width):\n",
 286 |     "    grouped = pd.DataFrame(zip_agg.groupby(group_field).agg({calc_field: {calc_field: calc}}).to_records())\n",
 287 |     "    grouped.columns = [group_field, calc_field]\n",
 288 |     "    grouped = grouped.sort_values(calc_field, ascending=False)\n",
 289 |     "\n",
 290 |     "    fig = plt.subplots(figsize=(width,length))\n",
 291 |     "    ax = sns.barplot(x=calc_field, y=group_field, data=grouped)\n",
 292 |     "    ax.set(xlabel=calc + '(' + calc_field + ')', ylabel=group_field )\n",
 293 |     "    plt.show()"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": null,
 299 |    "metadata": {
 300 |     "collapsed": false
 301 |    },
 302 |    "outputs": [],
 303 |    "source": [
 304 |     "barchart(vehicles, 2016, 'category','count', 6,8)"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": null,
 310 |    "metadata": {
 311 |     "collapsed": false
 312 |    },
 313 |    "outputs": [],
 314 |    "source": [
 315 |     "count_barchart(vehicles, 1985, 'category', 'count', 6,8)"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "code",
 320 |    "execution_count": null,
 321 |    "metadata": {
 322 |     "collapsed": false
 323 |    },
 324 |    "outputs": [],
 325 |    "source": [
 326 |     "count_barchart(vehicles, 2016, 'engine_size', 'count', 6, 8)"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "code",
 331 |    "execution_count": null,
 332 |    "metadata": {
 333 |     "collapsed": false
 334 |    },
 335 |    "outputs": [],
 336 |    "source": [
 337 |     "count_barchart(vehicles, 2016, 'fuel_efficiency', 'count',6, 8)"
 338 |    ]
 339 |   },
 340 |   {
 341 |    "cell_type": "code",
 342 |    "execution_count": null,
 343 |    "metadata": {
 344 |     "collapsed": false
 345 |    },
 346 |    "outputs": [],
 347 |    "source": [
 348 |     "count_barchart(vehicles, 2016, 'cluster', 'count',6,8)"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "code",
 353 |    "execution_count": null,
 354 |    "metadata": {
 355 |     "collapsed": false
 356 |    },
 357 |    "outputs": [],
 358 |    "source": [
 359 |     "count_barchart(vehicles, 2016, 'make', 'count',12, 12)"
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "markdown",
 364 |    "metadata": {},
 365 |    "source": [
 366 |     "## More Details with Pivoting"
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": null,
 372 |    "metadata": {
 373 |     "collapsed": true
 374 |    },
 375 |    "outputs": [],
 376 |    "source": [
 377 |     "def pivot_heatmap(df, year, rows, columns, values, width, length):\n",
 378 |     "    df_year = df[df.year == year]\n",
 379 |     "    df_pivot = df_year.pivot_table(values=values, index=rows, columns=columns, \n",
 380 |     "                                   aggfunc=np.size).dropna(axis=0, how='all')\n",
 381 |     "    \n",
 382 |     "    fig = plt.subplots(figsize=(width,length))\n",
 383 |     "    ax = sns.heatmap(df_pivot, annot=True, fmt='g')\n",
 384 |     "    plt.show()"
 385 |    ]
 386 |   },
 387 |   {
 388 |    "cell_type": "code",
 389 |    "execution_count": null,
 390 |    "metadata": {
 391 |     "collapsed": false
 392 |    },
 393 |    "outputs": [],
 394 |    "source": [
 395 |     "pivot_heatmap(vehicles, 2016, 'fuel_efficiency','engine_size','comb08',15, 8)"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "code",
 400 |    "execution_count": null,
 401 |    "metadata": {
 402 |     "collapsed": false
 403 |    },
 404 |    "outputs": [],
 405 |    "source": [
 406 |     "pivot_heatmap(vehicles, 1985, 'fuel_efficiency','engine_size','comb08',15, 8)"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "code",
 411 |    "execution_count": null,
 412 |    "metadata": {
 413 |     "collapsed": false
 414 |    },
 415 |    "outputs": [],
 416 |    "source": [
 417 |     "pivot_heatmap(vehicles, 2016, 'cluster','category', 'comb08', 15, 10)"
 418 |    ]
 419 |   },
 420 |   {
 421 |    "cell_type": "code",
 422 |    "execution_count": null,
 423 |    "metadata": {
 424 |     "collapsed": false
 425 |    },
 426 |    "outputs": [],
 427 |    "source": [
 428 |     "pivot_heatmap(vehicles, 2016, ['engine_size', 'fuel_efficiency'],'category', 'comb08', 15, 15)"
 429 |    ]
 430 |   },
 431 |   {
 432 |    "cell_type": "code",
 433 |    "execution_count": null,
 434 |    "metadata": {
 435 |     "collapsed": false
 436 |    },
 437 |    "outputs": [],
 438 |    "source": [
 439 |     "pivot_heatmap(vehicles, 2016, 'make','category', 'comb08', 10, 10)"
 440 |    ]
 441 |   },
 442 |   {
 443 |    "cell_type": "markdown",
 444 |    "metadata": {},
 445 |    "source": [
 446 |     "## Exploring Aggregations Over Time"
 447 |    ]
 448 |   },
 449 |   {
 450 |    "cell_type": "code",
 451 |    "execution_count": null,
 452 |    "metadata": {
 453 |     "collapsed": true
 454 |    },
 455 |    "outputs": [],
 456 |    "source": [
 457 |     "def multi_line(df, x, y):\n",
 458 |     "    ax = df.groupby([x, y]).size().unstack(y).plot(figsize=(15,8), cmap=\"Set2\")"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "code",
 463 |    "execution_count": null,
 464 |    "metadata": {
 465 |     "collapsed": false
 466 |    },
 467 |    "outputs": [],
 468 |    "source": [
 469 |     "multi_line(vehicles, 'year', 'category')"
 470 |    ]
 471 |   },
 472 |   {
 473 |    "cell_type": "code",
 474 |    "execution_count": null,
 475 |    "metadata": {
 476 |     "collapsed": false
 477 |    },
 478 |    "outputs": [],
 479 |    "source": [
 480 |     "bmw = vehicles[vehicles.make == 'BMW']\n",
 481 |     "multi_line(bmw, 'year', 'category')"
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "code",
 486 |    "execution_count": null,
 487 |    "metadata": {
 488 |     "collapsed": false
 489 |    },
 490 |    "outputs": [],
 491 |    "source": [
 492 |     "toyota = vehicles[vehicles.make == 'Toyota']\n",
 493 |     "multi_line(toyota, 'year', 'category')"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "markdown",
 498 |    "metadata": {},
 499 |    "source": [
 500 |     "## Exploring Field Relationships"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": null,
 506 |    "metadata": {
 507 |     "collapsed": false
 508 |    },
 509 |    "outputs": [],
 510 |    "source": [
 511 |     "def scatter_matrix(df, labels=None):\n",
 512 |     "    ax = sns.pairplot(df, hue=labels, diag_kind='kde', size=2)\n",
 513 |     "    plt.show()\n",
 514 |     "\n",
 515 |     "scatter_matrix(vehicles_numeric_norm)"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "code",
 520 |    "execution_count": null,
 521 |    "metadata": {
 522 |     "collapsed": false
 523 |    },
 524 |    "outputs": [],
 525 |    "source": [
 526 |     "scatter_matrix(vehicles_numeric_norm, labels=\"cluster\")"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "code",
 531 |    "execution_count": null,
 532 |    "metadata": {
 533 |     "collapsed": false
 534 |    },
 535 |    "outputs": [],
 536 |    "source": [
 537 |     "vehicles_numeric_norm['Cluster'] = vehicles['cluster']\n",
 538 |     "sns.lmplot('displ', 'comb08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)"
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": null,
 544 |    "metadata": {
 545 |     "collapsed": false
 546 |    },
 547 |    "outputs": [],
 548 |    "source": [
 549 |     "sns.lmplot('displ', 'fuelCost08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)"
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "markdown",
 554 |    "metadata": {},
 555 |    "source": [
 556 |     "## Exploring Entity Relationships (Graph Analysis)"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": null,
 562 |    "metadata": {
 563 |     "collapsed": true
 564 |    },
 565 |    "outputs": [],
 566 |    "source": [
 567 |     "entity = 'make'\n",
 568 |     "year = 2016\n",
 569 |     "\n",
 570 |     "vehicles_year = vehicles[vehicles.year==year]\n",
 571 |     "\n",
 572 |     "graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n",
 573 |     "                                                 'comb08','VClass', 'cluster'], \n",
 574 |     "                                                as_index=False).size()).reset_index()\n",
 575 |     "\n",
 576 |     "graph_year = graph_year.rename(columns={0: 'count'})\n",
 577 |     "graph_year['edge'] = (graph_year['cylinders'].map(str)\n",
 578 |     "                      + graph_year['displ'].map(str)\n",
 579 |     "                      + graph_year['trantype']\n",
 580 |     "                      + graph_year['drive']\n",
 581 |     "                      + graph_year['comb08'].map(str)\n",
 582 |     "                      + graph_year['VClass']\n",
 583 |     "                      + graph_year['cluster']\n",
 584 |     "                     )\n",
 585 |     "\n",
 586 |     "graph_year = graph_year[[entity, 'edge', 'count']]"
 587 |    ]
 588 |   },
 589 |   {
 590 |    "cell_type": "code",
 591 |    "execution_count": null,
 592 |    "metadata": {
 593 |     "collapsed": true
 594 |    },
 595 |    "outputs": [],
 596 |    "source": [
 597 |     "def df_to_graph(df, entity, edge):\n",
 598 |     "    df2 = df.copy()\n",
 599 |     "    graph_df = pd.merge(df, df2, how='inner', on=edge)\n",
 600 |     "    graph_df = graph_df.groupby([entity + '_x', entity + '_y']).count().reset_index()\n",
 601 |     "    graph_df = graph_df[graph_df[entity + '_x'] != graph_df[entity + '_y']]\n",
 602 |     "    graph_df = graph_df[[entity + '_x', entity + '_y', edge]]\n",
 603 |     "    return graph_df"
 604 |    ]
 605 |   },
 606 |   {
 607 |    "cell_type": "code",
 608 |    "execution_count": null,
 609 |    "metadata": {
 610 |     "collapsed": false
 611 |    },
 612 |    "outputs": [],
 613 |    "source": [
 614 |     "vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n",
 615 |     "vehicle_make_graph.head(10)"
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "code",
 620 |    "execution_count": null,
 621 |    "metadata": {
 622 |     "collapsed": false
 623 |    },
 624 |    "outputs": [],
 625 |    "source": [
 626 |     "import networkx as nx\n",
 627 |     "import graph_tool.all as gt\n",
 628 |     "import graph_tool as gt\n",
 629 |     "from graph_tool import *\n",
 630 |     "\n",
 631 |     "G = nx.from_pandas_dataframe(vehicle_make_graph, entity + '_x', entity + '_y', 'edge')"
 632 |    ]
 633 |   },
 634 |   {
 635 |    "cell_type": "markdown",
 636 |    "metadata": {},
 637 |    "source": [
 638 |     "[Converting NetworkX to Graph-Tool](http://bbengfort.github.io/snippets/2016/06/23/graph-tool-from-networkx.html) by Benjamin Bengfort (converts NetworkX graphs to much prettier Graph-Tool graphs). "
 639 |    ]
 640 |   },
 641 |   {
 642 |    "cell_type": "code",
 643 |    "execution_count": null,
 644 |    "metadata": {
 645 |     "collapsed": true
 646 |    },
 647 |    "outputs": [],
 648 |    "source": [
 649 |     "def get_prop_type(value, key=None):\n",
 650 |     "    \"\"\"\n",
 651 |     "    Performs typing and value conversion for the graph_tool PropertyMap class.\n",
 652 |     "    If a key is provided, it also ensures the key is in a format that can be\n",
 653 |     "    used with the PropertyMap. Returns a tuple, (type name, value, key)\n",
 654 |     "    \"\"\"\n",
 655 |     "    if isinstance(key, unicode):\n",
 656 |     "        # Encode the key as ASCII\n",
 657 |     "        key = key.encode('ascii', errors='replace')\n",
 658 |     "\n",
 659 |     "    # Deal with the value\n",
 660 |     "    if isinstance(value, bool):\n",
 661 |     "        tname = 'bool'\n",
 662 |     "\n",
 663 |     "    elif isinstance(value, int):\n",
 664 |     "        tname = 'float'\n",
 665 |     "        value = float(value)\n",
 666 |     "\n",
 667 |     "    elif isinstance(value, float):\n",
 668 |     "        tname = 'float'\n",
 669 |     "\n",
 670 |     "    elif isinstance(value, unicode):\n",
 671 |     "        tname = 'string'\n",
 672 |     "        value = value.encode('ascii', errors='replace')\n",
 673 |     "\n",
 674 |     "    elif isinstance(value, dict):\n",
 675 |     "        tname = 'object'\n",
 676 |     "\n",
 677 |     "    else:\n",
 678 |     "        tname = 'string'\n",
 679 |     "        value = str(value)\n",
 680 |     "\n",
 681 |     "    return tname, value, key\n",
 682 |     "\n",
 683 |     "\n",
 684 |     "def nx2gt(nxG):\n",
 685 |     "    \"\"\"\n",
 686 |     "    Converts a networkx graph to a graph-tool graph.\n",
 687 |     "    \"\"\"\n",
 688 |     "    # Phase 0: Create a directed or undirected graph-tool Graph\n",
 689 |     "    gtG = gt.Graph(directed=nxG.is_directed())\n",
 690 |     "\n",
 691 |     "    # Add the Graph properties as \"internal properties\"\n",
 692 |     "    for key, value in nxG.graph.items():\n",
 693 |     "        # Convert the value and key into a type for graph-tool\n",
 694 |     "        tname, value, key = get_prop_type(value, key)\n",
 695 |     "\n",
 696 |     "        prop = gtG.new_graph_property(tname) # Create the PropertyMap\n",
 697 |     "        gtG.graph_properties[key] = prop     # Set the PropertyMap\n",
 698 |     "        gtG.graph_properties[key] = value    # Set the actual value\n",
 699 |     "\n",
 700 |     "    # Phase 1: Add the vertex and edge property maps\n",
 701 |     "    # Go through all nodes and edges and add seen properties\n",
 702 |     "    # Add the node properties first\n",
 703 |     "    nprops = set() # cache keys to only add properties once\n",
 704 |     "    for node, data in nxG.nodes_iter(data=True):\n",
 705 |     "\n",
 706 |     "        # Go through all the properties if not seen and add them.\n",
 707 |     "        for key, val in data.items():\n",
 708 |     "            if key in nprops: continue # Skip properties already added\n",
 709 |     "\n",
 710 |     "            # Convert the value and key into a type for graph-tool\n",
 711 |     "            tname, _, key  = get_prop_type(val, key)\n",
 712 |     "\n",
 713 |     "            prop = gtG.new_vertex_property(tname) # Create the PropertyMap\n",
 714 |     "            gtG.vertex_properties[key] = prop     # Set the PropertyMap\n",
 715 |     "\n",
 716 |     "            # Add the key to the already seen properties\n",
 717 |     "            nprops.add(key)\n",
 718 |     "\n",
 719 |     "    # Also add the node id: in NetworkX a node can be any hashable type, but\n",
 720 |     "    # in graph-tool node are defined as indices. So we capture any strings\n",
 721 |     "    # in a special PropertyMap called 'id' -- modify as needed!\n",
 722 |     "    gtG.vertex_properties['id'] = gtG.new_vertex_property('string')\n",
 723 |     "\n",
 724 |     "    # Add the edge properties second\n",
 725 |     "    eprops = set() # cache keys to only add properties once\n",
 726 |     "    for src, dst, data in nxG.edges_iter(data=True):\n",
 727 |     "\n",
 728 |     "        # Go through all the edge properties if not seen and add them.\n",
 729 |     "        for key, val in data.items():\n",
 730 |     "            if key in eprops: continue # Skip properties already added\n",
 731 |     "\n",
 732 |     "            # Convert the value and key into a type for graph-tool\n",
 733 |     "            tname, _, key = get_prop_type(val, key)\n",
 734 |     "\n",
 735 |     "            prop = gtG.new_edge_property(tname) # Create the PropertyMap\n",
 736 |     "            gtG.edge_properties[key] = prop     # Set the PropertyMap\n",
 737 |     "\n",
 738 |     "            # Add the key to the already seen properties\n",
 739 |     "            eprops.add(key)\n",
 740 |     "\n",
 741 |     "    # Phase 2: Actually add all the nodes and vertices with their properties\n",
 742 |     "    # Add the nodes\n",
 743 |     "    vertices = {} # vertex mapping for tracking edges later\n",
 744 |     "    for node, data in nxG.nodes_iter(data=True):\n",
 745 |     "\n",
 746 |     "        # Create the vertex and annotate for our edges later\n",
 747 |     "        v = gtG.add_vertex()\n",
 748 |     "        vertices[node] = v\n",
 749 |     "\n",
 750 |     "        # Set the vertex properties, not forgetting the id property\n",
 751 |     "        data['id'] = str(node)\n",
 752 |     "        for key, value in data.items():\n",
 753 |     "            gtG.vp[key][v] = value # vp is short for vertex_properties\n",
 754 |     "\n",
 755 |     "    # Add the edges\n",
 756 |     "    for src, dst, data in nxG.edges_iter(data=True):\n",
 757 |     "\n",
 758 |     "        # Look up the vertex structs from our vertices mapping and add edge.\n",
 759 |     "        e = gtG.add_edge(vertices[src], vertices[dst])\n",
 760 |     "\n",
 761 |     "        # Add the edge properties\n",
 762 |     "        for key, value in data.items():\n",
 763 |     "            gtG.ep[key][e] = value # ep is short for edge_properties\n",
 764 |     "\n",
 765 |     "    # Done, finally!\n",
 766 |     "    return gtG\n",
 767 |     "\n",
 768 |     "\n",
 769 |     "if __name__ == '__main__':\n",
 770 |     "\n",
 771 |     "    # Create the networkx graph\n",
 772 |     "    nxG = nx.Graph(name=\"Undirected Graph\")\n",
 773 |     "    nxG.add_node(\"v1\", name=\"alpha\", color=\"red\")\n",
 774 |     "    nxG.add_node(\"v2\", name=\"bravo\", color=\"blue\")\n",
 775 |     "    nxG.add_node(\"v3\", name=\"charlie\", color=\"blue\")\n",
 776 |     "    nxG.add_node(\"v4\", name=\"hub\", color=\"purple\")\n",
 777 |     "    nxG.add_node(\"v5\", name=\"delta\", color=\"red\")\n",
 778 |     "    nxG.add_node(\"v6\", name=\"echo\", color=\"red\")\n",
 779 |     "\n",
 780 |     "    nxG.add_edge(\"v1\", \"v2\", weight=0.5, label=\"follows\")\n",
 781 |     "    nxG.add_edge(\"v1\", \"v3\", weight=0.25, label=\"follows\")\n",
 782 |     "    nxG.add_edge(\"v2\", \"v4\", weight=0.05, label=\"follows\")\n",
 783 |     "    nxG.add_edge(\"v3\", \"v4\", weight=0.35, label=\"follows\")\n",
 784 |     "    nxG.add_edge(\"v5\", \"v4\", weight=0.65, label=\"follows\")\n",
 785 |     "    nxG.add_edge(\"v6\", \"v4\", weight=0.53, label=\"follows\")\n",
 786 |     "    nxG.add_edge(\"v5\", \"v6\", weight=0.21, label=\"follows\")\n",
 787 |     "\n",
 788 |     "    for item in nxG.edges_iter(data=True):\n",
 789 |     "        print(item)\n",
 790 |     "\n",
 791 |     "    # Convert to graph-tool graph\n",
 792 |     "    gtG = nx2gt(nxG)\n",
 793 |     "    gtG.list_properties()"
 794 |    ]
 795 |   },
 796 |   {
 797 |    "cell_type": "code",
 798 |    "execution_count": null,
 799 |    "metadata": {
 800 |     "collapsed": true
 801 |    },
 802 |    "outputs": [],
 803 |    "source": [
 804 |     "def plot_graph(graph, width, length):\n",
 805 |     "    g = nx2gt(graph)\n",
 806 |     "    vlabel = g.vp['id']\n",
 807 |     "    gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n",
 808 |     "               vertex_size=5, vertex_fill_color='cyan')\n",
 809 |     "\n",
 810 |     "plot_graph(G, 1200, 800)"
 811 |    ]
 812 |   },
 813 |   {
 814 |    "cell_type": "code",
 815 |    "execution_count": null,
 816 |    "metadata": {
 817 |     "collapsed": true
 818 |    },
 819 |    "outputs": [],
 820 |    "source": [
 821 |     "ego = nx.ego_graph(G, 'Nissan', 1)\n",
 822 |     "plot_graph(ego, 500, 500)"
 823 |    ]
 824 |   },
 825 |   {
 826 |    "cell_type": "code",
 827 |    "execution_count": null,
 828 |    "metadata": {
 829 |     "collapsed": true
 830 |    },
 831 |    "outputs": [],
 832 |    "source": [
 833 |     "import community\n",
 834 |     "\n",
 835 |     "def detect_communities(graph):\n",
 836 |     "    partition = community.best_partition(graph)\n",
 837 |     "    nx.set_node_attributes(graph, 'partition', partition)\n",
 838 |     "    return graph, partition\n",
 839 |     "\n",
 840 |     "make_communities = pd.DataFrame(detect_communities(G)[1].items(), \n",
 841 |     "                                columns=['make', 'community']).sort_values('community', ascending=True)\n",
 842 |     "\n",
 843 |     "make_communities.head()"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "code",
 848 |    "execution_count": null,
 849 |    "metadata": {
 850 |     "collapsed": true
 851 |    },
 852 |    "outputs": [],
 853 |    "source": [
 854 |     "import random\n",
 855 |     "from copy import copy\n",
 856 |     "\n",
 857 |     "##########################################################################\n",
 858 |     "## Color Palettes\n",
 859 |     "##########################################################################\n",
 860 |     "\n",
 861 |     "FLATUI = [\"#9b59b6\", \"#3498db\", \"#95a5a6\", \"#e74c3c\", \"#34495e\", \"#2ecc71\"]\n",
 862 |     "PAIRED = [\n",
 863 |     "    \"#a6cee3\", \"#1f78b4\", \"#b2df8a\", \"#33a02c\", \"#fb9a99\", \"#e31a1c\",\n",
 864 |     "    \"#fdbf6f\", \"#ff7f00\", \"#cab2d6\", \"#6a3d9a\", \"#ffff99\", \"#b15928\",\n",
 865 |     "]\n",
 866 |     "SET1   = [\n",
 867 |     "    \"#e41a1c\", \"#377eb8\", \"#4daf4a\",\n",
 868 |     "    \"#984ea3\", \"#ff7f00\", \"#ffff33\",\n",
 869 |     "    \"#a65628\", \"#f781bf\", \"#999999\"\n",
 870 |     "]\n",
 871 |     "\n",
 872 |     "PALETTES = {\n",
 873 |     "    'flatui': FLATUI,\n",
 874 |     "    'paired': PAIRED,\n",
 875 |     "    'set1': SET1,\n",
 876 |     "}\n",
 877 |     "\n",
 878 |     "##########################################################################\n",
 879 |     "## Color Utilities\n",
 880 |     "##########################################################################\n",
 881 |     "\n",
 882 |     "class ColorMap(object):\n",
 883 |     "    \"\"\"\n",
 884 |     "    A helper for mapping categorical values to colors on demand.\n",
 885 |     "    \"\"\"\n",
 886 |     "\n",
 887 |     "    def __init__(self, colors='flatui', shuffle=False):\n",
 888 |     "        \"\"\"\n",
 889 |     "        Specify either a list of colors or one of the color names. If shuffle\n",
 890 |     "        is True then the colors will be shuffled randomly.\n",
 891 |     "        \"\"\"\n",
 892 |     "        self.mapping = {}\n",
 893 |     "        self.colors = colors\n",
 894 |     "\n",
 895 |     "        if shuffle:\n",
 896 |     "            random.shuffle(self._colors)\n",
 897 |     "\n",
 898 |     "    @property\n",
 899 |     "    def colors(self):\n",
 900 |     "        return self._colors\n",
 901 |     "\n",
 902 |     "    @colors.setter\n",
 903 |     "    def colors(self, value):\n",
 904 |     "        \"\"\"\n",
 905 |     "        Converts color strings into a color listing.\n",
 906 |     "        \"\"\"\n",
 907 |     "        if isinstance(value, basestring):\n",
 908 |     "            if value not in PALETTES:\n",
 909 |     "                raise ValueError(\"'{}' is not a registered color palette\")\n",
 910 |     "            self._colors = copy(PALETTES[value])\n",
 911 |     "        elif isinstance(value, list):\n",
 912 |     "            self._colors = value\n",
 913 |     "        else:\n",
 914 |     "            self._colors = list(value)\n",
 915 |     "\n",
 916 |     "    def __call__(self, category):\n",
 917 |     "        if category not in self.mapping:\n",
 918 |     "            if self.colors:\n",
 919 |     "                self.mapping[category] = self.colors.pop()\n",
 920 |     "            else:\n",
 921 |     "                raise ValueError(\n",
 922 |     "                    \"Not enough colors for this many categories!\"\n",
 923 |     "                )\n",
 924 |     "\n",
 925 |     "        return self.mapping[category]"
 926 |    ]
 927 |   },
 928 |   {
 929 |    "cell_type": "code",
 930 |    "execution_count": null,
 931 |    "metadata": {
 932 |     "collapsed": true
 933 |    },
 934 |    "outputs": [],
 935 |    "source": [
 936 |     "def plot_community_graph(graph, community_df, width, length):\n",
 937 |     "    g = nx2gt(G)\n",
 938 |     "    vlabel = g.vp['id']\n",
 939 |     "    vcolor = g.new_vertex_property('string') \n",
 940 |     "    vcmap = ColorMap('flatui', shuffle=False)\n",
 941 |     "    for vertex in g.vertices():\n",
 942 |     "        vcolor[vertex] = vcmap(community_df.community[vertex])\n",
 943 |     "    gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n",
 944 |     "               vertex_size=5, vertex_fill_color=vcolor)\n",
 945 |     "\n",
 946 |     "plot_community_graph(G, make_communities, 1200, 800)"
 947 |    ]
 948 |   },
 949 |   {
 950 |    "cell_type": "markdown",
 951 |    "metadata": {},
 952 |    "source": [
 953 |     "## Exploring Connections Over Time"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": null,
 959 |    "metadata": {
 960 |     "collapsed": true
 961 |    },
 962 |    "outputs": [],
 963 |    "source": [
 964 |     "columns = ['make_x','make_y', 'edge','year']\n",
 965 |     "graph_all_years = pd.DataFrame(columns=columns)"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "code",
 970 |    "execution_count": null,
 971 |    "metadata": {
 972 |     "collapsed": true
 973 |    },
 974 |    "outputs": [],
 975 |    "source": [
 976 |     "for i in vehicles['year'].unique():\n",
 977 |     "    vehicles_year = vehicles[vehicles.year==i]\n",
 978 |     "\n",
 979 |     "    graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n",
 980 |     "                                                     'comb08','VClass', 'cluster'], \n",
 981 |     "                                                    as_index=False).size()).reset_index()\n",
 982 |     "\n",
 983 |     "    graph_year = graph_year.rename(columns={0: 'count'})\n",
 984 |     "    graph_year['edge'] = (graph_year['cylinders'].map(str)\n",
 985 |     "                          + graph_year['displ'].map(str)\n",
 986 |     "                          + graph_year['trantype']\n",
 987 |     "                          + graph_year['drive']\n",
 988 |     "                          + graph_year['comb08'].map(str)\n",
 989 |     "                          + graph_year['VClass']\n",
 990 |     "                          + graph_year['cluster']\n",
 991 |     "                         )\n",
 992 |     "\n",
 993 |     "    graph_year = graph_year[[entity, 'edge', 'count']]\n",
 994 |     "    vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n",
 995 |     "    vehicle_make_graph['year'] = i\n",
 996 |     "    graph_all_years = graph_all_years.append(vehicle_make_graph)"
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "code",
1001 |    "execution_count": null,
1002 |    "metadata": {
1003 |     "collapsed": false
1004 |    },
1005 |    "outputs": [],
1006 |    "source": [
1007 |     "graph_summary = graph_all_years.groupby(['make_x', 'year'], \n",
1008 |     "                                        as_index=False).sum()\n",
1009 |     "\n",
1010 |     "graph_summary.head()"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "code",
1015 |    "execution_count": null,
1016 |    "metadata": {
1017 |     "collapsed": false
1018 |    },
1019 |    "outputs": [],
1020 |    "source": [
1021 |     "def graph_multi_line(df, x, y):\n",
1022 |     "    ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n",
1023 |     "    ax.legend(loc='center', bbox_to_anchor=(0.5, -0.35),\n",
1024 |     "          ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n",
1025 |     "\n",
1026 |     "graph_multi_line(graph_summary, 'year', 'make_x')"
1027 |    ]
1028 |   },
1029 |   {
1030 |    "cell_type": "code",
1031 |    "execution_count": null,
1032 |    "metadata": {
1033 |     "collapsed": false
1034 |    },
1035 |    "outputs": [],
1036 |    "source": [
1037 |     "makes = ['Chevrolet', 'Ford', 'Toyota', 'Honda', 'Nissan']\n",
1038 |     "\n",
1039 |     "def graph_multi_line_makes(df, x, y):\n",
1040 |     "    ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n",
1041 |     "    ax.legend(loc='center', bbox_to_anchor=(0.5, -0.15),\n",
1042 |     "          ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n",
1043 |     "\n",
1044 |     "graph_summary_makes = graph_summary[graph_summary.make_x.isin(makes)]\n",
1045 |     "graph_multi_line_makes(graph_summary_makes, 'year', 'make_x')"
1046 |    ]
1047 |   }
1048 |  ],
1049 |  "metadata": {
1050 |   "anaconda-cloud": {},
1051 |   "kernelspec": {
1052 |    "display_name": "Python [py27]",
1053 |    "language": "python",
1054 |    "name": "Python [py27]"
1055 |   },
1056 |   "language_info": {
1057 |    "codemirror_mode": {
1058 |     "name": "ipython",
1059 |     "version": 2
1060 |    },
1061 |    "file_extension": ".py",
1062 |    "mimetype": "text/x-python",
1063 |    "name": "python",
1064 |    "nbconvert_exporter": "python",
1065 |    "pygments_lexer": "ipython2",
1066 |    "version": "2.7.12"
1067 |   }
1068 |  },
1069 |  "nbformat": 4,
1070 |  "nbformat_minor": 0
1071 | }
1072 | 


--------------------------------------------------------------------------------