├── .DS_Store ├── transformation.pdf ├── README.md ├── LICENSE ├── Transforming Data to Unlock Its Latent Value.ipynb └── .ipynb_checkpoints └── Transforming Data to Unlock Its Latent Value-checkpoint.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ojedatony1616/exploratory_transformation/HEAD/.DS_Store -------------------------------------------------------------------------------- /transformation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ojedatony1616/exploratory_transformation/HEAD/transformation.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # exploratory_transformation 2 | Repository for exploratory data transformation & visualization talk 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Transforming Data to Unlock Its Latent Value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Transforming Data to Unlock Its Latent Value" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "import zipfile\n", 20 | "import requests\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "\n", 24 | "import warnings\n", 25 | "warnings.filterwarnings('ignore')\n", 26 | "\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import seaborn as sns\n", 29 | "\n", 30 | "path = 'data'\n", 31 | "\n", 32 | "%matplotlib inline" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Download the Data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "def download_data(url, name, path='data'):\n", 51 | " if not os.path.exists(path):\n", 52 | " os.mkdir(path)\n", 53 | "\n", 54 | " response = requests.get(url)\n", 55 | " with open(os.path.join(path, name), 'wb') as f:\n", 56 | " f.write(response.content)\n", 57 | " \n", 58 | " z = zipfile.ZipFile(os.path.join(path, 'vehicles.zip'))\n", 59 | " z.extractall(path)\n", 60 | "\n", 61 | "VEHICLES = 'http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip'\n", 62 | "\n", 63 | "download_data(VEHICLES, 'vehicles.zip')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "vehicles = pd.read_csv(os.path.join(path, 'vehicles.csv'))" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Clean and Reorganize the Data" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "select_columns = ['make', 'model', 'year', 'displ', 'cylinders', 'trany', 'drive', 'VClass','fuelType', \n", 93 | " 'barrels08', 'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08']\n", 94 | "\n", 95 | "vehicles = vehicles[select_columns][vehicles.year <= 2016].drop_duplicates().dropna()\n", 96 | "vehicles = vehicles.sort_values(['make', 'model', 'year'])\n", 97 | "vehicles.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Create Category Aggregations\n", 105 | "\n", 106 | "Hint: Look for object fields that have many categories. " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "def unique_col_values(df):\n", 118 | " for column in df:\n", 119 | " print(str(df[column].name) + \" | \" + str(len(df[column].unique())) + \" | \" + str(df[column].dtype))\n", 120 | "\n", 121 | "unique_col_values(vehicles)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "#Create new trantype field that specifies whether the vehicle is Automatic or Manual. \n", 133 | "vehicles.loc[vehicles.trany.str[0] == 'A', 'trantype'] = 'Automatic'\n", 134 | "vehicles.loc[vehicles.trany.str[0] == 'M', 'trantype'] = 'Manual'\n", 135 | "\n", 136 | "#Create new model_type field that parses the model type from the model field. \n", 137 | "vehicles['model_type'] = vehicles.make + \" \" + vehicles.model.str.split().str.get(0)\n", 138 | "\n", 139 | "#Create new category field that rolls up VClass into more general categories. \n", 140 | "small = ['Compact Cars','Subcompact Cars','Two Seaters','Minicompact Cars']\n", 141 | "midsize = ['Midsize Cars']\n", 142 | "large = ['Large Cars']\n", 143 | "\n", 144 | "vehicles.loc[vehicles.VClass.isin(small), 'category'] = 'Small Cars'\n", 145 | "vehicles.loc[vehicles.VClass.isin(midsize), 'category'] = 'Midsize Cars'\n", 146 | "vehicles.loc[vehicles.VClass.isin(large), 'category'] = 'Large Cars'\n", 147 | "vehicles.loc[vehicles.VClass.str.contains('Station'), 'category'] = 'Station Wagons'\n", 148 | "vehicles.loc[vehicles.VClass.str.contains('Pickup'), 'category'] = 'Pickup Trucks'\n", 149 | "vehicles.loc[vehicles.VClass.str.contains('Special Purpose'), 'category'] = 'Special Purpose'\n", 150 | "vehicles.loc[vehicles.VClass.str.contains('Sport Utility'), 'category'] = 'Sport Utility'\n", 151 | "vehicles.loc[(vehicles.VClass.str.contains('van')) | (vehicles.VClass.str.contains('van')),\n", 152 | " 'category'] = 'Vans & Minivans'\n", 153 | "\n", 154 | "#Create new fuel_category field that rolls up fuelType into more general categories. \n", 155 | "vehicles['fuel_category'] = ''\n", 156 | "gas = ['Regular', 'Premium', 'Midgrade']\n", 157 | "vehicles.loc[vehicles.fuelType.isin(gas), 'fuel_category'] = 'Gasoline'\n", 158 | "vehicles.loc[vehicles.fuelType == 'Diesel', 'fuel_category'] = 'Diesel'\n", 159 | "vehicles.loc[vehicles.fuel_category == '', 'fuel_category'] = 'Alternative/Hybrid'" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "## Create Categorical Fields from Continuous" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "engine_categories = ['Very Small Engine', 'Small Engine','Moderate Engine', \n", 178 | " 'Large Engine', 'Very Large Engine']\n", 179 | "vehicles['engine_size'] = pd.qcut(vehicles.displ, 5, engine_categories)\n", 180 | "\n", 181 | "efficiency_categories = ['Very Low Efficiency', 'Low Efficiency', 'Moderate Efficiency',\n", 182 | " 'High Efficiency', 'Very High Efficiency']\n", 183 | "vehicles['fuel_efficiency'] = pd.qcut(vehicles.comb08, 5, efficiency_categories)\n", 184 | "\n", 185 | "emmission_categories = ['Very Low Emmissions', 'Low Emmissions', 'Moderate Emmissions',\n", 186 | " 'High Emmissions', 'Very High Emmissions']\n", 187 | "vehicles['emmission'] = pd.qcut(vehicles.co2TailpipeGpm, 5, emmission_categories)\n", 188 | "\n", 189 | "fuelcost_categories = ['Very Low Fuel Cost', 'Low Fuel Cost', 'Moderate Fuel Cost',\n", 190 | " 'High Fuel Cost', 'Very High Fuel Cost']\n", 191 | "vehicles['fuel_cost'] = pd.qcut(vehicles.fuelCost08, 5, fuelcost_categories)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "## Cluster to Create Additional Categories" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "vehicles_numeric = vehicles._get_numeric_data()\n", 210 | "del vehicles_numeric['year']\n", 211 | "\n", 212 | "vehicles_numeric_norm = vehicles_numeric.apply(lambda x: (x / x.max()))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "from sklearn.cluster import KMeans\n", 224 | "\n", 225 | "model = KMeans(n_clusters=8)\n", 226 | "clusters = model.fit_predict(vehicles_numeric_norm)\n", 227 | "vehicles_numeric_norm['cluster'] = clusters\n", 228 | "\n", 229 | "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n", 230 | "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n", 231 | "\n", 232 | "fig, ax = plt.subplots(figsize=(20,10))\n", 233 | "sns.heatmap(cluster_means[cluster_columns], annot=True)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "model = KMeans(n_clusters=4)\n", 245 | "clusters = model.fit_predict(vehicles_numeric_norm)\n", 246 | "vehicles_numeric_norm['cluster'] = clusters\n", 247 | "\n", 248 | "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n", 249 | "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n", 250 | "\n", 251 | "fig, ax = plt.subplots(figsize=(20,10))\n", 252 | "sns.heatmap(cluster_means[cluster_columns], annot=True)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "vehicles['cluster'] = clusters\n", 264 | "vehicles['cluster'][vehicles['cluster']==0] = 'Small Very Efficient'\n", 265 | "vehicles['cluster'][vehicles['cluster']==1] = 'Large Inefficient'\n", 266 | "vehicles['cluster'][vehicles['cluster']==2] = 'Midsized Balanced'\n", 267 | "vehicles['cluster'][vehicles['cluster']==3] = 'Small Moderately Efficient'" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## Aggregate and Filter" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "def barchart(df, group_field, calc_field, calc, length, width):\n", 286 | " grouped = pd.DataFrame(zip_agg.groupby(group_field).agg({calc_field: {calc_field: calc}}).to_records())\n", 287 | " grouped.columns = [group_field, calc_field]\n", 288 | " grouped = grouped.sort_values(calc_field, ascending=False)\n", 289 | "\n", 290 | " fig = plt.subplots(figsize=(width,length))\n", 291 | " ax = sns.barplot(x=calc_field, y=group_field, data=grouped)\n", 292 | " ax.set(xlabel=calc + '(' + calc_field + ')', ylabel=group_field )\n", 293 | " plt.show()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "barchart(vehicles, 2016, 'category','count', 6,8)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": false 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "count_barchart(vehicles, 1985, 'category', 'count', 6,8)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "count_barchart(vehicles, 2016, 'engine_size', 'count', 6, 8)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "count_barchart(vehicles, 2016, 'fuel_efficiency', 'count',6, 8)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "count_barchart(vehicles, 2016, 'cluster', 'count',6,8)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "count_barchart(vehicles, 2016, 'make', 'count',12, 12)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "## More Details with Pivoting" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "def pivot_heatmap(df, year, rows, columns, values, width, length):\n", 378 | " df_year = df[df.year == year]\n", 379 | " df_pivot = df_year.pivot_table(values=values, index=rows, columns=columns, \n", 380 | " aggfunc=np.size).dropna(axis=0, how='all')\n", 381 | " \n", 382 | " fig = plt.subplots(figsize=(width,length))\n", 383 | " ax = sns.heatmap(df_pivot, annot=True, fmt='g')\n", 384 | " plt.show()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "pivot_heatmap(vehicles, 2016, 'fuel_efficiency','engine_size','comb08',15, 8)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "collapsed": false 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "pivot_heatmap(vehicles, 1985, 'fuel_efficiency','engine_size','comb08',15, 8)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "pivot_heatmap(vehicles, 2016, 'cluster','category', 'comb08', 15, 10)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": false 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "pivot_heatmap(vehicles, 2016, ['engine_size', 'fuel_efficiency'],'category', 'comb08', 15, 15)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "collapsed": false 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "pivot_heatmap(vehicles, 2016, 'make','category', 'comb08', 10, 10)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "## Exploring Aggregations Over Time" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "def multi_line(df, x, y):\n", 458 | " ax = df.groupby([x, y]).size().unstack(y).plot(figsize=(15,8), cmap=\"Set2\")" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "collapsed": false 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "multi_line(vehicles, 'year', 'category')" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": { 476 | "collapsed": false 477 | }, 478 | "outputs": [], 479 | "source": [ 480 | "bmw = vehicles[vehicles.make == 'BMW']\n", 481 | "multi_line(bmw, 'year', 'category')" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": { 488 | "collapsed": false 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "toyota = vehicles[vehicles.make == 'Toyota']\n", 493 | "multi_line(toyota, 'year', 'category')" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "## Exploring Field Relationships" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "collapsed": false 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "def scatter_matrix(df, labels=None):\n", 512 | " ax = sns.pairplot(df, hue=labels, diag_kind='kde', size=2)\n", 513 | " plt.show()\n", 514 | "\n", 515 | "scatter_matrix(vehicles_numeric_norm)" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "scatter_matrix(vehicles_numeric_norm, labels=\"cluster\")" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "collapsed": false 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "vehicles_numeric_norm['Cluster'] = vehicles['cluster']\n", 538 | "sns.lmplot('displ', 'comb08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "collapsed": false 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "sns.lmplot('displ', 'fuelCost08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "## Exploring Entity Relationships (Graph Analysis)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": { 563 | "collapsed": true 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "entity = 'make'\n", 568 | "year = 2016\n", 569 | "\n", 570 | "vehicles_year = vehicles[vehicles.year==year]\n", 571 | "\n", 572 | "graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n", 573 | " 'comb08','VClass', 'cluster'], \n", 574 | " as_index=False).size()).reset_index()\n", 575 | "\n", 576 | "graph_year = graph_year.rename(columns={0: 'count'})\n", 577 | "graph_year['edge'] = (graph_year['cylinders'].map(str)\n", 578 | " + graph_year['displ'].map(str)\n", 579 | " + graph_year['trantype']\n", 580 | " + graph_year['drive']\n", 581 | " + graph_year['comb08'].map(str)\n", 582 | " + graph_year['VClass']\n", 583 | " + graph_year['cluster']\n", 584 | " )\n", 585 | "\n", 586 | "graph_year = graph_year[[entity, 'edge', 'count']]" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": { 593 | "collapsed": true 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "def df_to_graph(df, entity, edge):\n", 598 | " df2 = df.copy()\n", 599 | " graph_df = pd.merge(df, df2, how='inner', on=edge)\n", 600 | " graph_df = graph_df.groupby([entity + '_x', entity + '_y']).count().reset_index()\n", 601 | " graph_df = graph_df[graph_df[entity + '_x'] != graph_df[entity + '_y']]\n", 602 | " graph_df = graph_df[[entity + '_x', entity + '_y', edge]]\n", 603 | " return graph_df" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": { 610 | "collapsed": false 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n", 615 | "vehicle_make_graph.head(10)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": { 622 | "collapsed": false 623 | }, 624 | "outputs": [], 625 | "source": [ 626 | "import networkx as nx\n", 627 | "import graph_tool.all as gt\n", 628 | "import graph_tool as gt\n", 629 | "from graph_tool import *\n", 630 | "\n", 631 | "G = nx.from_pandas_dataframe(vehicle_make_graph, entity + '_x', entity + '_y', 'edge')" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "[Converting NetworkX to Graph-Tool](http://bbengfort.github.io/snippets/2016/06/23/graph-tool-from-networkx.html) by Benjamin Bengfort (converts NetworkX graphs to much prettier Graph-Tool graphs). " 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": { 645 | "collapsed": true 646 | }, 647 | "outputs": [], 648 | "source": [ 649 | "def get_prop_type(value, key=None):\n", 650 | " \"\"\"\n", 651 | " Performs typing and value conversion for the graph_tool PropertyMap class.\n", 652 | " If a key is provided, it also ensures the key is in a format that can be\n", 653 | " used with the PropertyMap. Returns a tuple, (type name, value, key)\n", 654 | " \"\"\"\n", 655 | " if isinstance(key, unicode):\n", 656 | " # Encode the key as ASCII\n", 657 | " key = key.encode('ascii', errors='replace')\n", 658 | "\n", 659 | " # Deal with the value\n", 660 | " if isinstance(value, bool):\n", 661 | " tname = 'bool'\n", 662 | "\n", 663 | " elif isinstance(value, int):\n", 664 | " tname = 'float'\n", 665 | " value = float(value)\n", 666 | "\n", 667 | " elif isinstance(value, float):\n", 668 | " tname = 'float'\n", 669 | "\n", 670 | " elif isinstance(value, unicode):\n", 671 | " tname = 'string'\n", 672 | " value = value.encode('ascii', errors='replace')\n", 673 | "\n", 674 | " elif isinstance(value, dict):\n", 675 | " tname = 'object'\n", 676 | "\n", 677 | " else:\n", 678 | " tname = 'string'\n", 679 | " value = str(value)\n", 680 | "\n", 681 | " return tname, value, key\n", 682 | "\n", 683 | "\n", 684 | "def nx2gt(nxG):\n", 685 | " \"\"\"\n", 686 | " Converts a networkx graph to a graph-tool graph.\n", 687 | " \"\"\"\n", 688 | " # Phase 0: Create a directed or undirected graph-tool Graph\n", 689 | " gtG = gt.Graph(directed=nxG.is_directed())\n", 690 | "\n", 691 | " # Add the Graph properties as \"internal properties\"\n", 692 | " for key, value in nxG.graph.items():\n", 693 | " # Convert the value and key into a type for graph-tool\n", 694 | " tname, value, key = get_prop_type(value, key)\n", 695 | "\n", 696 | " prop = gtG.new_graph_property(tname) # Create the PropertyMap\n", 697 | " gtG.graph_properties[key] = prop # Set the PropertyMap\n", 698 | " gtG.graph_properties[key] = value # Set the actual value\n", 699 | "\n", 700 | " # Phase 1: Add the vertex and edge property maps\n", 701 | " # Go through all nodes and edges and add seen properties\n", 702 | " # Add the node properties first\n", 703 | " nprops = set() # cache keys to only add properties once\n", 704 | " for node, data in nxG.nodes_iter(data=True):\n", 705 | "\n", 706 | " # Go through all the properties if not seen and add them.\n", 707 | " for key, val in data.items():\n", 708 | " if key in nprops: continue # Skip properties already added\n", 709 | "\n", 710 | " # Convert the value and key into a type for graph-tool\n", 711 | " tname, _, key = get_prop_type(val, key)\n", 712 | "\n", 713 | " prop = gtG.new_vertex_property(tname) # Create the PropertyMap\n", 714 | " gtG.vertex_properties[key] = prop # Set the PropertyMap\n", 715 | "\n", 716 | " # Add the key to the already seen properties\n", 717 | " nprops.add(key)\n", 718 | "\n", 719 | " # Also add the node id: in NetworkX a node can be any hashable type, but\n", 720 | " # in graph-tool node are defined as indices. So we capture any strings\n", 721 | " # in a special PropertyMap called 'id' -- modify as needed!\n", 722 | " gtG.vertex_properties['id'] = gtG.new_vertex_property('string')\n", 723 | "\n", 724 | " # Add the edge properties second\n", 725 | " eprops = set() # cache keys to only add properties once\n", 726 | " for src, dst, data in nxG.edges_iter(data=True):\n", 727 | "\n", 728 | " # Go through all the edge properties if not seen and add them.\n", 729 | " for key, val in data.items():\n", 730 | " if key in eprops: continue # Skip properties already added\n", 731 | "\n", 732 | " # Convert the value and key into a type for graph-tool\n", 733 | " tname, _, key = get_prop_type(val, key)\n", 734 | "\n", 735 | " prop = gtG.new_edge_property(tname) # Create the PropertyMap\n", 736 | " gtG.edge_properties[key] = prop # Set the PropertyMap\n", 737 | "\n", 738 | " # Add the key to the already seen properties\n", 739 | " eprops.add(key)\n", 740 | "\n", 741 | " # Phase 2: Actually add all the nodes and vertices with their properties\n", 742 | " # Add the nodes\n", 743 | " vertices = {} # vertex mapping for tracking edges later\n", 744 | " for node, data in nxG.nodes_iter(data=True):\n", 745 | "\n", 746 | " # Create the vertex and annotate for our edges later\n", 747 | " v = gtG.add_vertex()\n", 748 | " vertices[node] = v\n", 749 | "\n", 750 | " # Set the vertex properties, not forgetting the id property\n", 751 | " data['id'] = str(node)\n", 752 | " for key, value in data.items():\n", 753 | " gtG.vp[key][v] = value # vp is short for vertex_properties\n", 754 | "\n", 755 | " # Add the edges\n", 756 | " for src, dst, data in nxG.edges_iter(data=True):\n", 757 | "\n", 758 | " # Look up the vertex structs from our vertices mapping and add edge.\n", 759 | " e = gtG.add_edge(vertices[src], vertices[dst])\n", 760 | "\n", 761 | " # Add the edge properties\n", 762 | " for key, value in data.items():\n", 763 | " gtG.ep[key][e] = value # ep is short for edge_properties\n", 764 | "\n", 765 | " # Done, finally!\n", 766 | " return gtG\n", 767 | "\n", 768 | "\n", 769 | "if __name__ == '__main__':\n", 770 | "\n", 771 | " # Create the networkx graph\n", 772 | " nxG = nx.Graph(name=\"Undirected Graph\")\n", 773 | " nxG.add_node(\"v1\", name=\"alpha\", color=\"red\")\n", 774 | " nxG.add_node(\"v2\", name=\"bravo\", color=\"blue\")\n", 775 | " nxG.add_node(\"v3\", name=\"charlie\", color=\"blue\")\n", 776 | " nxG.add_node(\"v4\", name=\"hub\", color=\"purple\")\n", 777 | " nxG.add_node(\"v5\", name=\"delta\", color=\"red\")\n", 778 | " nxG.add_node(\"v6\", name=\"echo\", color=\"red\")\n", 779 | "\n", 780 | " nxG.add_edge(\"v1\", \"v2\", weight=0.5, label=\"follows\")\n", 781 | " nxG.add_edge(\"v1\", \"v3\", weight=0.25, label=\"follows\")\n", 782 | " nxG.add_edge(\"v2\", \"v4\", weight=0.05, label=\"follows\")\n", 783 | " nxG.add_edge(\"v3\", \"v4\", weight=0.35, label=\"follows\")\n", 784 | " nxG.add_edge(\"v5\", \"v4\", weight=0.65, label=\"follows\")\n", 785 | " nxG.add_edge(\"v6\", \"v4\", weight=0.53, label=\"follows\")\n", 786 | " nxG.add_edge(\"v5\", \"v6\", weight=0.21, label=\"follows\")\n", 787 | "\n", 788 | " for item in nxG.edges_iter(data=True):\n", 789 | " print(item)\n", 790 | "\n", 791 | " # Convert to graph-tool graph\n", 792 | " gtG = nx2gt(nxG)\n", 793 | " gtG.list_properties()" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "metadata": { 800 | "collapsed": true 801 | }, 802 | "outputs": [], 803 | "source": [ 804 | "def plot_graph(graph, width, length):\n", 805 | " g = nx2gt(graph)\n", 806 | " vlabel = g.vp['id']\n", 807 | " gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n", 808 | " vertex_size=5, vertex_fill_color='cyan')\n", 809 | "\n", 810 | "plot_graph(G, 1200, 800)" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": null, 816 | "metadata": { 817 | "collapsed": true 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "ego = nx.ego_graph(G, 'Nissan', 1)\n", 822 | "plot_graph(ego, 500, 500)" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": null, 828 | "metadata": { 829 | "collapsed": true 830 | }, 831 | "outputs": [], 832 | "source": [ 833 | "import community\n", 834 | "\n", 835 | "def detect_communities(graph):\n", 836 | " partition = community.best_partition(graph)\n", 837 | " nx.set_node_attributes(graph, 'partition', partition)\n", 838 | " return graph, partition\n", 839 | "\n", 840 | "make_communities = pd.DataFrame(detect_communities(G)[1].items(), \n", 841 | " columns=['make', 'community']).sort_values('community', ascending=True)\n", 842 | "\n", 843 | "make_communities.head()" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": { 850 | "collapsed": true 851 | }, 852 | "outputs": [], 853 | "source": [ 854 | "import random\n", 855 | "from copy import copy\n", 856 | "\n", 857 | "##########################################################################\n", 858 | "## Color Palettes\n", 859 | "##########################################################################\n", 860 | "\n", 861 | "FLATUI = [\"#9b59b6\", \"#3498db\", \"#95a5a6\", \"#e74c3c\", \"#34495e\", \"#2ecc71\"]\n", 862 | "PAIRED = [\n", 863 | " \"#a6cee3\", \"#1f78b4\", \"#b2df8a\", \"#33a02c\", \"#fb9a99\", \"#e31a1c\",\n", 864 | " \"#fdbf6f\", \"#ff7f00\", \"#cab2d6\", \"#6a3d9a\", \"#ffff99\", \"#b15928\",\n", 865 | "]\n", 866 | "SET1 = [\n", 867 | " \"#e41a1c\", \"#377eb8\", \"#4daf4a\",\n", 868 | " \"#984ea3\", \"#ff7f00\", \"#ffff33\",\n", 869 | " \"#a65628\", \"#f781bf\", \"#999999\"\n", 870 | "]\n", 871 | "\n", 872 | "PALETTES = {\n", 873 | " 'flatui': FLATUI,\n", 874 | " 'paired': PAIRED,\n", 875 | " 'set1': SET1,\n", 876 | "}\n", 877 | "\n", 878 | "##########################################################################\n", 879 | "## Color Utilities\n", 880 | "##########################################################################\n", 881 | "\n", 882 | "class ColorMap(object):\n", 883 | " \"\"\"\n", 884 | " A helper for mapping categorical values to colors on demand.\n", 885 | " \"\"\"\n", 886 | "\n", 887 | " def __init__(self, colors='flatui', shuffle=False):\n", 888 | " \"\"\"\n", 889 | " Specify either a list of colors or one of the color names. If shuffle\n", 890 | " is True then the colors will be shuffled randomly.\n", 891 | " \"\"\"\n", 892 | " self.mapping = {}\n", 893 | " self.colors = colors\n", 894 | "\n", 895 | " if shuffle:\n", 896 | " random.shuffle(self._colors)\n", 897 | "\n", 898 | " @property\n", 899 | " def colors(self):\n", 900 | " return self._colors\n", 901 | "\n", 902 | " @colors.setter\n", 903 | " def colors(self, value):\n", 904 | " \"\"\"\n", 905 | " Converts color strings into a color listing.\n", 906 | " \"\"\"\n", 907 | " if isinstance(value, basestring):\n", 908 | " if value not in PALETTES:\n", 909 | " raise ValueError(\"'{}' is not a registered color palette\")\n", 910 | " self._colors = copy(PALETTES[value])\n", 911 | " elif isinstance(value, list):\n", 912 | " self._colors = value\n", 913 | " else:\n", 914 | " self._colors = list(value)\n", 915 | "\n", 916 | " def __call__(self, category):\n", 917 | " if category not in self.mapping:\n", 918 | " if self.colors:\n", 919 | " self.mapping[category] = self.colors.pop()\n", 920 | " else:\n", 921 | " raise ValueError(\n", 922 | " \"Not enough colors for this many categories!\"\n", 923 | " )\n", 924 | "\n", 925 | " return self.mapping[category]" 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": null, 931 | "metadata": { 932 | "collapsed": true 933 | }, 934 | "outputs": [], 935 | "source": [ 936 | "def plot_community_graph(graph, community_df, width, length):\n", 937 | " g = nx2gt(G)\n", 938 | " vlabel = g.vp['id']\n", 939 | " vcolor = g.new_vertex_property('string') \n", 940 | " vcmap = ColorMap('flatui', shuffle=False)\n", 941 | " for vertex in g.vertices():\n", 942 | " vcolor[vertex] = vcmap(community_df.community[vertex])\n", 943 | " gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n", 944 | " vertex_size=5, vertex_fill_color=vcolor)\n", 945 | "\n", 946 | "plot_community_graph(G, make_communities, 1200, 800)" 947 | ] 948 | }, 949 | { 950 | "cell_type": "markdown", 951 | "metadata": {}, 952 | "source": [ 953 | "## Exploring Connections Over Time" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": { 960 | "collapsed": true 961 | }, 962 | "outputs": [], 963 | "source": [ 964 | "columns = ['make_x','make_y', 'edge','year']\n", 965 | "graph_all_years = pd.DataFrame(columns=columns)" 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": null, 971 | "metadata": { 972 | "collapsed": true 973 | }, 974 | "outputs": [], 975 | "source": [ 976 | "for i in vehicles['year'].unique():\n", 977 | " vehicles_year = vehicles[vehicles.year==i]\n", 978 | "\n", 979 | " graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n", 980 | " 'comb08','VClass', 'cluster'], \n", 981 | " as_index=False).size()).reset_index()\n", 982 | "\n", 983 | " graph_year = graph_year.rename(columns={0: 'count'})\n", 984 | " graph_year['edge'] = (graph_year['cylinders'].map(str)\n", 985 | " + graph_year['displ'].map(str)\n", 986 | " + graph_year['trantype']\n", 987 | " + graph_year['drive']\n", 988 | " + graph_year['comb08'].map(str)\n", 989 | " + graph_year['VClass']\n", 990 | " + graph_year['cluster']\n", 991 | " )\n", 992 | "\n", 993 | " graph_year = graph_year[[entity, 'edge', 'count']]\n", 994 | " vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n", 995 | " vehicle_make_graph['year'] = i\n", 996 | " graph_all_years = graph_all_years.append(vehicle_make_graph)" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": null, 1002 | "metadata": { 1003 | "collapsed": false 1004 | }, 1005 | "outputs": [], 1006 | "source": [ 1007 | "graph_summary = graph_all_years.groupby(['make_x', 'year'], \n", 1008 | " as_index=False).sum()\n", 1009 | "\n", 1010 | "graph_summary.head()" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": null, 1016 | "metadata": { 1017 | "collapsed": false 1018 | }, 1019 | "outputs": [], 1020 | "source": [ 1021 | "def graph_multi_line(df, x, y):\n", 1022 | " ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n", 1023 | " ax.legend(loc='center', bbox_to_anchor=(0.5, -0.35),\n", 1024 | " ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n", 1025 | "\n", 1026 | "graph_multi_line(graph_summary, 'year', 'make_x')" 1027 | ] 1028 | }, 1029 | { 1030 | "cell_type": "code", 1031 | "execution_count": null, 1032 | "metadata": { 1033 | "collapsed": false 1034 | }, 1035 | "outputs": [], 1036 | "source": [ 1037 | "makes = ['Chevrolet', 'Ford', 'Toyota', 'Honda', 'Nissan']\n", 1038 | "\n", 1039 | "def graph_multi_line_makes(df, x, y):\n", 1040 | " ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n", 1041 | " ax.legend(loc='center', bbox_to_anchor=(0.5, -0.15),\n", 1042 | " ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n", 1043 | "\n", 1044 | "graph_summary_makes = graph_summary[graph_summary.make_x.isin(makes)]\n", 1045 | "graph_multi_line_makes(graph_summary_makes, 'year', 'make_x')" 1046 | ] 1047 | } 1048 | ], 1049 | "metadata": { 1050 | "anaconda-cloud": {}, 1051 | "kernelspec": { 1052 | "display_name": "Python [py27]", 1053 | "language": "python", 1054 | "name": "Python [py27]" 1055 | }, 1056 | "language_info": { 1057 | "codemirror_mode": { 1058 | "name": "ipython", 1059 | "version": 2 1060 | }, 1061 | "file_extension": ".py", 1062 | "mimetype": "text/x-python", 1063 | "name": "python", 1064 | "nbconvert_exporter": "python", 1065 | "pygments_lexer": "ipython2", 1066 | "version": "2.7.12" 1067 | } 1068 | }, 1069 | "nbformat": 4, 1070 | "nbformat_minor": 0 1071 | } 1072 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Transforming Data to Unlock Its Latent Value-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Transforming Data to Unlock Its Latent Value" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "import zipfile\n", 20 | "import requests\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "\n", 24 | "import warnings\n", 25 | "warnings.filterwarnings('ignore')\n", 26 | "\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import seaborn as sns\n", 29 | "\n", 30 | "path = 'data'\n", 31 | "\n", 32 | "%matplotlib inline" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Download the Data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "def download_data(url, name, path='data'):\n", 51 | " if not os.path.exists(path):\n", 52 | " os.mkdir(path)\n", 53 | "\n", 54 | " response = requests.get(url)\n", 55 | " with open(os.path.join(path, name), 'wb') as f:\n", 56 | " f.write(response.content)\n", 57 | " \n", 58 | " z = zipfile.ZipFile(os.path.join(path, 'vehicles.zip'))\n", 59 | " z.extractall(path)\n", 60 | "\n", 61 | "VEHICLES = 'http://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip'\n", 62 | "\n", 63 | "download_data(VEHICLES, 'vehicles.zip')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "vehicles = pd.read_csv(os.path.join(path, 'vehicles.csv'))" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Clean and Reorganize the Data" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "select_columns = ['make', 'model', 'year', 'displ', 'cylinders', 'trany', 'drive', 'VClass','fuelType', \n", 93 | " 'barrels08', 'city08', 'highway08', 'comb08', 'co2TailpipeGpm', 'fuelCost08']\n", 94 | "\n", 95 | "vehicles = vehicles[select_columns][vehicles.year <= 2016].drop_duplicates().dropna()\n", 96 | "vehicles = vehicles.sort_values(['make', 'model', 'year'])\n", 97 | "vehicles.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Create Category Aggregations\n", 105 | "\n", 106 | "Hint: Look for object fields that have many categories. " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "def unique_col_values(df):\n", 118 | " for column in df:\n", 119 | " print(str(df[column].name) + \" | \" + str(len(df[column].unique())) + \" | \" + str(df[column].dtype))\n", 120 | "\n", 121 | "unique_col_values(vehicles)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "#Create new trantype field that specifies whether the vehicle is Automatic or Manual. \n", 133 | "vehicles.loc[vehicles.trany.str[0] == 'A', 'trantype'] = 'Automatic'\n", 134 | "vehicles.loc[vehicles.trany.str[0] == 'M', 'trantype'] = 'Manual'\n", 135 | "\n", 136 | "#Create new model_type field that parses the model type from the model field. \n", 137 | "vehicles['model_type'] = vehicles.make + \" \" + vehicles.model.str.split().str.get(0)\n", 138 | "\n", 139 | "#Create new category field that rolls up VClass into more general categories. \n", 140 | "small = ['Compact Cars','Subcompact Cars','Two Seaters','Minicompact Cars']\n", 141 | "midsize = ['Midsize Cars']\n", 142 | "large = ['Large Cars']\n", 143 | "\n", 144 | "vehicles.loc[vehicles.VClass.isin(small), 'category'] = 'Small Cars'\n", 145 | "vehicles.loc[vehicles.VClass.isin(midsize), 'category'] = 'Midsize Cars'\n", 146 | "vehicles.loc[vehicles.VClass.isin(large), 'category'] = 'Large Cars'\n", 147 | "vehicles.loc[vehicles.VClass.str.contains('Station'), 'category'] = 'Station Wagons'\n", 148 | "vehicles.loc[vehicles.VClass.str.contains('Pickup'), 'category'] = 'Pickup Trucks'\n", 149 | "vehicles.loc[vehicles.VClass.str.contains('Special Purpose'), 'category'] = 'Special Purpose'\n", 150 | "vehicles.loc[vehicles.VClass.str.contains('Sport Utility'), 'category'] = 'Sport Utility'\n", 151 | "vehicles.loc[(vehicles.VClass.str.contains('van')) | (vehicles.VClass.str.contains('van')),\n", 152 | " 'category'] = 'Vans & Minivans'\n", 153 | "\n", 154 | "#Create new fuel_category field that rolls up fuelType into more general categories. \n", 155 | "vehicles['fuel_category'] = ''\n", 156 | "gas = ['Regular', 'Premium', 'Midgrade']\n", 157 | "vehicles.loc[vehicles.fuelType.isin(gas), 'fuel_category'] = 'Gasoline'\n", 158 | "vehicles.loc[vehicles.fuelType == 'Diesel', 'fuel_category'] = 'Diesel'\n", 159 | "vehicles.loc[vehicles.fuel_category == '', 'fuel_category'] = 'Alternative/Hybrid'" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "## Create Categorical Fields from Continuous" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "engine_categories = ['Very Small Engine', 'Small Engine','Moderate Engine', \n", 178 | " 'Large Engine', 'Very Large Engine']\n", 179 | "vehicles['engine_size'] = pd.qcut(vehicles.displ, 5, engine_categories)\n", 180 | "\n", 181 | "efficiency_categories = ['Very Low Efficiency', 'Low Efficiency', 'Moderate Efficiency',\n", 182 | " 'High Efficiency', 'Very High Efficiency']\n", 183 | "vehicles['fuel_efficiency'] = pd.qcut(vehicles.comb08, 5, efficiency_categories)\n", 184 | "\n", 185 | "emmission_categories = ['Very Low Emmissions', 'Low Emmissions', 'Moderate Emmissions',\n", 186 | " 'High Emmissions', 'Very High Emmissions']\n", 187 | "vehicles['emmission'] = pd.qcut(vehicles.co2TailpipeGpm, 5, emmission_categories)\n", 188 | "\n", 189 | "fuelcost_categories = ['Very Low Fuel Cost', 'Low Fuel Cost', 'Moderate Fuel Cost',\n", 190 | " 'High Fuel Cost', 'Very High Fuel Cost']\n", 191 | "vehicles['fuel_cost'] = pd.qcut(vehicles.fuelCost08, 5, fuelcost_categories)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "## Cluster to Create Additional Categories" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "vehicles_numeric = vehicles._get_numeric_data()\n", 210 | "del vehicles_numeric['year']\n", 211 | "\n", 212 | "vehicles_numeric_norm = vehicles_numeric.apply(lambda x: (x / x.max()))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "from sklearn.cluster import KMeans\n", 224 | "\n", 225 | "model = KMeans(n_clusters=8)\n", 226 | "clusters = model.fit_predict(vehicles_numeric_norm)\n", 227 | "vehicles_numeric_norm['cluster'] = clusters\n", 228 | "\n", 229 | "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n", 230 | "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n", 231 | "\n", 232 | "fig, ax = plt.subplots(figsize=(20,10))\n", 233 | "sns.heatmap(cluster_means[cluster_columns], annot=True)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "model = KMeans(n_clusters=4)\n", 245 | "clusters = model.fit_predict(vehicles_numeric_norm)\n", 246 | "vehicles_numeric_norm['cluster'] = clusters\n", 247 | "\n", 248 | "cluster_means = vehicles_numeric_norm.groupby(['cluster'], as_index=False).mean()\n", 249 | "cluster_columns = ['displ','cylinders','barrels08','city08','highway08','comb08','co2TailpipeGpm','fuelCost08']\n", 250 | "\n", 251 | "fig, ax = plt.subplots(figsize=(20,10))\n", 252 | "sns.heatmap(cluster_means[cluster_columns], annot=True)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "vehicles['cluster'] = clusters\n", 264 | "vehicles['cluster'][vehicles['cluster']==0] = 'Small Very Efficient'\n", 265 | "vehicles['cluster'][vehicles['cluster']==1] = 'Large Inefficient'\n", 266 | "vehicles['cluster'][vehicles['cluster']==2] = 'Midsized Balanced'\n", 267 | "vehicles['cluster'][vehicles['cluster']==3] = 'Small Moderately Efficient'" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## Aggregate and Filter" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "def barchart(df, group_field, calc_field, calc, length, width):\n", 286 | " grouped = pd.DataFrame(zip_agg.groupby(group_field).agg({calc_field: {calc_field: calc}}).to_records())\n", 287 | " grouped.columns = [group_field, calc_field]\n", 288 | " grouped = grouped.sort_values(calc_field, ascending=False)\n", 289 | "\n", 290 | " fig = plt.subplots(figsize=(width,length))\n", 291 | " ax = sns.barplot(x=calc_field, y=group_field, data=grouped)\n", 292 | " ax.set(xlabel=calc + '(' + calc_field + ')', ylabel=group_field )\n", 293 | " plt.show()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "barchart(vehicles, 2016, 'category','count', 6,8)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": false 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "count_barchart(vehicles, 1985, 'category', 'count', 6,8)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "count_barchart(vehicles, 2016, 'engine_size', 'count', 6, 8)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "count_barchart(vehicles, 2016, 'fuel_efficiency', 'count',6, 8)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "count_barchart(vehicles, 2016, 'cluster', 'count',6,8)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "count_barchart(vehicles, 2016, 'make', 'count',12, 12)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "## More Details with Pivoting" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "def pivot_heatmap(df, year, rows, columns, values, width, length):\n", 378 | " df_year = df[df.year == year]\n", 379 | " df_pivot = df_year.pivot_table(values=values, index=rows, columns=columns, \n", 380 | " aggfunc=np.size).dropna(axis=0, how='all')\n", 381 | " \n", 382 | " fig = plt.subplots(figsize=(width,length))\n", 383 | " ax = sns.heatmap(df_pivot, annot=True, fmt='g')\n", 384 | " plt.show()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "pivot_heatmap(vehicles, 2016, 'fuel_efficiency','engine_size','comb08',15, 8)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "collapsed": false 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "pivot_heatmap(vehicles, 1985, 'fuel_efficiency','engine_size','comb08',15, 8)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "pivot_heatmap(vehicles, 2016, 'cluster','category', 'comb08', 15, 10)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": false 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "pivot_heatmap(vehicles, 2016, ['engine_size', 'fuel_efficiency'],'category', 'comb08', 15, 15)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "collapsed": false 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "pivot_heatmap(vehicles, 2016, 'make','category', 'comb08', 10, 10)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "## Exploring Aggregations Over Time" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "def multi_line(df, x, y):\n", 458 | " ax = df.groupby([x, y]).size().unstack(y).plot(figsize=(15,8), cmap=\"Set2\")" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "collapsed": false 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "multi_line(vehicles, 'year', 'category')" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": { 476 | "collapsed": false 477 | }, 478 | "outputs": [], 479 | "source": [ 480 | "bmw = vehicles[vehicles.make == 'BMW']\n", 481 | "multi_line(bmw, 'year', 'category')" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": { 488 | "collapsed": false 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "toyota = vehicles[vehicles.make == 'Toyota']\n", 493 | "multi_line(toyota, 'year', 'category')" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "## Exploring Field Relationships" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "collapsed": false 508 | }, 509 | "outputs": [], 510 | "source": [ 511 | "def scatter_matrix(df, labels=None):\n", 512 | " ax = sns.pairplot(df, hue=labels, diag_kind='kde', size=2)\n", 513 | " plt.show()\n", 514 | "\n", 515 | "scatter_matrix(vehicles_numeric_norm)" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "scatter_matrix(vehicles_numeric_norm, labels=\"cluster\")" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "collapsed": false 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "vehicles_numeric_norm['Cluster'] = vehicles['cluster']\n", 538 | "sns.lmplot('displ', 'comb08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "collapsed": false 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "sns.lmplot('displ', 'fuelCost08', data=vehicles_numeric_norm, hue='Cluster', size=8, fit_reg=False)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": {}, 555 | "source": [ 556 | "## Exploring Entity Relationships (Graph Analysis)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": { 563 | "collapsed": true 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "entity = 'make'\n", 568 | "year = 2016\n", 569 | "\n", 570 | "vehicles_year = vehicles[vehicles.year==year]\n", 571 | "\n", 572 | "graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n", 573 | " 'comb08','VClass', 'cluster'], \n", 574 | " as_index=False).size()).reset_index()\n", 575 | "\n", 576 | "graph_year = graph_year.rename(columns={0: 'count'})\n", 577 | "graph_year['edge'] = (graph_year['cylinders'].map(str)\n", 578 | " + graph_year['displ'].map(str)\n", 579 | " + graph_year['trantype']\n", 580 | " + graph_year['drive']\n", 581 | " + graph_year['comb08'].map(str)\n", 582 | " + graph_year['VClass']\n", 583 | " + graph_year['cluster']\n", 584 | " )\n", 585 | "\n", 586 | "graph_year = graph_year[[entity, 'edge', 'count']]" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": { 593 | "collapsed": true 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "def df_to_graph(df, entity, edge):\n", 598 | " df2 = df.copy()\n", 599 | " graph_df = pd.merge(df, df2, how='inner', on=edge)\n", 600 | " graph_df = graph_df.groupby([entity + '_x', entity + '_y']).count().reset_index()\n", 601 | " graph_df = graph_df[graph_df[entity + '_x'] != graph_df[entity + '_y']]\n", 602 | " graph_df = graph_df[[entity + '_x', entity + '_y', edge]]\n", 603 | " return graph_df" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": { 610 | "collapsed": false 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n", 615 | "vehicle_make_graph.head(10)" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": { 622 | "collapsed": false 623 | }, 624 | "outputs": [], 625 | "source": [ 626 | "import networkx as nx\n", 627 | "import graph_tool.all as gt\n", 628 | "import graph_tool as gt\n", 629 | "from graph_tool import *\n", 630 | "\n", 631 | "G = nx.from_pandas_dataframe(vehicle_make_graph, entity + '_x', entity + '_y', 'edge')" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "[Converting NetworkX to Graph-Tool](http://bbengfort.github.io/snippets/2016/06/23/graph-tool-from-networkx.html) by Benjamin Bengfort (converts NetworkX graphs to much prettier Graph-Tool graphs). " 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": { 645 | "collapsed": true 646 | }, 647 | "outputs": [], 648 | "source": [ 649 | "def get_prop_type(value, key=None):\n", 650 | " \"\"\"\n", 651 | " Performs typing and value conversion for the graph_tool PropertyMap class.\n", 652 | " If a key is provided, it also ensures the key is in a format that can be\n", 653 | " used with the PropertyMap. Returns a tuple, (type name, value, key)\n", 654 | " \"\"\"\n", 655 | " if isinstance(key, unicode):\n", 656 | " # Encode the key as ASCII\n", 657 | " key = key.encode('ascii', errors='replace')\n", 658 | "\n", 659 | " # Deal with the value\n", 660 | " if isinstance(value, bool):\n", 661 | " tname = 'bool'\n", 662 | "\n", 663 | " elif isinstance(value, int):\n", 664 | " tname = 'float'\n", 665 | " value = float(value)\n", 666 | "\n", 667 | " elif isinstance(value, float):\n", 668 | " tname = 'float'\n", 669 | "\n", 670 | " elif isinstance(value, unicode):\n", 671 | " tname = 'string'\n", 672 | " value = value.encode('ascii', errors='replace')\n", 673 | "\n", 674 | " elif isinstance(value, dict):\n", 675 | " tname = 'object'\n", 676 | "\n", 677 | " else:\n", 678 | " tname = 'string'\n", 679 | " value = str(value)\n", 680 | "\n", 681 | " return tname, value, key\n", 682 | "\n", 683 | "\n", 684 | "def nx2gt(nxG):\n", 685 | " \"\"\"\n", 686 | " Converts a networkx graph to a graph-tool graph.\n", 687 | " \"\"\"\n", 688 | " # Phase 0: Create a directed or undirected graph-tool Graph\n", 689 | " gtG = gt.Graph(directed=nxG.is_directed())\n", 690 | "\n", 691 | " # Add the Graph properties as \"internal properties\"\n", 692 | " for key, value in nxG.graph.items():\n", 693 | " # Convert the value and key into a type for graph-tool\n", 694 | " tname, value, key = get_prop_type(value, key)\n", 695 | "\n", 696 | " prop = gtG.new_graph_property(tname) # Create the PropertyMap\n", 697 | " gtG.graph_properties[key] = prop # Set the PropertyMap\n", 698 | " gtG.graph_properties[key] = value # Set the actual value\n", 699 | "\n", 700 | " # Phase 1: Add the vertex and edge property maps\n", 701 | " # Go through all nodes and edges and add seen properties\n", 702 | " # Add the node properties first\n", 703 | " nprops = set() # cache keys to only add properties once\n", 704 | " for node, data in nxG.nodes_iter(data=True):\n", 705 | "\n", 706 | " # Go through all the properties if not seen and add them.\n", 707 | " for key, val in data.items():\n", 708 | " if key in nprops: continue # Skip properties already added\n", 709 | "\n", 710 | " # Convert the value and key into a type for graph-tool\n", 711 | " tname, _, key = get_prop_type(val, key)\n", 712 | "\n", 713 | " prop = gtG.new_vertex_property(tname) # Create the PropertyMap\n", 714 | " gtG.vertex_properties[key] = prop # Set the PropertyMap\n", 715 | "\n", 716 | " # Add the key to the already seen properties\n", 717 | " nprops.add(key)\n", 718 | "\n", 719 | " # Also add the node id: in NetworkX a node can be any hashable type, but\n", 720 | " # in graph-tool node are defined as indices. So we capture any strings\n", 721 | " # in a special PropertyMap called 'id' -- modify as needed!\n", 722 | " gtG.vertex_properties['id'] = gtG.new_vertex_property('string')\n", 723 | "\n", 724 | " # Add the edge properties second\n", 725 | " eprops = set() # cache keys to only add properties once\n", 726 | " for src, dst, data in nxG.edges_iter(data=True):\n", 727 | "\n", 728 | " # Go through all the edge properties if not seen and add them.\n", 729 | " for key, val in data.items():\n", 730 | " if key in eprops: continue # Skip properties already added\n", 731 | "\n", 732 | " # Convert the value and key into a type for graph-tool\n", 733 | " tname, _, key = get_prop_type(val, key)\n", 734 | "\n", 735 | " prop = gtG.new_edge_property(tname) # Create the PropertyMap\n", 736 | " gtG.edge_properties[key] = prop # Set the PropertyMap\n", 737 | "\n", 738 | " # Add the key to the already seen properties\n", 739 | " eprops.add(key)\n", 740 | "\n", 741 | " # Phase 2: Actually add all the nodes and vertices with their properties\n", 742 | " # Add the nodes\n", 743 | " vertices = {} # vertex mapping for tracking edges later\n", 744 | " for node, data in nxG.nodes_iter(data=True):\n", 745 | "\n", 746 | " # Create the vertex and annotate for our edges later\n", 747 | " v = gtG.add_vertex()\n", 748 | " vertices[node] = v\n", 749 | "\n", 750 | " # Set the vertex properties, not forgetting the id property\n", 751 | " data['id'] = str(node)\n", 752 | " for key, value in data.items():\n", 753 | " gtG.vp[key][v] = value # vp is short for vertex_properties\n", 754 | "\n", 755 | " # Add the edges\n", 756 | " for src, dst, data in nxG.edges_iter(data=True):\n", 757 | "\n", 758 | " # Look up the vertex structs from our vertices mapping and add edge.\n", 759 | " e = gtG.add_edge(vertices[src], vertices[dst])\n", 760 | "\n", 761 | " # Add the edge properties\n", 762 | " for key, value in data.items():\n", 763 | " gtG.ep[key][e] = value # ep is short for edge_properties\n", 764 | "\n", 765 | " # Done, finally!\n", 766 | " return gtG\n", 767 | "\n", 768 | "\n", 769 | "if __name__ == '__main__':\n", 770 | "\n", 771 | " # Create the networkx graph\n", 772 | " nxG = nx.Graph(name=\"Undirected Graph\")\n", 773 | " nxG.add_node(\"v1\", name=\"alpha\", color=\"red\")\n", 774 | " nxG.add_node(\"v2\", name=\"bravo\", color=\"blue\")\n", 775 | " nxG.add_node(\"v3\", name=\"charlie\", color=\"blue\")\n", 776 | " nxG.add_node(\"v4\", name=\"hub\", color=\"purple\")\n", 777 | " nxG.add_node(\"v5\", name=\"delta\", color=\"red\")\n", 778 | " nxG.add_node(\"v6\", name=\"echo\", color=\"red\")\n", 779 | "\n", 780 | " nxG.add_edge(\"v1\", \"v2\", weight=0.5, label=\"follows\")\n", 781 | " nxG.add_edge(\"v1\", \"v3\", weight=0.25, label=\"follows\")\n", 782 | " nxG.add_edge(\"v2\", \"v4\", weight=0.05, label=\"follows\")\n", 783 | " nxG.add_edge(\"v3\", \"v4\", weight=0.35, label=\"follows\")\n", 784 | " nxG.add_edge(\"v5\", \"v4\", weight=0.65, label=\"follows\")\n", 785 | " nxG.add_edge(\"v6\", \"v4\", weight=0.53, label=\"follows\")\n", 786 | " nxG.add_edge(\"v5\", \"v6\", weight=0.21, label=\"follows\")\n", 787 | "\n", 788 | " for item in nxG.edges_iter(data=True):\n", 789 | " print(item)\n", 790 | "\n", 791 | " # Convert to graph-tool graph\n", 792 | " gtG = nx2gt(nxG)\n", 793 | " gtG.list_properties()" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "metadata": { 800 | "collapsed": true 801 | }, 802 | "outputs": [], 803 | "source": [ 804 | "def plot_graph(graph, width, length):\n", 805 | " g = nx2gt(graph)\n", 806 | " vlabel = g.vp['id']\n", 807 | " gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n", 808 | " vertex_size=5, vertex_fill_color='cyan')\n", 809 | "\n", 810 | "plot_graph(G, 1200, 800)" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": null, 816 | "metadata": { 817 | "collapsed": true 818 | }, 819 | "outputs": [], 820 | "source": [ 821 | "ego = nx.ego_graph(G, 'Nissan', 1)\n", 822 | "plot_graph(ego, 500, 500)" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": null, 828 | "metadata": { 829 | "collapsed": true 830 | }, 831 | "outputs": [], 832 | "source": [ 833 | "import community\n", 834 | "\n", 835 | "def detect_communities(graph):\n", 836 | " partition = community.best_partition(graph)\n", 837 | " nx.set_node_attributes(graph, 'partition', partition)\n", 838 | " return graph, partition\n", 839 | "\n", 840 | "make_communities = pd.DataFrame(detect_communities(G)[1].items(), \n", 841 | " columns=['make', 'community']).sort_values('community', ascending=True)\n", 842 | "\n", 843 | "make_communities.head()" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": { 850 | "collapsed": true 851 | }, 852 | "outputs": [], 853 | "source": [ 854 | "import random\n", 855 | "from copy import copy\n", 856 | "\n", 857 | "##########################################################################\n", 858 | "## Color Palettes\n", 859 | "##########################################################################\n", 860 | "\n", 861 | "FLATUI = [\"#9b59b6\", \"#3498db\", \"#95a5a6\", \"#e74c3c\", \"#34495e\", \"#2ecc71\"]\n", 862 | "PAIRED = [\n", 863 | " \"#a6cee3\", \"#1f78b4\", \"#b2df8a\", \"#33a02c\", \"#fb9a99\", \"#e31a1c\",\n", 864 | " \"#fdbf6f\", \"#ff7f00\", \"#cab2d6\", \"#6a3d9a\", \"#ffff99\", \"#b15928\",\n", 865 | "]\n", 866 | "SET1 = [\n", 867 | " \"#e41a1c\", \"#377eb8\", \"#4daf4a\",\n", 868 | " \"#984ea3\", \"#ff7f00\", \"#ffff33\",\n", 869 | " \"#a65628\", \"#f781bf\", \"#999999\"\n", 870 | "]\n", 871 | "\n", 872 | "PALETTES = {\n", 873 | " 'flatui': FLATUI,\n", 874 | " 'paired': PAIRED,\n", 875 | " 'set1': SET1,\n", 876 | "}\n", 877 | "\n", 878 | "##########################################################################\n", 879 | "## Color Utilities\n", 880 | "##########################################################################\n", 881 | "\n", 882 | "class ColorMap(object):\n", 883 | " \"\"\"\n", 884 | " A helper for mapping categorical values to colors on demand.\n", 885 | " \"\"\"\n", 886 | "\n", 887 | " def __init__(self, colors='flatui', shuffle=False):\n", 888 | " \"\"\"\n", 889 | " Specify either a list of colors or one of the color names. If shuffle\n", 890 | " is True then the colors will be shuffled randomly.\n", 891 | " \"\"\"\n", 892 | " self.mapping = {}\n", 893 | " self.colors = colors\n", 894 | "\n", 895 | " if shuffle:\n", 896 | " random.shuffle(self._colors)\n", 897 | "\n", 898 | " @property\n", 899 | " def colors(self):\n", 900 | " return self._colors\n", 901 | "\n", 902 | " @colors.setter\n", 903 | " def colors(self, value):\n", 904 | " \"\"\"\n", 905 | " Converts color strings into a color listing.\n", 906 | " \"\"\"\n", 907 | " if isinstance(value, basestring):\n", 908 | " if value not in PALETTES:\n", 909 | " raise ValueError(\"'{}' is not a registered color palette\")\n", 910 | " self._colors = copy(PALETTES[value])\n", 911 | " elif isinstance(value, list):\n", 912 | " self._colors = value\n", 913 | " else:\n", 914 | " self._colors = list(value)\n", 915 | "\n", 916 | " def __call__(self, category):\n", 917 | " if category not in self.mapping:\n", 918 | " if self.colors:\n", 919 | " self.mapping[category] = self.colors.pop()\n", 920 | " else:\n", 921 | " raise ValueError(\n", 922 | " \"Not enough colors for this many categories!\"\n", 923 | " )\n", 924 | "\n", 925 | " return self.mapping[category]" 926 | ] 927 | }, 928 | { 929 | "cell_type": "code", 930 | "execution_count": null, 931 | "metadata": { 932 | "collapsed": true 933 | }, 934 | "outputs": [], 935 | "source": [ 936 | "def plot_community_graph(graph, community_df, width, length):\n", 937 | " g = nx2gt(G)\n", 938 | " vlabel = g.vp['id']\n", 939 | " vcolor = g.new_vertex_property('string') \n", 940 | " vcmap = ColorMap('flatui', shuffle=False)\n", 941 | " for vertex in g.vertices():\n", 942 | " vcolor[vertex] = vcmap(community_df.community[vertex])\n", 943 | " gt.graph_draw(g, output_size=(width,length), vertex_text=vlabel, vertex_font_weight=0.2, \n", 944 | " vertex_size=5, vertex_fill_color=vcolor)\n", 945 | "\n", 946 | "plot_community_graph(G, make_communities, 1200, 800)" 947 | ] 948 | }, 949 | { 950 | "cell_type": "markdown", 951 | "metadata": {}, 952 | "source": [ 953 | "## Exploring Connections Over Time" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": { 960 | "collapsed": true 961 | }, 962 | "outputs": [], 963 | "source": [ 964 | "columns = ['make_x','make_y', 'edge','year']\n", 965 | "graph_all_years = pd.DataFrame(columns=columns)" 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": null, 971 | "metadata": { 972 | "collapsed": true 973 | }, 974 | "outputs": [], 975 | "source": [ 976 | "for i in vehicles['year'].unique():\n", 977 | " vehicles_year = vehicles[vehicles.year==i]\n", 978 | "\n", 979 | " graph_year = pd.DataFrame(vehicles_year.groupby([entity,'cylinders','displ','trantype','drive',\n", 980 | " 'comb08','VClass', 'cluster'], \n", 981 | " as_index=False).size()).reset_index()\n", 982 | "\n", 983 | " graph_year = graph_year.rename(columns={0: 'count'})\n", 984 | " graph_year['edge'] = (graph_year['cylinders'].map(str)\n", 985 | " + graph_year['displ'].map(str)\n", 986 | " + graph_year['trantype']\n", 987 | " + graph_year['drive']\n", 988 | " + graph_year['comb08'].map(str)\n", 989 | " + graph_year['VClass']\n", 990 | " + graph_year['cluster']\n", 991 | " )\n", 992 | "\n", 993 | " graph_year = graph_year[[entity, 'edge', 'count']]\n", 994 | " vehicle_make_graph = df_to_graph(graph_year, entity, 'edge')\n", 995 | " vehicle_make_graph['year'] = i\n", 996 | " graph_all_years = graph_all_years.append(vehicle_make_graph)" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": null, 1002 | "metadata": { 1003 | "collapsed": false 1004 | }, 1005 | "outputs": [], 1006 | "source": [ 1007 | "graph_summary = graph_all_years.groupby(['make_x', 'year'], \n", 1008 | " as_index=False).sum()\n", 1009 | "\n", 1010 | "graph_summary.head()" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": null, 1016 | "metadata": { 1017 | "collapsed": false 1018 | }, 1019 | "outputs": [], 1020 | "source": [ 1021 | "def graph_multi_line(df, x, y):\n", 1022 | " ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n", 1023 | " ax.legend(loc='center', bbox_to_anchor=(0.5, -0.35),\n", 1024 | " ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n", 1025 | "\n", 1026 | "graph_multi_line(graph_summary, 'year', 'make_x')" 1027 | ] 1028 | }, 1029 | { 1030 | "cell_type": "code", 1031 | "execution_count": null, 1032 | "metadata": { 1033 | "collapsed": false 1034 | }, 1035 | "outputs": [], 1036 | "source": [ 1037 | "makes = ['Chevrolet', 'Ford', 'Toyota', 'Honda', 'Nissan']\n", 1038 | "\n", 1039 | "def graph_multi_line_makes(df, x, y):\n", 1040 | " ax = df.groupby([x, y]).sum().unstack(y).plot(figsize=(15,8), cmap=\"jet\")\n", 1041 | " ax.legend(loc='center', bbox_to_anchor=(0.5, -0.15),\n", 1042 | " ncol=5, fancybox=True, shadow=True, labels=df[y].unique())\n", 1043 | "\n", 1044 | "graph_summary_makes = graph_summary[graph_summary.make_x.isin(makes)]\n", 1045 | "graph_multi_line_makes(graph_summary_makes, 'year', 'make_x')" 1046 | ] 1047 | } 1048 | ], 1049 | "metadata": { 1050 | "anaconda-cloud": {}, 1051 | "kernelspec": { 1052 | "display_name": "Python [py27]", 1053 | "language": "python", 1054 | "name": "Python [py27]" 1055 | }, 1056 | "language_info": { 1057 | "codemirror_mode": { 1058 | "name": "ipython", 1059 | "version": 2 1060 | }, 1061 | "file_extension": ".py", 1062 | "mimetype": "text/x-python", 1063 | "name": "python", 1064 | "nbconvert_exporter": "python", 1065 | "pygments_lexer": "ipython2", 1066 | "version": "2.7.12" 1067 | } 1068 | }, 1069 | "nbformat": 4, 1070 | "nbformat_minor": 0 1071 | } 1072 | --------------------------------------------------------------------------------