└── Idiomatic Pandas.ipynb /Idiomatic Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": { 7 | "name": "#%% md\n" 8 | } 9 | }, 10 | "source": [ 11 | "# Idiomatic Pandas\n", 12 | "## 5 Tips for Better Pandas Code" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "pycharm": { 19 | "name": "#%% md\n" 20 | } 21 | }, 22 | "source": [ 23 | "## About Matt Harrison @\\_\\_mharrison\\_\\_\n", 24 | "\n", 25 | "* Author of Effective Pandas, Machine Learning Pocket Reference, and Illustrated Guide to Python 3.\n", 26 | "* Advisor at Ponder (creators of Modin)\n", 27 | "* Corporate trainer at MetaSnake. Taught Pandas to 1000's of students.\n", 28 | "* Upcoming Live Course https://maven.com/matt-harrison/data-analysis-using-pandas" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "pycharm": { 36 | "name": "#%%\n" 37 | } 38 | }, 39 | "outputs": [], 40 | "source": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "pycharm": { 47 | "name": "#%%\n" 48 | } 49 | }, 50 | "outputs": [], 51 | "source": [] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "pycharm": { 58 | "name": "#%%\n" 59 | } 60 | }, 61 | "outputs": [], 62 | "source": [] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "pycharm": { 69 | "name": "#%%\n" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "pycharm": { 80 | "name": "#%%\n" 81 | } 82 | }, 83 | "outputs": [], 84 | "source": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "pycharm": { 91 | "name": "#%%\n" 92 | } 93 | }, 94 | "outputs": [], 95 | "source": [] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "pycharm": { 101 | "name": "#%% md\n" 102 | } 103 | }, 104 | "source": [ 105 | "## Practice this on your data with your team!\n", 106 | "* https://maven.com/matt-harrison/data-analysis-using-pandas\n", 107 | "* Contact me matt@metasnake.com\n", 108 | "* Follow on Twitter @\\_\\_mharrison\\_\\_" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "pycharm": { 116 | "name": "#%%\n" 117 | } 118 | }, 119 | "outputs": [], 120 | "source": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "pycharm": { 127 | "name": "#%%\n" 128 | } 129 | }, 130 | "outputs": [], 131 | "source": [] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "pycharm": { 137 | "name": "#%% md\n" 138 | } 139 | }, 140 | "source": [ 141 | "## Outline\n", 142 | "\n", 143 | "* Load Data\n", 144 | "* Types\n", 145 | "* Chaining\n", 146 | "* Mutation\n", 147 | "* Apply\n", 148 | "* Aggregation" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": { 154 | "pycharm": { 155 | "name": "#%% md\n" 156 | } 157 | }, 158 | "source": [ 159 | "## Data" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 2, 165 | "metadata": { 166 | "lines_to_next_cell": 2, 167 | "pycharm": { 168 | "name": "#%%\n" 169 | } 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "from IPython.display import display\n", 174 | "import numpy as np\n", 175 | "import pandas as pd\n", 176 | "#import modin.pandas as pd" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 3, 182 | "metadata": { 183 | "pycharm": { 184 | "name": "#%%\n" 185 | } 186 | }, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": "'1.4.2'" 191 | }, 192 | "execution_count": 3, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "pd.__version__" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 4, 204 | "metadata": { 205 | "pycharm": { 206 | "name": "#%%\n" 207 | } 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "pd.options.display.min_rows = 20" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "outputs": [], 218 | "source": [ 219 | "import ray\n", 220 | "ray.init()" 221 | ], 222 | "metadata": { 223 | "collapsed": false, 224 | "pycharm": { 225 | "name": "#%%\n" 226 | } 227 | } 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 5, 232 | "metadata": { 233 | "pycharm": { 234 | "name": "#%%\n" 235 | } 236 | }, 237 | "outputs": [ 238 | { 239 | "name": "stderr", 240 | "output_type": "stream", 241 | "text": [ 242 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\373335790.py:1: DtypeWarning: Columns (68,70,71,72,73,74,76,79) have mixed types. Specify dtype option on import or set low_memory=False.\n", 243 | " autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip')\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip')" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 6, 254 | "metadata": { 255 | "scrolled": true, 256 | "pycharm": { 257 | "name": "#%%\n" 258 | } 259 | }, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": " barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 \\\n0 15.695714 0.0 0.0 0.0 19 0.0 0 \n1 29.964545 0.0 0.0 0.0 9 0.0 0 \n2 12.207778 0.0 0.0 0.0 23 0.0 0 \n3 29.964545 0.0 0.0 0.0 10 0.0 0 \n4 17.347895 0.0 0.0 0.0 17 0.0 0 \n5 14.982273 0.0 0.0 0.0 21 0.0 0 \n6 13.184400 0.0 0.0 0.0 22 0.0 0 \n7 13.733750 0.0 0.0 0.0 23 0.0 0 \n8 12.677308 0.0 0.0 0.0 23 0.0 0 \n9 13.184400 0.0 0.0 0.0 23 0.0 0 \n... ... ... ... ... ... ... ... \n41134 16.480500 0.0 0.0 0.0 18 0.0 0 \n41135 12.677308 0.0 0.0 0.0 23 0.0 0 \n41136 13.733750 0.0 0.0 0.0 21 0.0 0 \n41137 11.771786 0.0 0.0 0.0 24 0.0 0 \n41138 13.184400 0.0 0.0 0.0 21 0.0 0 \n41139 14.982273 0.0 0.0 0.0 19 0.0 0 \n41140 14.330870 0.0 0.0 0.0 20 0.0 0 \n41141 15.695714 0.0 0.0 0.0 18 0.0 0 \n41142 15.695714 0.0 0.0 0.0 18 0.0 0 \n41143 18.311667 0.0 0.0 0.0 16 0.0 0 \n\n cityA08U cityCD cityE ... mfrCode c240Dscr charge240b c240bDscr \\\n0 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n1 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n2 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n3 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n4 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n5 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n6 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n7 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n8 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n9 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n... ... ... ... ... ... ... ... ... \n41134 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41135 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41136 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41137 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41138 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41139 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41140 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41141 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41142 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n41143 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n\n createdOn modifiedOn startStop \\\n0 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n1 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n2 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n3 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n4 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n5 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n6 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n7 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n8 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n9 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n... ... ... ... \n41134 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41135 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41136 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41137 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41138 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41139 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41140 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41141 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41142 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n41143 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n\n phevCity phevHwy phevComb \n0 0 0 0 \n1 0 0 0 \n2 0 0 0 \n3 0 0 0 \n4 0 0 0 \n5 0 0 0 \n6 0 0 0 \n7 0 0 0 \n8 0 0 0 \n9 0 0 0 \n... ... ... ... \n41134 0 0 0 \n41135 0 0 0 \n41136 0 0 0 \n41137 0 0 0 \n41138 0 0 0 \n41139 0 0 0 \n41140 0 0 0 \n41141 0 0 0 \n41142 0 0 0 \n41143 0 0 0 \n\n[41144 rows x 83 columns]", 264 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...mfrCodec240Dscrcharge240bc240bDscrcreatedOnmodifiedOnstartStopphevCityphevHwyphevComb
015.6957140.00.00.0190.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
129.9645450.00.00.090.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
212.2077780.00.00.0230.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
329.9645450.00.00.0100.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
417.3478950.00.00.0170.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
514.9822730.00.00.0210.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
613.1844000.00.00.0220.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
713.7337500.00.00.0230.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
812.6773080.00.00.0230.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
913.1844000.00.00.0230.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
..................................................................
4113416.4805000.00.00.0180.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4113512.6773080.00.00.0230.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4113613.7337500.00.00.0210.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4113711.7717860.00.00.0240.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4113813.1844000.00.00.0210.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4113914.9822730.00.00.0190.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4114014.3308700.00.00.0200.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4114115.6957140.00.00.0180.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4114215.6957140.00.00.0180.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
4114318.3116670.00.00.0160.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
\n

41144 rows × 83 columns

\n
" 265 | }, 266 | "execution_count": 6, 267 | "metadata": {}, 268 | "output_type": "execute_result" 269 | } 270 | ], 271 | "source": [ 272 | "autos" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 7, 278 | "metadata": { 279 | "pycharm": { 280 | "name": "#%%\n" 281 | } 282 | }, 283 | "outputs": [ 284 | { 285 | "data": { 286 | "text/plain": "Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',\n 'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',\n 'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',\n 'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',\n 'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',\n 'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',\n 'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',\n 'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',\n 'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',\n 'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',\n 'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler',\n 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',\n 'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',\n 'createdOn', 'modifiedOn', 'startStop', 'phevCity', 'phevHwy',\n 'phevComb'],\n dtype='object')" 287 | }, 288 | "execution_count": 7, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "autos.columns" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "pycharm": { 302 | "name": "#%%\n" 303 | } 304 | }, 305 | "outputs": [], 306 | "source": [] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "pycharm": { 313 | "name": "#%%\n" 314 | } 315 | }, 316 | "outputs": [], 317 | "source": [] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "pycharm": { 324 | "name": "#%%\n" 325 | } 326 | }, 327 | "outputs": [], 328 | "source": [] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "pycharm": { 334 | "name": "#%% md\n" 335 | } 336 | }, 337 | "source": [ 338 | "## Types\n", 339 | "Getting the right types will enable analysis and correctness." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 8, 345 | "metadata": { 346 | "pycharm": { 347 | "name": "#%%\n" 348 | } 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', \n", 353 | " 'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 9, 359 | "metadata": { 360 | "pycharm": { 361 | "name": "#%%\n" 362 | } 363 | }, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": "city08 int64\ncomb08 int64\nhighway08 int64\ncylinders float64\ndispl float64\ndrive object\neng_dscr object\nfuelCost08 int64\nmake object\nmodel object\ntrany object\nrange int64\ncreatedOn object\nyear int64\ndtype: object" 368 | }, 369 | "execution_count": 9, 370 | "metadata": {}, 371 | "output_type": "execute_result" 372 | } 373 | ], 374 | "source": [ 375 | "autos[cols].dtypes" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 10, 381 | "metadata": { 382 | "pycharm": { 383 | "name": "#%%\n" 384 | } 385 | }, 386 | "outputs": [ 387 | { 388 | "data": { 389 | "text/plain": "Index 128\ncity08 329152\ncomb08 329152\nhighway08 329152\ncylinders 329152\ndispl 329152\ndrive 3028369\neng_dscr 2135693\nfuelCost08 329152\nmake 2606267\nmodel 2813134\ntrany 2933276\nrange 329152\ncreatedOn 3497240\nyear 329152\ndtype: int64" 390 | }, 391 | "execution_count": 10, 392 | "metadata": {}, 393 | "output_type": "execute_result" 394 | } 395 | ], 396 | "source": [ 397 | "autos[cols].memory_usage(deep=True)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 11, 403 | "metadata": { 404 | "pycharm": { 405 | "name": "#%%\n" 406 | } 407 | }, 408 | "outputs": [ 409 | { 410 | "data": { 411 | "text/plain": "19647323" 412 | }, 413 | "execution_count": 11, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | } 417 | ], 418 | "source": [ 419 | "autos[cols].memory_usage(deep=True).sum()" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "pycharm": { 427 | "name": "#%%\n" 428 | } 429 | }, 430 | "outputs": [], 431 | "source": [] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": { 436 | "pycharm": { 437 | "name": "#%% md\n" 438 | } 439 | }, 440 | "source": [ 441 | "### Ints" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 12, 447 | "metadata": { 448 | "pycharm": { 449 | "name": "#%%\n" 450 | } 451 | }, 452 | "outputs": [ 453 | { 454 | "data": { 455 | "text/plain": " city08 comb08 highway08 fuelCost08 range \\\ncount 41144.000000 41144.000000 41144.000000 41144.000000 41144.000000 \nmean 18.369045 20.616396 24.504667 2362.335942 0.793506 \nstd 7.905886 7.674535 7.730364 654.981925 13.041592 \nmin 6.000000 7.000000 9.000000 500.000000 0.000000 \n25% 15.000000 17.000000 20.000000 1900.000000 0.000000 \n50% 17.000000 20.000000 24.000000 2350.000000 0.000000 \n75% 20.000000 23.000000 28.000000 2700.000000 0.000000 \nmax 150.000000 136.000000 124.000000 7400.000000 370.000000 \n\n year \ncount 41144.000000 \nmean 2001.535266 \nstd 11.142414 \nmin 1984.000000 \n25% 1991.000000 \n50% 2002.000000 \n75% 2011.000000 \nmax 2020.000000 ", 456 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08fuelCost08rangeyear
count41144.00000041144.00000041144.00000041144.00000041144.00000041144.000000
mean18.36904520.61639624.5046672362.3359420.7935062001.535266
std7.9058867.6745357.730364654.98192513.04159211.142414
min6.0000007.0000009.000000500.0000000.0000001984.000000
25%15.00000017.00000020.0000001900.0000000.0000001991.000000
50%17.00000020.00000024.0000002350.0000000.0000002002.000000
75%20.00000023.00000028.0000002700.0000000.0000002011.000000
max150.000000136.000000124.0000007400.000000370.0000002020.000000
\n
" 457 | }, 458 | "execution_count": 12, 459 | "metadata": {}, 460 | "output_type": "execute_result" 461 | } 462 | ], 463 | "source": [ 464 | "autos[cols].select_dtypes(int).describe()" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 16, 470 | "metadata": { 471 | "scrolled": true, 472 | "pycharm": { 473 | "name": "#%%\n" 474 | } 475 | }, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": " city08 comb08 highway08 fuelCost08 range \\\ncount 41144.000000 41144.000000 41144.000000 41144.000000 41144.000000 \nmean 18.369045 20.616396 24.504667 2362.335942 0.793506 \nstd 7.905886 7.674535 7.730364 654.981925 13.041592 \nmin 6.000000 7.000000 9.000000 500.000000 0.000000 \n25% 15.000000 17.000000 20.000000 1900.000000 0.000000 \n50% 17.000000 20.000000 24.000000 2350.000000 0.000000 \n75% 20.000000 23.000000 28.000000 2700.000000 0.000000 \nmax 150.000000 136.000000 124.000000 7400.000000 370.000000 \n\n year \ncount 41144.000000 \nmean 2001.535266 \nstd 11.142414 \nmin 1984.000000 \n25% 1991.000000 \n50% 2002.000000 \n75% 2011.000000 \nmax 2020.000000 ", 480 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08fuelCost08rangeyear
count41144.00000041144.00000041144.00000041144.00000041144.00000041144.000000
mean18.36904520.61639624.5046672362.3359420.7935062001.535266
std7.9058867.6745357.730364654.98192513.04159211.142414
min6.0000007.0000009.000000500.0000000.0000001984.000000
25%15.00000017.00000020.0000001900.0000000.0000001991.000000
50%17.00000020.00000024.0000002350.0000000.0000002002.000000
75%20.00000023.00000028.0000002700.0000000.0000002011.000000
max150.000000136.000000124.0000007400.000000370.0000002020.000000
\n
" 481 | }, 482 | "execution_count": 16, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "# chaining\n", 489 | "(autos\n", 490 | " [cols]\n", 491 | " .select_dtypes(int)\n", 492 | " .describe()\n", 493 | ")" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 17, 499 | "metadata": { 500 | "scrolled": true, 501 | "pycharm": { 502 | "name": "#%%\n" 503 | } 504 | }, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": "iinfo(min=-128, max=127, dtype=int8)" 509 | }, 510 | "execution_count": 17, 511 | "metadata": {}, 512 | "output_type": "execute_result" 513 | } 514 | ], 515 | "source": [ 516 | "# can comb08 be and int8?\n", 517 | "np.iinfo(np.)" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 18, 523 | "metadata": { 524 | "pycharm": { 525 | "name": "#%%\n" 526 | } 527 | }, 528 | "outputs": [ 529 | { 530 | "data": { 531 | "text/plain": "iinfo(min=-32768, max=32767, dtype=int16)" 532 | }, 533 | "execution_count": 18, 534 | "metadata": {}, 535 | "output_type": "execute_result" 536 | } 537 | ], 538 | "source": [ 539 | "np.iinfo(np.i???nt16)" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 19, 545 | "metadata": { 546 | "scrolled": true, 547 | "pycharm": { 548 | "name": "#%%\n" 549 | } 550 | }, 551 | "outputs": [ 552 | { 553 | "data": { 554 | "text/plain": " highway08 fuelCost08 range year\ncount 41144.000000 41144.000000 41144.000000 41144.000000\nmean 24.504667 2362.335942 0.793506 2001.535266\nstd 7.730364 654.981925 13.041592 11.142414\nmin 9.000000 500.000000 0.000000 1984.000000\n25% 20.000000 1900.000000 0.000000 1991.000000\n50% 24.000000 2350.000000 0.000000 2002.000000\n75% 28.000000 2700.000000 0.000000 2011.000000\nmax 124.000000 7400.000000 370.000000 2020.000000", 555 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
highway08fuelCost08rangeyear
count41144.00000041144.00000041144.00000041144.000000
mean24.5046672362.3359420.7935062001.535266
std7.730364654.98192513.04159211.142414
min9.000000500.0000000.0000001984.000000
25%20.0000001900.0000000.0000001991.000000
50%24.0000002350.0000000.0000002002.000000
75%28.0000002700.0000000.0000002011.000000
max124.0000007400.000000370.0000002020.000000
\n
" 556 | }, 557 | "execution_count": 19, 558 | "metadata": {}, 559 | "output_type": "execute_result" 560 | } 561 | ], 562 | "source": [ 563 | "# chaining\n", 564 | "(autos\n", 565 | " [cols]\n", 566 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16' })\n", 567 | " .select_dtypes([int, 'int8'])\n", 568 | " .describe()\n", 569 | ")" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": { 576 | "scrolled": true, 577 | "pycharm": { 578 | "name": "#%%\n" 579 | } 580 | }, 581 | "outputs": [], 582 | "source": [ 583 | "# chaining\n", 584 | "# use 'integer' so see all int-like columns\n", 585 | "(autos\n", 586 | " [cols]\n", 587 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 588 | " 'range': 'int16', 'year': 'int16'})\n", 589 | " .select_dtypes(['integer']) # see https://numpy.org/doc/stable/reference/arrays.scalars.html\n", 590 | " .describe()\n", 591 | ")" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 20, 597 | "metadata": { 598 | "scrolled": true, 599 | "pycharm": { 600 | "name": "#%%\n" 601 | } 602 | }, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/plain": "18124995" 607 | }, 608 | "execution_count": 20, 609 | "metadata": {}, 610 | "output_type": "execute_result" 611 | } 612 | ], 613 | "source": [ 614 | "# chaining\n", 615 | "(autos\n", 616 | " [cols]\n", 617 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 618 | " 'range': 'int16', 'year': 'int16'})\n", 619 | " .memory_usage(deep=True)\n", 620 | " .sum() # was 19,647,323\n", 621 | ")" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": { 628 | "pycharm": { 629 | "name": "#%%\n" 630 | } 631 | }, 632 | "outputs": [], 633 | "source": [] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": { 638 | "pycharm": { 639 | "name": "#%% md\n" 640 | } 641 | }, 642 | "source": [ 643 | "### Floats" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 21, 649 | "metadata": { 650 | "scrolled": true, 651 | "pycharm": { 652 | "name": "#%%\n" 653 | } 654 | }, 655 | "outputs": [ 656 | { 657 | "data": { 658 | "text/plain": " cylinders displ\n0 4.0 2.0\n1 12.0 4.9\n2 4.0 2.2\n3 8.0 5.2\n4 4.0 2.2\n5 4.0 1.8\n6 4.0 1.8\n7 4.0 1.6\n8 4.0 1.6\n9 4.0 1.8\n... ... ...\n41134 4.0 2.1\n41135 4.0 1.9\n41136 4.0 1.9\n41137 4.0 1.9\n41138 4.0 1.9\n41139 4.0 2.2\n41140 4.0 2.2\n41141 4.0 2.2\n41142 4.0 2.2\n41143 4.0 2.2\n\n[41144 rows x 2 columns]", 659 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
cylindersdispl
04.02.0
112.04.9
24.02.2
38.05.2
44.02.2
54.01.8
64.01.8
74.01.6
84.01.6
94.01.8
.........
411344.02.1
411354.01.9
411364.01.9
411374.01.9
411384.01.9
411394.02.2
411404.02.2
411414.02.2
411424.02.2
411434.02.2
\n

41144 rows × 2 columns

\n
" 660 | }, 661 | "execution_count": 21, 662 | "metadata": {}, 663 | "output_type": "execute_result" 664 | } 665 | ], 666 | "source": [ 667 | "(autos\n", 668 | "[cols]\n", 669 | ".select_dtypes('float'))" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 22, 675 | "metadata": { 676 | "pycharm": { 677 | "name": "#%%\n" 678 | } 679 | }, 680 | "outputs": [ 681 | { 682 | "data": { 683 | "text/plain": "count 40938.000000\nmean 5.717084\nstd 1.755517\nmin 2.000000\n25% 4.000000\n50% 6.000000\n75% 6.000000\nmax 16.000000\nName: cylinders, dtype: float64" 684 | }, 685 | "execution_count": 22, 686 | "metadata": {}, 687 | "output_type": "execute_result" 688 | } 689 | ], 690 | "source": [ 691 | "# surprise! cylinders looks int-like\n", 692 | "autos.cylinders.describe()" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 23, 698 | "metadata": { 699 | "pycharm": { 700 | "name": "#%%\n" 701 | } 702 | }, 703 | "outputs": [ 704 | { 705 | "data": { 706 | "text/plain": "4.0 15938\n6.0 14284\n8.0 8801\n5.0 771\n12.0 626\n3.0 279\nNaN 206\n10.0 170\n2.0 59\n16.0 10\nName: cylinders, dtype: int64" 707 | }, 708 | "execution_count": 23, 709 | "metadata": {}, 710 | "output_type": "execute_result" 711 | } 712 | ], 713 | "source": [ 714 | "# opps! missing values\n", 715 | "autos.cylinders.value_counts(dropna=False)" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 24, 721 | "metadata": { 722 | "scrolled": true, 723 | "pycharm": { 724 | "name": "#%%\n" 725 | } 726 | }, 727 | "outputs": [ 728 | { 729 | "data": { 730 | "text/plain": " city08 comb08 highway08 cylinders displ drive \\\n7138 81 85 91 NaN NaN NaN \n7139 81 72 64 NaN NaN 2-Wheel Drive \n8143 81 72 64 NaN NaN 2-Wheel Drive \n8144 74 65 58 NaN NaN NaN \n8146 45 39 33 NaN NaN 2-Wheel Drive \n8147 84 75 66 NaN NaN NaN \n9212 87 78 69 NaN NaN 2-Wheel Drive \n9213 45 39 33 NaN NaN 2-Wheel Drive \n10329 87 78 69 NaN NaN 2-Wheel Drive \n21413 22 24 28 NaN NaN 4-Wheel Drive \n... ... ... ... ... ... ... \n34407 73 72 71 NaN NaN Front-Wheel Drive \n34408 118 108 97 NaN NaN Front-Wheel Drive \n34409 114 104 94 NaN NaN Front-Wheel Drive \n34538 74 74 73 NaN NaN All-Wheel Drive \n34561 80 76 72 NaN NaN 4-Wheel Drive \n34563 138 131 124 NaN NaN Rear-Wheel Drive \n34564 140 133 124 NaN NaN Rear-Wheel Drive \n34565 115 111 107 NaN NaN All-Wheel Drive \n34566 104 104 104 NaN NaN All-Wheel Drive \n34567 98 97 96 NaN NaN All-Wheel Drive \n\n eng_dscr fuelCost08 make model \\\n7138 NaN 800 Nissan Altra EV \n7139 NaN 900 Toyota RAV4 EV \n8143 NaN 900 Toyota RAV4 EV \n8144 NaN 1000 Ford Th!nk \n8146 NaN 1700 Ford Explorer USPS Electric \n8147 NaN 900 Nissan Hyper-Mini \n9212 NaN 850 Toyota RAV4 EV \n9213 NaN 1700 Ford Explorer USPS Electric \n10329 NaN 850 Toyota RAV4 EV \n21413 NaN 1750 Subaru RX Turbo \n... ... ... ... ... \n34407 NaN 900 BYD e6 \n34408 NaN 600 Nissan Leaf (62 kW-hr battery pack) \n34409 NaN 650 Nissan Leaf SV/SL (62 kW-hr battery pack) \n34538 NaN 900 Audi e-tron \n34561 NaN 850 Jaguar I-Pace \n34563 NaN 500 Tesla Model 3 Standard Range \n34564 NaN 500 Tesla Model 3 Standard Range Plus \n34565 NaN 600 Tesla Model S Long Range \n34566 NaN 650 Tesla Model S Performance (19\" Wheels) \n34567 NaN 700 Tesla Model S Performance (21\" Wheels) \n\n trany range createdOn year \n7138 NaN 90 Tue Jan 01 00:00:00 EST 2013 2000 \n7139 NaN 88 Tue Jan 01 00:00:00 EST 2013 2000 \n8143 NaN 88 Tue Jan 01 00:00:00 EST 2013 2001 \n8144 NaN 29 Tue Jan 01 00:00:00 EST 2013 2001 \n8146 NaN 38 Tue Jan 01 00:00:00 EST 2013 2001 \n8147 NaN 33 Tue Jan 01 00:00:00 EST 2013 2001 \n9212 NaN 95 Tue Jan 01 00:00:00 EST 2013 2002 \n9213 NaN 38 Tue Jan 01 00:00:00 EST 2013 2002 \n10329 NaN 95 Tue Jan 01 00:00:00 EST 2013 2003 \n21413 Manual 5-spd 0 Tue Jan 01 00:00:00 EST 2013 1985 \n... ... ... ... ... \n34407 Automatic (A1) 187 Wed Mar 13 00:00:00 EDT 2019 2019 \n34408 Automatic (A1) 226 Wed Mar 13 00:00:00 EDT 2019 2019 \n34409 Automatic (A1) 215 Wed Mar 13 00:00:00 EDT 2019 2019 \n34538 Automatic (A1) 204 Tue Apr 16 00:00:00 EDT 2019 2019 \n34561 Automatic (A1) 234 Thu May 02 00:00:00 EDT 2019 2020 \n34563 Automatic (A1) 220 Thu May 02 00:00:00 EDT 2019 2019 \n34564 Automatic (A1) 240 Thu May 02 00:00:00 EDT 2019 2019 \n34565 Automatic (A1) 370 Thu May 02 00:00:00 EDT 2019 2019 \n34566 Automatic (A1) 345 Thu May 02 00:00:00 EDT 2019 2019 \n34567 Automatic (A1) 325 Thu May 02 00:00:00 EDT 2019 2019 \n\n[206 rows x 14 columns]", 731 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08cylindersdispldriveeng_dscrfuelCost08makemodeltranyrangecreatedOnyear
7138818591NaNNaNNaNNaN800NissanAltra EVNaN90Tue Jan 01 00:00:00 EST 20132000
7139817264NaNNaN2-Wheel DriveNaN900ToyotaRAV4 EVNaN88Tue Jan 01 00:00:00 EST 20132000
8143817264NaNNaN2-Wheel DriveNaN900ToyotaRAV4 EVNaN88Tue Jan 01 00:00:00 EST 20132001
8144746558NaNNaNNaNNaN1000FordTh!nkNaN29Tue Jan 01 00:00:00 EST 20132001
8146453933NaNNaN2-Wheel DriveNaN1700FordExplorer USPS ElectricNaN38Tue Jan 01 00:00:00 EST 20132001
8147847566NaNNaNNaNNaN900NissanHyper-MiniNaN33Tue Jan 01 00:00:00 EST 20132001
9212877869NaNNaN2-Wheel DriveNaN850ToyotaRAV4 EVNaN95Tue Jan 01 00:00:00 EST 20132002
9213453933NaNNaN2-Wheel DriveNaN1700FordExplorer USPS ElectricNaN38Tue Jan 01 00:00:00 EST 20132002
10329877869NaNNaN2-Wheel DriveNaN850ToyotaRAV4 EVNaN95Tue Jan 01 00:00:00 EST 20132003
21413222428NaNNaN4-Wheel DriveNaN1750SubaruRX TurboManual 5-spd0Tue Jan 01 00:00:00 EST 20131985
.............................................
34407737271NaNNaNFront-Wheel DriveNaN900BYDe6Automatic (A1)187Wed Mar 13 00:00:00 EDT 20192019
3440811810897NaNNaNFront-Wheel DriveNaN600NissanLeaf (62 kW-hr battery pack)Automatic (A1)226Wed Mar 13 00:00:00 EDT 20192019
3440911410494NaNNaNFront-Wheel DriveNaN650NissanLeaf SV/SL (62 kW-hr battery pack)Automatic (A1)215Wed Mar 13 00:00:00 EDT 20192019
34538747473NaNNaNAll-Wheel DriveNaN900Audie-tronAutomatic (A1)204Tue Apr 16 00:00:00 EDT 20192019
34561807672NaNNaN4-Wheel DriveNaN850JaguarI-PaceAutomatic (A1)234Thu May 02 00:00:00 EDT 20192020
34563138131124NaNNaNRear-Wheel DriveNaN500TeslaModel 3 Standard RangeAutomatic (A1)220Thu May 02 00:00:00 EDT 20192019
34564140133124NaNNaNRear-Wheel DriveNaN500TeslaModel 3 Standard Range PlusAutomatic (A1)240Thu May 02 00:00:00 EDT 20192019
34565115111107NaNNaNAll-Wheel DriveNaN600TeslaModel S Long RangeAutomatic (A1)370Thu May 02 00:00:00 EDT 20192019
34566104104104NaNNaNAll-Wheel DriveNaN650TeslaModel S Performance (19\" Wheels)Automatic (A1)345Thu May 02 00:00:00 EDT 20192019
34567989796NaNNaNAll-Wheel DriveNaN700TeslaModel S Performance (21\" Wheels)Automatic (A1)325Thu May 02 00:00:00 EDT 20192019
\n

206 rows × 14 columns

\n
" 732 | }, 733 | "execution_count": 24, 734 | "metadata": {}, 735 | "output_type": "execute_result" 736 | } 737 | ], 738 | "source": [ 739 | "# where are they missing?\n", 740 | "(autos\n", 741 | " [cols]\n", 742 | " .query('cylinders.isna()')\n", 743 | ")" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": 25, 749 | "metadata": { 750 | "pycharm": { 751 | "name": "#%%\n" 752 | } 753 | }, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/plain": " city08 comb08 highway08 cylinders displ \\\ncount 41144.000000 41144.000000 41144.000000 41144.000000 41144.000000 \nmean 18.369045 20.616396 24.504667 5.688460 3.277904 \nstd 7.905886 7.674535 7.730364 1.797009 1.373415 \nmin 6.000000 7.000000 9.000000 0.000000 0.000000 \n25% 15.000000 17.000000 20.000000 4.000000 2.200000 \n50% 17.000000 20.000000 24.000000 6.000000 3.000000 \n75% 20.000000 23.000000 28.000000 6.000000 4.300000 \nmax 150.000000 136.000000 124.000000 16.000000 8.400000 \n\n fuelCost08 range year \ncount 41144.000000 41144.000000 41144.000000 \nmean 2362.335942 0.793506 2001.535266 \nstd 654.981925 13.041592 11.142414 \nmin 500.000000 0.000000 1984.000000 \n25% 1900.000000 0.000000 1991.000000 \n50% 2350.000000 0.000000 2002.000000 \n75% 2700.000000 0.000000 2011.000000 \nmax 7400.000000 370.000000 2020.000000 ", 758 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08cylindersdisplfuelCost08rangeyear
count41144.00000041144.00000041144.00000041144.00000041144.00000041144.00000041144.00000041144.000000
mean18.36904520.61639624.5046675.6884603.2779042362.3359420.7935062001.535266
std7.9058867.6745357.7303641.7970091.373415654.98192513.04159211.142414
min6.0000007.0000009.0000000.0000000.000000500.0000000.0000001984.000000
25%15.00000017.00000020.0000004.0000002.2000001900.0000000.0000001991.000000
50%17.00000020.00000024.0000006.0000003.0000002350.0000000.0000002002.000000
75%20.00000023.00000028.0000006.0000004.3000002700.0000000.0000002011.000000
max150.000000136.000000124.00000016.0000008.4000007400.000000370.0000002020.000000
\n
" 759 | }, 760 | "execution_count": 25, 761 | "metadata": {}, 762 | "output_type": "execute_result" 763 | } 764 | ], 765 | "source": [ 766 | "# chaining - add cylinders and displ columns\n", 767 | "(autos\n", 768 | " [cols]\n", 769 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 770 | " displ=autos.displ.fillna(0))\n", 771 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', \n", 772 | " 'fuelCost08': 'int16', 'range': 'int16', 'year': 'int16', })\n", 773 | " .describe()\n", 774 | ")" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": null, 780 | "metadata": { 781 | "pycharm": { 782 | "name": "#%%\n" 783 | } 784 | }, 785 | "outputs": [], 786 | "source": [ 787 | "autos[cols].describe()" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": { 794 | "scrolled": true, 795 | "pycharm": { 796 | "name": "#%%\n" 797 | } 798 | }, 799 | "outputs": [], 800 | "source": [ 801 | "# use this to inspect float sizes\n", 802 | "np.finfo(np.float16)" 803 | ] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": null, 808 | "metadata": { 809 | "scrolled": true, 810 | "pycharm": { 811 | "name": "#%%\n" 812 | } 813 | }, 814 | "outputs": [], 815 | "source": [ 816 | "# chaining - convert displ to float16\n", 817 | "(autos\n", 818 | " [cols]\n", 819 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 820 | " displ=autos.displ.fillna(0).astype('float16'))\n", 821 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 822 | " 'range': 'int16', 'year': 'int16'})\n", 823 | ")" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 26, 829 | "metadata": { 830 | "scrolled": false, 831 | "pycharm": { 832 | "name": "#%%\n" 833 | } 834 | }, 835 | "outputs": [ 836 | { 837 | "data": { 838 | "text/plain": "17590123" 839 | }, 840 | "execution_count": 26, 841 | "metadata": {}, 842 | "output_type": "execute_result" 843 | } 844 | ], 845 | "source": [ 846 | "# new memory usage\n", 847 | "(autos\n", 848 | " .loc[:, cols]\n", 849 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 850 | " displ=autos.displ.fillna(0).astype('float16'))\n", 851 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 852 | " 'range': 'int16', 'year': 'int16'})\n", 853 | " .memory_usage(deep=True)\n", 854 | " .sum() # was 19,647,323\n", 855 | ")" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": null, 861 | "metadata": { 862 | "pycharm": { 863 | "name": "#%%\n" 864 | } 865 | }, 866 | "outputs": [], 867 | "source": [] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": null, 872 | "metadata": { 873 | "pycharm": { 874 | "name": "#%%\n" 875 | } 876 | }, 877 | "outputs": [], 878 | "source": [] 879 | }, 880 | { 881 | "cell_type": "markdown", 882 | "metadata": { 883 | "pycharm": { 884 | "name": "#%% md\n" 885 | } 886 | }, 887 | "source": [ 888 | "### Objects" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": 27, 894 | "metadata": { 895 | "scrolled": true, 896 | "pycharm": { 897 | "name": "#%%\n" 898 | } 899 | }, 900 | "outputs": [ 901 | { 902 | "data": { 903 | "text/plain": " drive eng_dscr make \\\n0 Rear-Wheel Drive (FFS) Alfa Romeo \n1 Rear-Wheel Drive (GUZZLER) Ferrari \n2 Front-Wheel Drive (FFS) Dodge \n3 Rear-Wheel Drive NaN Dodge \n4 4-Wheel or All-Wheel Drive (FFS,TRBO) Subaru \n5 Front-Wheel Drive (FFS) Subaru \n6 Front-Wheel Drive (FFS) Subaru \n7 Front-Wheel Drive (FFS) Toyota \n8 Front-Wheel Drive (FFS) Toyota \n9 Front-Wheel Drive (FFS) Toyota \n... ... ... ... \n41134 Front-Wheel Drive (FFS) Saab \n41135 Front-Wheel Drive (TBI) (FFS) Saturn \n41136 Front-Wheel Drive (MFI) (FFS) Saturn \n41137 Front-Wheel Drive (TBI) (FFS) Saturn \n41138 Front-Wheel Drive (MFI) (FFS) Saturn \n41139 Front-Wheel Drive (FFS) Subaru \n41140 Front-Wheel Drive (FFS) Subaru \n41141 4-Wheel or All-Wheel Drive (FFS) Subaru \n41142 4-Wheel or All-Wheel Drive (FFS) Subaru \n41143 4-Wheel or All-Wheel Drive (FFS,TRBO) Subaru \n\n model trany createdOn \n0 Spider Veloce 2000 Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n1 Testarossa Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n2 Charger Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n3 B150/B250 Wagon 2WD Automatic 3-spd Tue Jan 01 00:00:00 EST 2013 \n4 Legacy AWD Turbo Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n5 Loyale Automatic 3-spd Tue Jan 01 00:00:00 EST 2013 \n6 Loyale Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n7 Corolla Automatic 3-spd Tue Jan 01 00:00:00 EST 2013 \n8 Corolla Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n9 Corolla Automatic 4-spd Tue Jan 01 00:00:00 EST 2013 \n... ... ... ... \n41134 900 Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n41135 SL Automatic 4-spd Tue Jan 01 00:00:00 EST 2013 \n41136 SL Automatic 4-spd Tue Jan 01 00:00:00 EST 2013 \n41137 SL Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n41138 SL Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n41139 Legacy Automatic 4-spd Tue Jan 01 00:00:00 EST 2013 \n41140 Legacy Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n41141 Legacy AWD Automatic 4-spd Tue Jan 01 00:00:00 EST 2013 \n41142 Legacy AWD Manual 5-spd Tue Jan 01 00:00:00 EST 2013 \n41143 Legacy AWD Turbo Automatic 4-spd Tue Jan 01 00:00:00 EST 2013 \n\n[41144 rows x 6 columns]", 904 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
driveeng_dscrmakemodeltranycreatedOn
0Rear-Wheel Drive(FFS)Alfa RomeoSpider Veloce 2000Manual 5-spdTue Jan 01 00:00:00 EST 2013
1Rear-Wheel Drive(GUZZLER)FerrariTestarossaManual 5-spdTue Jan 01 00:00:00 EST 2013
2Front-Wheel Drive(FFS)DodgeChargerManual 5-spdTue Jan 01 00:00:00 EST 2013
3Rear-Wheel DriveNaNDodgeB150/B250 Wagon 2WDAutomatic 3-spdTue Jan 01 00:00:00 EST 2013
44-Wheel or All-Wheel Drive(FFS,TRBO)SubaruLegacy AWD TurboManual 5-spdTue Jan 01 00:00:00 EST 2013
5Front-Wheel Drive(FFS)SubaruLoyaleAutomatic 3-spdTue Jan 01 00:00:00 EST 2013
6Front-Wheel Drive(FFS)SubaruLoyaleManual 5-spdTue Jan 01 00:00:00 EST 2013
7Front-Wheel Drive(FFS)ToyotaCorollaAutomatic 3-spdTue Jan 01 00:00:00 EST 2013
8Front-Wheel Drive(FFS)ToyotaCorollaManual 5-spdTue Jan 01 00:00:00 EST 2013
9Front-Wheel Drive(FFS)ToyotaCorollaAutomatic 4-spdTue Jan 01 00:00:00 EST 2013
.....................
41134Front-Wheel Drive(FFS)Saab900Manual 5-spdTue Jan 01 00:00:00 EST 2013
41135Front-Wheel Drive(TBI) (FFS)SaturnSLAutomatic 4-spdTue Jan 01 00:00:00 EST 2013
41136Front-Wheel Drive(MFI) (FFS)SaturnSLAutomatic 4-spdTue Jan 01 00:00:00 EST 2013
41137Front-Wheel Drive(TBI) (FFS)SaturnSLManual 5-spdTue Jan 01 00:00:00 EST 2013
41138Front-Wheel Drive(MFI) (FFS)SaturnSLManual 5-spdTue Jan 01 00:00:00 EST 2013
41139Front-Wheel Drive(FFS)SubaruLegacyAutomatic 4-spdTue Jan 01 00:00:00 EST 2013
41140Front-Wheel Drive(FFS)SubaruLegacyManual 5-spdTue Jan 01 00:00:00 EST 2013
411414-Wheel or All-Wheel Drive(FFS)SubaruLegacy AWDAutomatic 4-spdTue Jan 01 00:00:00 EST 2013
411424-Wheel or All-Wheel Drive(FFS)SubaruLegacy AWDManual 5-spdTue Jan 01 00:00:00 EST 2013
411434-Wheel or All-Wheel Drive(FFS,TRBO)SubaruLegacy AWD TurboAutomatic 4-spdTue Jan 01 00:00:00 EST 2013
\n

41144 rows × 6 columns

\n
" 905 | }, 906 | "execution_count": 27, 907 | "metadata": {}, 908 | "output_type": "execute_result" 909 | } 910 | ], 911 | "source": [ 912 | "(autos\n", 913 | " [cols]\n", 914 | " .select_dtypes(object)\n", 915 | ")" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 28, 921 | "metadata": { 922 | "scrolled": true, 923 | "pycharm": { 924 | "name": "#%%\n" 925 | } 926 | }, 927 | "outputs": [ 928 | { 929 | "data": { 930 | "text/plain": "Front-Wheel Drive 14236\nRear-Wheel Drive 13831\n4-Wheel or All-Wheel Drive 6648\nAll-Wheel Drive 3015\n4-Wheel Drive 1460\nNaN 1189\n2-Wheel Drive 507\nPart-time 4-Wheel Drive 258\nName: drive, dtype: int64" 931 | }, 932 | "execution_count": 28, 933 | "metadata": {}, 934 | "output_type": "execute_result" 935 | } 936 | ], 937 | "source": [ 938 | "# looks categorical\n", 939 | "(autos.drive.value_counts(dropna=False))" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 29, 945 | "metadata": { 946 | "scrolled": true, 947 | "pycharm": { 948 | "name": "#%%\n" 949 | } 950 | }, 951 | "outputs": [ 952 | { 953 | "data": { 954 | "text/plain": " city08 comb08 highway08 cylinders displ drive eng_dscr \\\n7138 81 85 91 NaN NaN NaN NaN \n8144 74 65 58 NaN NaN NaN NaN \n8147 84 75 66 NaN NaN NaN NaN \n18217 18 21 25 4.0 2.0 NaN (FFS) \n18218 20 22 26 4.0 1.5 NaN (FFS) \n18219 13 15 20 8.0 5.7 NaN (350 V8) (FFS) \n18220 13 15 20 8.0 5.7 NaN (350 V8) (FFS) \n18221 15 17 20 6.0 3.0 NaN (FFS,TRBO) \n18222 16 18 20 6.0 3.0 NaN (FFS) \n18223 16 18 22 6.0 3.0 NaN (FFS,TRBO) \n... ... ... ... ... ... ... ... \n20063 13 15 19 8.0 5.0 NaN (FFS) CA model \n20064 13 15 20 8.0 5.0 NaN (GM-OLDS) CA model \n20065 14 16 19 8.0 5.0 NaN (GM-CHEV) CA model \n20387 14 14 15 4.0 2.4 NaN (FFS) CA model \n21129 14 16 21 8.0 3.5 NaN GUZZLER FFS,TURBO \n23029 79 85 94 NaN NaN NaN Lead Acid \n23030 35 37 39 NaN NaN NaN NiMH \n23032 49 48 46 NaN NaN NaN NaN \n23037 49 48 46 NaN NaN NaN NaN \n23040 102 98 94 NaN NaN NaN NaN \n\n fuelCost08 make model trany range \\\n7138 800 Nissan Altra EV NaN 90 \n8144 1000 Ford Th!nk NaN 29 \n8147 900 Nissan Hyper-Mini NaN 33 \n18217 2000 Alfa Romeo Spider Veloce 2000 Manual 5-spd 0 \n18218 1900 Bertone X1/9 Manual 5-spd 0 \n18219 2800 Chevrolet Corvette Automatic 4-spd 0 \n18220 2800 Chevrolet Corvette Manual 4-spd 0 \n18221 2500 Nissan 300ZX Automatic 4-spd 0 \n18222 2350 Nissan 300ZX Automatic 4-spd 0 \n18223 2350 Nissan 300ZX Manual 5-spd 0 \n... ... ... ... ... ... \n20063 2800 Mercury Grand Marquis Wagon Automatic 4-spd 0 \n20064 2800 Oldsmobile Custom Cruiser Wagon Automatic 4-spd 0 \n20065 2650 Pontiac Parisienne Wagon Automatic 4-spd 0 \n20387 3000 Nissan Pickup Cab Chassis Manual 5-spd 0 \n21129 3250 Lotus Esprit V8 Manual 5-spd 0 \n23029 800 GMC EV1 Automatic (A1) 55 \n23030 1750 GMC EV1 Automatic (A1) 105 \n23032 1400 Honda EV Plus Automatic (A1) 81 \n23037 1400 Honda EV Plus Automatic (A1) 81 \n23040 650 MINI MiniE Automatic (A1) 100 \n\n createdOn year \n7138 Tue Jan 01 00:00:00 EST 2013 2000 \n8144 Tue Jan 01 00:00:00 EST 2013 2001 \n8147 Tue Jan 01 00:00:00 EST 2013 2001 \n18217 Tue Jan 01 00:00:00 EST 2013 1984 \n18218 Tue Jan 01 00:00:00 EST 2013 1984 \n18219 Tue Jan 01 00:00:00 EST 2013 1984 \n18220 Tue Jan 01 00:00:00 EST 2013 1984 \n18221 Tue Jan 01 00:00:00 EST 2013 1984 \n18222 Tue Jan 01 00:00:00 EST 2013 1984 \n18223 Tue Jan 01 00:00:00 EST 2013 1984 \n... ... ... \n20063 Tue Jan 01 00:00:00 EST 2013 1984 \n20064 Tue Jan 01 00:00:00 EST 2013 1984 \n20065 Tue Jan 01 00:00:00 EST 2013 1984 \n20387 Tue Jan 01 00:00:00 EST 2013 1984 \n21129 Tue Jan 01 00:00:00 EST 2013 2002 \n23029 Tue Jan 01 00:00:00 EST 2013 1999 \n23030 Tue Jan 01 00:00:00 EST 2013 1999 \n23032 Tue Jan 01 00:00:00 EST 2013 1999 \n23037 Tue Jan 01 00:00:00 EST 2013 1998 \n23040 Tue Jan 01 00:00:00 EST 2013 2008 \n\n[1189 rows x 14 columns]", 955 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08cylindersdispldriveeng_dscrfuelCost08makemodeltranyrangecreatedOnyear
7138818591NaNNaNNaNNaN800NissanAltra EVNaN90Tue Jan 01 00:00:00 EST 20132000
8144746558NaNNaNNaNNaN1000FordTh!nkNaN29Tue Jan 01 00:00:00 EST 20132001
8147847566NaNNaNNaNNaN900NissanHyper-MiniNaN33Tue Jan 01 00:00:00 EST 20132001
182171821254.02.0NaN(FFS)2000Alfa RomeoSpider Veloce 2000Manual 5-spd0Tue Jan 01 00:00:00 EST 20131984
182182022264.01.5NaN(FFS)1900BertoneX1/9Manual 5-spd0Tue Jan 01 00:00:00 EST 20131984
182191315208.05.7NaN(350 V8) (FFS)2800ChevroletCorvetteAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131984
182201315208.05.7NaN(350 V8) (FFS)2800ChevroletCorvetteManual 4-spd0Tue Jan 01 00:00:00 EST 20131984
182211517206.03.0NaN(FFS,TRBO)2500Nissan300ZXAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131984
182221618206.03.0NaN(FFS)2350Nissan300ZXAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131984
182231618226.03.0NaN(FFS,TRBO)2350Nissan300ZXManual 5-spd0Tue Jan 01 00:00:00 EST 20131984
.............................................
200631315198.05.0NaN(FFS) CA model2800MercuryGrand Marquis WagonAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131984
200641315208.05.0NaN(GM-OLDS) CA model2800OldsmobileCustom Cruiser WagonAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131984
200651416198.05.0NaN(GM-CHEV) CA model2650PontiacParisienne WagonAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131984
203871414154.02.4NaN(FFS) CA model3000NissanPickup Cab ChassisManual 5-spd0Tue Jan 01 00:00:00 EST 20131984
211291416218.03.5NaNGUZZLER FFS,TURBO3250LotusEsprit V8Manual 5-spd0Tue Jan 01 00:00:00 EST 20132002
23029798594NaNNaNNaNLead Acid800GMCEV1Automatic (A1)55Tue Jan 01 00:00:00 EST 20131999
23030353739NaNNaNNaNNiMH1750GMCEV1Automatic (A1)105Tue Jan 01 00:00:00 EST 20131999
23032494846NaNNaNNaNNaN1400HondaEV PlusAutomatic (A1)81Tue Jan 01 00:00:00 EST 20131999
23037494846NaNNaNNaNNaN1400HondaEV PlusAutomatic (A1)81Tue Jan 01 00:00:00 EST 20131998
230401029894NaNNaNNaNNaN650MINIMiniEAutomatic (A1)100Tue Jan 01 00:00:00 EST 20132008
\n

1189 rows × 14 columns

\n
" 956 | }, 957 | "execution_count": 29, 958 | "metadata": {}, 959 | "output_type": "execute_result" 960 | } 961 | ], 962 | "source": [ 963 | "# where are the values missing for drive?\n", 964 | "(autos\n", 965 | " [cols]\n", 966 | " .query('drive.isna()'))" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": 30, 972 | "metadata": { 973 | "scrolled": false, 974 | "pycharm": { 975 | "name": "#%%\n" 976 | } 977 | }, 978 | "outputs": [ 979 | { 980 | "data": { 981 | "text/plain": "12093275" 982 | }, 983 | "execution_count": 30, 984 | "metadata": {}, 985 | "output_type": "execute_result" 986 | } 987 | ], 988 | "source": [ 989 | "# drive and make (in .astype) to category\n", 990 | "(autos\n", 991 | " [cols]\n", 992 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 993 | " displ=autos.displ.fillna(0).astype('float16'),\n", 994 | " drive=autos.drive.fillna('Other').astype('category')\n", 995 | " )\n", 996 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 997 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 998 | " .memory_usage(deep=True)\n", 999 | " .sum() # was 19,647,323\n", 1000 | ")" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": 31, 1006 | "metadata": { 1007 | "scrolled": true, 1008 | "pycharm": { 1009 | "name": "#%%\n" 1010 | } 1011 | }, 1012 | "outputs": [ 1013 | { 1014 | "data": { 1015 | "text/plain": "Automatic 4-spd 11047\nManual 5-spd 8361\nAutomatic 3-spd 3151\nAutomatic (S6) 3106\nManual 6-spd 2757\nAutomatic 5-spd 2203\nAutomatic (S8) 1665\nAutomatic 6-spd 1619\nManual 4-spd 1483\nAutomatic (S5) 833\nAutomatic (variable gear ratios) 826\nAutomatic 7-spd 724\nAutomatic 8-spd 433\nAutomatic (AM-S7) 424\nAutomatic (S7) 327\nAutomatic 9-spd 293\nAutomatic (AM7) 245\nAutomatic (S4) 233\nAutomatic (AV-S6) 208\nAutomatic (A1) 201\nAutomatic (AM6) 151\nAutomatic (AV-S7) 139\nAutomatic (S10) 124\nAutomatic (AM-S6) 116\nManual 7-spd 114\nAutomatic (S9) 86\nManual 3-spd 77\nAutomatic (AM-S8) 60\nAutomatic (AV-S8) 47\nAutomatic 10-spd 25\nManual 4-spd Doubled 17\nAutomatic (AM5) 14\nNaN 11\nAutomatic (AV-S10) 11\nAutomatic (AM8) 6\nAutomatic (AM-S9) 3\nAutomatic (L3) 2\nAutomatic (L4) 2\nName: trany, dtype: int64" 1016 | }, 1017 | "execution_count": 31, 1018 | "metadata": {}, 1019 | "output_type": "execute_result" 1020 | } 1021 | ], 1022 | "source": [ 1023 | "# let's inspect trany\n", 1024 | "# looks like it has two pieces of information embedded in column\n", 1025 | "(autos.trany.value_counts(dropna=False))" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 32, 1031 | "metadata": { 1032 | "scrolled": false, 1033 | "pycharm": { 1034 | "name": "#%%\n" 1035 | } 1036 | }, 1037 | "outputs": [ 1038 | { 1039 | "data": { 1040 | "text/plain": "10631047" 1041 | }, 1042 | "execution_count": 32, 1043 | "metadata": {}, 1044 | "output_type": "execute_result" 1045 | } 1046 | ], 1047 | "source": [ 1048 | "# add automatic, speeds from trany, then drop trany\n", 1049 | "(autos\n", 1050 | " [cols]\n", 1051 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1052 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1053 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1054 | " automatic=autos.trany.str.contains('Auto'),\n", 1055 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8')\n", 1056 | " )\n", 1057 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 1058 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1059 | " .drop(columns=['trany'])\n", 1060 | " .memory_usage(deep=True)\n", 1061 | " .sum() # was 19,647,323\n", 1062 | ")" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": null, 1068 | "metadata": { 1069 | "pycharm": { 1070 | "name": "#%%\n" 1071 | } 1072 | }, 1073 | "outputs": [], 1074 | "source": [] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": null, 1079 | "metadata": { 1080 | "pycharm": { 1081 | "name": "#%%\n" 1082 | } 1083 | }, 1084 | "outputs": [], 1085 | "source": [] 1086 | }, 1087 | { 1088 | "cell_type": "markdown", 1089 | "metadata": { 1090 | "pycharm": { 1091 | "name": "#%% md\n" 1092 | } 1093 | }, 1094 | "source": [ 1095 | "### Dates" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "code", 1100 | "execution_count": 33, 1101 | "metadata": { 1102 | "scrolled": false, 1103 | "pycharm": { 1104 | "name": "#%%\n" 1105 | } 1106 | }, 1107 | "outputs": [ 1108 | { 1109 | "name": "stderr", 1110 | "output_type": "stream", 1111 | "text": [ 1112 | "C:\\Users\\matt\\AppData\\Roaming\\JetBrains\\DataSpell2022.1\\projects\\workspace\\venv\\lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname EST identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", 1113 | " warnings.warn(\"tzname {tzname} identified but not understood. \"\n", 1114 | "C:\\Users\\matt\\AppData\\Roaming\\JetBrains\\DataSpell2022.1\\projects\\workspace\\venv\\lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname EDT identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", 1115 | " warnings.warn(\"tzname {tzname} identified but not understood. \"\n" 1116 | ] 1117 | }, 1118 | { 1119 | "data": { 1120 | "text/plain": "7462959" 1121 | }, 1122 | "execution_count": 33, 1123 | "metadata": {}, 1124 | "output_type": "execute_result" 1125 | } 1126 | ], 1127 | "source": [ 1128 | "# add createdOn\n", 1129 | "(autos\n", 1130 | " [cols]\n", 1131 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1132 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1133 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1134 | " automatic=autos.trany.str.contains('Auto'),\n", 1135 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8'),\n", 1136 | " createdOn=pd.to_datetime(autos.createdOn).dt.tz_localize('America/New_York')\n", 1137 | " )\n", 1138 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 1139 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1140 | " .drop(columns=['trany'])\n", 1141 | " .memory_usage(deep=True)\n", 1142 | " .sum() # was 19,647,323\n", 1143 | ")" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": 34, 1149 | "metadata": { 1150 | "scrolled": true, 1151 | "pycharm": { 1152 | "name": "#%%\n" 1153 | } 1154 | }, 1155 | "outputs": [ 1156 | { 1157 | "data": { 1158 | "text/plain": "0 Tue Jan 01 00:00:00 EST 2013\n1 Tue Jan 01 00:00:00 EST 2013\n2 Tue Jan 01 00:00:00 EST 2013\n3 Tue Jan 01 00:00:00 EST 2013\n4 Tue Jan 01 00:00:00 EST 2013\n5 Tue Jan 01 00:00:00 EST 2013\n6 Tue Jan 01 00:00:00 EST 2013\n7 Tue Jan 01 00:00:00 EST 2013\n8 Tue Jan 01 00:00:00 EST 2013\n9 Tue Jan 01 00:00:00 EST 2013\n ... \n41134 Tue Jan 01 00:00:00 EST 2013\n41135 Tue Jan 01 00:00:00 EST 2013\n41136 Tue Jan 01 00:00:00 EST 2013\n41137 Tue Jan 01 00:00:00 EST 2013\n41138 Tue Jan 01 00:00:00 EST 2013\n41139 Tue Jan 01 00:00:00 EST 2013\n41140 Tue Jan 01 00:00:00 EST 2013\n41141 Tue Jan 01 00:00:00 EST 2013\n41142 Tue Jan 01 00:00:00 EST 2013\n41143 Tue Jan 01 00:00:00 EST 2013\nName: createdOn, Length: 41144, dtype: object" 1159 | }, 1160 | "execution_count": 34, 1161 | "metadata": {}, 1162 | "output_type": "execute_result" 1163 | } 1164 | ], 1165 | "source": [ 1166 | "# Python doesn't like EST/EDT\n", 1167 | "autos[cols].createdOn" 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "code", 1172 | "execution_count": 35, 1173 | "metadata": { 1174 | "scrolled": false, 1175 | "pycharm": { 1176 | "name": "#%%\n" 1177 | } 1178 | }, 1179 | "outputs": [ 1180 | { 1181 | "data": { 1182 | "text/plain": "NaN 16153\n(FFS) 8827\nSIDI 5526\n(FFS) CA model 926\n(FFS) (MPFI) 734\nFFV 701\n(FFS,TRBO) 666\n(350 V8) (FFS) 411\n(GUZZLER) (FFS) 366\nSOHC 354\n ... \nB234L/R4 (FFS,TRBO) 1\nGUZZLER V8 FFS,TURBO 1\n4.6M FFS MPFI 1\nCNG FFS 1\nPOLICE FFS MPFI 1\nB308E5 FFS,TURBO 1\n5.4E-R FFS MPFI 1\nV-6 FFS 1\n(GUZZLER) (FFS) (S-CHARGE) 1\nR-ENG (FFS,TRBO) 1\nName: eng_dscr, Length: 558, dtype: int64" 1183 | }, 1184 | "execution_count": 35, 1185 | "metadata": {}, 1186 | "output_type": "execute_result" 1187 | } 1188 | ], 1189 | "source": [ 1190 | "# Fix date warnings - move on to eng_dscr\n", 1191 | "# http://www.fueleconomy.gov/feg/findacarhelp.shtml#trany\n", 1192 | "(autos\n", 1193 | " [cols]\n", 1194 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1195 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1196 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1197 | " automatic=autos.trany.str.contains('Auto'),\n", 1198 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8'),\n", 1199 | " createdOn=pd.to_datetime(autos.createdOn.replace({' EDT': '-04:00',\n", 1200 | " ' EST': '-05:00'}, regex=True), utc=True).dt.tz_convert('America/New_York')\n", 1201 | " )\n", 1202 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 1203 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1204 | " .drop(columns=['trany'])\n", 1205 | " .eng_dscr\n", 1206 | " .value_counts(dropna=False)\n", 1207 | ")" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": 36, 1213 | "metadata": { 1214 | "scrolled": false, 1215 | "pycharm": { 1216 | "name": "#%%\n" 1217 | } 1218 | }, 1219 | "outputs": [ 1220 | { 1221 | "data": { 1222 | "text/plain": "6701302" 1223 | }, 1224 | "execution_count": 36, 1225 | "metadata": {}, 1226 | "output_type": "execute_result" 1227 | } 1228 | ], 1229 | "source": [ 1230 | "# add ffs (Feedback fuel system), drop eng_descr\n", 1231 | "(autos\n", 1232 | " [cols]\n", 1233 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1234 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1235 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1236 | " automatic=autos.trany.str.contains('Auto'),\n", 1237 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8'),\n", 1238 | " createdOn=pd.to_datetime(autos.createdOn.replace({' EDT': '-04:00',\n", 1239 | " ' EST': '-05:00'}, regex=True), utc=True).dt.tz_convert('America/New_York'),\n", 1240 | " ffs=autos.eng_dscr.str.contains('FFS')\n", 1241 | " )\n", 1242 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 1243 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1244 | " .drop(columns=['trany', 'eng_dscr'])\n", 1245 | " .memory_usage(deep=True)\n", 1246 | " .sum() # was 19,647,323\n", 1247 | ")" 1248 | ] 1249 | }, 1250 | { 1251 | "cell_type": "code", 1252 | "execution_count": 37, 1253 | "metadata": { 1254 | "lines_to_next_cell": 0, 1255 | "scrolled": true, 1256 | "pycharm": { 1257 | "name": "#%%\n" 1258 | } 1259 | }, 1260 | "outputs": [ 1261 | { 1262 | "data": { 1263 | "text/plain": " city08 comb08 highway08 cylinders displ \\\n0 19 21 25 4 2.000000 \n1 9 11 14 12 4.898438 \n2 23 27 33 4 2.199219 \n3 10 11 12 8 5.199219 \n4 17 19 23 4 2.199219 \n5 21 22 24 4 1.799805 \n6 22 25 29 4 1.799805 \n7 23 24 26 4 1.599609 \n8 23 26 31 4 1.599609 \n9 23 25 30 4 1.799805 \n... ... ... ... ... ... \n41134 18 20 24 4 2.099609 \n41135 23 26 33 4 1.900391 \n41136 21 24 30 4 1.900391 \n41137 24 28 33 4 1.900391 \n41138 21 25 32 4 1.900391 \n41139 19 22 26 4 2.199219 \n41140 20 23 28 4 2.199219 \n41141 18 21 24 4 2.199219 \n41142 18 21 24 4 2.199219 \n41143 16 18 21 4 2.199219 \n\n drive fuelCost08 make \\\n0 Rear-Wheel Drive 2000 Alfa Romeo \n1 Rear-Wheel Drive 3850 Ferrari \n2 Front-Wheel Drive 1550 Dodge \n3 Rear-Wheel Drive 3850 Dodge \n4 4-Wheel or All-Wheel Drive 2700 Subaru \n5 Front-Wheel Drive 1900 Subaru \n6 Front-Wheel Drive 1700 Subaru \n7 Front-Wheel Drive 1750 Toyota \n8 Front-Wheel Drive 1600 Toyota \n9 Front-Wheel Drive 1700 Toyota \n... ... ... ... \n41134 Front-Wheel Drive 2100 Saab \n41135 Front-Wheel Drive 1600 Saturn \n41136 Front-Wheel Drive 1750 Saturn \n41137 Front-Wheel Drive 1500 Saturn \n41138 Front-Wheel Drive 1700 Saturn \n41139 Front-Wheel Drive 1900 Subaru \n41140 Front-Wheel Drive 1850 Subaru \n41141 4-Wheel or All-Wheel Drive 2000 Subaru \n41142 4-Wheel or All-Wheel Drive 2000 Subaru \n41143 4-Wheel or All-Wheel Drive 2900 Subaru \n\n model range createdOn year automatic \\\n0 Spider Veloce 2000 0 2013-01-01 00:00:00-05:00 1985 False \n1 Testarossa 0 2013-01-01 00:00:00-05:00 1985 False \n2 Charger 0 2013-01-01 00:00:00-05:00 1985 False \n3 B150/B250 Wagon 2WD 0 2013-01-01 00:00:00-05:00 1985 True \n4 Legacy AWD Turbo 0 2013-01-01 00:00:00-05:00 1993 False \n5 Loyale 0 2013-01-01 00:00:00-05:00 1993 True \n6 Loyale 0 2013-01-01 00:00:00-05:00 1993 False \n7 Corolla 0 2013-01-01 00:00:00-05:00 1993 True \n8 Corolla 0 2013-01-01 00:00:00-05:00 1993 False \n9 Corolla 0 2013-01-01 00:00:00-05:00 1993 True \n... ... ... ... ... ... \n41134 900 0 2013-01-01 00:00:00-05:00 1993 False \n41135 SL 0 2013-01-01 00:00:00-05:00 1993 True \n41136 SL 0 2013-01-01 00:00:00-05:00 1993 True \n41137 SL 0 2013-01-01 00:00:00-05:00 1993 False \n41138 SL 0 2013-01-01 00:00:00-05:00 1993 False \n41139 Legacy 0 2013-01-01 00:00:00-05:00 1993 True \n41140 Legacy 0 2013-01-01 00:00:00-05:00 1993 False \n41141 Legacy AWD 0 2013-01-01 00:00:00-05:00 1993 True \n41142 Legacy AWD 0 2013-01-01 00:00:00-05:00 1993 False \n41143 Legacy AWD Turbo 0 2013-01-01 00:00:00-05:00 1993 True \n\n speeds ffs \n0 5 True \n1 5 False \n2 5 True \n3 3 NaN \n4 5 True \n5 3 True \n6 5 True \n7 3 True \n8 5 True \n9 4 True \n... ... ... \n41134 5 True \n41135 4 True \n41136 4 True \n41137 5 True \n41138 5 True \n41139 4 True \n41140 5 True \n41141 4 True \n41142 5 True \n41143 4 True \n\n[41144 rows x 15 columns]", 1264 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08cylindersdispldrivefuelCost08makemodelrangecreatedOnyearautomaticspeedsffs
019212542.000000Rear-Wheel Drive2000Alfa RomeoSpider Veloce 200002013-01-01 00:00:00-05:001985False5True
191114124.898438Rear-Wheel Drive3850FerrariTestarossa02013-01-01 00:00:00-05:001985False5False
223273342.199219Front-Wheel Drive1550DodgeCharger02013-01-01 00:00:00-05:001985False5True
310111285.199219Rear-Wheel Drive3850DodgeB150/B250 Wagon 2WD02013-01-01 00:00:00-05:001985True3NaN
417192342.1992194-Wheel or All-Wheel Drive2700SubaruLegacy AWD Turbo02013-01-01 00:00:00-05:001993False5True
521222441.799805Front-Wheel Drive1900SubaruLoyale02013-01-01 00:00:00-05:001993True3True
622252941.799805Front-Wheel Drive1700SubaruLoyale02013-01-01 00:00:00-05:001993False5True
723242641.599609Front-Wheel Drive1750ToyotaCorolla02013-01-01 00:00:00-05:001993True3True
823263141.599609Front-Wheel Drive1600ToyotaCorolla02013-01-01 00:00:00-05:001993False5True
923253041.799805Front-Wheel Drive1700ToyotaCorolla02013-01-01 00:00:00-05:001993True4True
................................................
4113418202442.099609Front-Wheel Drive2100Saab90002013-01-01 00:00:00-05:001993False5True
4113523263341.900391Front-Wheel Drive1600SaturnSL02013-01-01 00:00:00-05:001993True4True
4113621243041.900391Front-Wheel Drive1750SaturnSL02013-01-01 00:00:00-05:001993True4True
4113724283341.900391Front-Wheel Drive1500SaturnSL02013-01-01 00:00:00-05:001993False5True
4113821253241.900391Front-Wheel Drive1700SaturnSL02013-01-01 00:00:00-05:001993False5True
4113919222642.199219Front-Wheel Drive1900SubaruLegacy02013-01-01 00:00:00-05:001993True4True
4114020232842.199219Front-Wheel Drive1850SubaruLegacy02013-01-01 00:00:00-05:001993False5True
4114118212442.1992194-Wheel or All-Wheel Drive2000SubaruLegacy AWD02013-01-01 00:00:00-05:001993True4True
4114218212442.1992194-Wheel or All-Wheel Drive2000SubaruLegacy AWD02013-01-01 00:00:00-05:001993False5True
4114316182142.1992194-Wheel or All-Wheel Drive2900SubaruLegacy AWD Turbo02013-01-01 00:00:00-05:001993True4True
\n

41144 rows × 15 columns

\n
" 1265 | }, 1266 | "execution_count": 37, 1267 | "metadata": {}, 1268 | "output_type": "execute_result" 1269 | } 1270 | ], 1271 | "source": [ 1272 | "# a glorious function\n", 1273 | "def tweak_autos(autos):\n", 1274 | " cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', \n", 1275 | " 'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']\n", 1276 | " return (autos\n", 1277 | " [cols]\n", 1278 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1279 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1280 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1281 | " automatic=autos.trany.str.contains('Auto'),\n", 1282 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8'),\n", 1283 | " createdOn=pd.to_datetime(autos.createdOn.replace({' EDT': '-04:00',\n", 1284 | " ' EST': '-05:00'}, regex=True), utc=True).dt.tz_convert('America/New_York'),\n", 1285 | " ffs=autos.eng_dscr.str.contains('FFS')\n", 1286 | " )\n", 1287 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16',\n", 1288 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1289 | " .drop(columns=['trany', 'eng_dscr'])\n", 1290 | " )\n", 1291 | "\n", 1292 | "tweak_autos(autos)" 1293 | ] 1294 | }, 1295 | { 1296 | "cell_type": "code", 1297 | "execution_count": null, 1298 | "metadata": { 1299 | "pycharm": { 1300 | "name": "#%%\n" 1301 | } 1302 | }, 1303 | "outputs": [], 1304 | "source": [] 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "execution_count": null, 1309 | "metadata": { 1310 | "pycharm": { 1311 | "name": "#%%\n" 1312 | } 1313 | }, 1314 | "outputs": [], 1315 | "source": [] 1316 | }, 1317 | { 1318 | "cell_type": "code", 1319 | "execution_count": null, 1320 | "metadata": { 1321 | "lines_to_next_cell": 2, 1322 | "pycharm": { 1323 | "name": "#%%\n" 1324 | } 1325 | }, 1326 | "outputs": [], 1327 | "source": [] 1328 | }, 1329 | { 1330 | "cell_type": "markdown", 1331 | "metadata": { 1332 | "pycharm": { 1333 | "name": "#%% md\n" 1334 | } 1335 | }, 1336 | "source": [ 1337 | "## Chain\n", 1338 | "\n", 1339 | "Chaining is also called \"flow\" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.\n", 1340 | "\n", 1341 | "The chain should read like a recipe of ordered steps.\n", 1342 | "\n", 1343 | "(BTW, this is actually what we did above.)\n", 1344 | "\n", 1345 | "
\n", 1346 | " Hint: Leverage .pipe if you can't find a way to chain 😉🐼💪\n", 1347 | "
\n", 1348 | " \n", 1349 | "\n", 1350 | "\n" 1351 | ] 1352 | }, 1353 | { 1354 | "cell_type": "code", 1355 | "execution_count": null, 1356 | "metadata": { 1357 | "scrolled": true, 1358 | "pycharm": { 1359 | "name": "#%%\n" 1360 | } 1361 | }, 1362 | "outputs": [], 1363 | "source": [ 1364 | "def tweak_autos(autos):\n", 1365 | " cols = ['city08', 'comb08', 'highway08', 'cylinders', 'displ', 'drive', 'eng_dscr', \n", 1366 | " 'fuelCost08', 'make', 'model', 'trany', 'range', 'createdOn', 'year']\n", 1367 | " return (autos\n", 1368 | " [cols]\n", 1369 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1370 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1371 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1372 | " automatic=autos.trany.str.contains('Auto'),\n", 1373 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8'),\n", 1374 | " createdOn=pd.to_datetime(autos.createdOn.replace({' EDT': '-04:00',\n", 1375 | " ' EST': '-05:00'}, regex=True), utc=True).dt.tz_convert('America/New_York'),\n", 1376 | " ffs=autos.eng_dscr.str.contains('FFS')\n", 1377 | " )\n", 1378 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 1379 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1380 | " .drop(columns=['trany', 'eng_dscr'])\n", 1381 | " )\n", 1382 | "\n", 1383 | "tweak_autos(autos)" 1384 | ] 1385 | }, 1386 | { 1387 | "cell_type": "code", 1388 | "execution_count": 38, 1389 | "metadata": { 1390 | "pycharm": { 1391 | "name": "#%%\n" 1392 | } 1393 | }, 1394 | "outputs": [ 1395 | { 1396 | "name": "stderr", 1397 | "output_type": "stream", 1398 | "text": [ 1399 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:5: SettingWithCopyWarning: \n", 1400 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1401 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1402 | "\n", 1403 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1404 | " a1['cylinders'] = cyls2\n", 1405 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:9: SettingWithCopyWarning: \n", 1406 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1407 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1408 | "\n", 1409 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1410 | " a1.displ = displ3\n", 1411 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:10: SettingWithCopyWarning: \n", 1412 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1413 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1414 | "\n", 1415 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1416 | " a1.drive = autos.drive.fillna('Other').astype('category')\n", 1417 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:11: SettingWithCopyWarning: \n", 1418 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1419 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1420 | "\n", 1421 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1422 | " a1['automatic'] = autos.trany.str.contains('Auto')\n", 1423 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:15: SettingWithCopyWarning: \n", 1424 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1425 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1426 | "\n", 1427 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1428 | " a1['speeds'] = speedint\n", 1429 | "C:\\Users\\matt\\AppData\\Roaming\\JetBrains\\DataSpell2022.1\\projects\\workspace\\venv\\lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname EST identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", 1430 | " warnings.warn(\"tzname {tzname} identified but not understood. \"\n", 1431 | "C:\\Users\\matt\\AppData\\Roaming\\JetBrains\\DataSpell2022.1\\projects\\workspace\\venv\\lib\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname EDT identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", 1432 | " warnings.warn(\"tzname {tzname} identified but not understood. \"\n", 1433 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:16: SettingWithCopyWarning: \n", 1434 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1435 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1436 | "\n", 1437 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1438 | " a1.createdOn=pd.to_datetime(autos.createdOn).dt.tz_localize('America/New_York')\n", 1439 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:17: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", 1440 | " a1.ffs=autos.eng_dscr.str.contains('FFS')\n", 1441 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:18: SettingWithCopyWarning: \n", 1442 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1443 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1444 | "\n", 1445 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1446 | " a1['highway08'] = autos.highway08.astype('int8')\n", 1447 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:19: SettingWithCopyWarning: \n", 1448 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1449 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1450 | "\n", 1451 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1452 | " a1['city08'] = autos.city08.astype('int8')\n", 1453 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:20: SettingWithCopyWarning: \n", 1454 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1455 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1456 | "\n", 1457 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1458 | " a1['comb08'] = autos.comb08.astype('int16')\n", 1459 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:21: SettingWithCopyWarning: \n", 1460 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1461 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1462 | "\n", 1463 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1464 | " a1['fuelCost08'] = autos.fuelCost08.astype('int16')\n", 1465 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:22: SettingWithCopyWarning: \n", 1466 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1467 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1468 | "\n", 1469 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1470 | " a1['range'] = autos.range.astype('int16')\n", 1471 | "C:\\Users\\matt\\AppData\\Local\\Temp\\ipykernel_16404\\3557971291.py:23: SettingWithCopyWarning: \n", 1472 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1473 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1474 | "\n", 1475 | "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 1476 | " a1['make'] = autos.make.astype('category')\n" 1477 | ] 1478 | } 1479 | ], 1480 | "source": [ 1481 | "# compare chain to this mess\n", 1482 | "a1 = autos[cols]\n", 1483 | "cyls = autos.cylinders.fillna(0)\n", 1484 | "cyls2 = cyls.astype('int8')\n", 1485 | "a1['cylinders'] = cyls2\n", 1486 | "displ = a1.displ\n", 1487 | "displ2 = displ.fillna(0)\n", 1488 | "displ3 = displ2.astype('float16')\n", 1489 | "a1.displ = displ3\n", 1490 | "a1.drive = autos.drive.fillna('Other').astype('category')\n", 1491 | "a1['automatic'] = autos.trany.str.contains('Auto') \n", 1492 | "speed = autos.trany.str.extract(r'(\\d)+')\n", 1493 | "speedfill = speed.fillna('20')\n", 1494 | "speedint = speedfill.astype('int8')\n", 1495 | "a1['speeds'] = speedint\n", 1496 | "a1.createdOn=pd.to_datetime(autos.createdOn).dt.tz_localize('America/New_York')\n", 1497 | "a1.ffs=autos.eng_dscr.str.contains('FFS')\n", 1498 | "a1['highway08'] = autos.highway08.astype('int8')\n", 1499 | "a1['city08'] = autos.city08.astype('int8')\n", 1500 | "a1['comb08'] = autos.comb08.astype('int16')\n", 1501 | "a1['fuelCost08'] = autos.fuelCost08.astype('int16')\n", 1502 | "a1['range'] = autos.range.astype('int16')\n", 1503 | "a1['make'] = autos.make.astype('category')\n", 1504 | "a3 = a1.drop(columns=['trany', 'eng_dscr'])" 1505 | ] 1506 | }, 1507 | { 1508 | "cell_type": "code", 1509 | "execution_count": 39, 1510 | "metadata": { 1511 | "scrolled": false, 1512 | "pycharm": { 1513 | "name": "#%%\n" 1514 | } 1515 | }, 1516 | "outputs": [ 1517 | { 1518 | "data": { 1519 | "text/plain": " city08 comb08 highway08 cylinders displ \\\n0 19 21 25 4 2.000000 \n1 9 11 14 12 4.898438 \n2 23 27 33 4 2.199219 \n3 10 11 12 8 5.199219 \n4 17 19 23 4 2.199219 \n5 21 22 24 4 1.799805 \n6 22 25 29 4 1.799805 \n7 23 24 26 4 1.599609 \n8 23 26 31 4 1.599609 \n9 23 25 30 4 1.799805 \n... ... ... ... ... ... \n41134 18 20 24 4 2.099609 \n41135 23 26 33 4 1.900391 \n41136 21 24 30 4 1.900391 \n41137 24 28 33 4 1.900391 \n41138 21 25 32 4 1.900391 \n41139 19 22 26 4 2.199219 \n41140 20 23 28 4 2.199219 \n41141 18 21 24 4 2.199219 \n41142 18 21 24 4 2.199219 \n41143 16 18 21 4 2.199219 \n\n drive eng_dscr fuelCost08 make \\\n0 Rear-Wheel Drive (FFS) 2000 Alfa Romeo \n1 Rear-Wheel Drive (GUZZLER) 3850 Ferrari \n2 Front-Wheel Drive (FFS) 1550 Dodge \n3 Rear-Wheel Drive NaN 3850 Dodge \n4 4-Wheel or All-Wheel Drive (FFS,TRBO) 2700 Subaru \n5 Front-Wheel Drive (FFS) 1900 Subaru \n6 Front-Wheel Drive (FFS) 1700 Subaru \n7 Front-Wheel Drive (FFS) 1750 Toyota \n8 Front-Wheel Drive (FFS) 1600 Toyota \n9 Front-Wheel Drive (FFS) 1700 Toyota \n... ... ... ... ... \n41134 Front-Wheel Drive (FFS) 2100 Saab \n41135 Front-Wheel Drive (TBI) (FFS) 1600 Saturn \n41136 Front-Wheel Drive (MFI) (FFS) 1750 Saturn \n41137 Front-Wheel Drive (TBI) (FFS) 1500 Saturn \n41138 Front-Wheel Drive (MFI) (FFS) 1700 Saturn \n41139 Front-Wheel Drive (FFS) 1900 Subaru \n41140 Front-Wheel Drive (FFS) 1850 Subaru \n41141 4-Wheel or All-Wheel Drive (FFS) 2000 Subaru \n41142 4-Wheel or All-Wheel Drive (FFS) 2000 Subaru \n41143 4-Wheel or All-Wheel Drive (FFS,TRBO) 2900 Subaru \n\n model trany range createdOn \\\n0 Spider Veloce 2000 Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n1 Testarossa Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n2 Charger Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n3 B150/B250 Wagon 2WD Automatic 3-spd 0 2013-01-01 00:00:00-05:00 \n4 Legacy AWD Turbo Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n5 Loyale Automatic 3-spd 0 2013-01-01 00:00:00-05:00 \n6 Loyale Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n7 Corolla Automatic 3-spd 0 2013-01-01 00:00:00-05:00 \n8 Corolla Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n9 Corolla Automatic 4-spd 0 2013-01-01 00:00:00-05:00 \n... ... ... ... ... \n41134 900 Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n41135 SL Automatic 4-spd 0 2013-01-01 00:00:00-05:00 \n41136 SL Automatic 4-spd 0 2013-01-01 00:00:00-05:00 \n41137 SL Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n41138 SL Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n41139 Legacy Automatic 4-spd 0 2013-01-01 00:00:00-05:00 \n41140 Legacy Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n41141 Legacy AWD Automatic 4-spd 0 2013-01-01 00:00:00-05:00 \n41142 Legacy AWD Manual 5-spd 0 2013-01-01 00:00:00-05:00 \n41143 Legacy AWD Turbo Automatic 4-spd 0 2013-01-01 00:00:00-05:00 \n\n year automatic speeds ffs \n0 1985 False 5 True \n1 1985 False 5 False \n2 1985 False 5 True \n3 1985 True 3 NaN \n4 1993 False 5 True \n5 1993 True 3 True \n6 1993 False 5 True \n7 1993 True 3 True \n8 1993 False 5 True \n9 1993 True 4 True \n... ... ... ... ... \n41134 1993 False 5 True \n41135 1993 True 4 True \n41136 1993 True 4 True \n41137 1993 False 5 True \n41138 1993 False 5 True \n41139 1993 True 4 True \n41140 1993 False 5 True \n41141 1993 True 4 True \n41142 1993 False 5 True \n41143 1993 True 4 True \n\n[41144 rows x 17 columns]", 1520 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08cylindersdispldriveeng_dscrfuelCost08makemodeltranyrangecreatedOnyearautomaticspeedsffs
019212542.000000Rear-Wheel Drive(FFS)2000Alfa RomeoSpider Veloce 2000Manual 5-spd02013-01-01 00:00:00-05:001985False5True
191114124.898438Rear-Wheel Drive(GUZZLER)3850FerrariTestarossaManual 5-spd02013-01-01 00:00:00-05:001985False5False
223273342.199219Front-Wheel Drive(FFS)1550DodgeChargerManual 5-spd02013-01-01 00:00:00-05:001985False5True
310111285.199219Rear-Wheel DriveNaN3850DodgeB150/B250 Wagon 2WDAutomatic 3-spd02013-01-01 00:00:00-05:001985True3NaN
417192342.1992194-Wheel or All-Wheel Drive(FFS,TRBO)2700SubaruLegacy AWD TurboManual 5-spd02013-01-01 00:00:00-05:001993False5True
521222441.799805Front-Wheel Drive(FFS)1900SubaruLoyaleAutomatic 3-spd02013-01-01 00:00:00-05:001993True3True
622252941.799805Front-Wheel Drive(FFS)1700SubaruLoyaleManual 5-spd02013-01-01 00:00:00-05:001993False5True
723242641.599609Front-Wheel Drive(FFS)1750ToyotaCorollaAutomatic 3-spd02013-01-01 00:00:00-05:001993True3True
823263141.599609Front-Wheel Drive(FFS)1600ToyotaCorollaManual 5-spd02013-01-01 00:00:00-05:001993False5True
923253041.799805Front-Wheel Drive(FFS)1700ToyotaCorollaAutomatic 4-spd02013-01-01 00:00:00-05:001993True4True
......................................................
4113418202442.099609Front-Wheel Drive(FFS)2100Saab900Manual 5-spd02013-01-01 00:00:00-05:001993False5True
4113523263341.900391Front-Wheel Drive(TBI) (FFS)1600SaturnSLAutomatic 4-spd02013-01-01 00:00:00-05:001993True4True
4113621243041.900391Front-Wheel Drive(MFI) (FFS)1750SaturnSLAutomatic 4-spd02013-01-01 00:00:00-05:001993True4True
4113724283341.900391Front-Wheel Drive(TBI) (FFS)1500SaturnSLManual 5-spd02013-01-01 00:00:00-05:001993False5True
4113821253241.900391Front-Wheel Drive(MFI) (FFS)1700SaturnSLManual 5-spd02013-01-01 00:00:00-05:001993False5True
4113919222642.199219Front-Wheel Drive(FFS)1900SubaruLegacyAutomatic 4-spd02013-01-01 00:00:00-05:001993True4True
4114020232842.199219Front-Wheel Drive(FFS)1850SubaruLegacyManual 5-spd02013-01-01 00:00:00-05:001993False5True
4114118212442.1992194-Wheel or All-Wheel Drive(FFS)2000SubaruLegacy AWDAutomatic 4-spd02013-01-01 00:00:00-05:001993True4True
4114218212442.1992194-Wheel or All-Wheel Drive(FFS)2000SubaruLegacy AWDManual 5-spd02013-01-01 00:00:00-05:001993False5True
4114316182142.1992194-Wheel or All-Wheel Drive(FFS,TRBO)2900SubaruLegacy AWD TurboAutomatic 4-spd02013-01-01 00:00:00-05:001993True4True
\n

41144 rows × 17 columns

\n
" 1521 | }, 1522 | "metadata": {}, 1523 | "output_type": "display_data" 1524 | }, 1525 | { 1526 | "data": { 1527 | "text/plain": " city08 comb08 highway08 cylinders displ \\\n0 19 21 25 4 2.000000 \n1 9 11 14 12 4.898438 \n2 23 27 33 4 2.199219 \n3 10 11 12 8 5.199219 \n4 17 19 23 4 2.199219 \n5 21 22 24 4 1.799805 \n6 22 25 29 4 1.799805 \n7 23 24 26 4 1.599609 \n8 23 26 31 4 1.599609 \n9 23 25 30 4 1.799805 \n... ... ... ... ... ... \n41134 18 20 24 4 2.099609 \n41135 23 26 33 4 1.900391 \n41136 21 24 30 4 1.900391 \n41137 24 28 33 4 1.900391 \n41138 21 25 32 4 1.900391 \n41139 19 22 26 4 2.199219 \n41140 20 23 28 4 2.199219 \n41141 18 21 24 4 2.199219 \n41142 18 21 24 4 2.199219 \n41143 16 18 21 4 2.199219 \n\n drive fuelCost08 make \\\n0 Rear-Wheel Drive 2000 Alfa Romeo \n1 Rear-Wheel Drive 3850 Ferrari \n2 Front-Wheel Drive 1550 Dodge \n3 Rear-Wheel Drive 3850 Dodge \n4 4-Wheel or All-Wheel Drive 2700 Subaru \n5 Front-Wheel Drive 1900 Subaru \n6 Front-Wheel Drive 1700 Subaru \n7 Front-Wheel Drive 1750 Toyota \n8 Front-Wheel Drive 1600 Toyota \n9 Front-Wheel Drive 1700 Toyota \n... ... ... ... \n41134 Front-Wheel Drive 2100 Saab \n41135 Front-Wheel Drive 1600 Saturn \n41136 Front-Wheel Drive 1750 Saturn \n41137 Front-Wheel Drive 1500 Saturn \n41138 Front-Wheel Drive 1700 Saturn \n41139 Front-Wheel Drive 1900 Subaru \n41140 Front-Wheel Drive 1850 Subaru \n41141 4-Wheel or All-Wheel Drive 2000 Subaru \n41142 4-Wheel or All-Wheel Drive 2000 Subaru \n41143 4-Wheel or All-Wheel Drive 2900 Subaru \n\n model range createdOn year automatic \\\n0 Spider Veloce 2000 0 2013-01-01 00:00:00-05:00 1985 False \n1 Testarossa 0 2013-01-01 00:00:00-05:00 1985 False \n2 Charger 0 2013-01-01 00:00:00-05:00 1985 False \n3 B150/B250 Wagon 2WD 0 2013-01-01 00:00:00-05:00 1985 True \n4 Legacy AWD Turbo 0 2013-01-01 00:00:00-05:00 1993 False \n5 Loyale 0 2013-01-01 00:00:00-05:00 1993 True \n6 Loyale 0 2013-01-01 00:00:00-05:00 1993 False \n7 Corolla 0 2013-01-01 00:00:00-05:00 1993 True \n8 Corolla 0 2013-01-01 00:00:00-05:00 1993 False \n9 Corolla 0 2013-01-01 00:00:00-05:00 1993 True \n... ... ... ... ... ... \n41134 900 0 2013-01-01 00:00:00-05:00 1993 False \n41135 SL 0 2013-01-01 00:00:00-05:00 1993 True \n41136 SL 0 2013-01-01 00:00:00-05:00 1993 True \n41137 SL 0 2013-01-01 00:00:00-05:00 1993 False \n41138 SL 0 2013-01-01 00:00:00-05:00 1993 False \n41139 Legacy 0 2013-01-01 00:00:00-05:00 1993 True \n41140 Legacy 0 2013-01-01 00:00:00-05:00 1993 False \n41141 Legacy AWD 0 2013-01-01 00:00:00-05:00 1993 True \n41142 Legacy AWD 0 2013-01-01 00:00:00-05:00 1993 False \n41143 Legacy AWD Turbo 0 2013-01-01 00:00:00-05:00 1993 True \n\n speeds ffs \n0 5 True \n1 5 False \n2 5 True \n3 3 NaN \n4 5 True \n5 3 True \n6 5 True \n7 3 True \n8 5 True \n9 4 True \n... ... ... \n41134 5 True \n41135 4 True \n41136 4 True \n41137 5 True \n41138 5 True \n41139 4 True \n41140 5 True \n41141 4 True \n41142 5 True \n41143 4 True \n\n[41144 rows x 15 columns]", 1528 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08cylindersdispldrivefuelCost08makemodelrangecreatedOnyearautomaticspeedsffs
019212542.000000Rear-Wheel Drive2000Alfa RomeoSpider Veloce 200002013-01-01 00:00:00-05:001985False5True
191114124.898438Rear-Wheel Drive3850FerrariTestarossa02013-01-01 00:00:00-05:001985False5False
223273342.199219Front-Wheel Drive1550DodgeCharger02013-01-01 00:00:00-05:001985False5True
310111285.199219Rear-Wheel Drive3850DodgeB150/B250 Wagon 2WD02013-01-01 00:00:00-05:001985True3NaN
417192342.1992194-Wheel or All-Wheel Drive2700SubaruLegacy AWD Turbo02013-01-01 00:00:00-05:001993False5True
521222441.799805Front-Wheel Drive1900SubaruLoyale02013-01-01 00:00:00-05:001993True3True
622252941.799805Front-Wheel Drive1700SubaruLoyale02013-01-01 00:00:00-05:001993False5True
723242641.599609Front-Wheel Drive1750ToyotaCorolla02013-01-01 00:00:00-05:001993True3True
823263141.599609Front-Wheel Drive1600ToyotaCorolla02013-01-01 00:00:00-05:001993False5True
923253041.799805Front-Wheel Drive1700ToyotaCorolla02013-01-01 00:00:00-05:001993True4True
................................................
4113418202442.099609Front-Wheel Drive2100Saab90002013-01-01 00:00:00-05:001993False5True
4113523263341.900391Front-Wheel Drive1600SaturnSL02013-01-01 00:00:00-05:001993True4True
4113621243041.900391Front-Wheel Drive1750SaturnSL02013-01-01 00:00:00-05:001993True4True
4113724283341.900391Front-Wheel Drive1500SaturnSL02013-01-01 00:00:00-05:001993False5True
4113821253241.900391Front-Wheel Drive1700SaturnSL02013-01-01 00:00:00-05:001993False5True
4113919222642.199219Front-Wheel Drive1900SubaruLegacy02013-01-01 00:00:00-05:001993True4True
4114020232842.199219Front-Wheel Drive1850SubaruLegacy02013-01-01 00:00:00-05:001993False5True
4114118212442.1992194-Wheel or All-Wheel Drive2000SubaruLegacy AWD02013-01-01 00:00:00-05:001993True4True
4114218212442.1992194-Wheel or All-Wheel Drive2000SubaruLegacy AWD02013-01-01 00:00:00-05:001993False5True
4114316182142.1992194-Wheel or All-Wheel Drive2900SubaruLegacy AWD Turbo02013-01-01 00:00:00-05:001993True4True
\n

41144 rows × 15 columns

\n
" 1529 | }, 1530 | "execution_count": 39, 1531 | "metadata": {}, 1532 | "output_type": "execute_result" 1533 | } 1534 | ], 1535 | "source": [ 1536 | "# easy to debug\n", 1537 | "# - assign to var (df3)\n", 1538 | "# - comment out\n", 1539 | "# - pipe to display\n", 1540 | "\n", 1541 | "\n", 1542 | "from IPython.display import display\n", 1543 | "\n", 1544 | "def get_var(df, var_name):\n", 1545 | " globals()[var_name] = df\n", 1546 | " return df\n", 1547 | "\n", 1548 | "def tweak_autos(autos):\n", 1549 | " return (autos\n", 1550 | " [cols]\n", 1551 | " # create var \n", 1552 | " .pipe(get_var, 'df3')\n", 1553 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1554 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1555 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1556 | " automatic=autos.trany.str.contains('Auto'),\n", 1557 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8'), \n", 1558 | " createdOn=pd.to_datetime(autos.createdOn.replace({' EDT': '-04:00',\n", 1559 | " ' EST': '-05:00'}, regex=True), utc=True).dt.tz_convert('America/New_York'),\n", 1560 | " ffs=autos.eng_dscr.str.contains('FFS')\n", 1561 | " )\n", 1562 | " # debug pipe \n", 1563 | " .pipe(lambda df: display(df) or df)\n", 1564 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16', \n", 1565 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1566 | " .drop(columns=['trany', 'eng_dscr'])\n", 1567 | " )\n", 1568 | "\n", 1569 | "tweak_autos(autos)" 1570 | ] 1571 | }, 1572 | { 1573 | "cell_type": "code", 1574 | "execution_count": 40, 1575 | "metadata": { 1576 | "scrolled": true, 1577 | "pycharm": { 1578 | "name": "#%%\n" 1579 | } 1580 | }, 1581 | "outputs": [ 1582 | { 1583 | "data": { 1584 | "text/plain": " city08 comb08 highway08 cylinders displ \\\n0 19 21 25 4.0 2.0 \n1 9 11 14 12.0 4.9 \n2 23 27 33 4.0 2.2 \n3 10 11 12 8.0 5.2 \n4 17 19 23 4.0 2.2 \n5 21 22 24 4.0 1.8 \n6 22 25 29 4.0 1.8 \n7 23 24 26 4.0 1.6 \n8 23 26 31 4.0 1.6 \n9 23 25 30 4.0 1.8 \n... ... ... ... ... ... \n41134 18 20 24 4.0 2.1 \n41135 23 26 33 4.0 1.9 \n41136 21 24 30 4.0 1.9 \n41137 24 28 33 4.0 1.9 \n41138 21 25 32 4.0 1.9 \n41139 19 22 26 4.0 2.2 \n41140 20 23 28 4.0 2.2 \n41141 18 21 24 4.0 2.2 \n41142 18 21 24 4.0 2.2 \n41143 16 18 21 4.0 2.2 \n\n drive eng_dscr fuelCost08 make \\\n0 Rear-Wheel Drive (FFS) 2000 Alfa Romeo \n1 Rear-Wheel Drive (GUZZLER) 3850 Ferrari \n2 Front-Wheel Drive (FFS) 1550 Dodge \n3 Rear-Wheel Drive NaN 3850 Dodge \n4 4-Wheel or All-Wheel Drive (FFS,TRBO) 2700 Subaru \n5 Front-Wheel Drive (FFS) 1900 Subaru \n6 Front-Wheel Drive (FFS) 1700 Subaru \n7 Front-Wheel Drive (FFS) 1750 Toyota \n8 Front-Wheel Drive (FFS) 1600 Toyota \n9 Front-Wheel Drive (FFS) 1700 Toyota \n... ... ... ... ... \n41134 Front-Wheel Drive (FFS) 2100 Saab \n41135 Front-Wheel Drive (TBI) (FFS) 1600 Saturn \n41136 Front-Wheel Drive (MFI) (FFS) 1750 Saturn \n41137 Front-Wheel Drive (TBI) (FFS) 1500 Saturn \n41138 Front-Wheel Drive (MFI) (FFS) 1700 Saturn \n41139 Front-Wheel Drive (FFS) 1900 Subaru \n41140 Front-Wheel Drive (FFS) 1850 Subaru \n41141 4-Wheel or All-Wheel Drive (FFS) 2000 Subaru \n41142 4-Wheel or All-Wheel Drive (FFS) 2000 Subaru \n41143 4-Wheel or All-Wheel Drive (FFS,TRBO) 2900 Subaru \n\n model trany range \\\n0 Spider Veloce 2000 Manual 5-spd 0 \n1 Testarossa Manual 5-spd 0 \n2 Charger Manual 5-spd 0 \n3 B150/B250 Wagon 2WD Automatic 3-spd 0 \n4 Legacy AWD Turbo Manual 5-spd 0 \n5 Loyale Automatic 3-spd 0 \n6 Loyale Manual 5-spd 0 \n7 Corolla Automatic 3-spd 0 \n8 Corolla Manual 5-spd 0 \n9 Corolla Automatic 4-spd 0 \n... ... ... ... \n41134 900 Manual 5-spd 0 \n41135 SL Automatic 4-spd 0 \n41136 SL Automatic 4-spd 0 \n41137 SL Manual 5-spd 0 \n41138 SL Manual 5-spd 0 \n41139 Legacy Automatic 4-spd 0 \n41140 Legacy Manual 5-spd 0 \n41141 Legacy AWD Automatic 4-spd 0 \n41142 Legacy AWD Manual 5-spd 0 \n41143 Legacy AWD Turbo Automatic 4-spd 0 \n\n createdOn year \n0 Tue Jan 01 00:00:00 EST 2013 1985 \n1 Tue Jan 01 00:00:00 EST 2013 1985 \n2 Tue Jan 01 00:00:00 EST 2013 1985 \n3 Tue Jan 01 00:00:00 EST 2013 1985 \n4 Tue Jan 01 00:00:00 EST 2013 1993 \n5 Tue Jan 01 00:00:00 EST 2013 1993 \n6 Tue Jan 01 00:00:00 EST 2013 1993 \n7 Tue Jan 01 00:00:00 EST 2013 1993 \n8 Tue Jan 01 00:00:00 EST 2013 1993 \n9 Tue Jan 01 00:00:00 EST 2013 1993 \n... ... ... \n41134 Tue Jan 01 00:00:00 EST 2013 1993 \n41135 Tue Jan 01 00:00:00 EST 2013 1993 \n41136 Tue Jan 01 00:00:00 EST 2013 1993 \n41137 Tue Jan 01 00:00:00 EST 2013 1993 \n41138 Tue Jan 01 00:00:00 EST 2013 1993 \n41139 Tue Jan 01 00:00:00 EST 2013 1993 \n41140 Tue Jan 01 00:00:00 EST 2013 1993 \n41141 Tue Jan 01 00:00:00 EST 2013 1993 \n41142 Tue Jan 01 00:00:00 EST 2013 1993 \n41143 Tue Jan 01 00:00:00 EST 2013 1993 \n\n[41144 rows x 14 columns]", 1585 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
city08comb08highway08cylindersdispldriveeng_dscrfuelCost08makemodeltranyrangecreatedOnyear
01921254.02.0Rear-Wheel Drive(FFS)2000Alfa RomeoSpider Veloce 2000Manual 5-spd0Tue Jan 01 00:00:00 EST 20131985
19111412.04.9Rear-Wheel Drive(GUZZLER)3850FerrariTestarossaManual 5-spd0Tue Jan 01 00:00:00 EST 20131985
22327334.02.2Front-Wheel Drive(FFS)1550DodgeChargerManual 5-spd0Tue Jan 01 00:00:00 EST 20131985
31011128.05.2Rear-Wheel DriveNaN3850DodgeB150/B250 Wagon 2WDAutomatic 3-spd0Tue Jan 01 00:00:00 EST 20131985
41719234.02.24-Wheel or All-Wheel Drive(FFS,TRBO)2700SubaruLegacy AWD TurboManual 5-spd0Tue Jan 01 00:00:00 EST 20131993
52122244.01.8Front-Wheel Drive(FFS)1900SubaruLoyaleAutomatic 3-spd0Tue Jan 01 00:00:00 EST 20131993
62225294.01.8Front-Wheel Drive(FFS)1700SubaruLoyaleManual 5-spd0Tue Jan 01 00:00:00 EST 20131993
72324264.01.6Front-Wheel Drive(FFS)1750ToyotaCorollaAutomatic 3-spd0Tue Jan 01 00:00:00 EST 20131993
82326314.01.6Front-Wheel Drive(FFS)1600ToyotaCorollaManual 5-spd0Tue Jan 01 00:00:00 EST 20131993
92325304.01.8Front-Wheel Drive(FFS)1700ToyotaCorollaAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131993
.............................................
411341820244.02.1Front-Wheel Drive(FFS)2100Saab900Manual 5-spd0Tue Jan 01 00:00:00 EST 20131993
411352326334.01.9Front-Wheel Drive(TBI) (FFS)1600SaturnSLAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131993
411362124304.01.9Front-Wheel Drive(MFI) (FFS)1750SaturnSLAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131993
411372428334.01.9Front-Wheel Drive(TBI) (FFS)1500SaturnSLManual 5-spd0Tue Jan 01 00:00:00 EST 20131993
411382125324.01.9Front-Wheel Drive(MFI) (FFS)1700SaturnSLManual 5-spd0Tue Jan 01 00:00:00 EST 20131993
411391922264.02.2Front-Wheel Drive(FFS)1900SubaruLegacyAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131993
411402023284.02.2Front-Wheel Drive(FFS)1850SubaruLegacyManual 5-spd0Tue Jan 01 00:00:00 EST 20131993
411411821244.02.24-Wheel or All-Wheel Drive(FFS)2000SubaruLegacy AWDAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131993
411421821244.02.24-Wheel or All-Wheel Drive(FFS)2000SubaruLegacy AWDManual 5-spd0Tue Jan 01 00:00:00 EST 20131993
411431618214.02.24-Wheel or All-Wheel Drive(FFS,TRBO)2900SubaruLegacy AWD TurboAutomatic 4-spd0Tue Jan 01 00:00:00 EST 20131993
\n

41144 rows × 14 columns

\n
" 1586 | }, 1587 | "execution_count": 40, 1588 | "metadata": {}, 1589 | "output_type": "execute_result" 1590 | } 1591 | ], 1592 | "source": [ 1593 | "# inspect intermediate data frame\n", 1594 | "df3" 1595 | ] 1596 | }, 1597 | { 1598 | "cell_type": "code", 1599 | "execution_count": null, 1600 | "metadata": { 1601 | "pycharm": { 1602 | "name": "#%%\n" 1603 | } 1604 | }, 1605 | "outputs": [], 1606 | "source": [] 1607 | }, 1608 | { 1609 | "cell_type": "code", 1610 | "execution_count": null, 1611 | "metadata": { 1612 | "pycharm": { 1613 | "name": "#%%\n" 1614 | } 1615 | }, 1616 | "outputs": [], 1617 | "source": [] 1618 | }, 1619 | { 1620 | "cell_type": "code", 1621 | "execution_count": null, 1622 | "metadata": { 1623 | "pycharm": { 1624 | "name": "#%%\n" 1625 | } 1626 | }, 1627 | "outputs": [], 1628 | "source": [] 1629 | }, 1630 | { 1631 | "cell_type": "markdown", 1632 | "metadata": { 1633 | "pycharm": { 1634 | "name": "#%% md\n" 1635 | } 1636 | }, 1637 | "source": [ 1638 | "## Don't Mutate\n", 1639 | "\n", 1640 | "> \"you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not.\"\n", 1641 | ">\n", 1642 | "> **jreback** - Pandas core dev\n", 1643 | "\n", 1644 | "\n", 1645 | "\n", 1646 | "https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136\n", 1647 | "\n", 1648 | "* In general, no performance benefits\n", 1649 | "* Prohibits chaining\n", 1650 | "* ``SettingWithCopyWarning`` fun\n" 1651 | ] 1652 | }, 1653 | { 1654 | "cell_type": "code", 1655 | "execution_count": null, 1656 | "metadata": { 1657 | "pycharm": { 1658 | "name": "#%%\n" 1659 | } 1660 | }, 1661 | "outputs": [], 1662 | "source": [] 1663 | }, 1664 | { 1665 | "cell_type": "code", 1666 | "execution_count": null, 1667 | "metadata": { 1668 | "pycharm": { 1669 | "name": "#%%\n" 1670 | } 1671 | }, 1672 | "outputs": [], 1673 | "source": [] 1674 | }, 1675 | { 1676 | "cell_type": "code", 1677 | "execution_count": null, 1678 | "metadata": { 1679 | "pycharm": { 1680 | "name": "#%%\n" 1681 | } 1682 | }, 1683 | "outputs": [], 1684 | "source": [] 1685 | }, 1686 | { 1687 | "cell_type": "markdown", 1688 | "metadata": { 1689 | "pycharm": { 1690 | "name": "#%% md\n" 1691 | } 1692 | }, 1693 | "source": [ 1694 | "## Don't Apply (if you can)" 1695 | ] 1696 | }, 1697 | { 1698 | "cell_type": "code", 1699 | "execution_count": null, 1700 | "metadata": { 1701 | "pycharm": { 1702 | "name": "#%%\n" 1703 | } 1704 | }, 1705 | "outputs": [], 1706 | "source": [ 1707 | "def tweak_autos(autos):\n", 1708 | " return (autos\n", 1709 | " [cols]\n", 1710 | " .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),\n", 1711 | " displ=autos.displ.fillna(0).astype('float16'),\n", 1712 | " drive=autos.drive.fillna('Other').astype('category'),\n", 1713 | " automatic=autos.trany.str.contains('Auto'),\n", 1714 | " speeds=autos.trany.str.extract(r'(\\d)+').fillna('20').astype('int8'),\n", 1715 | " createdOn=pd.to_datetime(autos.createdOn.replace({' EDT': '-04:00',\n", 1716 | " ' EST': '-05:00'}, regex=True), utc=True).dt.tz_convert('America/New_York'),\n", 1717 | " ffs=autos.eng_dscr.str.contains('FFS')\n", 1718 | " )\n", 1719 | " .astype({'highway08': 'int8', 'city08': 'int16', 'comb08': 'int16', 'fuelCost08': 'int16',\n", 1720 | " 'range': 'int16', 'year': 'int16', 'make': 'category'})\n", 1721 | " .drop(columns=['trany', 'eng_dscr'])\n", 1722 | " )\n", 1723 | "\n", 1724 | "\n", 1725 | "autos2 = tweak_autos(autos)" 1726 | ] 1727 | }, 1728 | { 1729 | "cell_type": "code", 1730 | "execution_count": null, 1731 | "metadata": { 1732 | "scrolled": true, 1733 | "pycharm": { 1734 | "name": "#%%\n" 1735 | } 1736 | }, 1737 | "outputs": [], 1738 | "source": [ 1739 | "# try to me more Euro-centric\n", 1740 | "def to_lper100km(val):\n", 1741 | " return 235.215 / val\n", 1742 | "autos2.city08.apply(to_lper100km)" 1743 | ] 1744 | }, 1745 | { 1746 | "cell_type": "code", 1747 | "execution_count": null, 1748 | "metadata": { 1749 | "scrolled": true, 1750 | "pycharm": { 1751 | "name": "#%%\n" 1752 | } 1753 | }, 1754 | "outputs": [], 1755 | "source": [ 1756 | "# this gives the sames results\n", 1757 | "235.215 / autos2.city08 " 1758 | ] 1759 | }, 1760 | { 1761 | "cell_type": "code", 1762 | "execution_count": null, 1763 | "metadata": { 1764 | "pycharm": { 1765 | "name": "#%%\n" 1766 | } 1767 | }, 1768 | "outputs": [], 1769 | "source": [ 1770 | "%%timeit\n", 1771 | "autos2.city08.apply(to_lper100km)" 1772 | ] 1773 | }, 1774 | { 1775 | "cell_type": "code", 1776 | "execution_count": null, 1777 | "metadata": { 1778 | "pycharm": { 1779 | "name": "#%%\n" 1780 | } 1781 | }, 1782 | "outputs": [], 1783 | "source": [ 1784 | "%%timeit\n", 1785 | "235.215 / autos2.city08 " 1786 | ] 1787 | }, 1788 | { 1789 | "cell_type": "code", 1790 | "execution_count": null, 1791 | "metadata": { 1792 | "pycharm": { 1793 | "name": "#%%\n" 1794 | } 1795 | }, 1796 | "outputs": [], 1797 | "source": [ 1798 | "# ~50x slower!\n", 1799 | "6_220 / 110" 1800 | ] 1801 | }, 1802 | { 1803 | "cell_type": "code", 1804 | "execution_count": null, 1805 | "metadata": { 1806 | "pycharm": { 1807 | "name": "#%%\n" 1808 | } 1809 | }, 1810 | "outputs": [], 1811 | "source": [ 1812 | "def is_american(val):\n", 1813 | " return val in {'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}" 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "code", 1818 | "execution_count": null, 1819 | "metadata": { 1820 | "pycharm": { 1821 | "name": "#%%\n" 1822 | } 1823 | }, 1824 | "outputs": [], 1825 | "source": [ 1826 | "%%timeit\n", 1827 | "autos2.make.apply(is_american)" 1828 | ] 1829 | }, 1830 | { 1831 | "cell_type": "code", 1832 | "execution_count": null, 1833 | "metadata": { 1834 | "pycharm": { 1835 | "name": "#%%\n" 1836 | } 1837 | }, 1838 | "outputs": [], 1839 | "source": [ 1840 | "%%timeit\n", 1841 | "autos2.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'})" 1842 | ] 1843 | }, 1844 | { 1845 | "cell_type": "code", 1846 | "execution_count": null, 1847 | "metadata": { 1848 | "pycharm": { 1849 | "name": "#%%\n" 1850 | } 1851 | }, 1852 | "outputs": [], 1853 | "source": [ 1854 | "autos3 = autos2.assign(make=autos2.make.astype(str))" 1855 | ] 1856 | }, 1857 | { 1858 | "cell_type": "code", 1859 | "execution_count": null, 1860 | "metadata": { 1861 | "pycharm": { 1862 | "name": "#%%\n" 1863 | } 1864 | }, 1865 | "outputs": [], 1866 | "source": [ 1867 | "%%timeit\n", 1868 | "# converted to string\n", 1869 | "autos3.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'})" 1870 | ] 1871 | }, 1872 | { 1873 | "cell_type": "code", 1874 | "execution_count": null, 1875 | "metadata": { 1876 | "pycharm": { 1877 | "name": "#%%\n" 1878 | } 1879 | }, 1880 | "outputs": [], 1881 | "source": [ 1882 | "%%timeit\n", 1883 | "autos3.make.apply(is_american)" 1884 | ] 1885 | }, 1886 | { 1887 | "cell_type": "code", 1888 | "execution_count": null, 1889 | "metadata": { 1890 | "pycharm": { 1891 | "name": "#%%\n" 1892 | } 1893 | }, 1894 | "outputs": [], 1895 | "source": [ 1896 | "def country(val):\n", 1897 | " if val in {'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}:\n", 1898 | " return 'US'\n", 1899 | " return 'Other'" 1900 | ] 1901 | }, 1902 | { 1903 | "cell_type": "code", 1904 | "execution_count": null, 1905 | "metadata": { 1906 | "scrolled": true, 1907 | "pycharm": { 1908 | "name": "#%%\n" 1909 | } 1910 | }, 1911 | "outputs": [], 1912 | "source": [ 1913 | "%%timeit\n", 1914 | "# Might be ok for strings, since they are not vectorized...\n", 1915 | "(autos2\n", 1916 | " .assign(country=autos2.make.apply(country))\n", 1917 | ")" 1918 | ] 1919 | }, 1920 | { 1921 | "cell_type": "code", 1922 | "execution_count": null, 1923 | "metadata": { 1924 | "pycharm": { 1925 | "name": "#%%\n" 1926 | } 1927 | }, 1928 | "outputs": [], 1929 | "source": [ 1930 | "%%timeit\n", 1931 | "values = {'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}\n", 1932 | "(autos2\n", 1933 | " .assign(country='US')\n", 1934 | " .assign(country=lambda df_:df_.country.where(df_.make.isin(values), 'Other'))\n", 1935 | ")" 1936 | ] 1937 | }, 1938 | { 1939 | "cell_type": "code", 1940 | "execution_count": null, 1941 | "metadata": { 1942 | "pycharm": { 1943 | "name": "#%%\n" 1944 | } 1945 | }, 1946 | "outputs": [], 1947 | "source": [ 1948 | "%%timeit\n", 1949 | "\n", 1950 | "(autos2\n", 1951 | " .assign(country=np.select([autos2.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'})], \n", 1952 | " ['US'], 'Other'))\n", 1953 | ")" 1954 | ] 1955 | }, 1956 | { 1957 | "cell_type": "code", 1958 | "execution_count": null, 1959 | "metadata": { 1960 | "lines_to_next_cell": 0, 1961 | "pycharm": { 1962 | "name": "#%%\n" 1963 | } 1964 | }, 1965 | "outputs": [], 1966 | "source": [ 1967 | "%%timeit\n", 1968 | "\n", 1969 | "(autos2\n", 1970 | " .assign(country=np.where(autos2.make.isin({'Chevrolet', 'Ford', 'Dodge', 'GMC', 'Tesla'}), \n", 1971 | " 'US', 'Other'))\n", 1972 | ")" 1973 | ] 1974 | }, 1975 | { 1976 | "cell_type": "code", 1977 | "execution_count": null, 1978 | "metadata": { 1979 | "lines_to_next_cell": 2, 1980 | "pycharm": { 1981 | "name": "#%%\n" 1982 | } 1983 | }, 1984 | "outputs": [], 1985 | "source": [] 1986 | }, 1987 | { 1988 | "cell_type": "code", 1989 | "execution_count": null, 1990 | "metadata": { 1991 | "pycharm": { 1992 | "name": "#%%\n" 1993 | } 1994 | }, 1995 | "outputs": [], 1996 | "source": [] 1997 | }, 1998 | { 1999 | "cell_type": "code", 2000 | "execution_count": null, 2001 | "metadata": { 2002 | "lines_to_next_cell": 2, 2003 | "pycharm": { 2004 | "name": "#%%\n" 2005 | } 2006 | }, 2007 | "outputs": [], 2008 | "source": [] 2009 | }, 2010 | { 2011 | "cell_type": "code", 2012 | "execution_count": null, 2013 | "metadata": { 2014 | "pycharm": { 2015 | "name": "#%%\n" 2016 | } 2017 | }, 2018 | "outputs": [], 2019 | "source": [] 2020 | }, 2021 | { 2022 | "cell_type": "code", 2023 | "execution_count": null, 2024 | "metadata": { 2025 | "pycharm": { 2026 | "name": "#%%\n" 2027 | } 2028 | }, 2029 | "outputs": [], 2030 | "source": [] 2031 | }, 2032 | { 2033 | "cell_type": "code", 2034 | "execution_count": null, 2035 | "metadata": { 2036 | "pycharm": { 2037 | "name": "#%%\n" 2038 | } 2039 | }, 2040 | "outputs": [], 2041 | "source": [] 2042 | }, 2043 | { 2044 | "cell_type": "code", 2045 | "execution_count": null, 2046 | "metadata": { 2047 | "pycharm": { 2048 | "name": "#%%\n" 2049 | } 2050 | }, 2051 | "outputs": [], 2052 | "source": [] 2053 | }, 2054 | { 2055 | "cell_type": "code", 2056 | "execution_count": null, 2057 | "metadata": { 2058 | "pycharm": { 2059 | "name": "#%%\n" 2060 | } 2061 | }, 2062 | "outputs": [], 2063 | "source": [] 2064 | }, 2065 | { 2066 | "cell_type": "code", 2067 | "execution_count": null, 2068 | "metadata": { 2069 | "lines_to_next_cell": 2, 2070 | "pycharm": { 2071 | "name": "#%%\n" 2072 | } 2073 | }, 2074 | "outputs": [], 2075 | "source": [] 2076 | }, 2077 | { 2078 | "cell_type": "code", 2079 | "execution_count": null, 2080 | "metadata": { 2081 | "lines_to_next_cell": 2, 2082 | "pycharm": { 2083 | "name": "#%%\n" 2084 | } 2085 | }, 2086 | "outputs": [], 2087 | "source": [] 2088 | }, 2089 | { 2090 | "cell_type": "code", 2091 | "execution_count": null, 2092 | "metadata": { 2093 | "pycharm": { 2094 | "name": "#%%\n" 2095 | } 2096 | }, 2097 | "outputs": [], 2098 | "source": [] 2099 | }, 2100 | { 2101 | "cell_type": "code", 2102 | "execution_count": null, 2103 | "metadata": { 2104 | "pycharm": { 2105 | "name": "#%%\n" 2106 | } 2107 | }, 2108 | "outputs": [], 2109 | "source": [] 2110 | }, 2111 | { 2112 | "cell_type": "code", 2113 | "execution_count": null, 2114 | "metadata": { 2115 | "pycharm": { 2116 | "name": "#%%\n" 2117 | } 2118 | }, 2119 | "outputs": [], 2120 | "source": [] 2121 | }, 2122 | { 2123 | "cell_type": "code", 2124 | "execution_count": null, 2125 | "metadata": { 2126 | "pycharm": { 2127 | "name": "#%%\n" 2128 | } 2129 | }, 2130 | "outputs": [], 2131 | "source": [] 2132 | }, 2133 | { 2134 | "cell_type": "code", 2135 | "execution_count": null, 2136 | "metadata": { 2137 | "pycharm": { 2138 | "name": "#%%\n" 2139 | } 2140 | }, 2141 | "outputs": [], 2142 | "source": [] 2143 | }, 2144 | { 2145 | "cell_type": "code", 2146 | "execution_count": null, 2147 | "metadata": { 2148 | "pycharm": { 2149 | "name": "#%%\n" 2150 | } 2151 | }, 2152 | "outputs": [], 2153 | "source": [] 2154 | }, 2155 | { 2156 | "cell_type": "code", 2157 | "execution_count": null, 2158 | "metadata": { 2159 | "lines_to_next_cell": 2, 2160 | "pycharm": { 2161 | "name": "#%%\n" 2162 | } 2163 | }, 2164 | "outputs": [], 2165 | "source": [] 2166 | }, 2167 | { 2168 | "cell_type": "code", 2169 | "execution_count": null, 2170 | "metadata": { 2171 | "pycharm": { 2172 | "name": "#%%\n" 2173 | } 2174 | }, 2175 | "outputs": [], 2176 | "source": [] 2177 | }, 2178 | { 2179 | "cell_type": "markdown", 2180 | "metadata": { 2181 | "pycharm": { 2182 | "name": "#%% md\n" 2183 | } 2184 | }, 2185 | "source": [ 2186 | "## Master Aggregation\n", 2187 | "\n", 2188 | "Let's compare mileage by country by year...🤔" 2189 | ] 2190 | }, 2191 | { 2192 | "cell_type": "code", 2193 | "execution_count": null, 2194 | "metadata": { 2195 | "scrolled": true, 2196 | "pycharm": { 2197 | "name": "#%%\n" 2198 | } 2199 | }, 2200 | "outputs": [], 2201 | "source": [ 2202 | "(autos2\n", 2203 | " .groupby('year')\n", 2204 | " .mean()\n", 2205 | ")" 2206 | ] 2207 | }, 2208 | { 2209 | "cell_type": "code", 2210 | "execution_count": null, 2211 | "metadata": { 2212 | "scrolled": true, 2213 | "pycharm": { 2214 | "name": "#%%\n" 2215 | } 2216 | }, 2217 | "outputs": [], 2218 | "source": [ 2219 | "# watch order of column filtering/aggregation\n", 2220 | "(autos2\n", 2221 | " .groupby('year')\n", 2222 | " [['comb08', 'speeds']]\n", 2223 | " .mean()\n", 2224 | ")" 2225 | ] 2226 | }, 2227 | { 2228 | "cell_type": "code", 2229 | "execution_count": null, 2230 | "metadata": { 2231 | "scrolled": true, 2232 | "pycharm": { 2233 | "name": "#%%\n" 2234 | } 2235 | }, 2236 | "outputs": [], 2237 | "source": [ 2238 | "%%timeit\n", 2239 | "# watch order of column filtering/aggregation\n", 2240 | "(autos2\n", 2241 | " .groupby('year')\n", 2242 | " [['comb08', 'speeds']]\n", 2243 | " .mean()\n", 2244 | ")" 2245 | ] 2246 | }, 2247 | { 2248 | "cell_type": "code", 2249 | "execution_count": null, 2250 | "metadata": { 2251 | "scrolled": true, 2252 | "pycharm": { 2253 | "name": "#%%\n" 2254 | } 2255 | }, 2256 | "outputs": [], 2257 | "source": [ 2258 | "%%timeit\n", 2259 | "# watch order of column filtering/aggregation\n", 2260 | "(autos2\n", 2261 | " .groupby('year')\n", 2262 | " .mean()\n", 2263 | " [['comb08', 'speeds']]\n", 2264 | ")" 2265 | ] 2266 | }, 2267 | { 2268 | "cell_type": "code", 2269 | "execution_count": null, 2270 | "metadata": { 2271 | "pycharm": { 2272 | "name": "#%%\n" 2273 | } 2274 | }, 2275 | "outputs": [], 2276 | "source": [ 2277 | "import matplotlib.pyplot as plt\n", 2278 | "import seaborn as sns\n", 2279 | "plt.style.use('pandas1book') \n", 2280 | "sns.set_context('talk')\n", 2281 | "plt.plot(range(10))" 2282 | ] 2283 | }, 2284 | { 2285 | "cell_type": "code", 2286 | "execution_count": null, 2287 | "metadata": { 2288 | "scrolled": true, 2289 | "pycharm": { 2290 | "name": "#%%\n" 2291 | } 2292 | }, 2293 | "outputs": [], 2294 | "source": [ 2295 | "(autos2\n", 2296 | " .groupby('year')\n", 2297 | " [['comb08', 'speeds']]\n", 2298 | " .mean()\n", 2299 | " .plot()\n", 2300 | ")" 2301 | ] 2302 | }, 2303 | { 2304 | "cell_type": "code", 2305 | "execution_count": null, 2306 | "metadata": { 2307 | "scrolled": true, 2308 | "pycharm": { 2309 | "name": "#%%\n" 2310 | } 2311 | }, 2312 | "outputs": [], 2313 | "source": [ 2314 | "(autos2\n", 2315 | " .groupby('year')\n", 2316 | " [['comb08', 'speeds']]\n", 2317 | " #.mean()\n", 2318 | " #.median()\n", 2319 | " .quantile(.3)\n", 2320 | " #.std()\n", 2321 | " #.var()\n", 2322 | " .plot()\n", 2323 | ")" 2324 | ] 2325 | }, 2326 | { 2327 | "cell_type": "code", 2328 | "execution_count": null, 2329 | "metadata": { 2330 | "scrolled": true, 2331 | "pycharm": { 2332 | "name": "#%%\n" 2333 | } 2334 | }, 2335 | "outputs": [], 2336 | "source": [ 2337 | "# add country\n", 2338 | "(autos2\n", 2339 | " .assign(country=autos2.make.apply(country))\n", 2340 | " .groupby(['year', 'country'])\n", 2341 | " .mean()\n", 2342 | ")" 2343 | ] 2344 | }, 2345 | { 2346 | "cell_type": "code", 2347 | "execution_count": null, 2348 | "metadata": { 2349 | "scrolled": false, 2350 | "pycharm": { 2351 | "name": "#%%\n" 2352 | } 2353 | }, 2354 | "outputs": [], 2355 | "source": [ 2356 | "# can go deeper and apply multiple aggregates\n", 2357 | "def second_to_last(ser):\n", 2358 | " return ser.iloc[-2]\n", 2359 | "\n", 2360 | "(autos2\n", 2361 | " .assign(country=autos2.make.apply(country))\n", 2362 | " .groupby(['year', 'country'])\n", 2363 | " .agg(['min', 'mean', second_to_last])\n", 2364 | ")" 2365 | ] 2366 | }, 2367 | { 2368 | "cell_type": "code", 2369 | "execution_count": null, 2370 | "metadata": { 2371 | "pycharm": { 2372 | "name": "#%%\n" 2373 | } 2374 | }, 2375 | "outputs": [], 2376 | "source": [ 2377 | "# back to simpler example, adding plots\n", 2378 | "(autos2\n", 2379 | " .assign(country=autos2.make.apply(country))\n", 2380 | " .groupby(['year', 'country'])\n", 2381 | " .mean()\n", 2382 | " .plot()\n", 2383 | ")" 2384 | ] 2385 | }, 2386 | { 2387 | "cell_type": "code", 2388 | "execution_count": null, 2389 | "metadata": { 2390 | "scrolled": true, 2391 | "pycharm": { 2392 | "name": "#%%\n" 2393 | } 2394 | }, 2395 | "outputs": [], 2396 | "source": [ 2397 | "(autos2\n", 2398 | " .assign(country=autos2.make.apply(country))\n", 2399 | " .groupby(['year', 'country'])\n", 2400 | " .mean()\n", 2401 | " .unstack()\n", 2402 | ")" 2403 | ] 2404 | }, 2405 | { 2406 | "cell_type": "code", 2407 | "execution_count": null, 2408 | "metadata": { 2409 | "scrolled": true, 2410 | "pycharm": { 2411 | "name": "#%%\n" 2412 | } 2413 | }, 2414 | "outputs": [], 2415 | "source": [ 2416 | "(autos2\n", 2417 | " .assign(country=autos2.make.apply(country))\n", 2418 | " .groupby(['year', 'country'])\n", 2419 | " .mean()\n", 2420 | " #.std()\n", 2421 | " .unstack()\n", 2422 | " .city08\n", 2423 | " .plot()\n", 2424 | " .legend(bbox_to_anchor=(1,1))\n", 2425 | ")" 2426 | ] 2427 | }, 2428 | { 2429 | "cell_type": "code", 2430 | "execution_count": null, 2431 | "metadata": { 2432 | "scrolled": true, 2433 | "pycharm": { 2434 | "name": "#%%\n" 2435 | } 2436 | }, 2437 | "outputs": [], 2438 | "source": [ 2439 | "# smoothe it out a bit w/ rolling\n", 2440 | "(autos2\n", 2441 | " .assign(country=autos2.make.apply(country))\n", 2442 | " .groupby(['year', 'country'])\n", 2443 | " .mean()\n", 2444 | " .unstack()\n", 2445 | " .city08\n", 2446 | " .rolling(3)\n", 2447 | " .mean()\n", 2448 | " .plot()\n", 2449 | " .legend(bbox_to_anchor=(1,1))\n", 2450 | ")" 2451 | ] 2452 | }, 2453 | { 2454 | "cell_type": "code", 2455 | "execution_count": null, 2456 | "metadata": { 2457 | "lines_to_next_cell": 2, 2458 | "pycharm": { 2459 | "name": "#%%\n" 2460 | } 2461 | }, 2462 | "outputs": [], 2463 | "source": [] 2464 | }, 2465 | { 2466 | "cell_type": "code", 2467 | "execution_count": null, 2468 | "metadata": { 2469 | "pycharm": { 2470 | "name": "#%%\n" 2471 | } 2472 | }, 2473 | "outputs": [], 2474 | "source": [] 2475 | }, 2476 | { 2477 | "cell_type": "code", 2478 | "execution_count": null, 2479 | "metadata": { 2480 | "pycharm": { 2481 | "name": "#%%\n" 2482 | } 2483 | }, 2484 | "outputs": [], 2485 | "source": [] 2486 | }, 2487 | { 2488 | "cell_type": "code", 2489 | "execution_count": null, 2490 | "metadata": { 2491 | "pycharm": { 2492 | "name": "#%%\n" 2493 | } 2494 | }, 2495 | "outputs": [], 2496 | "source": [] 2497 | }, 2498 | { 2499 | "cell_type": "code", 2500 | "execution_count": null, 2501 | "metadata": { 2502 | "pycharm": { 2503 | "name": "#%%\n" 2504 | } 2505 | }, 2506 | "outputs": [], 2507 | "source": [] 2508 | }, 2509 | { 2510 | "cell_type": "code", 2511 | "execution_count": null, 2512 | "metadata": { 2513 | "pycharm": { 2514 | "name": "#%%\n" 2515 | } 2516 | }, 2517 | "outputs": [], 2518 | "source": [] 2519 | }, 2520 | { 2521 | "cell_type": "code", 2522 | "execution_count": null, 2523 | "metadata": { 2524 | "pycharm": { 2525 | "name": "#%%\n" 2526 | } 2527 | }, 2528 | "outputs": [], 2529 | "source": [] 2530 | }, 2531 | { 2532 | "cell_type": "code", 2533 | "execution_count": null, 2534 | "metadata": { 2535 | "pycharm": { 2536 | "name": "#%%\n" 2537 | } 2538 | }, 2539 | "outputs": [], 2540 | "source": [] 2541 | }, 2542 | { 2543 | "cell_type": "code", 2544 | "execution_count": null, 2545 | "metadata": { 2546 | "pycharm": { 2547 | "name": "#%%\n" 2548 | } 2549 | }, 2550 | "outputs": [], 2551 | "source": [] 2552 | }, 2553 | { 2554 | "cell_type": "code", 2555 | "execution_count": null, 2556 | "metadata": { 2557 | "pycharm": { 2558 | "name": "#%%\n" 2559 | } 2560 | }, 2561 | "outputs": [], 2562 | "source": [] 2563 | }, 2564 | { 2565 | "cell_type": "markdown", 2566 | "metadata": { 2567 | "pycharm": { 2568 | "name": "#%% md\n" 2569 | } 2570 | }, 2571 | "source": [ 2572 | "## Summary\n", 2573 | "\n", 2574 | "* Correct types save space and enable convenient math, string, and date functionality\n", 2575 | "* Chaining operations will:\n", 2576 | " * Make code readable\n", 2577 | " * Remove bugs\n", 2578 | " * Easier to debug\n", 2579 | "* Don't mutate (there's no point). Embrace chaining.\n", 2580 | "* ``.apply`` is slow for math\n", 2581 | "* Aggregations are powerful. Play with them until they make sense\n", 2582 | "* Upcoming course https://maven.com/matt-harrison/data-analysis-using-pandas\n", 2583 | "\n", 2584 | "Follow me on Twitter ``@__mharrison__``\n", 2585 | "\n", 2586 | "Book giveaway!\n" 2587 | ] 2588 | }, 2589 | { 2590 | "cell_type": "code", 2591 | "execution_count": null, 2592 | "metadata": { 2593 | "pycharm": { 2594 | "name": "#%%\n" 2595 | } 2596 | }, 2597 | "outputs": [], 2598 | "source": [ 2599 | "import random\n", 2600 | "random.randrange(1,13)" 2601 | ] 2602 | }, 2603 | { 2604 | "cell_type": "code", 2605 | "execution_count": null, 2606 | "metadata": { 2607 | "pycharm": { 2608 | "name": "#%%\n" 2609 | } 2610 | }, 2611 | "outputs": [], 2612 | "source": [] 2613 | }, 2614 | { 2615 | "cell_type": "code", 2616 | "execution_count": null, 2617 | "metadata": { 2618 | "pycharm": { 2619 | "name": "#%%\n" 2620 | } 2621 | }, 2622 | "outputs": [], 2623 | "source": [] 2624 | }, 2625 | { 2626 | "cell_type": "code", 2627 | "execution_count": null, 2628 | "metadata": { 2629 | "pycharm": { 2630 | "name": "#%%\n" 2631 | } 2632 | }, 2633 | "outputs": [], 2634 | "source": [] 2635 | }, 2636 | { 2637 | "cell_type": "code", 2638 | "execution_count": null, 2639 | "metadata": { 2640 | "pycharm": { 2641 | "name": "#%%\n" 2642 | } 2643 | }, 2644 | "outputs": [], 2645 | "source": [] 2646 | } 2647 | ], 2648 | "metadata": { 2649 | "jupytext": { 2650 | "encoding": "# -*- coding: utf-8 -*-", 2651 | "formats": "ipynb,py:light" 2652 | }, 2653 | "kernelspec": { 2654 | "display_name": "Python 3", 2655 | "language": "python", 2656 | "name": "python3" 2657 | }, 2658 | "language_info": { 2659 | "codemirror_mode": { 2660 | "name": "ipython", 2661 | "version": 3 2662 | }, 2663 | "file_extension": ".py", 2664 | "mimetype": "text/x-python", 2665 | "name": "python", 2666 | "nbconvert_exporter": "python", 2667 | "pygments_lexer": "ipython3", 2668 | "version": "3.8.5" 2669 | } 2670 | }, 2671 | "nbformat": 4, 2672 | "nbformat_minor": 4 2673 | } --------------------------------------------------------------------------------