├── 1.md ├── 2.1.md ├── 2.2.md ├── 2.3.md ├── 2.4.md ├── 2.5.md ├── 2.6.md ├── 2.7.md ├── 2.8.md ├── 2.9.md ├── 2.md ├── 3.1.md ├── 3.10.md ├── 3.11.md ├── 3.3.md ├── 3.4.md ├── 3.5.md ├── 3.6.md ├── 3.7.md ├── 3.8.md ├── 3.9.md └── 3.md /1.md: -------------------------------------------------------------------------------- 1 | # 十分钟搞定 pandas 2 | 3 | > 原文:[10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) 4 | 5 | > 译者:[ChaoSimple](http://home.cnblogs.com/u/chaosimple/) 6 | 7 | > 来源:[【原】十分钟搞定pandas](http://www.cnblogs.com/chaosimple/p/4153083.html) 8 | 9 | 官方网站上《10 Minutes to pandas》的一个简单的翻译,原文在[这里](http://pandas.pydata.org/pandas-docs/stable/10min.html)。这篇文章是对 pandas 的一个简单的介绍,详细的介绍请参考:[秘籍](http://pandas.pydata.org/pandas-docs/stable/cookbook.html#cookbook) 。习惯上,我们会按下面格式引入所需要的包: 10 | 11 | ```py 12 | In [1]: import pandas as pd 13 | 14 | In [2]: import numpy as np 15 | 16 | In [3]: import matplotlib.pyplot as plt 17 | ``` 18 | 19 | 20 | # 一、 创建对象 21 | 22 | 可以通过 [数据结构入门](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dsintro) 来查看有关该节内容的详细信息。 23 | 24 | 1、可以通过传递一个`list`对象来创建一个`Series`,pandas 会默认创建整型索引: 25 | 26 | ```py 27 | In [4]: s = pd.Series([1,3,5,np.nan,6,8]) 28 | 29 | In [5]: s 30 | Out[5]: 31 | 0 1.0 32 | 1 3.0 33 | 2 5.0 34 | 3 NaN 35 | 4 6.0 36 | 5 8.0 37 | dtype: float64 38 | ``` 39 | 40 | 2、通过传递一个 numpy`array`,时间索引以及列标签来创建一个`DataFrame`: 41 | 42 | ```py 43 | In [6]: dates = pd.date_range('20130101', periods=6) 44 | 45 | In [7]: dates 46 | Out[7]: 47 | DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', 48 | '2013-01-05', '2013-01-06'], 49 | dtype='datetime64[ns]', freq='D') 50 | 51 | In [8]: df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) 52 | 53 | In [9]: df 54 | Out[9]: 55 | A B C D 56 | 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 57 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 58 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 59 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 60 | 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 61 | 2013-01-06 -0.673690 0.113648 -1.478427 0.524988 62 | ``` 63 | 64 | 3、通过传递一个能够被转换成类似序列结构的字典对象来创建一个`DataFrame`: 65 | 66 | ```py 67 | In [10]: df2 = pd.DataFrame({ 'A' : 1., 68 | ....: 'B' : pd.Timestamp('20130102'), 69 | ....: 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 70 | ....: 'D' : np.array([3] * 4,dtype='int32'), 71 | ....: 'E' : pd.Categorical(["test","train","test","train"]), 72 | ....: 'F' : 'foo' }) 73 | ....: 74 | 75 | In [11]: df2 76 | Out[11]: 77 | A B C D E F 78 | 0 1.0 2013-01-02 1.0 3 test foo 79 | 1 1.0 2013-01-02 1.0 3 train foo 80 | 2 1.0 2013-01-02 1.0 3 test foo 81 | 3 1.0 2013-01-02 1.0 3 train foo 82 | ``` 83 | 84 | 85 | 4、查看不同列的数据类型: 86 | 87 | ```py 88 | In [12]: df2.dtypes 89 | Out[12]: 90 | A float64 91 | B datetime64[ns] 92 | C float32 93 | D int32 94 | E category 95 | F object 96 | dtype: object 97 | ``` 98 | 99 | 100 | 5、如果你使用的是 IPython,使用 Tab 自动补全功能会自动识别所有的属性以及自定义的列,下图中是所有能够被自动识别的属性的一个子集: 101 | 102 | ```py 103 | In [13]: df2. 104 | df2.A df2.boxplot 105 | df2.abs df2.C 106 | df2.add df2.clip 107 | df2.add_prefix df2.clip_lower 108 | df2.add_suffix df2.clip_upper 109 | df2.align df2.columns 110 | df2.all df2.combine 111 | df2.any df2.combineAdd 112 | df2.append df2.combine_first 113 | df2.apply df2.combineMult 114 | df2.applymap df2.compound 115 | df2.as_blocks df2.consolidate 116 | df2.asfreq df2.convert_objects 117 | df2.as_matrix df2.copy 118 | df2.astype df2.corr 119 | df2.at df2.corrwith 120 | df2.at_time df2.count 121 | df2.axes df2.cov 122 | df2.B df2.cummax 123 | df2.between_time df2.cummin 124 | df2.bfill df2.cumprod 125 | df2.blocks df2.cumsum 126 | df2.bool df2.D 127 | ``` 128 | 129 | 130 | # 二、 查看数据 131 | 132 | 详情请参阅:[基础](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics)。 133 | 134 | 1、 查看`DataFrame`中头部和尾部的行: 135 | 136 | ```py 137 | In [14]: df.head() 138 | Out[14]: 139 | A B C D 140 | 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 141 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 142 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 143 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 144 | 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 145 | 146 | In [15]: df.tail(3) 147 | Out[15]: 148 | A B C D 149 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 150 | 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 151 | 2013-01-06 -0.673690 0.113648 -1.478427 0.524988 152 | ``` 153 | 154 | 2、 显示索引、列和底层的 numpy 数据: 155 | 156 | ```py 157 | In [16]: df.index 158 | Out[16]: 159 | DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', 160 | '2013-01-05', '2013-01-06'], 161 | dtype='datetime64[ns]', freq='D') 162 | 163 | In [17]: df.columns 164 | Out[17]: Index([u'A', u'B', u'C', u'D'], dtype='object') 165 | 166 | In [18]: df.values 167 | Out[18]: 168 | array([[ 0.4691, -0.2829, -1.5091, -1.1356], 169 | [ 1.2121, -0.1732, 0.1192, -1.0442], 170 | [-0.8618, -2.1046, -0.4949, 1.0718], 171 | [ 0.7216, -0.7068, -1.0396, 0.2719], 172 | [-0.425 , 0.567 , 0.2762, -1.0874], 173 | [-0.6737, 0.1136, -1.4784, 0.525 ]]) 174 | ``` 175 | 176 | 3、 `describe()`函数对于数据的快速统计汇总: 177 | 178 | ```py 179 | In [19]: df.describe() 180 | Out[19]: 181 | A B C D 182 | count 6.000000 6.000000 6.000000 6.000000 183 | mean 0.073711 -0.431125 -0.687758 -0.233103 184 | std 0.843157 0.922818 0.779887 0.973118 185 | min -0.861849 -2.104569 -1.509059 -1.135632 186 | 25% -0.611510 -0.600794 -1.368714 -1.076610 187 | 50% 0.022070 -0.228039 -0.767252 -0.386188 188 | 75% 0.658444 0.041933 -0.034326 0.461706 189 | max 1.212112 0.567020 0.276232 1.071804 190 | ``` 191 | 192 | 193 | 4、 对数据的转置: 194 | 195 | ```py 196 | In [20]: df.T 197 | Out[20]: 198 | 2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06 199 | A 0.469112 1.212112 -0.861849 0.721555 -0.424972 -0.673690 200 | B -0.282863 -0.173215 -2.104569 -0.706771 0.567020 0.113648 201 | C -1.509059 0.119209 -0.494929 -1.039575 0.276232 -1.478427 202 | D -1.135632 -1.044236 1.071804 0.271860 -1.087401 0.524988 203 | ``` 204 | 205 | 5、 按轴进行排序 206 | 207 | ```py 208 | In [21]: df.sort_index(axis=1, ascending=False) 209 | Out[21]: 210 | D C B A 211 | 2013-01-01 -1.135632 -1.509059 -0.282863 0.469112 212 | 2013-01-02 -1.044236 0.119209 -0.173215 1.212112 213 | 2013-01-03 1.071804 -0.494929 -2.104569 -0.861849 214 | 2013-01-04 0.271860 -1.039575 -0.706771 0.721555 215 | 2013-01-05 -1.087401 0.276232 0.567020 -0.424972 216 | 2013-01-06 0.524988 -1.478427 0.113648 -0.673690 217 | ``` 218 | 219 | 220 | 6、 按值进行排序 221 | 222 | ```py 223 | In [22]: df.sort_values(by='B') 224 | Out[22]: 225 | A B C D 226 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 227 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 228 | 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 229 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 230 | 2013-01-06 -0.673690 0.113648 -1.478427 0.524988 231 | 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 232 | ``` 233 | 234 | 235 | # 三、 选择 236 | 237 | 虽然标准的 Python/Numpy 的选择和设置表达式都能够直接派上用场,但是作为工程使用的代码,我们推荐使用经过优化的 pandas 数据访问方式: `.at`, `.iat`, `.loc`, `.iloc` 和 `.ix`。详情请参阅[索引和选取数据](http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing) 和 [多重索引/高级索引](http://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced)。 238 | 239 | ## 获取 240 | 241 | 1、 选择一个单独的列,这将会返回一个`Series`,等同于`df.A`: 242 | 243 | ```py 244 | In [23]: df['A'] 245 | Out[23]: 246 | 2013-01-01 0.469112 247 | 2013-01-02 1.212112 248 | 2013-01-03 -0.861849 249 | 2013-01-04 0.721555 250 | 2013-01-05 -0.424972 251 | 2013-01-06 -0.673690 252 | Freq: D, Name: A, dtype: float64 253 | ``` 254 | 255 | 2、 通过`[]`进行选择,这将会对行进行切片 256 | 257 | ```py 258 | In [24]: df[0:3] 259 | Out[24]: 260 | A B C D 261 | 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 262 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 263 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 264 | 265 | In [25]: df['20130102':'20130104'] 266 | Out[25]: 267 | A B C D 268 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 269 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 270 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 271 | ``` 272 | 273 | ## 通过标签选择 274 | 275 | 1、 使用标签来获取一个交叉的区域 276 | 277 | ```py 278 | In [26]: df.loc[dates[0]] 279 | Out[26]: 280 | A 0.469112 281 | B -0.282863 282 | C -1.509059 283 | D -1.135632 284 | Name: 2013-01-01 00:00:00, dtype: float64 285 | ``` 286 | 287 | 2、 通过标签来在多个轴上进行选择 288 | 289 | ```py 290 | In [27]: df.loc[:,['A','B']] 291 | Out[27]: 292 | A B 293 | 2013-01-01 0.469112 -0.282863 294 | 2013-01-02 1.212112 -0.173215 295 | 2013-01-03 -0.861849 -2.104569 296 | 2013-01-04 0.721555 -0.706771 297 | 2013-01-05 -0.424972 0.567020 298 | 2013-01-06 -0.673690 0.113648 299 | ``` 300 | 301 | 3、 标签切片 302 | 303 | ```py 304 | In [28]: df.loc['20130102':'20130104',['A','B']] 305 | Out[28]: 306 | A B 307 | 2013-01-02 1.212112 -0.173215 308 | 2013-01-03 -0.861849 -2.104569 309 | 2013-01-04 0.721555 -0.706771 310 | ``` 311 | 312 | 4、 对于返回的对象进行维度缩减 313 | 314 | ```py 315 | In [29]: df.loc['20130102',['A','B']] 316 | Out[29]: 317 | A 1.212112 318 | B -0.173215 319 | Name: 2013-01-02 00:00:00, dtype: float64 320 | ``` 321 | 322 | 5、 获取一个标量 323 | 324 | ```py 325 | In [30]: df.loc[dates[0],'A'] 326 | Out[30]: 0.46911229990718628 327 | ``` 328 | 329 | 6、 快速访问一个标量(与上一个方法等价) 330 | 331 | ```py 332 | In [31]: df.at[dates[0],'A'] 333 | Out[31]: 0.46911229990718628 334 | ``` 335 | 336 | ## 通过位置选择 337 | 338 | 1、 通过传递数值进行位置选择(选择的是行) 339 | 340 | ```py 341 | In [32]: df.iloc[3] 342 | Out[32]: 343 | A 0.721555 344 | B -0.706771 345 | C -1.039575 346 | D 0.271860 347 | Name: 2013-01-04 00:00:00, dtype: float64 348 | ``` 349 | 350 | 2、 通过数值进行切片,与 numpy/python 中的情况类似 351 | 352 | ```py 353 | In [33]: df.iloc[3:5,0:2] 354 | Out[33]: 355 | A B 356 | 2013-01-04 0.721555 -0.706771 357 | 2013-01-05 -0.424972 0.567020 358 | ``` 359 | 360 | 3、 通过指定一个位置的列表,与 numpy/python 中的情况类似 361 | 362 | ```py 363 | In [34]: df.iloc[[1,2,4],[0,2]] 364 | Out[34]: 365 | A C 366 | 2013-01-02 1.212112 0.119209 367 | 2013-01-03 -0.861849 -0.494929 368 | 2013-01-05 -0.424972 0.276232 369 | ``` 370 | 371 | 4、 对行进行切片 372 | 373 | ```py 374 | In [35]: df.iloc[1:3,:] 375 | Out[35]: 376 | A B C D 377 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 378 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 379 | ``` 380 | 381 | 5、 对列进行切片 382 | 383 | ```py 384 | In [36]: df.iloc[:,1:3] 385 | Out[36]: 386 | B C 387 | 2013-01-01 -0.282863 -1.509059 388 | 2013-01-02 -0.173215 0.119209 389 | 2013-01-03 -2.104569 -0.494929 390 | 2013-01-04 -0.706771 -1.039575 391 | 2013-01-05 0.567020 0.276232 392 | 2013-01-06 0.113648 -1.478427 393 | ``` 394 | 395 | 6、 获取特定的值 396 | 397 | ```py 398 | In [37]: df.iloc[1,1] 399 | Out[37]: -0.17321464905330858 400 | ``` 401 | 402 | 快速访问标量(等同于前一个方法): 403 | 404 | ```py 405 | In [38]: df.iat[1,1] 406 | Out[38]: -0.17321464905330858 407 | ``` 408 | 409 | ## 布尔索引 410 | 411 | 1、 使用一个单独列的值来选择数据: 412 | 413 | ```py 414 | In [39]: df[df.A > 0] 415 | Out[39]: 416 | A B C D 417 | 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 418 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 419 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 420 | ``` 421 | 422 | 2、 使用`where`操作来选择数据: 423 | 424 | ```py 425 | In [40]: df[df > 0] 426 | Out[40]: 427 | A B C D 428 | 2013-01-01 0.469112 NaN NaN NaN 429 | 2013-01-02 1.212112 NaN 0.119209 NaN 430 | 2013-01-03 NaN NaN NaN 1.071804 431 | 2013-01-04 0.721555 NaN NaN 0.271860 432 | 2013-01-05 NaN 0.567020 0.276232 NaN 433 | 2013-01-06 NaN 0.113648 NaN 0.524988 434 | ``` 435 | 436 | 3、 使用`isin()`方法来过滤: 437 | 438 | ```py 439 | In [41]: df2 = df.copy() 440 | 441 | In [42]: df2['E'] = ['one', 'one','two','three','four','three'] 442 | 443 | In [43]: df2 444 | Out[43]: 445 | A B C D E 446 | 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 one 447 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 one 448 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two 449 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 three 450 | 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four 451 | 2013-01-06 -0.673690 0.113648 -1.478427 0.524988 three 452 | 453 | In [44]: df2[df2['E'].isin(['two','four'])] 454 | Out[44]: 455 | A B C D E 456 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two 457 | 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four 458 | ``` 459 | 460 | ## 设置 461 | 462 | 1、 设置一个新的列: 463 | 464 | ```py 465 | In [45]: s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) 466 | 467 | In [46]: s1 468 | Out[46]: 469 | 2013-01-02 1 470 | 2013-01-03 2 471 | 2013-01-04 3 472 | 2013-01-05 4 473 | 2013-01-06 5 474 | 2013-01-07 6 475 | Freq: D, dtype: int64 476 | 477 | In [47]: df['F'] = s1 478 | ``` 479 | 480 | 2、 通过标签设置新的值: 481 | 482 | ```py 483 | In [48]: df.at[dates[0],'A'] = 0 484 | ``` 485 | 486 | 3、 通过位置设置新的值: 487 | 488 | ```py 489 | In [49]: df.iat[0,1] = 0 490 | ``` 491 | 492 | 4、 通过一个numpy数组设置一组新值: 493 | 494 | ```py 495 | In [50]: df.loc[:,'D'] = np.array([5] * len(df)) 496 | ``` 497 | 498 | 上述操作结果如下: 499 | 500 | ```py 501 | In [51]: df 502 | Out[51]: 503 | A B C D F 504 | 2013-01-01 0.000000 0.000000 -1.509059 5 NaN 505 | 2013-01-02 1.212112 -0.173215 0.119209 5 1.0 506 | 2013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 507 | 2013-01-04 0.721555 -0.706771 -1.039575 5 3.0 508 | 2013-01-05 -0.424972 0.567020 0.276232 5 4.0 509 | 2013-01-06 -0.673690 0.113648 -1.478427 5 5.0 510 | ``` 511 | 512 | 5、 通过where操作来设置新的值: 513 | 514 | ```py 515 | In [52]: df2 = df.copy() 516 | 517 | In [53]: df2[df2 > 0] = -df2 518 | 519 | In [54]: df2 520 | Out[54]: 521 | A B C D F 522 | 2013-01-01 0.000000 0.000000 -1.509059 -5 NaN 523 | 2013-01-02 -1.212112 -0.173215 -0.119209 -5 -1.0 524 | 2013-01-03 -0.861849 -2.104569 -0.494929 -5 -2.0 525 | 2013-01-04 -0.721555 -0.706771 -1.039575 -5 -3.0 526 | 2013-01-05 -0.424972 -0.567020 -0.276232 -5 -4.0 527 | 2013-01-06 -0.673690 -0.113648 -1.478427 -5 -5.0 528 | ``` 529 | 530 | 531 | # 四、 缺失值处理 532 | 533 | 在 pandas 中,使用`np.nan`来代替缺失值,这些值将默认不会包含在计算中,详情请参阅:[缺失的数据](http://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data)。 534 | 535 | 1、 `reindex()`方法可以对指定轴上的索引进行改变/增加/删除操作,这将返回原始数据的一个拷贝: 536 | 537 | ```py 538 | In [55]: df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) 539 | 540 | In [56]: df1.loc[dates[0]:dates[1],'E'] = 1 541 | 542 | In [57]: df1 543 | Out[57]: 544 | A B C D F E 545 | 2013-01-01 0.000000 0.000000 -1.509059 5 NaN 1.0 546 | 2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0 547 | 2013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 NaN 548 | 2013-01-04 0.721555 -0.706771 -1.039575 5 3.0 NaN 549 | ``` 550 | 551 | 2、 去掉包含缺失值的行: 552 | 553 | ```py 554 | In [58]: df1.dropna(how='any') 555 | Out[58]: 556 | A B C D F E 557 | 2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0 558 | ``` 559 | 560 | 3、 对缺失值进行填充: 561 | 562 | ```py 563 | In [59]: df1.fillna(value=5) 564 | Out[59]: 565 | A B C D F E 566 | 2013-01-01 0.000000 0.000000 -1.509059 5 5.0 1.0 567 | 2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0 568 | 2013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 5.0 569 | 2013-01-04 0.721555 -0.706771 -1.039575 5 3.0 5.0 570 | ``` 571 | 572 | 4、 对数据进行布尔填充: 573 | 574 | ```py 575 | n [60]: pd.isnull(df1) 576 | Out[60]: 577 | A B C D F E 578 | 2013-01-01 False False False False True False 579 | 2013-01-02 False False False False False False 580 | 2013-01-03 False False False False False True 581 | 2013-01-04 False False False False False True 582 | ``` 583 | 584 | 585 | # 五、 相关操作 586 | 587 | 详情请参与 [基本的二进制操作](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics-binop) 588 | 589 | ## 统计(相关操作通常情况下不包括缺失值) 590 | 591 | 1、 执行描述性统计: 592 | 593 | ```py 594 | In [61]: df.mean() 595 | Out[61]: 596 | A -0.004474 597 | B -0.383981 598 | C -0.687758 599 | D 5.000000 600 | F 3.000000 601 | dtype: float64 602 | ``` 603 | 604 | 2、 在其他轴上进行相同的操作: 605 | 606 | ```py 607 | In [62]: df.mean(1) 608 | Out[62]: 609 | 2013-01-01 0.872735 610 | 2013-01-02 1.431621 611 | 2013-01-03 0.707731 612 | 2013-01-04 1.395042 613 | 2013-01-05 1.883656 614 | 2013-01-06 1.592306 615 | Freq: D, dtype: float64 616 | ``` 617 | 618 | 3、 对于拥有不同维度,需要对齐的对象进行操作。Pandas 会自动的沿着指定的维度进行广播: 619 | 620 | ```py 621 | In [63]: s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) 622 | 623 | In [64]: s 624 | Out[64]: 625 | 2013-01-01 NaN 626 | 2013-01-02 NaN 627 | 2013-01-03 1.0 628 | 2013-01-04 3.0 629 | 2013-01-05 5.0 630 | 2013-01-06 NaN 631 | Freq: D, dtype: float64 632 | 633 | In [65]: df.sub(s, axis='index') 634 | Out[65]: 635 | A B C D F 636 | 2013-01-01 NaN NaN NaN NaN NaN 637 | 2013-01-02 NaN NaN NaN NaN NaN 638 | 2013-01-03 -1.861849 -3.104569 -1.494929 4.0 1.0 639 | 2013-01-04 -2.278445 -3.706771 -4.039575 2.0 0.0 640 | 2013-01-05 -5.424972 -4.432980 -4.723768 0.0 -1.0 641 | 2013-01-06 NaN NaN NaN NaN NaN 642 | ``` 643 | 644 | ## `Apply` 645 | 646 | 1、 对数据应用函数: 647 | 648 | ```py 649 | In [66]: df.apply(np.cumsum) 650 | Out[66]: 651 | A B C D F 652 | 2013-01-01 0.000000 0.000000 -1.509059 5 NaN 653 | 2013-01-02 1.212112 -0.173215 -1.389850 10 1.0 654 | 2013-01-03 0.350263 -2.277784 -1.884779 15 3.0 655 | 2013-01-04 1.071818 -2.984555 -2.924354 20 6.0 656 | 2013-01-05 0.646846 -2.417535 -2.648122 25 10.0 657 | 2013-01-06 -0.026844 -2.303886 -4.126549 30 15.0 658 | 659 | In [67]: df.apply(lambda x: x.max() - x.min()) 660 | Out[67]: 661 | A 2.073961 662 | B 2.671590 663 | C 1.785291 664 | D 0.000000 665 | F 4.000000 666 | dtype: float64 667 | ``` 668 | 669 | ## 直方图 670 | 671 | 具体请参照:[直方图和离散化](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics-discretization)。 672 | 673 | ```py 674 | In [68]: s = pd.Series(np.random.randint(0, 7, size=10)) 675 | 676 | In [69]: s 677 | Out[69]: 678 | 0 4 679 | 1 2 680 | 2 1 681 | 3 2 682 | 4 6 683 | 5 4 684 | 6 4 685 | 7 6 686 | 8 4 687 | 9 4 688 | dtype: int64 689 | 690 | In [70]: s.value_counts() 691 | Out[70]: 692 | 4 5 693 | 6 2 694 | 2 2 695 | 1 1 696 | dtype: int64 697 | ``` 698 | 699 | ## 字符串方法 700 | 701 | `Series`对象在其`str`属性中配备了一组字符串处理方法,可以很容易的应用到数组中的每个元素,如下段代码所示。更多详情请参考:[字符串向量化方法](http://pandas.pydata.org/pandas-docs/stable/text.html#text-string-methods)。 702 | 703 | ```py 704 | In [71]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) 705 | 706 | In [72]: s.str.lower() 707 | Out[72]: 708 | 0 a 709 | 1 b 710 | 2 c 711 | 3 aaba 712 | 4 baca 713 | 5 NaN 714 | 6 caba 715 | 7 dog 716 | 8 cat 717 | dtype: object 718 | ``` 719 | 720 | 721 | # 六、 合并 722 | 723 | Pandas 提供了大量的方法能够轻松的对`Series`,`DataFrame`和`Panel`对象进行各种符合各种逻辑关系的合并操作。具体请参阅:[合并](http://pandas.pydata.org/pandas-docs/stable/merging.html#merging)。 724 | 725 | ## `Concat` 726 | 727 | ```py 728 | In [73]: df = pd.DataFrame(np.random.randn(10, 4)) 729 | 730 | In [74]: df 731 | Out[74]: 732 | 0 1 2 3 733 | 0 -0.548702 1.467327 -1.015962 -0.483075 734 | 1 1.637550 -1.217659 -0.291519 -1.745505 735 | 2 -0.263952 0.991460 -0.919069 0.266046 736 | 3 -0.709661 1.669052 1.037882 -1.705775 737 | 4 -0.919854 -0.042379 1.247642 -0.009920 738 | 5 0.290213 0.495767 0.362949 1.548106 739 | 6 -1.131345 -0.089329 0.337863 -0.945867 740 | 7 -0.932132 1.956030 0.017587 -0.016692 741 | 8 -0.575247 0.254161 -1.143704 0.215897 742 | 9 1.193555 -0.077118 -0.408530 -0.862495 743 | 744 | # break it into pieces 745 | In [75]: pieces = [df[:3], df[3:7], df[7:]] 746 | 747 | In [76]: pd.concat(pieces) 748 | Out[76]: 749 | 0 1 2 3 750 | 0 -0.548702 1.467327 -1.015962 -0.483075 751 | 1 1.637550 -1.217659 -0.291519 -1.745505 752 | 2 -0.263952 0.991460 -0.919069 0.266046 753 | 3 -0.709661 1.669052 1.037882 -1.705775 754 | 4 -0.919854 -0.042379 1.247642 -0.009920 755 | 5 0.290213 0.495767 0.362949 1.548106 756 | 6 -1.131345 -0.089329 0.337863 -0.945867 757 | 7 -0.932132 1.956030 0.017587 -0.016692 758 | 8 -0.575247 0.254161 -1.143704 0.215897 759 | 9 1.193555 -0.077118 -0.408530 -0.862495 760 | ``` 761 | 762 | ## `Join` 763 | 764 | 类似于 SQL 类型的合并,具体请参阅:[数据库风格的连接](http://pandas.pydata.org/pandas-docs/stable/merging.html#merging-join) 765 | 766 | ```py 767 | In [77]: left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) 768 | 769 | In [78]: right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) 770 | 771 | In [79]: left 772 | Out[79]: 773 | key lval 774 | 0 foo 1 775 | 1 foo 2 776 | 777 | In [80]: right 778 | Out[80]: 779 | key rval 780 | 0 foo 4 781 | 1 foo 5 782 | 783 | In [81]: pd.merge(left, right, on='key') 784 | Out[81]: 785 | key lval rval 786 | 0 foo 1 4 787 | 1 foo 1 5 788 | 2 foo 2 4 789 | 3 foo 2 5 790 | ``` 791 | 792 | 另一个例子: 793 | 794 | ```py 795 | In [82]: left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) 796 | 797 | In [83]: right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) 798 | 799 | In [84]: left 800 | Out[84]: 801 | key lval 802 | 0 foo 1 803 | 1 bar 2 804 | 805 | In [85]: right 806 | Out[85]: 807 | key rval 808 | 0 foo 4 809 | 1 bar 5 810 | 811 | In [86]: pd.merge(left, right, on='key') 812 | Out[86]: 813 | key lval rval 814 | 0 foo 1 4 815 | 1 bar 2 5 816 | ``` 817 | 818 | ## `Append` 819 | 820 | 将一行连接到一个`DataFrame`上,具体请参阅[附加](http://pandas.pydata.org/pandas-docs/stable/merging.html#merging-concatenation): 821 | 822 | ```py 823 | In [87]: df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) 824 | 825 | In [88]: df 826 | Out[88]: 827 | A B C D 828 | 0 1.346061 1.511763 1.627081 -0.990582 829 | 1 -0.441652 1.211526 0.268520 0.024580 830 | 2 -1.577585 0.396823 -0.105381 -0.532532 831 | 3 1.453749 1.208843 -0.080952 -0.264610 832 | 4 -0.727965 -0.589346 0.339969 -0.693205 833 | 5 -0.339355 0.593616 0.884345 1.591431 834 | 6 0.141809 0.220390 0.435589 0.192451 835 | 7 -0.096701 0.803351 1.715071 -0.708758 836 | 837 | In [89]: s = df.iloc[3] 838 | 839 | In [90]: df.append(s, ignore_index=True) 840 | Out[90]: 841 | A B C D 842 | 0 1.346061 1.511763 1.627081 -0.990582 843 | 1 -0.441652 1.211526 0.268520 0.024580 844 | 2 -1.577585 0.396823 -0.105381 -0.532532 845 | 3 1.453749 1.208843 -0.080952 -0.264610 846 | 4 -0.727965 -0.589346 0.339969 -0.693205 847 | 5 -0.339355 0.593616 0.884345 1.591431 848 | 6 0.141809 0.220390 0.435589 0.192451 849 | 7 -0.096701 0.803351 1.715071 -0.708758 850 | 8 1.453749 1.208843 -0.080952 -0.264610 851 | ``` 852 | 853 | 854 | # 七、 分组 855 | 856 | 对于”group by”操作,我们通常是指以下一个或多个操作步骤: 857 | 858 | + (Splitting)按照一些规则将数据分为不同的组; 859 | 860 | + (Applying)对于每组数据分别执行一个函数; 861 | 862 | + (Combining)将结果组合到一个数据结构中; 863 | 864 | 详情请参阅:[_Grouping section_](http://pandas.pydata.org/pandas-docs/stable/groupby.html#groupby) 865 | 866 | ```py 867 | In [91]: df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 868 | ....: 'foo', 'bar', 'foo', 'foo'], 869 | ....: 'B' : ['one', 'one', 'two', 'three', 870 | ....: 'two', 'two', 'one', 'three'], 871 | ....: 'C' : np.random.randn(8), 872 | ....: 'D' : np.random.randn(8)}) 873 | ....: 874 | 875 | In [92]: df 876 | Out[92]: 877 | A B C D 878 | 0 foo one -1.202872 -0.055224 879 | 1 bar one -1.814470 2.395985 880 | 2 foo two 1.018601 1.552825 881 | 3 bar three -0.595447 0.166599 882 | 4 foo two 1.395433 0.047609 883 | 5 bar two -0.392670 -0.136473 884 | 6 foo one 0.007207 -0.561757 885 | 7 foo three 1.928123 -1.623033 886 | ``` 887 | 888 | 1、 分组并对每个分组执行`sum`函数: 889 | 890 | ```py 891 | In [93]: df.groupby('A').sum() 892 | Out[93]: 893 | C D 894 | A 895 | bar -2.802588 2.42611 896 | foo 3.146492 -0.63958 897 | ``` 898 | 899 | 2、 通过多个列进行分组形成一个层次索引,然后执行函数: 900 | 901 | ```py 902 | In [94]: df.groupby(['A','B']).sum() 903 | Out[94]: 904 | C D 905 | A B 906 | bar one -1.814470 2.395985 907 | three -0.595447 0.166599 908 | two -0.392670 -0.136473 909 | foo one -1.195665 -0.616981 910 | three 1.928123 -1.623033 911 | two 2.414034 1.600434 912 | ``` 913 | 914 | 915 | # 八、 改变形状 916 | 917 | 详情请参阅 [层次索引](http://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced-hierarchical) 和 [改变形状](http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-stacking)。 918 | 919 | ## `Stack` 920 | 921 | ```py 922 | In [95]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 923 | ....: 'foo', 'foo', 'qux', 'qux'], 924 | ....: ['one', 'two', 'one', 'two', 925 | ....: 'one', 'two', 'one', 'two']])) 926 | ....: 927 | 928 | In [96]: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) 929 | 930 | In [97]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) 931 | 932 | In [98]: df2 = df[:4] 933 | 934 | In [99]: df2 935 | Out[99]: 936 | A B 937 | first second 938 | bar one 0.029399 -0.542108 939 | two 0.282696 -0.087302 940 | baz one -1.575170 1.771208 941 | two 0.816482 1.100230 942 | ``` 943 | 944 | ```py 945 | In [100]: stacked = df2.stack() 946 | 947 | In [101]: stacked 948 | Out[101]: 949 | first second 950 | bar one A 0.029399 951 | B -0.542108 952 | two A 0.282696 953 | B -0.087302 954 | baz one A -1.575170 955 | B 1.771208 956 | two A 0.816482 957 | B 1.100230 958 | dtype: float64 959 | ``` 960 | 961 | ```py 962 | In [102]: stacked.unstack() 963 | Out[102]: 964 | A B 965 | first second 966 | bar one 0.029399 -0.542108 967 | two 0.282696 -0.087302 968 | baz one -1.575170 1.771208 969 | two 0.816482 1.100230 970 | 971 | In [103]: stacked.unstack(1) 972 | Out[103]: 973 | second one two 974 | first 975 | bar A 0.029399 0.282696 976 | B -0.542108 -0.087302 977 | baz A -1.575170 0.816482 978 | B 1.771208 1.100230 979 | 980 | In [104]: stacked.unstack(0) 981 | Out[104]: 982 | first bar baz 983 | second 984 | one A 0.029399 -1.575170 985 | B -0.542108 1.771208 986 | two A 0.282696 0.816482 987 | B -0.087302 1.100230 988 | ``` 989 | 990 | ## 数据透视表 991 | 992 | 详情请参阅:[数据透视表](http://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-pivot). 993 | 994 | ```py 995 | In [105]: df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 996 | .....: 'B' : ['A', 'B', 'C'] * 4, 997 | .....: 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 998 | .....: 'D' : np.random.randn(12), 999 | .....: 'E' : np.random.randn(12)}) 1000 | .....: 1001 | 1002 | In [106]: df 1003 | Out[106]: 1004 | A B C D E 1005 | 0 one A foo 1.418757 -0.179666 1006 | 1 one B foo -1.879024 1.291836 1007 | 2 two C foo 0.536826 -0.009614 1008 | 3 three A bar 1.006160 0.392149 1009 | 4 one B bar -0.029716 0.264599 1010 | 5 one C bar -1.146178 -0.057409 1011 | 6 two A foo 0.100900 -1.425638 1012 | 7 three B foo -1.035018 1.024098 1013 | 8 one C foo 0.314665 -0.106062 1014 | 9 one A bar -0.773723 1.824375 1015 | 10 two B bar -1.170653 0.595974 1016 | 11 three C bar 0.648740 1.167115 1017 | ``` 1018 | 1019 | 可以从这个数据中轻松的生成数据透视表: 1020 | 1021 | ```py 1022 | In [107]: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) 1023 | Out[107]: 1024 | C bar foo 1025 | A B 1026 | one A -0.773723 1.418757 1027 | B -0.029716 -1.879024 1028 | C -1.146178 0.314665 1029 | three A 1.006160 NaN 1030 | B NaN -1.035018 1031 | C 0.648740 NaN 1032 | two A NaN 0.100900 1033 | B -1.170653 NaN 1034 | C NaN 0.536826 1035 | ``` 1036 | 1037 | 1038 | # 九、 时间序列 1039 | 1040 | Pandas 在对频率转换进行重新采样时拥有简单、强大且高效的功能(如将按秒采样的数据转换为按5分钟为单位进行采样的数据)。这种操作在金融领域非常常见。具体参考:[时间序列](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries)。 1041 | 1042 | ```py 1043 | In [108]: rng = pd.date_range('1/1/2012', periods=100, freq='S') 1044 | 1045 | In [109]: ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) 1046 | 1047 | In [110]: ts.resample('5Min').sum() 1048 | Out[110]: 1049 | 2012-01-01 25083 1050 | Freq: 5T, dtype: int64 1051 | ``` 1052 | 1053 | 1、 时区表示: 1054 | 1055 | ```py 1056 | In [111]: rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') 1057 | 1058 | In [112]: ts = pd.Series(np.random.randn(len(rng)), rng) 1059 | 1060 | In [113]: ts 1061 | Out[113]: 1062 | 2012-03-06 0.464000 1063 | 2012-03-07 0.227371 1064 | 2012-03-08 -0.496922 1065 | 2012-03-09 0.306389 1066 | 2012-03-10 -2.290613 1067 | Freq: D, dtype: float64 1068 | 1069 | In [114]: ts_utc = ts.tz_localize('UTC') 1070 | 1071 | In [115]: ts_utc 1072 | Out[115]: 1073 | 2012-03-06 00:00:00+00:00 0.464000 1074 | 2012-03-07 00:00:00+00:00 0.227371 1075 | 2012-03-08 00:00:00+00:00 -0.496922 1076 | 2012-03-09 00:00:00+00:00 0.306389 1077 | 2012-03-10 00:00:00+00:00 -2.290613 1078 | Freq: D, dtype: float64 1079 | ``` 1080 | 1081 | 2、 时区转换: 1082 | 1083 | ```py 1084 | In [116]: ts_utc.tz_convert('US/Eastern') 1085 | Out[116]: 1086 | 2012-03-05 19:00:00-05:00 0.464000 1087 | 2012-03-06 19:00:00-05:00 0.227371 1088 | 2012-03-07 19:00:00-05:00 -0.496922 1089 | 2012-03-08 19:00:00-05:00 0.306389 1090 | 2012-03-09 19:00:00-05:00 -2.290613 1091 | Freq: D, dtype: float64 1092 | ``` 1093 | 1094 | 3、 时间跨度转换: 1095 | 1096 | ```py 1097 | In [117]: rng = pd.date_range('1/1/2012', periods=5, freq='M') 1098 | 1099 | In [118]: ts = pd.Series(np.random.randn(len(rng)), index=rng) 1100 | 1101 | In [119]: ts 1102 | Out[119]: 1103 | 2012-01-31 -1.134623 1104 | 2012-02-29 -1.561819 1105 | 2012-03-31 -0.260838 1106 | 2012-04-30 0.281957 1107 | 2012-05-31 1.523962 1108 | Freq: M, dtype: float64 1109 | 1110 | In [120]: ps = ts.to_period() 1111 | 1112 | In [121]: ps 1113 | Out[121]: 1114 | 2012-01 -1.134623 1115 | 2012-02 -1.561819 1116 | 2012-03 -0.260838 1117 | 2012-04 0.281957 1118 | 2012-05 1.523962 1119 | Freq: M, dtype: float64 1120 | 1121 | In [122]: ps.to_timestamp() 1122 | Out[122]: 1123 | 2012-01-01 -1.134623 1124 | 2012-02-01 -1.561819 1125 | 2012-03-01 -0.260838 1126 | 2012-04-01 0.281957 1127 | 2012-05-01 1.523962 1128 | Freq: MS, dtype: float64 1129 | ``` 1130 | 1131 | 4、 时期和时间戳之间的转换使得可以使用一些方便的算术函数。 1132 | 1133 | ```py 1134 | In [123]: prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') 1135 | 1136 | In [124]: ts = pd.Series(np.random.randn(len(prng)), prng) 1137 | 1138 | In [125]: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 1139 | 1140 | In [126]: ts.head() 1141 | Out[126]: 1142 | 1990-03-01 09:00 -0.902937 1143 | 1990-06-01 09:00 0.068159 1144 | 1990-09-01 09:00 -0.057873 1145 | 1990-12-01 09:00 -0.368204 1146 | 1991-03-01 09:00 -1.144073 1147 | Freq: H, dtype: float64 1148 | ``` 1149 | 1150 | 1151 | # 十、 Categorical 1152 | 1153 | 从 0.15 版本开始,pandas 可以在`DataFrame`中支持 Categorical 类型的数据,详细 介绍参看:[Categorical 简介](http://pandas.pydata.org/pandas-docs/stable/categorical.html#categorical)和[_API documentation_](http://pandas.pydata.org/pandas-docs/stable/api.html#api-categorical)。 1154 | 1155 | ```py 1156 | In [127]: df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) 1157 | ``` 1158 | 1159 | 1、 将原始的`grade`转换为 Categorical 数据类型: 1160 | 1161 | ```py 1162 | In [128]: df["grade"] = df["raw_grade"].astype("category") 1163 | 1164 | In [129]: df["grade"] 1165 | Out[129]: 1166 | 0 a 1167 | 1 b 1168 | 2 b 1169 | 3 a 1170 | 4 a 1171 | 5 e 1172 | Name: grade, dtype: category 1173 | Categories (3, object): [a, b, e] 1174 | ``` 1175 | 1176 | 2、 将 Categorical 类型数据重命名为更有意义的名称: 1177 | 1178 | ```py 1179 | In [130]: df["grade"].cat.categories = ["very good", "good", "very bad"] 1180 | ``` 1181 | 1182 | 3、 对类别进行重新排序,增加缺失的类别: 1183 | 1184 | ```py 1185 | In [131]: df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) 1186 | 1187 | In [132]: df["grade"] 1188 | Out[132]: 1189 | 0 very good 1190 | 1 good 1191 | 2 good 1192 | 3 very good 1193 | 4 very good 1194 | 5 very bad 1195 | Name: grade, dtype: category 1196 | Categories (5, object): [very bad, bad, medium, good, very good] 1197 | ``` 1198 | 1199 | 4、 排序是按照 Categorical 的顺序进行的而不是按照字典顺序进行: 1200 | 1201 | ```py 1202 | In [133]: df.sort_values(by="grade") 1203 | Out[133]: 1204 | id raw_grade grade 1205 | 5 6 e very bad 1206 | 1 2 b good 1207 | 2 3 b good 1208 | 0 1 a very good 1209 | 3 4 a very good 1210 | 4 5 a very good 1211 | ``` 1212 | 1213 | 1214 | 5、 对 Categorical 列进行排序时存在空的类别: 1215 | 1216 | ```py 1217 | In [134]: df.groupby("grade").size() 1218 | Out[134]: 1219 | grade 1220 | very bad 1 1221 | bad 0 1222 | medium 0 1223 | good 2 1224 | very good 3 1225 | dtype: int64 1226 | ``` 1227 | 1228 | 1229 | # 十一、 画图 1230 | 1231 | 具体文档参看:[绘图](http://pandas.pydata.org/pandas-docs/stable/visualization.html#visualization)文档。 1232 | 1233 | ```py 1234 | In [135]: ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) 1235 | 1236 | In [136]: ts = ts.cumsum() 1237 | 1238 | In [137]: ts.plot() 1239 | Out[137]: 1240 | ``` 1241 | 1242 | ![](http://pandas.pydata.org/pandas-docs/stable/_images/series_plot_basic.png) 1243 | 1244 | 对于`DataFrame`来说,`plot`是一种将所有列及其标签进行绘制的简便方法: 1245 | 1246 | ```py 1247 | In [138]: df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, 1248 | .....: columns=['A', 'B', 'C', 'D']) 1249 | .....: 1250 | 1251 | In [139]: df = df.cumsum() 1252 | 1253 | In [140]: plt.figure(); df.plot(); plt.legend(loc='best') 1254 | Out[140]: 1255 | ``` 1256 | 1257 | ![](http://pandas.pydata.org/pandas-docs/stable/_images/frame_plot_basic.png) 1258 | 1259 | 1260 | 1261 | # 十二、 导入和保存数据 1262 | 1263 | ## CSV 1264 | 1265 | 参考:[写入 CSV 文件](http://pandas.pydata.org/pandas-docs/stable/io.html#io-store-in-csv)。 1266 | 1267 | 1、 写入 csv 文件: 1268 | 1269 | ```py 1270 | In [141]: df.to_csv('foo.csv') 1271 | ``` 1272 | 1273 | 2、 从 csv 文件中读取: 1274 | 1275 | ```py 1276 | In [142]: pd.read_csv('foo.csv') 1277 | Out[142]: 1278 | Unnamed: 0 A B C D 1279 | 0 2000-01-01 0.266457 -0.399641 -0.219582 1.186860 1280 | 1 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953 1281 | 2 2000-01-03 -1.734933 0.530468 2.060811 -0.515536 1282 | 3 2000-01-04 -1.555121 1.452620 0.239859 -1.156896 1283 | 4 2000-01-05 0.578117 0.511371 0.103552 -2.428202 1284 | 5 2000-01-06 0.478344 0.449933 -0.741620 -1.962409 1285 | 6 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753 1286 | .. ... ... ... ... ... 1287 | 993 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940 1288 | 994 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107 1289 | 995 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740 1290 | 996 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439 1291 | 997 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593 1292 | 998 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560 1293 | 999 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368 1294 | 1295 | [1000 rows x 5 columns] 1296 | ``` 1297 | 1298 | ## HDF5 1299 | 1300 | 参考:[HDF5 存储](http://pandas.pydata.org/pandas-docs/stable/io.html#io-hdf5) 1301 | 1302 | 1、 写入 HDF5 存储: 1303 | 1304 | ```py 1305 | In [143]: df.to_hdf('foo.h5','df') 1306 | ``` 1307 | 1308 | 2、 从 HDF5 存储中读取: 1309 | 1310 | ```py 1311 | In [144]: pd.read_hdf('foo.h5','df') 1312 | Out[144]: 1313 | A B C D 1314 | 2000-01-01 0.266457 -0.399641 -0.219582 1.186860 1315 | 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953 1316 | 2000-01-03 -1.734933 0.530468 2.060811 -0.515536 1317 | 2000-01-04 -1.555121 1.452620 0.239859 -1.156896 1318 | 2000-01-05 0.578117 0.511371 0.103552 -2.428202 1319 | 2000-01-06 0.478344 0.449933 -0.741620 -1.962409 1320 | 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753 1321 | ... ... ... ... ... 1322 | 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940 1323 | 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107 1324 | 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740 1325 | 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439 1326 | 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593 1327 | 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560 1328 | 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368 1329 | 1330 | [1000 rows x 4 columns] 1331 | ``` 1332 | 1333 | ## Excel 1334 | 1335 | 参考:[_MS Excel_](http://pandas.pydata.org/pandas-docs/stable/io.html#io-excel) 1336 | 1337 | 1、 写入excel文件: 1338 | 1339 | ```py 1340 | In [145]: df.to_excel('foo.xlsx', sheet_name='Sheet1') 1341 | ``` 1342 | 1343 | 2、 从excel文件中读取: 1344 | 1345 | ```py 1346 | In [146]: pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) 1347 | Out[146]: 1348 | A B C D 1349 | 2000-01-01 0.266457 -0.399641 -0.219582 1.186860 1350 | 2000-01-02 -1.170732 -0.345873 1.653061 -0.282953 1351 | 2000-01-03 -1.734933 0.530468 2.060811 -0.515536 1352 | 2000-01-04 -1.555121 1.452620 0.239859 -1.156896 1353 | 2000-01-05 0.578117 0.511371 0.103552 -2.428202 1354 | 2000-01-06 0.478344 0.449933 -0.741620 -1.962409 1355 | 2000-01-07 1.235339 -0.091757 -1.543861 -1.084753 1356 | ... ... ... ... ... 1357 | 2002-09-20 -10.628548 -9.153563 -7.883146 28.313940 1358 | 2002-09-21 -10.390377 -8.727491 -6.399645 30.914107 1359 | 2002-09-22 -8.985362 -8.485624 -4.669462 31.367740 1360 | 2002-09-23 -9.558560 -8.781216 -4.499815 30.518439 1361 | 2002-09-24 -9.902058 -9.340490 -4.386639 30.105593 1362 | 2002-09-25 -10.216020 -9.480682 -3.933802 29.758560 1363 | 2002-09-26 -11.856774 -10.671012 -3.216025 29.369368 1364 | 1365 | [1000 rows x 4 columns] 1366 | ``` 1367 | 1368 | # 十三、陷阱 1369 | 1370 | 如果你尝试某个操作并且看到如下异常: 1371 | 1372 | ```py 1373 | >>> if pd.Series([False, True, False]): 1374 | print("I was true") 1375 | Traceback 1376 | ... 1377 | ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). 1378 | ``` 1379 | 1380 | 解释及处理方式请见[比较](http://pandas.pydata.org/pandas-docs/stable/basics.html#basics-compare)。 1381 | 1382 | 同时请见[陷阱](http://pandas.pydata.org/pandas-docs/stable/gotchas.html#gotchas)。 -------------------------------------------------------------------------------- /2.1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/secoba/pandas-official-tutorials-zh/ddc60f38c28e4a72be682eb6682191de0023f6c8/2.1.md -------------------------------------------------------------------------------- /2.2.md: -------------------------------------------------------------------------------- 1 | # 第二章 2 | 3 | > 原文:[Chapter 2](http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%202%20-%20Selecting%20data%20&%20finding%20the%20most%20common%20complaint%20type.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ```py 10 | # 通常的开头 11 | import pandas as pd 12 | # 使图表更大更漂亮 13 | pd.set_option('display.mpl_style', 'default') 14 | pd.set_option('display.line_width', 5000) 15 | pd.set_option('display.max_columns', 60) 16 | 17 | figsize(15, 5) 18 | ``` 19 | 20 | 我们将在这里使用一个新的数据集,来演示如何处理更大的数据集。 这是来自 [NYC Open Data](https://nycopendata.socrata.com/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9) 的 311 个服务请求的子集。 21 | 22 | 23 | ```py 24 | complaints = pd.read_csv('../data/311-service-requests.csv') 25 | ``` 26 | 27 | ## 2.1 里面究竟有什么?(总结) 28 | 29 | 当你查看一个大型数据框架,而不是显示数据框架的内容,它会显示一个摘要。 这包括所有列,以及每列中有多少非空值。 30 | 31 | ```py 32 | complaints 33 | ``` 34 | 35 | ``` 36 | 37 | Int64Index: 111069 entries, 0 to 111068 38 | Data columns (total 52 columns): 39 | Unique Key 111069 non-null values 40 | Created Date 111069 non-null values 41 | Closed Date 60270 non-null values 42 | Agency 111069 non-null values 43 | Agency Name 111069 non-null values 44 | Complaint Type 111069 non-null values 45 | Descriptor 111068 non-null values 46 | Location Type 79048 non-null values 47 | Incident Zip 98813 non-null values 48 | Incident Address 84441 non-null values 49 | Street Name 84438 non-null values 50 | Cross Street 1 84728 non-null values 51 | Cross Street 2 84005 non-null values 52 | Intersection Street 1 19364 non-null values 53 | Intersection Street 2 19366 non-null values 54 | Address Type 102247 non-null values 55 | City 98860 non-null values 56 | Landmark 95 non-null values 57 | Facility Type 110938 non-null values 58 | Status 111069 non-null values 59 | Due Date 39239 non-null values 60 | Resolution Action Updated Date 96507 non-null values 61 | Community Board 111069 non-null values 62 | Borough 111069 non-null values 63 | X Coordinate (State Plane) 98143 non-null values 64 | Y Coordinate (State Plane) 98143 non-null values 65 | Park Facility Name 111069 non-null values 66 | Park Borough 111069 non-null values 67 | School Name 111069 non-null values 68 | School Number 111052 non-null values 69 | School Region 110524 non-null values 70 | School Code 110524 non-null values 71 | School Phone Number 111069 non-null values 72 | School Address 111069 non-null values 73 | School City 111069 non-null values 74 | School State 111069 non-null values 75 | School Zip 111069 non-null values 76 | School Not Found 38984 non-null values 77 | School or Citywide Complaint 0 non-null values 78 | Vehicle Type 99 non-null values 79 | Taxi Company Borough 117 non-null values 80 | Taxi Pick Up Location 1059 non-null values 81 | Bridge Highway Name 185 non-null values 82 | Bridge Highway Direction 185 non-null values 83 | Road Ramp 184 non-null values 84 | Bridge Highway Segment 223 non-null values 85 | Garage Lot Name 49 non-null values 86 | Ferry Direction 37 non-null values 87 | Ferry Terminal Name 336 non-null values 88 | Latitude 98143 non-null values 89 | Longitude 98143 non-null values 90 | Location 98143 non-null values 91 | dtypes: float64(5), int64(1), object(46) 92 | ``` 93 | 94 | ## 2.2 选择列和行 95 | 96 | 为了选择一列,使用列名称作为索引,像这样: 97 | 98 | ```py 99 | complaints['Complaint Type'] 100 | ``` 101 | 102 | ```py 103 | 0 Noise - Street/Sidewalk 104 | 1 Illegal Parking 105 | 2 Noise - Commercial 106 | 3 Noise - Vehicle 107 | 4 Rodent 108 | 5 Noise - Commercial 109 | 6 Blocked Driveway 110 | 7 Noise - Commercial 111 | 8 Noise - Commercial 112 | 9 Noise - Commercial 113 | 10 Noise - House of Worship 114 | 11 Noise - Commercial 115 | 12 Illegal Parking 116 | 13 Noise - Vehicle 117 | 14 Rodent 118 | ... 119 | 111054 Noise - Street/Sidewalk 120 | 111055 Noise - Commercial 121 | 111056 Street Sign - Missing 122 | 111057 Noise 123 | 111058 Noise - Commercial 124 | 111059 Noise - Street/Sidewalk 125 | 111060 Noise 126 | 111061 Noise - Commercial 127 | 111062 Water System 128 | 111063 Water System 129 | 111064 Maintenance or Facility 130 | 111065 Illegal Parking 131 | 111066 Noise - Street/Sidewalk 132 | 111067 Noise - Commercial 133 | 111068 Blocked Driveway 134 | Name: Complaint Type, Length: 111069, dtype: object 135 | ``` 136 | 137 | 要获得`DataFrame`的前 5 行,我们可以使用切片:`df [:5]`。 138 | 139 | 这是一个了解数据框架中存在什么信息的很好方式 - 花一点时间来查看内容并获得此数据集的感觉。 140 | 141 | ```py 142 | complaints[:5] 143 | ``` 144 | 145 | 146 | | | Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | 147 | | --- | --- | 148 | | 0 | 26589651 | 10/31/2013 02:08:41 AM | NaN | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Talking | Street/Sidewalk | 11432 | 90-03 169 STREET | 169 STREET | 90 AVENUE | 91 AVENUE | NaN | NaN | ADDRESS | JAMAICA | NaN | Precinct | Assigned | 10/31/2013 10:08:41 AM | 10/31/2013 02:35:17 AM | 12 QUEENS | QUEENS | 1042027 | 197389 | Unspecified | QUEENS | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.708275 | -73.791604 | (40.70827532593202, -73.79160395779721) | 149 | | 1 | 26593698 | 10/31/2013 02:01:04 AM | NaN | NYPD | New York City Police Department | Illegal Parking | Commercial Overnight Parking | Street/Sidewalk | 11378 | 58 AVENUE | 58 AVENUE | 58 PLACE | 59 STREET | NaN | NaN | BLOCKFACE | MASPETH | NaN | Precinct | Open | 10/31/2013 10:01:04 AM | NaN | 05 QUEENS | QUEENS | 1009349 | 201984 | Unspecified | QUEENS | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.721041 | -73.909453 | (40.721040535628305, -73.90945306791765) | 150 | | 2 | 26594139 | 10/31/2013 02:00:24 AM | 10/31/2013 02:40:32 AM | NYPD | New York City Police Department | Noise - Commercial | Loud Music/Party | Club/Bar/Restaurant | 10032 | 4060 BROADWAY | BROADWAY | WEST 171 STREET | WEST 172 STREET | NaN | NaN | ADDRESS | NEW YORK | NaN | Precinct | Closed | 10/31/2013 10:00:24 AM | 10/31/2013 02:39:42 AM | 12 MANHATTAN | MANHATTAN | 1001088 | 246531 | Unspecified | MANHATTAN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.843330 | -73.939144 | (40.84332975466513, -73.93914371913482) | 151 | | 3 | 26595721 | 10/31/2013 01:56:23 AM | 10/31/2013 02:21:48 AM | NYPD | New York City Police Department | Noise - Vehicle | Car/Truck Horn | Street/Sidewalk | 10023 | WEST 72 STREET | WEST 72 STREET | COLUMBUS AVENUE | AMSTERDAM AVENUE | NaN | NaN | BLOCKFACE | NEW YORK | NaN | Precinct | Closed | 10/31/2013 09:56:23 AM | 10/31/2013 02:21:10 AM | 07 MANHATTAN | MANHATTAN | 989730 | 222727 | Unspecified | MANHATTAN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.778009 | -73.980213 | (40.7780087446372, -73.98021349023975) | 152 | | 4 | 26590930 | 10/31/2013 01:53:44 AM | NaN | DOHMH | Department of Health and Mental Hygiene | Rodent | Condition Attracting Rodents | Vacant Lot | 10027 | WEST 124 STREET | WEST 124 STREET | LENOX AVENUE | ADAM CLAYTON POWELL JR BOULEVARD | NaN | NaN | BLOCKFACE | NEW YORK | NaN | N/A | Pending | 11/30/2013 01:53:44 AM | 10/31/2013 01:59:54 AM | 10 MANHATTAN | MANHATTAN | 998815 | 233545 | Unspecified | MANHATTAN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.807691 | -73.947387 | (40.80769092704951, | 153 | 154 | 我们可以组合它们来获得一列的前五行。 155 | 156 | ```py 157 | complaints['Complaint Type'][:5] 158 | ``` 159 | 160 | ``` 161 | 0 Noise - Street/Sidewalk 162 | 1 Illegal Parking 163 | 2 Noise - Commercial 164 | 3 Noise - Vehicle 165 | 4 Rodent 166 | Name: Complaint Type, dtype: object 167 | ``` 168 | 169 | 并且无论我们以什么方向: 170 | 171 | ```py 172 | complaints[:5]['Complaint Type'] 173 | ``` 174 | 175 | ``` 176 | 0 Noise - Street/Sidewalk 177 | 1 Illegal Parking 178 | 2 Noise - Commercial 179 | 3 Noise - Vehicle 180 | 4 Rodent 181 | Name: Complaint Type, dtype: object 182 | ``` 183 | 184 | ## 2.3 选择多列 185 | 186 | 如果我们只关心投诉类型和区,但不关心其余的信息怎么办? Pandas 使它很容易选择列的一个子集:只需将所需列的列表用作索引。 187 | 188 | ```py 189 | complaints[['Complaint Type', 'Borough']] 190 | ``` 191 | 192 | ``` 193 | 194 | Int64Index: 111069 entries, 0 to 111068 195 | Data columns (total 2 columns): 196 | Complaint Type 111069 non-null values 197 | Borough 111069 non-null values 198 | dtypes: object(2) 199 | ``` 200 | 201 | 这会向我们展示总结,我们可以获取前 10 列: 202 | 203 | ```py 204 | complaints[['Complaint Type', 'Borough']][:10] 205 | ``` 206 | 207 | | | Complaint Type | Borough | 208 | | --- | --- | --- | 209 | | 0 | Noise - Street/Sidewalk | QUEENS | 210 | | 1 | Illegal Parking | QUEENS | 211 | | 2 | Noise - Commercial | MANHATTAN | 212 | | 3 | Noise - Vehicle | MANHATTAN | 213 | | 4 | Rodent | MANHATTAN | 214 | | 5 | Noise - Commercial | QUEENS | 215 | | 6 | Blocked Driveway | QUEENS | 216 | | 7 | Noise - Commercial | QUEENS | 217 | | 8 | Noise - Commercial | MANHATTAN | 218 | | 9 | Noise - Commercial | BROOKLYN | 219 | 220 | ## 2.4 什么是最常见的投诉类型? 221 | 222 | 这是个易于回答的问题,我们可以调用`.value_counts()`方法: 223 | 224 | ```py 225 | complaints['Complaint Type'].value_counts() 226 | ``` 227 | 228 | ``` 229 | HEATING 14200 230 | GENERAL CONSTRUCTION 7471 231 | Street Light Condition 7117 232 | DOF Literature Request 5797 233 | PLUMBING 5373 234 | PAINT - PLASTER 5149 235 | Blocked Driveway 4590 236 | NONCONST 3998 237 | Street Condition 3473 238 | Illegal Parking 3343 239 | Noise 3321 240 | Traffic Signal Condition 3145 241 | Dirty Conditions 2653 242 | Water System 2636 243 | Noise - Commercial 2578 244 | ... 245 | Opinion for the Mayor 2 246 | Window Guard 2 247 | DFTA Literature Request 2 248 | Legal Services Provider Complaint 2 249 | Open Flame Permit 1 250 | Snow 1 251 | Municipal Parking Facility 1 252 | X-Ray Machine/Equipment 1 253 | Stalled Sites 1 254 | DHS Income Savings Requirement 1 255 | Tunnel Condition 1 256 | Highway Sign - Damaged 1 257 | Ferry Permit 1 258 | Trans Fat 1 259 | DWD 1 260 | Length: 165, dtype: int64 261 | ``` 262 | 263 | 如果我们想要最常见的 10 个投诉类型,我们可以这样: 264 | 265 | ```py 266 | complaint_counts = complaints['Complaint Type'].value_counts() 267 | complaint_counts[:10] 268 | ``` 269 | 270 | ``` 271 | HEATING 14200 272 | GENERAL CONSTRUCTION 7471 273 | Street Light Condition 7117 274 | DOF Literature Request 5797 275 | PLUMBING 5373 276 | PAINT - PLASTER 5149 277 | Blocked Driveway 4590 278 | NONCONST 3998 279 | Street Condition 3473 280 | Illegal Parking 3343 281 | dtype: int64 282 | ``` 283 | 284 | 但是还可以更好,我们可以绘制出来! 285 | 286 | ```py 287 | complaint_counts[:10].plot(kind='bar') 288 | ``` 289 | 290 | ``` 291 | 292 | ``` 293 | 294 | ![](http://upload-images.jianshu.io/upload_images/118142-ef3c651f8cdb4496.png) 295 | 296 | -------------------------------------------------------------------------------- /2.3.md: -------------------------------------------------------------------------------- 1 | # 第三章 2 | 3 | > 原文:[Chapter 3](http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%203%20-%20Which%20borough%20has%20the%20most%20noise%20complaints%3F%20%28or%2C%20more%20selecting%20data%29.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ``` 10 | # 通常的开头 11 | import pandas as pd 12 | 13 | # 使图表更大更漂亮 14 | pd.set_option('display.mpl_style', 'default') 15 | figsize(15, 5) 16 | 17 | 18 | # 始终展示所有列 19 | pd.set_option('display.line_width', 5000) 20 | pd.set_option('display.max_columns', 60) 21 | ``` 22 | 23 | 让我们继续 NYC 311 服务请求的例子。 24 | 25 | ```py 26 | complaints = pd.read_csv('../data/311-service-requests.csv') 27 | ``` 28 | 29 | ## 3.1 仅仅选择噪音投诉 30 | 31 | 我想知道哪个区有最多的噪音投诉。 首先,我们来看看数据,看看它是什么样子: 32 | 33 | ```py 34 | complaints[:5] 35 | ``` 36 | 37 | | | Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | 38 | | --- | --- | 39 | | 0 | 26589651 | 10/31/2013 02:08:41 AM | NaN | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Talking | Street/Sidewalk | 11432 | 90-03 169 STREET | 169 STREET | 90 AVENUE | 91 AVENUE | NaN | NaN | ADDRESS | JAMAICA | NaN | Precinct | Assigned | 10/31/2013 10:08:41 AM | 10/31/2013 02:35:17 AM | 12 QUEENS | QUEENS | 1042027 | 197389 | Unspecified | QUEENS | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.708275 | -73.791604 | (40.70827532593202, -73.79160395779721) | 40 | | 1 | 26593698 | 10/31/2013 02:01:04 AM | NaN | NYPD | New York City Police Department | Illegal Parking | Commercial Overnight Parking | Street/Sidewalk | 11378 | 58 AVENUE | 58 AVENUE | 58 PLACE | 59 STREET | NaN | NaN | BLOCKFACE | MASPETH | NaN | Precinct | Open | 10/31/2013 10:01:04 AM | NaN | 05 QUEENS | QUEENS | 1009349 | 201984 | Unspecified | QUEENS | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.721041 | -73.909453 | (40.721040535628305, -73.90945306791765) | 41 | | 2 | 26594139 | 10/31/2013 02:00:24 AM | 10/31/2013 02:40:32 AM | NYPD | New York City Police Department | Noise - Commercial | Loud Music/Party | Club/Bar/Restaurant | 10032 | 4060 BROADWAY | BROADWAY | WEST 171 STREET | WEST 172 STREET | NaN | NaN | ADDRESS | NEW YORK | NaN | Precinct | Closed | 10/31/2013 10:00:24 AM | 10/31/2013 02:39:42 AM | 12 MANHATTAN | MANHATTAN | 1001088 | 246531 | Unspecified | MANHATTAN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.843330 | -73.939144 | (40.84332975466513, -73.93914371913482) | 42 | | 3 | 26595721 | 10/31/2013 01:56:23 AM | 10/31/2013 02:21:48 AM | NYPD | New York City Police Department | Noise - Vehicle | Car/Truck Horn | Street/Sidewalk | 10023 | WEST 72 STREET | WEST 72 STREET | COLUMBUS AVENUE | AMSTERDAM AVENUE | NaN | NaN | BLOCKFACE | NEW YORK | NaN | Precinct | Closed | 10/31/2013 09:56:23 AM | 10/31/2013 02:21:10 AM | 07 MANHATTAN | MANHATTAN | 989730 | 222727 | Unspecified | MANHATTAN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.778009 | -73.980213 | (40.7780087446372, -73.98021349023975) | 43 | | 4 | 26590930 | 10/31/2013 01:53:44 AM | NaN | DOHMH | Department of Health and Mental Hygiene | Rodent | Condition Attracting Rodents | Vacant Lot | 10027 | WEST 124 STREET | WEST 124 STREET | LENOX AVENUE | ADAM CLAYTON POWELL JR BOULEVARD | NaN | NaN | BLOCKFACE | NEW YORK | NaN | N/A | Pending | 11/30/2013 01:53:44 AM | 10/31/2013 01:59:54 AM | 10 MANHATTAN | MANHATTAN | 998815 | 233545 | Unspecified | MANHATTAN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.807691 | -73.947387 | (40.80769092704951, -73.94738703491433) | 44 | 45 | 为了得到噪音投诉,我们需要找到`Complaint Type`列为`Noise - Street/Sidewalk`的行。 我会告诉你如何做,然后解释发生了什么。 46 | 47 | ```py 48 | noise_complaints = complaints[complaints['Complaint Type'] == "Noise - Street/Sidewalk"] 49 | noise_complaints[:3] 50 | ``` 51 | 52 | | | Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | 53 | | --- | --- | 54 | | 0 | 26589651 | 10/31/2013 02:08:41 AM | NaN | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Talking | Street/Sidewalk | 11432 | 90-03 169 STREET | 169 STREET | 90 AVENUE | 91 AVENUE | NaN | NaN | ADDRESS | JAMAICA | NaN | Precinct | Assigned | 10/31/2013 10:08:41 AM | 10/31/2013 02:35:17 AM | 12 QUEENS | QUEENS | 1042027 | 197389 | Unspecified | QUEENS | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.708275 | -73.791604 | (40.70827532593202, -73.79160395779721) | 55 | | 16 | 26594086 | 10/31/2013 12:54:03 AM | 10/31/2013 02:16:39 AM | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 10310 | 173 CAMPBELL AVENUE | CAMPBELL AVENUE | HENDERSON AVENUE | WINEGAR LANE | NaN | NaN | ADDRESS | STATEN ISLAND | NaN | Precinct | Closed | 10/31/2013 08:54:03 AM | 10/31/2013 02:07:14 AM | 01 STATEN ISLAND | STATEN ISLAND | 952013 | 171076 | Unspecified | STATEN ISLAND | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.636182 | -74.116150 | (40.63618202176914, -74.1161500428337) | 56 | | 25 | 26591573 | 10/31/2013 12:35:18 AM | 10/31/2013 02:41:35 AM | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Talking | Street/Sidewalk | 10312 | 24 PRINCETON LANE | PRINCETON LANE | HAMPTON GREEN | DEAD END | NaN | NaN | ADDRESS | STATEN ISLAND | NaN | Precinct | Closed | 10/31/2013 08:35:18 AM | 10/31/2013 01:45:17 AM | 03 STATEN ISLAND | STATEN ISLAND | 929577 | 140964 | Unspecified | STATEN ISLAND | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.553421 | -74.196743 | (40.55342078716953, -74.19674315017886) | 57 | 58 | 如果你查看`noise_complaints`,你会看到它生效了,它只包含带有正确的投诉类型的投诉。 但是这是如何工作的? 让我们把它解构成两部分 59 | 60 | ```py 61 | complaints['Complaint Type'] == "Noise - Street/Sidewalk" 62 | ``` 63 | 64 | ``` 65 | 0 True 66 | 1 False 67 | 2 False 68 | 3 False 69 | 4 False 70 | 5 False 71 | 6 False 72 | 7 False 73 | 8 False 74 | 9 False 75 | 10 False 76 | 11 False 77 | 12 False 78 | 13 False 79 | 14 False 80 | ... 81 | 111054 True 82 | 111055 False 83 | 111056 False 84 | 111057 False 85 | 111058 False 86 | 111059 True 87 | 111060 False 88 | 111061 False 89 | 111062 False 90 | 111063 False 91 | 111064 False 92 | 111065 False 93 | 111066 True 94 | 111067 False 95 | 111068 False 96 | Name: Complaint Type, Length: 111069, dtype: bool 97 | ``` 98 | 99 | 这是一个`True`和`False`的大数组,对应`DataFrame`中的每一行。 当我们用这个数组索引我们的`DataFrame`时,我们只得到其中为`True`行。 100 | 101 | 您还可以将多个条件与`&`运算符组合,如下所示: 102 | 103 | ```py 104 | is_noise = complaints['Complaint Type'] == "Noise - Street/Sidewalk" 105 | in_brooklyn = complaints['Borough'] == "BROOKLYN" 106 | complaints[is_noise & in_brooklyn][:5] 107 | ``` 108 | 109 | | | Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | 110 | | --- | --- | 111 | | 31 | 26595564 | 10/31/2013 12:30:36 AM | NaN | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 11236 | AVENUE J | AVENUE J | EAST 80 STREET | EAST 81 STREET | NaN | NaN | BLOCKFACE | BROOKLYN | NaN | Precinct | Open | 10/31/2013 08:30:36 AM | NaN | 18 BROOKLYN | BROOKLYN | 1008937 | 170310 | Unspecified | BROOKLYN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.634104 | -73.911055 | (40.634103775951736, -73.91105541883589) | 112 | | 49 | 26595553 | 10/31/2013 12:05:10 AM | 10/31/2013 02:43:43 AM | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Talking | Street/Sidewalk | 11225 | 25 LEFFERTS AVENUE | LEFFERTS AVENUE | WASHINGTON AVENUE | BEDFORD AVENUE | NaN | NaN | ADDRESS | BROOKLYN | NaN | Precinct | Closed | 10/31/2013 08:05:10 AM | 10/31/2013 01:29:29 AM | 09 BROOKLYN | BROOKLYN | 995366 | 180388 | Unspecified | BROOKLYN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.661793 | -73.959934 | (40.6617931276793, -73.95993363978067) | 113 | | 109 | 26594653 | 10/30/2013 11:26:32 PM | 10/31/2013 12:18:54 AM | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 11222 | NaN | NaN | NaN | NaN | DOBBIN STREET | NORMAN STREET | INTERSECTION | BROOKLYN | NaN | Precinct | Closed | 10/31/2013 07:26:32 AM | 10/31/2013 12:18:54 AM | 01 BROOKLYN | BROOKLYN | 996925 | 203271 | Unspecified | BROOKLYN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.724600 | -73.954271 | (40.724599563793525, -73.95427134534344) | 114 | | 236 | 26591992 | 10/30/2013 10:02:58 PM | 10/30/2013 10:23:20 PM | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Talking | Street/Sidewalk | 11218 | DITMAS AVENUE | DITMAS AVENUE | NaN | NaN | NaN | NaN | LATLONG | BROOKLYN | NaN | Precinct | Closed | 10/31/2013 06:02:58 AM | 10/30/2013 10:23:20 PM | 01 BROOKLYN | BROOKLYN | 991895 | 171051 | Unspecified | BROOKLYN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.636169 | -73.972455 | (40.63616876563881, -73.97245504682485) | 115 | | 370 | 26594167 | 10/30/2013 08:38:25 PM | 10/30/2013 10:26:28 PM | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 11218 | 126 BEVERLY ROAD | BEVERLY ROAD | CHURCH AVENUE | EAST 2 STREET | NaN | NaN | ADDRESS | BROOKLYN | NaN | Precinct | Closed | 10/31/2013 04:38:25 AM | 10/30/2013 10:26:28 PM | 12 BROOKLYN | BROOKLYN | 990144 | 173511 | Unspecified | BROOKLYN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.642922 | -73.978762 | (40.6429222774404, -73.97876175474585) | 116 | 117 | 或者如果我们只需要几列: 118 | 119 | ```py 120 | complaints[is_noise & in_brooklyn][['Complaint Type', 'Borough', 'Created Date', 'Descriptor']][:10] 121 | ``` 122 | 123 | | | Complaint Type | Borough | Created Date | Descriptor | 124 | | --- | --- | 125 | | 31 | Noise - Street/Sidewalk | BROOKLYN | 10/31/2013 12:30:36 AM | Loud Music/Party | 126 | | 49 | Noise - Street/Sidewalk | BROOKLYN | 10/31/2013 12:05:10 AM | Loud Talking | 127 | | 109 | Noise - Street/Sidewalk | BROOKLYN | 10/30/2013 11:26:32 PM | Loud Music/Party | 128 | | 236 | Noise - Street/Sidewalk | BROOKLYN | 10/30/2013 10:02:58 PM | Loud Talking | 129 | | 370 | Noise - Street/Sidewalk | BROOKLYN | 10/30/2013 08:38:25 PM | Loud Music/Party | 130 | | 378 | Noise - Street/Sidewalk | BROOKLYN | 10/30/2013 08:32:13 PM | Loud Talking | 131 | | 656 | Noise - Street/Sidewalk | BROOKLYN | 10/30/2013 06:07:39 PM | Loud Music/Party | 132 | | 1251 | Noise - Street/Sidewalk | BROOKLYN | 10/30/2013 03:04:51 PM | Loud Talking | 133 | | 5416 | Noise - Street/Sidewalk | BROOKLYN | 10/29/2013 10:07:02 PM | Loud Talking | 134 | | 5584 | Noise - Street/Sidewalk | BROOKLYN | 10/29/2013 08:15:59 PM | Loud Music/Party | 135 | 136 | ## 3.2 numpy 数组的注解 137 | 138 | 在内部,列的类型是`pd.Series`。 139 | 140 | ```py 141 | pd.Series([1,2,3]) 142 | ``` 143 | 144 | ``` 145 | 0 1 146 | 1 2 147 | 2 3 148 | dtype: int64 149 | ``` 150 | 151 | 而且`pandas.Series`的内部是 numpy 数组。 如果将`.values`添加到任何`Series`的末尾,你将得到它的内部 numpy 数组。 152 | 153 | ```py 154 | np.array([1,2,3]) 155 | ``` 156 | 157 | ``` 158 | array([1, 2, 3]) 159 | ``` 160 | 161 | ```py 162 | pd.Series([1,2,3]).values 163 | ``` 164 | 165 | ``` 166 | array([1, 2, 3]) 167 | ``` 168 | 169 | 所以这个二进制数组选择的操作,实际上适用于任何 NumPy 数组: 170 | 171 | ```py 172 | arr = np.array([1,2,3]) 173 | ``` 174 | 175 | ```py 176 | arr != 2 177 | ``` 178 | 179 | ``` 180 | array([ True, False, True], dtype=bool) 181 | ``` 182 | 183 | ```py 184 | arr[arr != 2] 185 | ``` 186 | 187 | ``` 188 | array([1, 3]) 189 | ``` 190 | 191 | ## 3.3 所以,哪个区的噪音投诉最多? 192 | 193 | ```py 194 | is_noise = complaints['Complaint Type'] == "Noise - Street/Sidewalk" 195 | noise_complaints = complaints[is_noise] 196 | noise_complaints['Borough'].value_counts() 197 | ``` 198 | 199 | ``` 200 | MANHATTAN 917 201 | BROOKLYN 456 202 | BRONX 292 203 | QUEENS 226 204 | STATEN ISLAND 36 205 | Unspecified 1 206 | dtype: int64 207 | ``` 208 | 209 | 这是曼哈顿! 但是,如果我们想要除以总投诉数量,以使它有点更有意义? 这也很容易: 210 | 211 | ```py 212 | noise_complaint_counts = noise_complaints['Borough'].value_counts() 213 | complaint_counts = complaints['Borough'].value_counts() 214 | ``` 215 | 216 | ```py 217 | noise_complaint_counts / complaint_counts 218 | ``` 219 | 220 | ``` 221 | BRONX 0 222 | BROOKLYN 0 223 | MANHATTAN 0 224 | QUEENS 0 225 | STATEN ISLAND 0 226 | Unspecified 0 227 | dtype: int64 228 | ``` 229 | 230 | 糟糕,为什么是零?这是因为 Python 2 中的整数除法。让我们通过将`complaints_counts`转换为浮点数组来解决它。 231 | 232 | ```py 233 | noise_complaint_counts / complaint_counts.astype(float) 234 | ``` 235 | 236 | ``` 237 | BRONX 0.014833 238 | BROOKLYN 0.013864 239 | MANHATTAN 0.037755 240 | QUEENS 0.010143 241 | STATEN ISLAND 0.007474 242 | Unspecified 0.000141 243 | dtype: float64 244 | ``` 245 | 246 | ```py 247 | (noise_complaint_counts / complaint_counts.astype(float)).plot(kind='bar') 248 | ``` 249 | 250 | ``` 251 | 252 | ``` 253 | 254 | ![](http://upload-images.jianshu.io/upload_images/118142-f5a44c5d4d3e26d1.png) 255 | 256 | 所以曼哈顿的噪音投诉比其他区要多。 257 | -------------------------------------------------------------------------------- /2.4.md: -------------------------------------------------------------------------------- 1 | # 第四章 2 | 3 | > 原文:[Chapter 4](http://nbviewer.ipython.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%204%20-%20Find%20out%20on%20which%20weekday%20people%20bike%20the%20most%20with%20groupby%20and%20aggregate.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ```py 10 | import pandas as pd 11 | pd.set_option('display.mpl_style', 'default') # 使图表漂亮一些 12 | figsize(15, 5) 13 | ``` 14 | 15 | 好的! 我们将在这里回顾我们的自行车道数据集。 我住在蒙特利尔,我很好奇我们是一个通勤城市,还是以骑自行车为乐趣的城市 - 人们在周末还是工作日骑自行车? 16 | 17 | ## 4.1 向我们的`DataFrame`中刚添加`weekday`列 18 | 19 | 首先我们需要加载数据,我们之前已经做过了。 20 | 21 | ```py 22 | bikes = pd.read_csv('../data/bikes.csv', sep=';', encoding='latin1', parse_dates=['Date'], dayfirst=True, index_col='Date') 23 | bikes['Berri 1'].plot() 24 | ``` 25 | 26 | ``` 27 | 28 | ``` 29 | 30 | ![](http://upload-images.jianshu.io/upload_images/118142-df8849e858fab6fc.png) 31 | 32 | 33 | ![](http://upload-images.jianshu.io/upload_images/118142-180a224dabf330b7.png) 34 | 35 | 接下来,我们只是看看 Berri 自行车道。 Berri 是蒙特利尔的一条街道,是一个相当重要的自行车道。 现在我习惯走这条路去图书馆,但我在旧蒙特利尔工作时,我习惯于走这条路去上班。 36 | 37 | 所以我们要创建一个只有 Berri 自行车道的`DataFrame`。 38 | 39 | ```py 40 | berri_bikes = bikes[['Berri 1']] 41 | ``` 42 | 43 | ``` 44 | berri_bikes[:5] 45 | ``` 46 | 47 | 48 | | | Berri 1 | 49 | | --- | --- | 50 | | Date | | 51 | | 2012-01-01 | 35 | 52 | | 2012-01-02 | 83 | 53 | | 2012-01-03 | 135 | 54 | | 2012-01-04 | 144 | 55 | | 2012-01-05 | 197 | 56 | 57 | 接下来,我们需要添加一列`weekday`。 首先,我们可以从索引得到星期。 我们还没有谈到索引,但索引在上面的`DataFrame`中是左边的东西,在`Date`下面。 它基本上是一年中的所有日子。 58 | 59 | ```py 60 | berri_bikes.index 61 | ``` 62 | 63 | ``` 64 | 65 | [2012-01-01 00:00:00, ..., 2012-11-05 00:00:00] 66 | Length: 310, Freq: None, Timezone: None 67 | ``` 68 | 69 | 你可以看到,实际上缺少一些日期 - 实际上只有一年的 310 天。 天知道为什么。 70 | 71 | Pandas 有一堆非常棒的时间序列功能,所以如果我们想得到每一行的月份中的日期,我们可以这样做: 72 | 73 | ```py 74 | berri_bikes.index.day 75 | ``` 76 | 77 | ``` 78 | array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 79 | 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 2, 3, 80 | 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 81 | 21, 22, 23, 24, 25, 26, 27, 28, 29, 1, 2, 3, 4, 5, 6, 7, 8, 82 | 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 83 | 26, 27, 28, 29, 30, 31, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 84 | 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 85 | 29, 30, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 86 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 87 | 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 88 | 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 2, 3, 4, 5, 89 | 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 90 | 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 2, 3, 4, 5, 6, 7, 8, 91 | 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 92 | 26, 27, 28, 29, 30, 31, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 93 | 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 94 | 29, 30, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 95 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 96 | 2, 3, 4, 5], dtype=int32) 97 | ``` 98 | 99 | 我们实际上想要星期: 100 | 101 | ```py 102 | berri_bikes.index.weekday 103 | ``` 104 | 105 | ```py 106 | array([6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 107 | 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 108 | 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 109 | 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 110 | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 111 | 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 112 | 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 113 | 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 114 | 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 115 | 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 116 | 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 117 | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 118 | 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 119 | 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0], dtype=int32) 120 | ``` 121 | 122 | 这是周中的日期,其中 0 是星期一。我通过查询日历得到 0 是星期一。 123 | 124 | 现在我们知道了如何获取星期,我们可以将其添加到我们的`DataFrame`中作为一列: 125 | 126 | ```py 127 | berri_bikes['weekday'] = berri_bikes.index.weekday 128 | berri_bikes[:5] 129 | ``` 130 | 131 | | | Berri 1 | weekday | 132 | | --- | --- | 133 | | Date | | | 134 | | 2012-01-01 | 35 | 6 | 135 | | 2012-01-02 | 83 | 0 | 136 | | 2012-01-03 | 135 | 1 | 137 | | 2012-01-04 | 144 | 2 | 138 | | 2012-01-05 | 197 | 3 | 139 | 140 | ## 4.2 按星期统计骑手 141 | 142 | 这很易于实现! 143 | 144 | `Dataframe`有一个类似于 SQL`groupby`的`.groupby()`方法,如果你熟悉的话。 我现在不打算解释更多 - 如果你想知道更多,请见[文档](http://pandas.pydata.org/pandas-docs/stable/groupby.html)。 145 | 146 | 在这种情况下,`berri_bikes.groupby('weekday')`.aggregate(sum)`意味着“按星期对行分组,然后将星期相同的所有值相加”。 147 | 148 | ```py 149 | weekday_counts = berri_bikes.groupby('weekday').aggregate(sum) 150 | weekday_counts 151 | ``` 152 | 153 | | | Berri 1 | 154 | | --- | --- | 155 | | weekday | | 156 | | 0 | 134298 | 157 | | 1 | 135305 | 158 | | 2 | 152972 | 159 | | 3 | 160131 | 160 | | 4 | 141771 | 161 | | 5 | 101578 | 162 | | 6 | 99310 | 163 | 164 | 很难记住`0, 1, 2, 3, 4, 5, 6`是什么,所以让我们修复它并绘制出来: 165 | 166 | ```py 167 | weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] 168 | weekday_counts 169 | ``` 170 | 171 | 172 | | | Berri 1 | 173 | | --- | --- | 174 | | Monday | 134298 | 175 | | Tuesday | 135305 | 176 | | Wednesday | 152972 | 177 | | Thursday | 160131 | 178 | | Friday | 141771 | 179 | | Saturday | 101578 | 180 | | Sunday | 99310 | 181 | 182 | ```py 183 | weekday_counts.plot(kind='bar') 184 | ``` 185 | 186 | ``` 187 | 188 | ``` 189 | 190 | ![](http://upload-images.jianshu.io/upload_images/118142-14bd94cef2d645cb.png) 191 | 192 | 所以看起来蒙特利尔是通勤骑自行车的人 - 他们在工作日骑自行车更多。 193 | 194 | ## 4.3 放到一起 195 | 196 | 让我们把所有的一起,证明它是多么容易。 6 行的神奇 Pandas! 197 | 198 | 如果你想玩一玩,尝试将`sum`变为`max`,`np.median`,或任何你喜欢的其他函数。 199 | 200 | ```py 201 | bikes = pd.read_csv('../data/bikes.csv', 202 | sep=';', encoding='latin1', 203 | parse_dates=['Date'], dayfirst=True, 204 | index_col='Date') 205 | # 添加 weekday 列 206 | berri_bikes = bikes[['Berri 1']] 207 | berri_bikes['weekday'] = berri_bikes.index.weekday 208 | 209 | # 按照星期累计骑手,并绘制出来 210 | weekday_counts = berri_bikes.groupby('weekday').aggregate(sum) 211 | weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] 212 | weekday_counts.plot(kind='bar') 213 | ``` -------------------------------------------------------------------------------- /2.5.md: -------------------------------------------------------------------------------- 1 | # 第五章 2 | 3 | > 原文:[Chapter 5](http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%205%20-%20Combining%20dataframes%20and%20scraping%20Canadian%20weather%20data.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ## 5.1 下载一个月的天气数据 10 | 11 | 在处理自行车数据时,我需要温度和降水数据,来弄清楚人们下雨时是否喜欢骑自行车。 所以我访问了加拿大历史天气数据的网站,并想出如何自动获得它们。 12 | 13 | 这里我们将获取 201 年 3 月的数据,并清理它们。 14 | 15 | 以下是可用于在蒙特利尔获取数据的网址模板。 16 | 17 | ```py 18 | url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data" 19 | ``` 20 | 21 | 我们获取 2013 年三月的数据,我们需要以`month=3, year=2012`对它格式化: 22 | 23 | ```py 24 | url = url_template.format(month=3, year=2012) 25 | weather_mar2012 = pd.read_csv(url, skiprows=16, index_col='Date/Time', parse_dates=True, encoding='latin1') 26 | ``` 27 | 28 | 这非常不错! 我们可以使用和以前一样的`read_csv`函数,并且只是给它一个 URL 作为文件名。 真棒。 29 | 30 | 在这个 CSV 的顶部有 16 行元数据,但是 Pandas 知道 CSV 很奇怪,所以有一个`skiprows`选项。 我们再次解析日期,并将`Date/Time`设置为索引列。 这是产生的`DataFrame`。 31 | 32 | ```py 33 | weather_mar2012 34 | ``` 35 | 36 | ``` 37 | 38 | DatetimeIndex: 744 entries, 2012-03-01 00:00:00 to 2012-03-31 23:00:00 39 | Data columns (total 24 columns): 40 | Year 744 non-null values 41 | Month 744 non-null values 42 | Day 744 non-null values 43 | Time 744 non-null values 44 | Data Quality 744 non-null values 45 | Temp (°C) 744 non-null values 46 | Temp Flag 0 non-null values 47 | Dew Point Temp (°C) 744 non-null values 48 | Dew Point Temp Flag 0 non-null values 49 | Rel Hum (%) 744 non-null values 50 | Rel Hum Flag 0 non-null values 51 | Wind Dir (10s deg) 715 non-null values 52 | Wind Dir Flag 0 non-null values 53 | Wind Spd (km/h) 744 non-null values 54 | Wind Spd Flag 3 non-null values 55 | Visibility (km) 744 non-null values 56 | Visibility Flag 0 non-null values 57 | Stn Press (kPa) 744 non-null values 58 | Stn Press Flag 0 non-null values 59 | Hmdx 12 non-null values 60 | Hmdx Flag 0 non-null values 61 | Wind Chill 242 non-null values 62 | Wind Chill Flag 1 non-null values 63 | Weather 744 non-null values 64 | dtypes: float64(14), int64(5), object(5) 65 | ``` 66 | 67 | 让我们绘制它吧! 68 | 69 | ```py 70 | weather_mar2012[u"Temp (\xb0C)"].plot(figsize=(15, 5)) 71 | ``` 72 | 73 | ``` 74 | 75 | ``` 76 | 77 | ![](http://upload-images.jianshu.io/upload_images/118142-388bc1a04030c146.png) 78 | 79 | 注意它在中间升高到25°C。这是一个大问题。 这是三月,人们在外面穿着短裤。 80 | 81 | 我出城了,而且错过了。真是伤心啊。 82 | 83 | 我需要将度数字符`°`写为`'\xb0'`。 让我们去掉它,让它更容易键入。 84 | 85 | ```py 86 | weather_mar2012.columns = [s.replace(u'\xb0', '') for s in weather_mar2012.columns] 87 | ``` 88 | 89 | 你会注意到在上面的摘要中,有几个列完全是空的,或其中只有几个值。 让我们使用`dropna`去掉它们。 90 | 91 | `dropna `中的`axis=1`意味着“删除列,而不是行”,以及`how ='any'`意味着“如果任何值为空,则删除列”。 92 | 93 | 现在更好了 - 我们只有带有真实数据的列。 94 | 95 | 96 | | | Year | Month | Day | Time | Data Quality | Temp (C) | Dew Point Temp (C) | Rel Hum (%) | Wind Spd (km/h) | Visibility (km) | Stn Press (kPa) | Weather | 97 | | --- | --- | 98 | | Date/Time | | | | | | | | | | | | | 99 | | 2012-03-01 00:00:00 | 2012 | 3 | 1 | 00:00 | | -5.5 | -9.7 | 72 | 24 | 4.0 | 100.97 | Snow | 100 | | 2012-03-01 01:00:00 | 2012 | 3 | 1 | 01:00 | | -5.7 | -8.7 | 79 | 26 | 2.4 | 100.87 | Snow | 101 | | 2012-03-01 02:00:00 | 2012 | 3 | 1 | 02:00 | | -5.4 | -8.3 | 80 | 28 | 4.8 | 100.80 | Snow | 102 | | 2012-03-01 03:00:00 | 2012 | 3 | 1 | 03:00 | | -4.7 | -7.7 | 79 | 28 | 4.0 | 100.69 | Snow | 103 | | 2012-03-01 04:00:00 | 2012 | 3 | 1 | 04:00 | | -5.4 | -7.8 | 83 | 35 | 1.6 | 100.62 | Snow | 104 | 105 | `Year/Month/Day/Time`列是冗余的,但`Data Quality`列看起来不太有用。 让我们去掉他们。 106 | 107 | `axis = 1`参数意味着“删除列”,像以前一样。 `dropna`和`drop`等操作的默认值总是对行进行操作。 108 | 109 | ```py 110 | weather_mar2012 = weather_mar2012.drop(['Year', 'Month', 'Day', 'Time', 'Data Quality'], axis=1) 111 | weather_mar2012[:5] 112 | ``` 113 | 114 | | | Temp (C) | Dew Point Temp (C) | Rel Hum (%) | Wind Spd (km/h) | Visibility (km) | Stn Press (kPa) | Weather | 115 | | --- | --- | 116 | | Date/Time | | | | | | | | 117 | | 2012-03-01 00:00:00 | -5.5 | -9.7 | 72 | 24 | 4.0 | 100.97 | Snow | 118 | | 2012-03-01 01:00:00 | -5.7 | -8.7 | 79 | 26 | 2.4 | 100.87 | Snow | 119 | | 2012-03-01 02:00:00 | -5.4 | -8.3 | 80 | 28 | 4.8 | 100.80 | Snow | 120 | | 2012-03-01 03:00:00 | -4.7 | -7.7 | 79 | 28 | 4.0 | 100.69 | Snow | 121 | | 2012-03-01 04:00:00 | -5.4 | -7.8 | 83 | 35 | 1.6 | 100.62 | Snow | 122 | 123 | ## 5.2 按一天中的小时绘制温度 124 | 125 | 这只是为了好玩 - 我们以前已经做过,使用`groupby`和`aggregate`! 我们将了解它是否在夜间变冷。 好吧,这是显然的。 但是让我们这样做。 126 | 127 | ```py 128 | temperatures = weather_mar2012[[u'Temp (C)']] 129 | temperatures['Hour'] = weather_mar2012.index.hour 130 | temperatures.groupby('Hour').aggregate(np.median).plot() 131 | ``` 132 | 133 | ![](http://upload-images.jianshu.io/upload_images/118142-55fcf7ebf4a66b16.png) 134 | 135 | 所以温度中位数在 2pm 时达到峰值。 136 | 137 | ## 5.3 获取整年的数据 138 | 139 | 好吧,那么如果我们想要全年的数据呢? 理想情况下 API 会让我们下载,但我不能找出一种方法来实现它。 140 | 141 | 首先,让我们将上面的成果放到一个函数中,函数按照给定月份获取天气。 142 | 143 | 我注意到有一个烦人的 bug,当我请求一月时,它给我上一年的数据,所以我们要解决这个问题。 【真的是这样。你可以检查一下 =)】 144 | 145 | ```py 146 | def download_weather_month(year, month): 147 | if month == 1: 148 | year += 1 149 | url = url_template.format(year=year, month=month) 150 | weather_data = pd.read_csv(url, skiprows=16, index_col='Date/Time', parse_dates=True) 151 | weather_data = weather_data.dropna(axis=1) 152 | weather_data.columns = [col.replace('\xb0', '') for col in weather_data.columns] 153 | weather_data = weather_data.drop(['Year', 'Day', 'Month', 'Time', 'Data Quality'], axis=1) 154 | return weather_data 155 | ``` 156 | 157 | 我们可以测试这个函数是否行为正确: 158 | 159 | ```py 160 | download_weather_month(2012, 1)[:5] 161 | ``` 162 | 163 | 164 | | | Temp (C) | Dew Point Temp (C) | Rel Hum (%) | Wind Spd (km/h) | Visibility (km) | Stn Press (kPa) | Weather | 165 | | --- | --- | 166 | | Date/Time | | | | | | | | 167 | | 2012-01-01 00:00:00 | -1.8 | -3.9 | 86 | 4 | 8.0 | 101.24 | Fog | 168 | | 2012-01-01 01:00:00 | -1.8 | -3.7 | 87 | 4 | 8.0 | 101.24 | Fog | 169 | | 2012-01-01 02:00:00 | -1.8 | -3.4 | 89 | 7 | 4.0 | 101.26 | Freezing Drizzle,Fog | 170 | | 2012-01-01 03:00:00 | -1.5 | -3.2 | 88 | 6 | 4.0 | 101.27 | Freezing Drizzle,Fog | 171 | | 2012-01-01 04:00:00 | -1.5 | -3.3 | 88 | 7 | 4.8 | 101.23 | Fog | 172 | 173 | 现在我们一次性获取了所有月份,需要一些时间来运行。 174 | 175 | ```py 176 | data_by_month = [download_weather_month(2012, i) for i in range(1, 13)] 177 | ``` 178 | 179 | 一旦我们完成之后,可以轻易使用`pd.concat`将所有`DataFrame`连接成一个大`DataFrame`。 现在我们有整年的数据了! 180 | 181 | ```py 182 | weather_2012 = pd.concat(data_by_month) 183 | weather_2012 184 | ``` 185 | 186 | ``` 187 | 188 | DatetimeIndex: 8784 entries, 2012-01-01 00:00:00 to 2012-12-31 23:00:00 189 | Data columns (total 7 columns): 190 | Temp (C) 8784 non-null values 191 | Dew Point Temp (C) 8784 non-null values 192 | Rel Hum (%) 8784 non-null values 193 | Wind Spd (km/h) 8784 non-null values 194 | Visibility (km) 8784 non-null values 195 | Stn Press (kPa) 8784 non-null values 196 | Weather 8784 non-null values 197 | dtypes: float64(4), int64(2), object(1) 198 | ``` 199 | 200 | ## 5.4 保存到 CSV 201 | 202 | 每次下载数据会非常慢,所以让我们保存`DataFrame`: 203 | 204 | ```py 205 | weather_2012.to_csv('../data/weather_2012.csv') 206 | ``` 207 | 208 | 这就完成了! 209 | 210 | ## 5.5 总结 211 | 212 | 在这一章末尾,我们下载了加拿大 2012 年的所有天气数据,并保存到了 CSV 中。 213 | 214 | 我们通过一次下载一个月份,之后组合所有月份来实现。 215 | 216 | 这里是 2012 年每一个小时的天气数据! 217 | 218 | ```py 219 | weather_2012_final = pd.read_csv('../data/weather_2012.csv', index_col='Date/Time') 220 | weather_2012_final['Temp (C)'].plot(figsize=(15, 6)) 221 | ``` 222 | 223 | ``` 224 | 225 | ``` 226 | 227 | ![](http://upload-images.jianshu.io/upload_images/118142-79e416b5c76b0f4f.png) -------------------------------------------------------------------------------- /2.6.md: -------------------------------------------------------------------------------- 1 | # 第六章 2 | 3 | > 原文:[Chapter 6](http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/v0.1/cookbook/Chapter%206%20-%20String%20operations%21%20Which%20month%20was%20the%20snowiest%3F.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ```py 10 | import pandas as pd 11 | pd.set_option('display.mpl_style', 'default') 12 | figsize(15, 3) 13 | ``` 14 | 15 | 我们前面看到,Pandas 真的很善于处理日期。 它也善于处理字符串! 我们从第 5 章回顾我们的天气数据。 16 | 17 | ```py 18 | weather_2012 = pd.read_csv('../data/weather_2012.csv', parse_dates=True, index_col='Date/Time') 19 | weather_2012[:5] 20 | ``` 21 | 22 | | | Temp (C) | Dew Point Temp (C) | Rel Hum (%) | Wind Spd (km/h) | Visibility (km) | Stn Press (kPa) | Weather | 23 | | --- | --- | 24 | | Date/Time | | | | | | | | 25 | | 2012-01-01 00:00:00 | -1.8 | -3.9 | 86 | 4 | 8.0 | 101.24 | Fog | 26 | | 2012-01-01 01:00:00 | -1.8 | -3.7 | 87 | 4 | 8.0 | 101.24 | Fog | 27 | | 2012-01-01 02:00:00 | -1.8 | -3.4 | 89 | 7 | 4.0 | 101.26 | Freezing Drizzle,Fog | 28 | | 2012-01-01 03:00:00 | -1.5 | -3.2 | 88 | 6 | 4.0 | 101.27 | Freezing Drizzle,Fog | 29 | | 2012-01-01 04:00:00 | -1.5 | -3.3 | 88 | 7 | 4.8 | 101.23 | Fog | 30 | 31 | ## 6.1 字符串操作 32 | 33 | 您会看到`Weather`列会显示每小时发生的天气的文字说明。 如果文本描述包含`Snow`,我们将假设它是下雪的。 34 | 35 | pandas 提供了向量化的字符串函数,以便于对包含文本的列进行操作。 [文档](http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods)中有一些很好的例子。 36 | 37 | ```py 38 | weather_description = weather_2012['Weather'] 39 | is_snowing = weather_description.str.contains('Snow') 40 | ``` 41 | 42 | 这会给我们一个二进制向量,很难看出里面的东西,所以我们绘制它: 43 | 44 | ```py 45 | # Not super useful 46 | is_snowing[:5] 47 | ``` 48 | 49 | ``` 50 | Date/Time 51 | 2012-01-01 00:00:00 False 52 | 2012-01-01 01:00:00 False 53 | 2012-01-01 02:00:00 False 54 | 2012-01-01 03:00:00 False 55 | 2012-01-01 04:00:00 False 56 | Name: Weather, dtype: bool 57 | ``` 58 | 59 | ```py 60 | # More useful! 61 | is_snowing.plot() 62 | ``` 63 | 64 | ``` 65 | 66 | ``` 67 | 68 | ![](http://upload-images.jianshu.io/upload_images/118142-1b069cba8676dee4.png) 69 | 70 | ## 6.2 使用`resample`找到下雪最多的月份 71 | 72 | 如果我们想要每个月的温度中值,我们可以使用`resample()`方法,如下所示: 73 | 74 | ```py 75 | weather_2012['Temp (C)'].resample('M', how=np.median).plot(kind='bar') 76 | ``` 77 | 78 | ``` 79 | 80 | ``` 81 | 82 | ![](http://upload-images.jianshu.io/upload_images/118142-0b1823080da895fe.png) 83 | 84 | 毫无奇怪,七月和八月是最暖和的。 85 | 86 | 所以我们可以将`is_snowing`转化为一堆 0 和 1,而不是`True`和`False`。 87 | 88 | ```py 89 | Date/Time 90 | 2012-01-01 00:00:00 0 91 | 2012-01-01 01:00:00 0 92 | 2012-01-01 02:00:00 0 93 | 2012-01-01 03:00:00 0 94 | 2012-01-01 04:00:00 0 95 | 2012-01-01 05:00:00 0 96 | 2012-01-01 06:00:00 0 97 | 2012-01-01 07:00:00 0 98 | 2012-01-01 08:00:00 0 99 | 2012-01-01 09:00:00 0 100 | Name: Weather, dtype: float64 101 | ``` 102 | 103 | 然后使用`resample`寻找每个月下雪的时间比例。 104 | 105 | ```py 106 | is_snowing.astype(float).resample('M', how=np.mean) 107 | ``` 108 | 109 | ``` 110 | Date/Time 111 | 2012-01-31 0.240591 112 | 2012-02-29 0.162356 113 | 2012-03-31 0.087366 114 | 2012-04-30 0.015278 115 | 2012-05-31 0.000000 116 | 2012-06-30 0.000000 117 | 2012-07-31 0.000000 118 | 2012-08-31 0.000000 119 | 2012-09-30 0.000000 120 | 2012-10-31 0.000000 121 | 2012-11-30 0.038889 122 | 2012-12-31 0.251344 123 | Freq: M, dtype: float64 124 | ``` 125 | 126 | ```py 127 | is_snowing.astype(float).resample('M', how=np.mean).plot(kind='bar') 128 | ``` 129 | 130 | ``` 131 | 132 | ``` 133 | 134 | ![](http://upload-images.jianshu.io/upload_images/118142-b7121e50d8ccbf5e.png) 135 | 136 | 所以现在我们知道了! 2012 年 12 月是下雪最多的一个月。 此外,这个图表暗示着我感觉到的东西 - 11 月突然开始下雪,然后慢慢变慢,需要很长时间停止,最后下雪的月份通常在 4 月或 5 月。 137 | 138 | ## 6.3 将温度和降雪绘制在一起 139 | 140 | 我们还可以将这两个统计(温度和降雪)合并为一个`DataFrame`,并将它们绘制在一起: 141 | 142 | ```py 143 | temperature = weather_2012['Temp (C)'].resample('M', how=np.median) 144 | is_snowing = weather_2012['Weather'].str.contains('Snow') 145 | snowiness = is_snowing.astype(float).resample('M', how=np.mean) 146 | 147 | # Name the columns 148 | temperature.name = "Temperature" 149 | snowiness.name = "Snowiness" 150 | ``` 151 | 152 | 我们再次使用`concat `,将两个统计连接为一个`DataFrame`。 153 | 154 | ```py 155 | stats = pd.concat([temperature, snowiness], axis=1) 156 | stats 157 | ``` 158 | 159 | 160 | | | Temperature | Snowiness | 161 | | --- | --- | --- | 162 | | Date/Time | | | 163 | | 2012-01-31 | -7.05 | 0.240591 | 164 | | 2012-02-29 | -4.10 | 0.162356 | 165 | | 2012-03-31 | 2.60 | 0.087366 | 166 | | 2012-04-30 | 6.30 | 0.015278 | 167 | | 2012-05-31 | 16.05 | 0.000000 | 168 | | 2012-06-30 | 19.60 | 0.000000 | 169 | | 2012-07-31 | 22.90 | 0.000000 | 170 | | 2012-08-31 | 22.20 | 0.000000 | 171 | | 2012-09-30 | 16.10 | 0.000000 | 172 | | 2012-10-31 | 11.30 | 0.000000 | 173 | | 2012-11-30 | 1.05 | 0.038889 | 174 | | 2012-12-31 | -2.85 | 0.251344 | 175 | 176 | ```py 177 | stats.plot(kind='bar') 178 | ``` 179 | 180 | ``` 181 | 182 | ``` 183 | 184 | ![](http://upload-images.jianshu.io/upload_images/118142-73924abb44aca794.png) 185 | 186 | 这并不能正常工作,因为比例不对,我们可以在两个图表中分别绘制它们,这样会更好: 187 | 188 | ```py 189 | stats.plot(kind='bar', subplots=True, figsize=(15, 10)) 190 | ``` 191 | 192 | ``` 193 | array([, 194 | ], dtype=object) 195 | ``` 196 | 197 | ![](http://upload-images.jianshu.io/upload_images/118142-f7add95986df2c65.png) 198 | 199 | -------------------------------------------------------------------------------- /2.7.md: -------------------------------------------------------------------------------- 1 | # 第七章 2 | 3 | > 原文:[Chapter 7](http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/master/cookbook/Chapter%207%20-%20Cleaning%20up%20messy%20data.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ```py 10 | # 通常的开头 11 | %matplotlib inline 12 | 13 | import pandas as pd 14 | import matplotlib.pyplot as plt 15 | import numpy as np 16 | 17 | # 使图表更大更漂亮 18 | pd.set_option('display.mpl_style', 'default') 19 | plt.rcParams['figure.figsize'] = (15, 5) 20 | plt.rcParams['font.family'] = 'sans-serif' 21 | 22 | # 在 Pandas 0.12 中需要展示大量的列 23 | # 在 Pandas 0.13 中不需要 24 | pd.set_option('display.width', 5000) 25 | pd.set_option('display.max_columns', 60) 26 | ``` 27 | 28 | 杂乱数据的主要问题之一是:你怎么知道它是否杂乱呢? 29 | 30 | 我们将在这里使用 NYC 311 服务请求数据集,因为它很大,有点不方便。 31 | 32 | ```py 33 | requests = pd.read_csv('../data/311-service-requests.csv') 34 | ``` 35 | 36 | ## 7.1 我怎么知道它是否杂乱? 37 | 38 | 我们在这里查看几列。 我知道邮政编码有一些问题,所以让我们先看看它。 39 | 40 | 要了解列是否有问题,我通常使用`.unique()`来查看所有的值。 如果它是一列数字,我将绘制一个直方图来获得分布的感觉。 41 | 42 | 当我们看看`Incident Zip`中的唯一值时,很快就会清楚这是一个混乱。 43 | 44 | 一些问题: 45 | 46 | + 一些已经解析为字符串,一些是浮点 47 | + 存在`nan` 48 | + 部分邮政编码为`29616-0759`或`83` 49 | + 有一些 Pandas 无法识别的 N/A 值 ,如`'N/A'`和`'NO CLUE'` 50 | 51 | 我们可以做的事情: 52 | 53 | + 将`N/A`和`NO CLUE`规格化为`nan`值 54 | + 看看 83 处发生了什么,并决定做什么 55 | + 将一切转化为字符串 56 | 57 | ```py 58 | requests['Incident Zip'].unique() 59 | ``` 60 | 61 | ``` 62 | array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0, 63 | 11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0, 64 | 11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0, 65 | 10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0, 66 | 11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0, 67 | 10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0, 68 | 10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0, 69 | 10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0, 70 | 11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0, 71 | 10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0, 72 | 10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0, 73 | 10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0, 74 | 11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0, 75 | 10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0, 76 | 11209.0, 10021.0, 10037.0, 11413.0, 11375.0, 11238.0, 10473.0, 77 | 11103.0, 11354.0, 11361.0, 11106.0, 11385.0, 10463.0, 10467.0, 78 | 11204.0, 11237.0, 11377.0, 11364.0, 11434.0, 11435.0, 11210.0, 79 | 11228.0, 11368.0, 11694.0, 10464.0, 11415.0, 10314.0, 10301.0, 80 | 10018.0, 10038.0, 11105.0, 11230.0, 10468.0, 11104.0, 10471.0, 81 | 11416.0, 10075.0, 11422.0, 11355.0, 10028.0, 10462.0, 10306.0, 82 | 10461.0, 11224.0, 11429.0, 10035.0, 11366.0, 11362.0, 11206.0, 83 | 10460.0, 10304.0, 11360.0, 11411.0, 10455.0, 10475.0, 10069.0, 84 | 10303.0, 10308.0, 10302.0, 11357.0, 10470.0, 11367.0, 11370.0, 85 | 10454.0, 10451.0, 11436.0, 11426.0, 10153.0, 11004.0, 11428.0, 86 | 11427.0, 11001.0, 11363.0, 10004.0, 10474.0, 11430.0, 10000.0, 87 | 10307.0, 11239.0, 10119.0, 10006.0, 10048.0, 11697.0, 11692.0, 88 | 11693.0, 10573.0, 83.0, 11559.0, 10020.0, 77056.0, 11776.0, 70711.0, 89 | 10282.0, 11109.0, 10044.0, '10452', '11233', '10468', '10310', 90 | '11105', '10462', '10029', '10301', '10457', '10467', '10469', 91 | '11225', '10035', '10031', '11226', '10454', '11221', '10025', 92 | '11229', '11235', '11422', '10472', '11208', '11102', '10032', 93 | '11216', '10473', '10463', '11213', '10040', '10302', '11231', 94 | '10470', '11204', '11104', '11212', '10466', '11416', '11214', 95 | '10009', '11692', '11385', '11423', '11201', '10024', '11435', 96 | '10312', '10030', '11106', '10033', '10303', '11215', '11222', 97 | '11354', '10016', '10034', '11420', '10304', '10019', '11237', 98 | '11249', '11230', '11372', '11207', '11378', '11419', '11361', 99 | '10011', '11357', '10012', '11358', '10003', '10002', '11374', 100 | '10007', '11234', '10065', '11369', '11434', '11205', '11206', 101 | '11415', '11236', '11218', '11413', '10458', '11101', '10306', 102 | '11355', '10023', '11368', '10314', '11421', '10010', '10018', 103 | '11223', '10455', '11377', '11433', '11375', '10037', '11209', 104 | '10459', '10128', '10014', '10282', '11373', '10451', '11238', 105 | '11211', '10038', '11694', '11203', '11691', '11232', '10305', 106 | '10021', '11228', '10036', '10001', '10017', '11217', '11219', 107 | '10308', '10465', '11379', '11414', '10460', '11417', '11220', 108 | '11366', '10027', '11370', '10309', '11412', '11356', '10456', 109 | '11432', '10022', '10013', '11367', '11040', '10026', '10475', 110 | '11210', '11364', '11426', '10471', '10119', '11224', '11418', 111 | '11429', '11365', '10461', '11239', '10039', '00083', '11411', 112 | '10075', '11004', '11360', '10453', '10028', '11430', '10307', 113 | '11103', '10004', '10069', '10005', '10474', '11428', '11436', 114 | '10020', '11001', '11362', '11693', '10464', '11427', '10044', 115 | '11363', '10006', '10000', '02061', '77092-2016', '10280', '11109', 116 | '14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE', 117 | '90010', '10281', '11747', '23541', '11776', '11697', '11788', 118 | '07604', 10112.0, 11788.0, 11563.0, 11580.0, 7087.0, 11042.0, 119 | 7093.0, 11501.0, 92123.0, 0.0, 11575.0, 7109.0, 11797.0, '10803', 120 | '11716', '11722', '11549-3650', '10162', '92123', '23502', '11518', 121 | '07020', '08807', '11577', '07114', '11003', '07201', '11563', 122 | '61702', '10103', '29616-0759', '35209-3114', '11520', '11735', 123 | '10129', '11005', '41042', '11590', 6901.0, 7208.0, 11530.0, 124 | 13221.0, 10954.0, 11735.0, 10103.0, 7114.0, 11111.0, 10107.0], dtype=object) 125 | ``` 126 | 127 | ## 7.3 修复`nan`值和字符串/浮点混淆 128 | 129 | 我们可以将`na_values`选项传递到`pd.read_csv`来清理它们。 我们还可以指定`Incident Zip`的类型是字符串,而不是浮点。 130 | 131 | ```py 132 | na_values = ['NO CLUE', 'N/A', '0'] 133 | requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str}) 134 | ``` 135 | 136 | ```py 137 | requests['Incident Zip'].unique() 138 | ``` 139 | 140 | ``` 141 | array(['11432', '11378', '10032', '10023', '10027', '11372', '11419', 142 | '11417', '10011', '11225', '11218', '10003', '10029', '10466', 143 | '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016', 144 | '10305', '10312', '10026', '10309', '10036', '11433', '11235', 145 | '11213', '11379', '11101', '10014', '11231', '11234', '10457', 146 | '10459', '10465', '11207', '10002', '10034', '11233', '10453', 147 | '10456', '10469', '11374', '11221', '11421', '11215', '10007', 148 | '10019', '11205', '11418', '11369', '11249', '10005', '10009', 149 | '11211', '11412', '10458', '11229', '10065', '10030', '11222', 150 | '10024', '10013', '11420', '11365', '10012', '11214', '11212', 151 | '10022', '11232', '11040', '11226', '10281', '11102', '11208', 152 | '10001', '10472', '11414', '11223', '10040', '11220', '11373', 153 | '11203', '11691', '11356', '10017', '10452', '10280', '11217', 154 | '10031', '11201', '11358', '10128', '11423', '10039', '10010', 155 | '11209', '10021', '10037', '11413', '11375', '11238', '10473', 156 | '11103', '11354', '11361', '11106', '11385', '10463', '10467', 157 | '11204', '11237', '11377', '11364', '11434', '11435', '11210', 158 | '11228', '11368', '11694', '10464', '11415', '10314', '10301', 159 | '10018', '10038', '11105', '11230', '10468', '11104', '10471', 160 | '11416', '10075', '11422', '11355', '10028', '10462', '10306', 161 | '10461', '11224', '11429', '10035', '11366', '11362', '11206', 162 | '10460', '10304', '11360', '11411', '10455', '10475', '10069', 163 | '10303', '10308', '10302', '11357', '10470', '11367', '11370', 164 | '10454', '10451', '11436', '11426', '10153', '11004', '11428', 165 | '11427', '11001', '11363', '10004', '10474', '11430', '10000', 166 | '10307', '11239', '10119', '10006', '10048', '11697', '11692', 167 | '11693', '10573', '00083', '11559', '10020', '77056', '11776', 168 | '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225', 169 | '55164-0737', '19711', '07306', '000000', '90010', '11747', '23541', 170 | '11788', '07604', '10112', '11563', '11580', '07087', '11042', 171 | '07093', '11501', '92123', '00000', '11575', '07109', '11797', 172 | '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518', 173 | '07020', '08807', '11577', '07114', '11003', '07201', '61702', 174 | '10103', '29616-0759', '35209-3114', '11520', '11735', '10129', 175 | '11005', '41042', '11590', '06901', '07208', '11530', '13221', 176 | '10954', '11111', '10107'], dtype=object) 177 | ``` 178 | 179 | ## 7.4 短横线处发生了什么 180 | 181 | ```py 182 | rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False) 183 | len(requests[rows_with_dashes]) 184 | ``` 185 | 186 | ``` 187 | 5 188 | ``` 189 | 190 | ```py 191 | requests[rows_with_dashes] 192 | ``` 193 | 194 | 195 | | | Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | 196 | | --- | --- | 197 | | 29136 | 26550551 | 10/24/2013 06:16:34 PM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | False Advertising | NaN | 77092-2016 | 2700 EAST SELTICE WAY | EAST SELTICE WAY | NaN | NaN | NaN | NaN | NaN | HOUSTON | NaN | NaN | Assigned | 11/13/2013 11:15:20 AM | 10/29/2013 11:16:16 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 198 | | 30939 | 26548831 | 10/24/2013 09:35:10 AM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 55164-0737 | P.O. BOX 64437 | 64437 | NaN | NaN | NaN | NaN | NaN | ST. PAUL | NaN | NaN | Assigned | 11/13/2013 02:30:21 PM | 10/29/2013 02:31:06 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 199 | | 70539 | 26488417 | 10/15/2013 03:40:33 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 11549-3650 | 365 HOFSTRA UNIVERSITY | HOFSTRA UNIVERSITY | NaN | NaN | NaN | NaN | NaN | HEMSTEAD | NaN | NaN | Assigned | 11/30/2013 01:20:33 PM | 10/16/2013 01:21:39 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | La Guardia Airport | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 200 | | 85821 | 26468296 | 10/10/2013 12:36:43 PM | 10/26/2013 01:07:07 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Debt Not Owed | NaN | 29616-0759 | PO BOX 25759 | BOX 25759 | NaN | NaN | NaN | NaN | NaN | GREENVILLE | NaN | NaN | Closed | 10/26/2013 09:20:28 AM | 10/26/2013 01:07:07 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 201 | | 89304 | 26461137 | 10/09/2013 05:23:46 PM | 10/25/2013 01:06:41 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 35209-3114 | 600 BEACON PKWY | BEACON PKWY | NaN | NaN | NaN | NaN | NaN | BIRMINGHAM | NaN | NaN | Closed | 10/25/2013 02:43:42 PM | 10/25/2013 01:06:41 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 202 | 203 | 204 | 我认为这些都是缺失的数据,像这样删除它们: 205 | 206 | ```py 207 | requests['Incident Zip'][rows_with_dashes] = np.nan 208 | ``` 209 | 210 | 但是我的朋友 Dave 指出,9 位邮政编码是正常的。 让我们看看所有超过 5 位数的邮政编码,确保它们没问题,然后截断它们。 211 | 212 | ```py 213 | long_zip_codes = requests['Incident Zip'].str.len() > 5 214 | requests['Incident Zip'][long_zip_codes].unique() 215 | ``` 216 | 217 | ``` 218 | array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759', 219 | '35209-3114'], dtype=object) 220 | ``` 221 | 222 | 这些看起来可以截断: 223 | 224 | ```py 225 | requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5) 226 | ``` 227 | 228 | 就可以了。 229 | 230 | 231 | 早些时候我认为 00083 是一个损坏的邮政编码,但事实证明中央公园的邮政编码是 00083! 显示我知道的吧。 我仍然关心 00000 邮政编码,但是:让我们看看。 232 | 233 | ```py 234 | requests[requests['Incident Zip'] == '00000'] 235 | ``` 236 | 237 | 238 | | | Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | 239 | | --- | --- | 240 | | 42600 | 26529313 | 10/22/2013 02:51:06 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | NaN | 00000 | EWR EWR | EWR | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/07/2013 09:53:51 AM | 10/23/2013 09:54:43 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 241 | | 60843 | 26507389 | 10/17/2013 05:48:44 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 00000 | 1 NEWARK AIRPORT | NEWARK AIRPORT | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/02/2013 11:59:46 AM | 10/18/2013 12:01:08 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 242 | 243 | 这看起来对我来说很糟糕,让我将它们设为`NaN`。 244 | 245 | ```py 246 | zero_zips = requests['Incident Zip'] == '00000' 247 | requests.loc[zero_zips, 'Incident Zip'] = np.nan 248 | ``` 249 | 250 | 太棒了,让我们看看现在在哪里。 251 | 252 | ```py 253 | unique_zips = requests['Incident Zip'].unique() 254 | unique_zips.sort() 255 | unique_zips 256 | ``` 257 | 258 | ``` 259 | array([nan, '00083', '02061', '06901', '07020', '07087', '07093', '07109', 260 | '07114', '07201', '07208', '07306', '07604', '08807', '10000', 261 | '10001', '10002', '10003', '10004', '10005', '10006', '10007', 262 | '10009', '10010', '10011', '10012', '10013', '10014', '10016', 263 | '10017', '10018', '10019', '10020', '10021', '10022', '10023', 264 | '10024', '10025', '10026', '10027', '10028', '10029', '10030', 265 | '10031', '10032', '10033', '10034', '10035', '10036', '10037', 266 | '10038', '10039', '10040', '10044', '10048', '10065', '10069', 267 | '10075', '10103', '10107', '10112', '10119', '10128', '10129', 268 | '10153', '10162', '10280', '10281', '10282', '10301', '10302', 269 | '10303', '10304', '10305', '10306', '10307', '10308', '10309', 270 | '10310', '10312', '10314', '10451', '10452', '10453', '10454', 271 | '10455', '10456', '10457', '10458', '10459', '10460', '10461', 272 | '10462', '10463', '10464', '10465', '10466', '10467', '10468', 273 | '10469', '10470', '10471', '10472', '10473', '10474', '10475', 274 | '10573', '10803', '10954', '11001', '11003', '11004', '11005', 275 | '11040', '11042', '11101', '11102', '11103', '11104', '11105', 276 | '11106', '11109', '11111', '11201', '11203', '11204', '11205', 277 | '11206', '11207', '11208', '11209', '11210', '11211', '11212', 278 | '11213', '11214', '11215', '11216', '11217', '11218', '11219', 279 | '11220', '11221', '11222', '11223', '11224', '11225', '11226', 280 | '11228', '11229', '11230', '11231', '11232', '11233', '11234', 281 | '11235', '11236', '11237', '11238', '11239', '11249', '11354', 282 | '11355', '11356', '11357', '11358', '11360', '11361', '11362', 283 | '11363', '11364', '11365', '11366', '11367', '11368', '11369', 284 | '11370', '11372', '11373', '11374', '11375', '11377', '11378', 285 | '11379', '11385', '11411', '11412', '11413', '11414', '11415', 286 | '11416', '11417', '11418', '11419', '11420', '11421', '11422', 287 | '11423', '11426', '11427', '11428', '11429', '11430', '11432', 288 | '11433', '11434', '11435', '11436', '11501', '11518', '11520', 289 | '11530', '11549', '11559', '11563', '11575', '11577', '11580', 290 | '11590', '11691', '11692', '11693', '11694', '11697', '11716', 291 | '11722', '11735', '11747', '11776', '11788', '11797', '13221', 292 | '14225', '19711', '23502', '23541', '29616', '35209', '41042', 293 | '55164', '61702', '70711', '77056', '77092', '90010', '92123'], dtype=object) 294 | ``` 295 | 296 | 太棒了! 这更加干净。 虽然这里有一些奇怪的东西 - 我在谷歌地图上查找 77056,这是在德克萨斯州。 297 | 298 | 让我们仔细看看: 299 | 300 | ```py 301 | zips = requests['Incident Zip'] 302 | # Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?) 303 | is_close = zips.str.startswith('0') | zips.str.startswith('1') 304 | # There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False 305 | is_far = ~(is_close) & zips.notnull() 306 | ``` 307 | 308 | ```py 309 | zips[is_far] 310 | ``` 311 | 312 | ``` 313 | 12102 77056 314 | 13450 70711 315 | 29136 77092 316 | 30939 55164 317 | 44008 90010 318 | 47048 23541 319 | 57636 92123 320 | 71001 92123 321 | 71834 23502 322 | 80573 61702 323 | 85821 29616 324 | 89304 35209 325 | 94201 41042 326 | Name: Incident Zip, dtype: object 327 | ``` 328 | 329 | ```py 330 | requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip') 331 | ``` 332 | 333 | 334 | | | Incident Zip | Descriptor | City | 335 | | --- | --- | 336 | | 71834 | 23502 | Harassment | NORFOLK | 337 | | 47048 | 23541 | Harassment | NORFOLK | 338 | | 85821 | 29616 | Debt Not Owed | GREENVILLE | 339 | | 89304 | 35209 | Harassment | BIRMINGHAM | 340 | | 94201 | 41042 | Harassment | FLORENCE | 341 | | 30939 | 55164 | Harassment | ST. PAUL | 342 | | 80573 | 61702 | Billing Dispute | BLOOMIGTON | 343 | | 13450 | 70711 | Contract Dispute | CLIFTON | 344 | | 12102 | 77056 | Debt Not Owed | HOUSTON | 345 | | 29136 | 77092 | False Advertising | HOUSTON | 346 | | 44008 | 90010 | Billing Dispute | LOS ANGELES | 347 | | 57636 | 92123 | Harassment | SAN DIEGO | 348 | | 71001 | 92123 | Billing Dispute | SAN DIEGO | 349 | 350 | 好吧,真的有来自 LA 和休斯敦的请求! 很高兴知道它们。 按邮政编码过滤可能是处理它的一个糟糕的方式 - 我们真的应该看着城市。 351 | 352 | ```py 353 | requests['City'].str.upper().value_counts() 354 | ``` 355 | 356 | ``` 357 | BROOKLYN 31662 358 | NEW YORK 22664 359 | BRONX 18438 360 | STATEN ISLAND 4766 361 | JAMAICA 2246 362 | FLUSHING 1803 363 | ASTORIA 1568 364 | RIDGEWOOD 1073 365 | CORONA 707 366 | OZONE PARK 693 367 | LONG ISLAND CITY 678 368 | FAR ROCKAWAY 652 369 | ELMHURST 647 370 | WOODSIDE 609 371 | EAST ELMHURST 562 372 | ... 373 | MELVILLE 1 374 | PORT JEFFERSON STATION 1 375 | NORWELL 1 376 | EAST ROCKAWAY 1 377 | BIRMINGHAM 1 378 | ROSLYN 1 379 | LOS ANGELES 1 380 | MINEOLA 1 381 | JERSEY CITY 1 382 | ST. PAUL 1 383 | CLIFTON 1 384 | COL.ANVURES 1 385 | EDGEWATER 1 386 | ROSELYN 1 387 | CENTRAL ISLIP 1 388 | Length: 100, dtype: int64 389 | ``` 390 | 391 | 看起来这些是合法的投诉,所以我们只是把它们放在一边。 392 | 393 | ## 7.5 把它们放到一起 394 | 395 | 这里是我们最后所做的事情,用于清理我们的邮政编码,都在一起: 396 | 397 | ```py 398 | na_values = ['NO CLUE', 'N/A', '0'] 399 | requests = pd.read_csv('../data/311-service-requests.csv', 400 | na_values=na_values, 401 | dtype={'Incident Zip': str}) 402 | ``` 403 | 404 | ```py 405 | def fix_zip_codes(zips): 406 | # Truncate everything to length 5 407 | zips = zips.str.slice(0, 5) 408 | 409 | # Set 00000 zip codes to nan 410 | zero_zips = zips == '00000' 411 | zips[zero_zips] = np.nan 412 | 413 | return zips 414 | ``` 415 | 416 | ```py 417 | requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip']) 418 | ``` 419 | 420 | ```py 421 | requests['Incident Zip'].unique() 422 | ``` 423 | 424 | ``` 425 | array(['11432', '11378', '10032', '10023', '10027', '11372', '11419', 426 | '11417', '10011', '11225', '11218', '10003', '10029', '10466', 427 | '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016', 428 | '10305', '10312', '10026', '10309', '10036', '11433', '11235', 429 | '11213', '11379', '11101', '10014', '11231', '11234', '10457', 430 | '10459', '10465', '11207', '10002', '10034', '11233', '10453', 431 | '10456', '10469', '11374', '11221', '11421', '11215', '10007', 432 | '10019', '11205', '11418', '11369', '11249', '10005', '10009', 433 | '11211', '11412', '10458', '11229', '10065', '10030', '11222', 434 | '10024', '10013', '11420', '11365', '10012', '11214', '11212', 435 | '10022', '11232', '11040', '11226', '10281', '11102', '11208', 436 | '10001', '10472', '11414', '11223', '10040', '11220', '11373', 437 | '11203', '11691', '11356', '10017', '10452', '10280', '11217', 438 | '10031', '11201', '11358', '10128', '11423', '10039', '10010', 439 | '11209', '10021', '10037', '11413', '11375', '11238', '10473', 440 | '11103', '11354', '11361', '11106', '11385', '10463', '10467', 441 | '11204', '11237', '11377', '11364', '11434', '11435', '11210', 442 | '11228', '11368', '11694', '10464', '11415', '10314', '10301', 443 | '10018', '10038', '11105', '11230', '10468', '11104', '10471', 444 | '11416', '10075', '11422', '11355', '10028', '10462', '10306', 445 | '10461', '11224', '11429', '10035', '11366', '11362', '11206', 446 | '10460', '10304', '11360', '11411', '10455', '10475', '10069', 447 | '10303', '10308', '10302', '11357', '10470', '11367', '11370', 448 | '10454', '10451', '11436', '11426', '10153', '11004', '11428', 449 | '11427', '11001', '11363', '10004', '10474', '11430', '10000', 450 | '10307', '11239', '10119', '10006', '10048', '11697', '11692', 451 | '11693', '10573', '00083', '11559', '10020', '77056', '11776', 452 | '70711', '10282', '11109', '10044', '02061', '77092', '14225', 453 | '55164', '19711', '07306', '90010', '11747', '23541', '11788', 454 | '07604', '10112', '11563', '11580', '07087', '11042', '07093', 455 | '11501', '92123', '11575', '07109', '11797', '10803', '11716', 456 | '11722', '11549', '10162', '23502', '11518', '07020', '08807', 457 | '11577', '07114', '11003', '07201', '61702', '10103', '29616', 458 | '35209', '11520', '11735', '10129', '11005', '41042', '11590', 459 | '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object) 460 | ``` 461 | -------------------------------------------------------------------------------- /2.8.md: -------------------------------------------------------------------------------- 1 | # 第八章 2 | 3 | > 原文:[Chapter 8](http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/master/cookbook/Chapter%208%20-%20How%20to%20deal%20with%20timestamps.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ```py 10 | import pandas as pd 11 | ``` 12 | 13 | ## 8.1 解析 Unix 时间戳 14 | 15 | 在 pandas 中处理 Unix 时间戳不是很容易 - 我花了相当长的时间来解决这个问题。 我们在这里使用的文件是一个软件包流行度文件,我在我的系统上的`/var/log/popularity-contest`找到的。 16 | 17 | [这里](http://popcon.ubuntu.com/README)解释了这个文件是什么。 18 | 19 | 20 | ```py 21 | # Read it, and remove the last row 22 | popcon = pd.read_csv('../data/popularity-contest', sep=' ', )[:-1] 23 | popcon.columns = ['atime', 'ctime', 'package-name', 'mru-program', 'tag'] 24 | ``` 25 | 26 | 列是访问时间,创建时间,包名称最近使用的程序,以及标签。 27 | 28 | ```py 29 | popcon[:5] 30 | ``` 31 | 32 | 33 | | | atime | ctime | package-name | mru-program | tag | 34 | | --- | --- | 35 | | 0 | 1387295797 | 1367633260 | perl-base | /usr/bin/perl | NaN | 36 | | 1 | 1387295796 | 1354370480 | login | /bin/su | NaN | 37 | | 2 | 1387295743 | 1354341275 | libtalloc2 | /usr/lib/x86_64-linux-gnu/libtalloc.so.2.0.7 | NaN | 38 | | 3 | 1387295743 | 1387224204 | libwbclient0 | /usr/lib/x86_64-linux-gnu/libwbclient.so.0 | | 39 | | 4 | 1387295742 | 1354341253 | libselinux1 | /lib/x86_64-linux-gnu/libselinux.so.1 | NaN | 40 | 41 | pandas 中的时间戳解析的神奇部分是 numpy `datetime`已经存储为 Unix 时间戳。 所以我们需要做的是告诉 pandas 这些整数实际上是数据时间 - 它不需要做任何转换。 42 | 43 | 我们需要首先将这些转换为整数: 44 | 45 | ```py 46 | popcon['atime'] = popcon['atime'].astype(int) 47 | popcon['ctime'] = popcon['ctime'].astype(int) 48 | ``` 49 | 50 | 每个 numpy 数组和 pandas 序列都有一个`dtype` - 这通常是`int64`,`float64`或`object`。 一些可用的时间类型是`datetime64`[s],`datetime64`[ms]和`datetime64`[us]。 与之相似,也有`timedelta`类型。 51 | 52 | 我们可以使用`pd.to_datetime`函数将我们的整数时间戳转换为`datetimes`。 这是一个常量时间操作 - 我们实际上并不改变任何数据,只是改变了 Pandas 如何看待它。 53 | 54 | ```py 55 | popcon['atime'] = pd.to_datetime(popcon['atime'], unit='s') 56 | popcon['ctime'] = pd.to_datetime(popcon['ctime'], unit='s') 57 | ``` 58 | 59 | 如果我们现在查看`dtype `,它是` | 82 | | 4 | 2013-12-17 15:55:42 | 2012-12-01 05:54:13 | libselinux1 | /lib/x86_64-linux-gnu/libselinux.so.1 | NaN | 83 | 84 | 现在假设我们要查看所有不是库的软件包。 85 | 86 | 首先,我想去掉一切带有时间戳 0 的东西。注意,我们可以在这个比较中使用一个字符串,即使它实际上在里面是一个时间戳。这是因为 Pandas 是非常厉害的。 87 | 88 | ```py 89 | popcon = popcon[popcon['atime'] > '1970-01-01'] 90 | ``` 91 | 92 | 现在我们可以使用 pandas 的魔法字符串功能来查看包名称不包含`lib`的行。 93 | 94 | ```py 95 | nonlibraries = popcon[~popcon['package-name'].str.contains('lib')] 96 | ``` 97 | 98 | ```py 99 | nonlibraries.sort('ctime', ascending=False)[:10] 100 | ``` 101 | 102 | | | atime | ctime | package-name | mru-program | tag | 103 | | --- | --- | 104 | | 57 | 2013-12-17 04:55:39 | 2013-12-17 04:55:42 | ddd | /usr/bin/ddd | | 105 | | 450 | 2013-12-16 20:03:20 | 2013-12-16 20:05:13 | nodejs | /usr/bin/npm | | 106 | | 454 | 2013-12-16 20:03:20 | 2013-12-16 20:05:04 | switchboard-plug-keyboard | /usr/lib/plugs/pantheon/keyboard/options.txt | | 107 | | 445 | 2013-12-16 20:03:20 | 2013-12-16 20:05:04 | thunderbird-locale-en | /usr/lib/thunderbird-addons/extensions/langpac... | | 108 | | 396 | 2013-12-16 20:08:27 | 2013-12-16 20:05:03 | software-center | /usr/sbin/update-software-center | | 109 | | 449 | 2013-12-16 20:03:20 | 2013-12-16 20:05:00 | samba-common-bin | /usr/bin/net.samba3 | | 110 | | 397 | 2013-12-16 20:08:25 | 2013-12-16 20:04:59 | postgresql-client-9.1 | /usr/lib/postgresql/9.1/bin/psql | | 111 | | 398 | 2013-12-16 20:08:23 | 2013-12-16 20:04:58 | postgresql-9.1 | /usr/lib/postgresql/9.1/bin/postmaster | | 112 | | 452 | 2013-12-16 20:03:20 | 2013-12-16 20:04:55 | php5-dev | /usr/include/php5/main/snprintf.h | | 113 | | 440 | 2013-12-16 20:03:20 | 2013-12-16 20:04:54 | php-pear | /usr/share/php/XML/Util.php | | 114 | 115 | 好吧,很酷,它说我最近安装了`ddd`。 和`postgresql`! 我记得安装这些东西。 116 | 117 | 这里的整个消息是,如果你有一个以秒或毫秒或纳秒为单位的时间戳,那么你可以“转换”到`datetime64 [the-right-thing]`,并且 pandas/numpy 将处理其余的事情。 118 | -------------------------------------------------------------------------------- /2.9.md: -------------------------------------------------------------------------------- 1 | # 第九章 2 | 3 | > 原文:[Chapter 9](http://nbviewer.jupyter.org/github/jvns/pandas-cookbook/blob/master/cookbook/Chapter%209%20-%20Loading%20data%20from%20SQL%20databases.ipynb) 4 | 5 | > 译者:[飞龙](https://github.com/wizardforcel) 6 | 7 | > 协议:[CC BY-NC-SA 4.0](http://creativecommons.org/licenses/by-nc-sa/4.0/) 8 | 9 | ```py 10 | import pandas as pd 11 | import sqlite3 12 | ``` 13 | 14 | 到目前为止,我们只涉及从 CSV 文件中读取数据。 这是一个存储数据的常见方式,但有很多其它方式! Pandas 可以从 HTML,JSON,SQL,Excel(!!!),HDF5,Stata 和其他一些东西中读取数据。 在本章中,我们将讨论从 SQL 数据库读取数据。 15 | 16 | 您可以使用`pd.read_sql`函数从 SQL 数据库读取数据。 `read_sql`将自动将 SQL 列名转换为`DataFrame`列名。 17 | 18 | `read_sql`需要 2 个参数:`SELECT`语句和数据库连接对象。 这是极好的,因为它意味着你可以从任何种类的 SQL 数据库读取 - 无论是 MySQL,SQLite,PostgreSQL 或其他东西。 19 | 20 | 此示例从 SQLite 数据库读取,但任何其他数据库将以相同的方式工作。 21 | 22 | ```py 23 | con = sqlite3.connect("../data/weather_2012.sqlite") 24 | df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con) 25 | df 26 | ``` 27 | 28 | 29 | | | id | date_time | temp | 30 | | --- | --- | 31 | | 0 | 1 | 2012-01-01 00:00:00 | -1.8 | 32 | | 1 | 2 | 2012-01-01 01:00:00 | -1.8 | 33 | | 2 | 3 | 2012-01-01 02:00:00 | -1.8 | 34 | 35 | `read_sql`不会自动将主键(`id`)设置为`DataFrame`的索引。 你可以通过向`read_sql`添加一个`index_col`参数来实现。 36 | 37 | 如果你大量使用`read_csv`,你可能已经看到它有一个`index_col`参数。 这个行为是一样的。 38 | 39 | ```py 40 | df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con, index_col='id') 41 | df 42 | ``` 43 | 44 | 45 | | | date_time | temp | 46 | | --- | --- | 47 | | id | | | 48 | | 1 | 2012-01-01 00:00:00 | -1.8 | 49 | | 2 | 2012-01-01 01:00:00 | -1.8 | 50 | | 3 | 2012-01-01 02:00:00 | -1.8 | 51 | 52 | 如果希望`DataFrame`由多个列索引,可以将列的列表提供给`index_col`: 53 | 54 | ```py 55 | df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con, 56 | index_col=['id', 'date_time']) 57 | df 58 | ``` 59 | 60 | 61 | | | | temp | 62 | | --- | --- | 63 | | id | date_time | | 64 | | 1 | 2012-01-01 00:00:00 | -1.8 | 65 | | 2 | 2012-01-01 01:00:00 | -1.8 | 66 | | 3 | 2012-01-01 02:00:00 | -1.8 | 67 | 68 | 69 | ## 9.2 写入 SQLite 数据库 70 | 71 | Pandas 拥有`write_frame`函数,它从`DataFrame`创建一个数据库表。 现在这只适用于 SQLite 数据库。 让我们使用它,来将我们的 2012 天气数据转换为 SQL。 72 | 73 | 你会注意到这个函数在`pd.io.sql`中。 在`pd.io`中有很多有用的函数,用于读取和写入各种类型的数据,值得花一些时间来探索它们。 ([请参阅文档!](http://pandas.pydata.org/pandas-docs/stable/io.html)) 74 | 75 | ```py 76 | weather_df = pd.read_csv('../data/weather_2012.csv') 77 | con = sqlite3.connect("../data/test_db.sqlite") 78 | con.execute("DROP TABLE IF EXISTS weather_2012") 79 | weather_df.to_sql("weather_2012", con) 80 | ``` 81 | 82 | 我们现在可以从`test_db.sqlite`中的`weather_2012`表中读取数据,我们看到我们得到了相同的数据: 83 | 84 | ```py 85 | con = sqlite3.connect("../data/test_db.sqlite") 86 | df = pd.read_sql("SELECT * from weather_2012 LIMIT 3", con) 87 | df 88 | ``` 89 | 90 | 91 | | | index | Date/Time | Temp (C) | Dew Point Temp (C) | Rel Hum (%) | Wind Spd (km/h) | Visibility (km) | Stn Press (kPa) | Weather | 92 | | --- | --- | 93 | | 0 | 0 | 2012-01-01 00:00:00 | -1.8 | -3.9 | 86 | 4 | 8 | 101.24 | Fog | 94 | | 1 | 1 | 2012-01-01 01:00:00 | -1.8 | -3.7 | 87 | 4 | 8 | 101.24 | Fog | 95 | | 2 | 2 | 2012-01-01 02:00:00 | -1.8 | -3.4 | 89 | 7 | 4 | 101.26 | Freezing Drizzle,Fog | 96 | 97 | 在数据库中保存数据的好处在于,可以执行任意的 SQL 查询。 这非常酷,特别是如果你更熟悉 SQL 的情况下。 以下是`Weather`列排序的示例: 98 | 99 | 100 | 101 | | | index | Date/Time | Temp (C) | Dew Point Temp (C) | Rel Hum (%) | Wind Spd (km/h) | Visibility (km) | Stn Press (kPa) | Weather | 102 | | --- | --- | 103 | | 0 | 67 | 2012-01-03 19:00:00 | -16.9 | -24.8 | 50 | 24 | 25 | 101.74 | Clear | 104 | | 1 | 114 | 2012-01-05 18:00:00 | -7.1 | -14.4 | 56 | 11 | 25 | 100.71 | Clear | 105 | | 2 | 115 | 2012-01-05 19:00:00 | -9.2 | -15.4 | 61 | 7 | 25 | 100.80 | Clear | 106 | 107 | 如果你有一个 PostgreSQL 数据库或 MySQL 数据库,从它读取的工作方式与从 SQLite 数据库读取完全相同。 使用`psycopg2.connect()`或`MySQLdb.connect()`创建连接,然后使用 108 | 109 | ```py 110 | pd.read_sql("SELECT whatever from your_table", con) 111 | ``` 112 | 113 | ## 9.3 连接到其它类型的数据库 114 | 115 | 为了连接到 MySQL 数据库: 116 | 117 | 注:为了使其正常工作,你需要拥有 MySQL/PostgreSQL 数据库,并带有正确的`localhost`,数据库名称,以及其他。 118 | 119 | ```py 120 | import MySQLdb con = MySQLdb.connect(host="localhost", db="test") 121 | ``` 122 | 123 | 为了连接到 PostgreSQL 数据库: 124 | 125 | ```py 126 | import psycopg2 con = psycopg2.connect(host="localhost") 127 | ``` 128 | -------------------------------------------------------------------------------- /2.md: -------------------------------------------------------------------------------- 1 | # Pandas 秘籍 -------------------------------------------------------------------------------- /3.1.md: -------------------------------------------------------------------------------- 1 | # 第一课 2 | 3 | + 创建数据 - 我们以创建用于分析的数据集开始。这使读这篇教程的用户不必下载任何文件来重复结果。我们会将数据集导出到文本文件,以便你从文本文件获取一些数据来进行实验。 4 | 5 | + 获取数据 - 我们会学到如何读取文本文件。数据包含婴儿名称,以及 1980 年出生的婴儿名称数量。 6 | 7 | + 准备数据 - 这里我们会简单查看数据,并确保它是干净的。干净的意思是,我们会查看文本文件的内容,并寻找任何异常。这些可能包括缺失的数据、数据中的不一致、或者任何其他不合适的数据。如果我们找到了,我们之后必须决定如何处理这些记录。 8 | 9 | + 分析数据 - 我们会简单地寻找特定年份的流行名称。 10 | 11 | + 展示数据 - 通过表格和图像,清晰向用户展示特定年份的流行名称是什么。 12 | 13 | > Pandas 库用于全部数据分析,除了数据展示的一小部分。Matplotlib 库仅仅用于数据展示部分。课程中的第一步就是导入这些库。 14 | 15 | ```py 16 | # 导入所有教程所需的库 17 | 18 | # 导入库中特定函数的通用语法: 19 | ##from (library) import (specific library function) 20 | from pandas import DataFrame, read_csv 21 | 22 | # 导入库而不是函数的通用语法 23 | ##import (library) as (give the library a nickname/alias) 24 | import matplotlib.pyplot as plt 25 | import pandas as pd #this is how I usually import pandas 26 | import sys #only needed to determine Python version number 27 | import matplotlib #only needed to determine Matplotlib version number 28 | 29 | # 开启内联绘图 30 | %matplotlib inline 31 | ``` 32 | 33 | ```py 34 | print('Python version ' + sys.version) 35 | print('Pandas version ' + pd.__version__) 36 | print('Matplotlib version ' + matplotlib.__version__) 37 | ``` 38 | 39 | ``` 40 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 41 | Pandas version 0.18.1 42 | Matplotlib version 1.5.1 43 | ``` 44 | 45 | ## 创建数据 46 | 47 | 数据集含有 5 个婴儿名称,以及当年(1980)记录的出生数量。 48 | 49 | ```py 50 | # 名称数量和出生率的初始数据集 51 | names = ['Bob','Jessica','Mary','John','Mel'] 52 | births = [968, 155, 77, 578, 973] 53 | ``` 54 | 55 | 为了合并两个列表,我们使用`zip`函数。 56 | 57 | ```py 58 | zip? 59 | ``` 60 | 61 | ```py 62 | BabyDataSet = list(zip(names,births)) 63 | BabyDataSet 64 | ``` 65 | 66 | ``` 67 | [('Bob', 968), ('Jessica', 155), ('Mary', 77), ('John', 578), ('Mel', 973)] 68 | ``` 69 | 70 | 我们基本完成了创建数据集,我们现在使用 Pandas 库来将数据集导出到 CSV 文件。 71 | 72 | `df`是`DataFrame`对象,你可以将这个对象当做含有`BabyDataSet `内容的对象,它的格式和 SQL 表或者 Excel 表格相似。让我们在下面查看`df`中的所有内容: 73 | 74 | ```py 75 | df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) 76 | df 77 | ``` 78 | 79 | 80 | | | Names | Births | 81 | | --- | --- | 82 | | 0 | Bob | 968 | 83 | | 1 | Jessica | 155 | 84 | | 2 | Mary | 77 | 85 | | 3 | John | 578 | 86 | | 4 | Mel | 973 | 87 | 88 | 导出`DataFrame`到 CSV 文件。我们可以将文件命名为`births1880.csv`。函数`to_csv`用于导出文件,文件保存在笔记本的相同目录中,除非另行规定。 89 | 90 | ```py 91 | df.to_csv? 92 | ``` 93 | 94 | 我们所用的参数仅仅是`index`和`header`。将这些参数设为`True`会防止索引和标题名称导出。修改这些参数的值来深入理解它们。 95 | 96 | ```py 97 | df.to_csv('births1880.csv',index=False,header=False) 98 | ``` 99 | 100 | ## 获取数据 101 | 102 | 为了拉取 CSV 文件,我们使用 Pandas 函数`read_csv`,让我们插件这个函数,以及它接受什么输入。 103 | 104 | ```py 105 | read_csv? 106 | ``` 107 | 108 | 即使这个函数拥有许多参数,我们仅仅向其传递文本文件的位置。 109 | 110 | ``` 111 | Location = C:\Users\ENTER_USER_NAME.xy\startups\births1880.csv 112 | ``` 113 | 114 | 注:取决于你保存笔记本的位置,你可能需要修改上面的位置。 115 | 116 | ```py 117 | Location = r'C:\Users\david\notebooks\births1880.csv' 118 | df = pd.read_csv(Location) 119 | ``` 120 | 121 | 要注意字符串前面的`r`。因为斜杠是特殊字符,将`r`放在字符串前面,会转义整个字符串。 122 | 123 | ```py 124 | df 125 | ``` 126 | 127 | | | Bob | 968 | 128 | | --- | --- | 129 | | 0 | Jessica | 155 | 130 | | 1 | Mary | 77 | 131 | | 2 | John | 578 | 132 | | 3 | Mel | 973 | 133 | 134 | 这会带给我们该练习中的第一个问题,`read_cav`函数将 CSV 文件的第一个记录作为标题的名称。这显然是不对的,译文文本文件没有提供给我们标题名称。 135 | 136 | 为了纠正它,我们将`header `函数传递给`read_csv`函数并将其设为`None`(Python 中的意思是 Null)。 137 | 138 | ```py 139 | df = pd.read_csv(Location, header=None) 140 | df 141 | ``` 142 | 143 | 144 | | | 0 | 1 | 145 | | --- | --- | 146 | | 0 | Bob | 968 | 147 | | 1 | Jessica | 155 | 148 | | 2 | Mary | 77 | 149 | | 3 | John | 578 | 150 | | 4 | Mel | 973 | 151 | 152 | 如果我们打算为列提供特定名称,我们需要传递另一个叫做`names`的参数。我们也可以忽略`header`参数。 153 | 154 | ```py 155 | df = pd.read_csv(Location, names=['Names','Births']) 156 | df 157 | ``` 158 | 159 | 160 | | | Names | Births | 161 | | --- | --- | 162 | | 0 | Bob | 968 | 163 | | 1 | Jessica | 155 | 164 | | 2 | Mary | 77 | 165 | | 3 | John | 578 | 166 | | 4 | Mel | 973 | 167 | 168 | 我们可以将数值`[0,1,2,3,4]`看做 Excel 文件中的行号。在 Pandas 中这些是`DataFrame`中索引的部分。你可以将索引看做 SQL 表的主键,除了索引可以重复。 169 | 170 | `[Names, Births]`可以看做列标题,类似于可以在 Excel 表格和 SQL 数据库中找到的标题。 171 | 172 | > 既然我们用完了 CSV 文件,我们将其删除。 173 | 174 | ```py 175 | import os 176 | os.remove(Location) 177 | ``` 178 | 179 | ## 准备数据 180 | 181 | 数据含有婴儿名称和 1880 年出生数量。我们已经知道了我们拥有 5 个记录,没有记录是缺失的(非空的值)。 182 | 183 | `Names`列在这里没有意义,因为它只由字母和数字的字符串组成。这一列中有可能有损坏的数据,但我们在分析中不必担心它、`Births`列应该只含有整数,表示特定年份中特定名称的婴儿的出生数量。我们可以检查所有数据是否都是整数类型。这一列的数据类型为浮点是没有意义的。在分析中,我并不担心任何可能的离群点。 184 | 185 | 意识到除了我们对`Names`列所做的检查之外,简单查看`DataFrame`中的数据应该在这个阶段需要做的事情。因为我们在数据分析的生命周期继续前进,我们会拥有大量的机会来寻找数据集的任何问题。 186 | 187 | ```py 188 | # 检查列的数据类型 189 | df.dtypes 190 | ``` 191 | 192 | ``` 193 | Names object 194 | Births int64 195 | dtype: object 196 | ``` 197 | 198 | ``` 199 | # 检查 Births 列的数据类型 200 | df.Births.dtype 201 | ``` 202 | 203 | ``` 204 | dtype('int64') 205 | ``` 206 | 207 | 你可以看到,`Birth`列的类型是`int64`,这一列中不会出现浮点(小数)或者字母或数字字符。 208 | 209 | ## 分析数据 210 | 211 | 为了寻找最受欢迎的名称,或者出生率最高的婴儿名称,我们可以这样做。 212 | 213 | + 对`DataFrame`进行排序,之后选取选取前面几行。 214 | + 使用`max`属性来寻找最大值。 215 | 216 | ```py 217 | # 方法 1: 218 | Sorted = df.sort_values(['Births'], ascending=False) 219 | Sorted.head(1) 220 | ``` 221 | 222 | 223 | | | Names | Births | 224 | | --- | --- | 225 | | 4 | Mel | 973 | 226 | 227 | ```py 228 | # 方法 2: 229 | df['Births'].max() 230 | ``` 231 | 232 | ``` 233 | 973 234 | ``` 235 | 236 | ## 展示数据 237 | 238 | 这里我们绘制`Birth`列,并标记图表来向用户展示图表上的最高值。和表格一起使用,用户就可以清晰了解,`Mel`是数据集中的最受欢迎的婴儿名称。 239 | 240 | `plot`是个便利属性,Pandas 让你轻松画出`DataFrame`的数据。我们在上一节了解了如何寻找`Birth`列的最大值。现在为了寻找值为 973 的实际的婴儿名称,这看起来需要些技巧,所以让我们解决它。 241 | 242 | 代码段的解释: 243 | 244 | `df['Names']` - 这是婴儿名称的完整列表,完全的`Names`列 245 | `df['Births'] ` - 这是 1880 年的出生数量的完整列表,完整的`Births`列 246 | `df['Births'].max()` - 这是`Births`列的值中发现的最大值 247 | `[df['Births'] == df['Births'].max()]` - 等同于寻找`Births`列等于 973 的所有记录 248 | `df['Names'][df['Births'] == df['Births'].max()]` - 等同于选择`Names`列的所有记录,其中`Birth`列为 973 249 | 250 | 一种替代方式为,对`DataFrame`排序:`Sorted['Names'].head(1).value`。 251 | 252 | `str`函数将对象简单转换为字符串。 253 | 254 | ```py 255 | # 创建图表 256 | df['Births'].plot() 257 | 258 | # 数据集中的最大值 259 | MaxValue = df['Births'].max() 260 | 261 | # 和最大值相关的名称 262 | MaxName = df['Names'][df['Births'] == df['Births'].max()].values 263 | 264 | # 在图表上展示的文本 265 | Text = str(MaxValue) + " - " + MaxName 266 | 267 | # 向图表添加文本 268 | plt.annotate(Text, xy=(1, MaxValue), xytext=(8, 0), 269 | xycoords=('axes fraction', 'data'), textcoords='offset points') 270 | 271 | print("The most popular name") 272 | df[df['Births'] == df['Births'].max()] 273 | # 也可以使用 Sorted.head(1) 274 | ``` 275 | 276 | ``` 277 | The most popular name 278 | ``` 279 | 280 | 281 | | | Names | Births | 282 | | --- | --- | 283 | | 4 | Mel | 973 | 284 | 285 | ![](http://upload-images.jianshu.io/upload_images/118142-ad80314f46a0aeea.png) 286 | -------------------------------------------------------------------------------- /3.10.md: -------------------------------------------------------------------------------- 1 | # 第十课 2 | 3 | ```py 4 | import pandas as pd 5 | import sys 6 | ``` 7 | 8 | ```py 9 | print('Python version ' + sys.version) 10 | print('Pandas version ' + pd.__version__) 11 | ``` 12 | 13 | ``` 14 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 15 | Pandas version 0.18.1 16 | ``` 17 | 18 | ## 从`DataFrame`转换为 Excel 19 | 20 | ```py 21 | # 创建 DataFrame 22 | d = [1,2,3,4,5,6,7,8,9] 23 | df = pd.DataFrame(d, columns = ['Number']) 24 | df 25 | ``` 26 | 27 | | | Number | 28 | | --- | --- | 29 | | 0 | 1 | 30 | | 1 | 2 | 31 | | 2 | 3 | 32 | | 3 | 4 | 33 | | 4 | 5 | 34 | | 5 | 6 | 35 | | 6 | 7 | 36 | | 7 | 8 | 37 | | 8 | 9 | 38 | 39 | ```py 40 | # 导出到 Excel 41 | df.to_excel('Lesson10.xlsx', sheet_name = 'testing', index = False) 42 | print('Done') 43 | ``` 44 | 45 | ``` 46 | Done 47 | ``` 48 | 49 | ## 从 Excel 转换为`DataFrame` 50 | 51 | ```py 52 | # Excel 文件路径 53 | # 你的路径可能不同,请修改下面的路径 54 | location = r'C:\Users\david\notebooks\Lesson10.xlsx' 55 | 56 | # 解析 Excel 文件 57 | df = pd.read_excel(location, 0) 58 | df.head() 59 | ``` 60 | 61 | 62 | | | Number | 63 | | --- | --- | 64 | | 0 | 1 | 65 | | 1 | 2 | 66 | | 2 | 3 | 67 | | 3 | 4 | 68 | | 4 | 5 | 69 | 70 | ```py 71 | df.dtypes 72 | ``` 73 | 74 | ``` 75 | Number int64 76 | dtype: object 77 | ``` 78 | 79 | ```py 80 | df.tail() 81 | ``` 82 | 83 | 84 | | | Number | 85 | | --- | --- | 86 | | 4 | 5 | 87 | | 5 | 6 | 88 | | 6 | 7 | 89 | | 7 | 8 | 90 | | 8 | 9 | 91 | 92 | ## 从`DataFrame`转换为 JSON 93 | 94 | ```py 95 | df.to_json('Lesson10.json') 96 | print('Done') 97 | ``` 98 | 99 | ``` 100 | Done 101 | ``` 102 | 103 | ## 从 JSON 转换为`DataFrame` 104 | 105 | ```py 106 | # 你的路径可能不同,请修改下面的路径 107 | jsonloc = r'C:\Users\david\notebooks\Lesson10.json' 108 | 109 | # 读取 JSON 文件 110 | df2 = pd.read_json(jsonloc) 111 | ``` 112 | 113 | ``` 114 | df2 115 | ``` 116 | 117 | 118 | | | Number | 119 | | --- | --- | 120 | | 0 | 1 | 121 | | 1 | 2 | 122 | | 2 | 3 | 123 | | 3 | 4 | 124 | | 4 | 5 | 125 | | 5 | 6 | 126 | | 6 | 7 | 127 | | 7 | 8 | 128 | | 8 | 9 | 129 | 130 | ```py 131 | df2.dtypes 132 | ``` 133 | 134 | ``` 135 | Number int64 136 | dtype: object 137 | ``` -------------------------------------------------------------------------------- /3.11.md: -------------------------------------------------------------------------------- 1 | # 第十一课 2 | 3 | 从多个 excel 文件获取数据并将它们合并到单个`DataFrame`中。 4 | 5 | ```py 6 | import pandas as pd 7 | import matplotlib 8 | import os 9 | import sys 10 | %matplotlib inline 11 | ``` 12 | 13 | ```py 14 | print('Python version ' + sys.version) 15 | print('Pandas version ' + pd.__version__) 16 | print('Matplotlib version ' + matplotlib.__version__) 17 | ``` 18 | 19 | ``` 20 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 21 | Pandas version 0.18.1 22 | Matplotlib version 1.5.1 23 | ``` 24 | 25 | ## 创建 3 个 Excel 文件 26 | 27 | ```py 28 | # 创建 DataFrame 29 | d = {'Channel':[1], 'Number':[255]} 30 | df = pd.DataFrame(d) 31 | df 32 | ``` 33 | 34 | 35 | | | Channel | Number | 36 | | --- | --- | 37 | | 0 | 1 | 255 | 38 | 39 | 40 | ```py 41 | # 导出到 Excel 42 | 43 | df.to_excel('test1.xlsx', sheet_name = 'test1', index = False) 44 | df.to_excel('test2.xlsx', sheet_name = 'test2', index = False) 45 | df.to_excel('test3.xlsx', sheet_name = 'test3', index = False) 46 | print('Done') 47 | ``` 48 | 49 | ``` 50 | Done 51 | ``` 52 | 53 | ## 把三个 Excel 文件放入一个`DataFrame` 54 | 55 | 获取文件名称的列表,但是确保文件夹中不存在其他 Excel 文件。 56 | 57 | ```py 58 | # 保存文件名称的列表 59 | FileNames = [] 60 | 61 | # 你的路径可能不同,请修改下面的路径 62 | os.chdir(r"C:\Users\david\notebooks") 63 | 64 | # 寻找所有以 ".xlsx" 结尾的文件 65 | for files in os.listdir("."): 66 | if files.endswith(".xlsx"): 67 | FileNames.append(files) 68 | 69 | FileNames 70 | ``` 71 | 72 | ``` 73 | ['test1.xlsx', 'test2.xlsx', 'test3.xlsx'] 74 | ``` 75 | 76 | 创建函数来处理所有其他的 Excel 文件。 77 | 78 | ```py 79 | def GetFile(fnombre): 80 | 81 | # 你的 Excel 文件路径 82 | # 你的路径可能不同,请修改下面的路径 83 | location = r'C:\Users\david\notebooks\\' + fnombre 84 | 85 | # 解析 Excel 文件 86 | # 0 为第一张表格 87 | df = pd.read_excel(location, 0) 88 | 89 | # 文件名称的标签记录 90 | df['File'] = fnombre 91 | 92 | # 使 "File" 列成为 df 的索引 93 | return df.set_index(['File']) 94 | ``` 95 | 96 | 遍历每个文件的名称,创建一个`DataFrame`,将其添加到列表。 97 | 98 | 例如,`df_list = [df, df, df]`。 99 | 100 | ```py 101 | # 创建 DataFrame 的列表 102 | df_list = [GetFile(fname) for fname in FileNames] 103 | df_list 104 | ``` 105 | 106 | ``` 107 | [ Channel Number 108 | File 109 | test1.xlsx 1 255, Channel Number 110 | File 111 | test2.xlsx 1 255, Channel Number 112 | File 113 | test3.xlsx 1 255] 114 | ``` 115 | 116 | ```py 117 | # 将所有 DataFrame 合并为一个 118 | big_df = pd.concat(df_list) 119 | big_df 120 | ``` 121 | 122 | | | Channel | Number | 123 | | --- | --- | 124 | | File | | | 125 | | test1.xlsx | 1 | 255 | 126 | | test2.xlsx | 1 | 255 | 127 | | test3.xlsx | 1 | 255 | 128 | 129 | ```py 130 | big_df.dtypes 131 | ``` 132 | 133 | ``` 134 | Channel int64 135 | Number int64 136 | dtype: object 137 | ``` 138 | 139 | ```py 140 | # 绘制它! 141 | big_df['Channel'].plot.bar(); 142 | ``` 143 | 144 | ![](http://upload-images.jianshu.io/upload_images/118142-c70707db951ba677.png) 145 | 146 | -------------------------------------------------------------------------------- /3.3.md: -------------------------------------------------------------------------------- 1 | # 第三课 2 | 3 | 读取数据 - 我们的数据由 Excel 文件组成,包含每天的顾客数量。我们会了解如何读取 Excel 文件来处理。 4 | 5 | 准备数据 - 数据是不规则的时间序列,拥有重复的时间。我们会在压缩数据,以及预测下一年的顾客数量上受到挑战。 6 | 7 | 分析数据 - 我们使用图表来将趋势可视化,并标出离群点。我们会使用一些内建的计算工具来预测下一年的顾客数量。 8 | 9 | 展示数据 - 我们会绘制结果。 10 | 11 | 注意:确保你已经浏览了前面所有课程。这个练习需要前面所学的所有知识。 12 | 13 | ```py 14 | # 导入库 15 | import pandas as pd 16 | import matplotlib.pyplot as plt 17 | import numpy.random as np 18 | import sys 19 | import matplotlib 20 | 21 | %matplotlib inline 22 | ``` 23 | 24 | ```py 25 | print('Python version ' + sys.version) 26 | print('Pandas version: ' + pd.__version__) 27 | print('Matplotlib version ' + matplotlib.__version__) 28 | ``` 29 | 30 | ``` 31 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 32 | Pandas version: 0.18.1 33 | Matplotlib version 1.5.1 34 | ``` 35 | 36 | > 我们会创建自己的测试数据来分析。 37 | 38 | ```py 39 | # 设置种子 40 | np.seed(111) 41 | 42 | # 生成测试数据的函数 43 | def CreateDataSet(Number=1): 44 | 45 | Output = [] 46 | 47 | for i in range(Number): 48 | 49 | # 创建按周(周一)的数据范围 50 | rng = pd.date_range(start='1/1/2009', end='12/31/2012', freq='W-MON') 51 | 52 | # 创建随机数据 53 | data = np.randint(low=25,high=1000,size=len(rng)) 54 | 55 | # 状态池 56 | status = [1,2,3] 57 | 58 | # 创建状态的随机列表 59 | random_status = [status[np.randint(low=0,high=len(status))] for i in range(len(rng))] 60 | 61 | # 州池 62 | states = ['GA','FL','fl','NY','NJ','TX'] 63 | 64 | # 创建州的随机列表 65 | random_states = [states[np.randint(low=0,high=len(states))] for i in range(len(rng))] 66 | 67 | Output.extend(zip(random_states, random_status, data, rng)) 68 | 69 | return Output 70 | ``` 71 | 72 | 现在我们拥有了生成测试数据的函数,让我们创建一些数据,并将其放入`DataFrame`。 73 | 74 | ```py 75 | dataset = CreateDataSet(4) 76 | df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate']) 77 | df.info() 78 | ``` 79 | 80 | ``` 81 | 82 | RangeIndex: 836 entries, 0 to 835 83 | Data columns (total 4 columns): 84 | State 836 non-null object 85 | Status 836 non-null int64 86 | CustomerCount 836 non-null int64 87 | StatusDate 836 non-null datetime64[ns] 88 | dtypes: datetime64[ns](1), int64(2), object(1) 89 | memory usage: 26.2+ KB 90 | ``` 91 | 92 | ```py 93 | df.head() 94 | ``` 95 | 96 | 97 | | | State | Status | CustomerCount | StatusDate | 98 | | --- | --- | 99 | | 0 | GA | 1 | 877 | 2009-01-05 | 100 | | 1 | FL | 1 | 901 | 2009-01-12 | 101 | | 2 | fl | 3 | 749 | 2009-01-19 | 102 | | 3 | FL | 3 | 111 | 2009-01-26 | 103 | | 4 | GA | 1 | 300 | 2009-02-02 | 104 | 105 | 我们现在要将此`DataFrame`保存到 Excel 文件中,然后将其恢复到`DataFrame`。 我们只是这样做,向您展示如何读取和写入 Excel 文件。 106 | 107 | 我们不将`DataFrame`的索引值写入 Excel 文件,因为它们不是我们初始测试数据集的一部分。 108 | 109 | ```py 110 | # 将结果保存到 Excel 111 | df.to_excel('Lesson3.xlsx', index=False) 112 | print('Done') 113 | ``` 114 | 115 | ``` 116 | Done 117 | ``` 118 | 119 | ## 从 Excel 抓取数据 120 | 121 | 我们将使用`read_excel`函数从 Excel 文件读入数据。 该功能允许您按名称或位置读入特定的选项卡。 122 | 123 | ```py 124 | pd.read_excel? 125 | ``` 126 | 127 | 注意:Excel 文件的位置必须和该笔记一样,除非另行规定。 128 | 129 | ```py 130 | # 文件位置 131 | Location = r'C:\Users\david\notebooks\Lesson3.xlsx' 132 | 133 | # 解析特定的表格 134 | df = pd.read_excel(Location, 0, index_col='StatusDate') 135 | df.dtypes 136 | ``` 137 | 138 | ``` 139 | State object 140 | Status int64 141 | CustomerCount int64 142 | dtype: object 143 | ``` 144 | 145 | ```py 146 | df.index 147 | ``` 148 | 149 | ``` 150 | DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-01-26', 151 | '2009-02-02', '2009-02-09', '2009-02-16', '2009-02-23', 152 | '2009-03-02', '2009-03-09', 153 | ... 154 | '2012-10-29', '2012-11-05', '2012-11-12', '2012-11-19', 155 | '2012-11-26', '2012-12-03', '2012-12-10', '2012-12-17', 156 | '2012-12-24', '2012-12-31'], 157 | dtype='datetime64[ns]', name='StatusDate', length=836, freq=None) 158 | ``` 159 | 160 | ```py 161 | df.head() 162 | ``` 163 | 164 | 165 | | | State | Status | CustomerCount | 166 | | --- | --- | 167 | | StatusDate | | | | 168 | | 2009-01-05 | GA | 1 | 877 | 169 | | 2009-01-12 | FL | 1 | 901 | 170 | | 2009-01-19 | fl | 3 | 749 | 171 | | 2009-01-26 | FL | 3 | 111 | 172 | | 2009-02-02 | GA | 1 | 300 | 173 | 174 | ## 准备数据 175 | 176 | 此部分尝试清理要分析的数据。 177 | 178 | + 确保`State`列全部大写 179 | + 仅选择帐户状态等于`1`的记录 180 | + 在`State`列中合并(`NJ`和`NY`)为`NY` 181 | + 删除任何异常值(数据集中的任何奇怪结果) 182 | 183 | 让我们快速查看`State`值是大写还是小写 184 | 185 | ```py 186 | df['State'].unique() 187 | ``` 188 | 189 | ``` 190 | array(['GA', 'FL', 'fl', 'TX', 'NY', 'NJ'], dtype=object) 191 | ``` 192 | 193 | 要将所有`State`值转换为大写,我们将使用`upper()`函数和`DataFrame`的`apply`属性。 `lambda`函数简单地将上面的函数应用到`State`列中的每个值。 194 | 195 | ```py 196 | # 清理 State 列,转换为大写 197 | df['State'] = df.State.apply(lambda x: x.upper()) 198 | ``` 199 | 200 | ```py 201 | df['State'].unique() 202 | ``` 203 | 204 | ``` 205 | array(['GA', 'FL', 'TX', 'NY', 'NJ'], dtype=object) 206 | ``` 207 | 208 | ```py 209 | # 仅仅抓取 Status == 1 的值 210 | mask = df['Status'] == 1 211 | df = df[mask] 212 | ``` 213 | 214 | 为了将`NJ`变成`NY`,我们只需要, 215 | 216 | + `[df.State == 'NJ'] ` - 找到`State`列为`NJ`的所有记录 217 | + `df.State[df.State == 'NJ'] = 'NY' ` - 对于所有`State`为`NJ`的记录,把它们的值变为`NY` 218 | 219 | ```py 220 | # 将 NJ 变为 NY 221 | mask = df.State == 'NJ' 222 | df['State'][mask] = 'NY' 223 | ``` 224 | 225 | 现在我们可以看到,我们要处理的数据集更加干净了。 226 | 227 | ```py 228 | df['State'].unique() 229 | ``` 230 | 231 | ``` 232 | array(['GA', 'FL', 'NY', 'TX'], dtype=object) 233 | ``` 234 | 235 | 这时,我们可能想要绘制数据,来检查数据中的任何离群点或不一致的值。 我们将使用`DataFrame`的`plot()`属性。 236 | 237 | 从下面的图表可以看出,它不是非常明确,可能标志着我们需要执行更多的数据准备操作。 238 | 239 | ```py 240 | df['CustomerCount'].plot(figsize=(15,5)); 241 | ``` 242 | 243 | ![](http://upload-images.jianshu.io/upload_images/118142-273be9e5f7b61ff7.png) 244 | 245 | 如果我们看一下数据,我们开始意识到同一个`State`,`StatusDate`和`Status`组合有多个值。 这可能意味着你正在使用的数据是脏的/坏的/不准确的,但我们假设是其他情况。 我们可以假设此数据集是较大数据集的子集,如果我们只是在`CustomerCount`列中按`State`,`StatusDate`和`Status`添加值,我们将获得每天的客户总计数。 246 | 247 | ```py 248 | sortdf = df[df['State']=='NY'].sort_index(axis=0) 249 | sortdf.head(10) 250 | ``` 251 | 252 | 253 | | | State | Status | CustomerCount | 254 | | --- | --- | 255 | | StatusDate | | | | 256 | | 2009-01-19 | NY | 1 | 522 | 257 | | 2009-02-23 | NY | 1 | 710 | 258 | | 2009-03-09 | NY | 1 | 992 | 259 | | 2009-03-16 | NY | 1 | 355 | 260 | | 2009-03-23 | NY | 1 | 728 | 261 | | 2009-03-30 | NY | 1 | 863 | 262 | | 2009-04-13 | NY | 1 | 520 | 263 | | 2009-04-20 | NY | 1 | 820 | 264 | | 2009-04-20 | NY | 1 | 937 | 265 | | 2009-04-27 | NY | 1 | 447 | -------------------------------------------------------------------------------- /3.4.md: -------------------------------------------------------------------------------- 1 | # 第四课 2 | 3 | 在这一课中我们会回到基础部分,我们会处理一个小型数据集,所以你可以轻易理解我尝试解释的东西。我们会添加列、删除列、并且以多种方式对数据切片。祝愉快! 4 | 5 | ```py 6 | # 导入库 7 | import pandas as pd 8 | import sys 9 | ``` 10 | 11 | ```py 12 | print('Python version ' + sys.version) 13 | print('Pandas version: ' + pd.__version__) 14 | ``` 15 | 16 | ``` 17 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 18 | Pandas version: 0.18.1 19 | ``` 20 | 21 | ```py 22 | # 我们的小型数据集 23 | d = [0,1,2,3,4,5,6,7,8,9] 24 | 25 | # 创建 DataFrame 26 | df = pd.DataFrame(d) 27 | df 28 | ``` 29 | 30 | 31 | | | 0 | 32 | | --- | --- | 33 | | 0 | 0 | 34 | | 1 | 1 | 35 | | 2 | 2 | 36 | | 3 | 3 | 37 | | 4 | 4 | 38 | | 5 | 5 | 39 | | 6 | 6 | 40 | | 7 | 7 | 41 | | 8 | 8 | 42 | | 9 | 9 | 43 | 44 | ``` 45 | # 让我们修改列名称 46 | df.columns = ['Rev'] 47 | df 48 | ``` 49 | 50 | 51 | | | Rev | 52 | | --- | --- | 53 | | 0 | 0 | 54 | | 1 | 1 | 55 | | 2 | 2 | 56 | | 3 | 3 | 57 | | 4 | 4 | 58 | | 5 | 5 | 59 | | 6 | 6 | 60 | | 7 | 7 | 61 | | 8 | 8 | 62 | | 9 | 9 | 63 | 64 | ``` 65 | # 让我们添加一列 66 | df['NewCol'] = 5 67 | df 68 | ``` 69 | 70 | 71 | | | Rev | NewCol | 72 | | --- | --- | 73 | | 0 | 0 | 5 | 74 | | 1 | 1 | 5 | 75 | | 2 | 2 | 5 | 76 | | 3 | 3 | 5 | 77 | | 4 | 4 | 5 | 78 | | 5 | 5 | 5 | 79 | | 6 | 6 | 5 | 80 | | 7 | 7 | 5 | 81 | | 8 | 8 | 5 | 82 | | 9 | 9 | 5 | 83 | 84 | ``` 85 | # 让我们修改新的列 86 | df['NewCol'] = df['NewCol'] + 1 87 | df 88 | ``` 89 | 90 | 91 | | | Rev | NewCol | 92 | | --- | --- | 93 | | 0 | 0 | 6 | 94 | | 1 | 1 | 6 | 95 | | 2 | 2 | 6 | 96 | | 3 | 3 | 6 | 97 | | 4 | 4 | 6 | 98 | | 5 | 5 | 6 | 99 | | 6 | 6 | 6 | 100 | | 7 | 7 | 6 | 101 | | 8 | 8 | 6 | 102 | | 9 | 9 | 6 | 103 | 104 | ``` 105 | # 我们可以删除列 106 | del df['NewCol'] 107 | df 108 | ``` 109 | 110 | 111 | | | Rev | 112 | | --- | --- | 113 | | 0 | 0 | 114 | | 1 | 1 | 115 | | 2 | 2 | 116 | | 3 | 3 | 117 | | 4 | 4 | 118 | | 5 | 5 | 119 | | 6 | 6 | 120 | | 7 | 7 | 121 | | 8 | 8 | 122 | | 9 | 9 | 123 | 124 | ``` 125 | # 让我们添加几列 126 | df['test'] = 3 127 | df['col'] = df['Rev'] 128 | df 129 | ``` 130 | 131 | | | Rev | test | col | 132 | | --- | --- | 133 | | 0 | 0 | 3 | 0 | 134 | | 1 | 1 | 3 | 1 | 135 | | 2 | 2 | 3 | 2 | 136 | | 3 | 3 | 3 | 3 | 137 | | 4 | 4 | 3 | 4 | 138 | | 5 | 5 | 3 | 5 | 139 | | 6 | 6 | 3 | 6 | 140 | | 7 | 7 | 3 | 7 | 141 | | 8 | 8 | 3 | 8 | 142 | | 9 | 9 | 3 | 9 | 143 | 144 | ```py 145 | # 如果我们希望,我们可以修改索引的名称 146 | i = ['a','b','c','d','e','f','g','h','i','j'] 147 | df.index = i 148 | df 149 | ``` 150 | 151 | 152 | | | Rev | test | col | 153 | | --- | --- | 154 | | a | 0 | 3 | 0 | 155 | | b | 1 | 3 | 1 | 156 | | c | 2 | 3 | 2 | 157 | | d | 3 | 3 | 3 | 158 | | e | 4 | 3 | 4 | 159 | | f | 5 | 3 | 5 | 160 | | g | 6 | 3 | 6 | 161 | | h | 7 | 3 | 7 | 162 | | i | 8 | 3 | 8 | 163 | | j | 9 | 3 | 9 | 164 | 165 | 我们现在可以使用`loc`选取`DataFrame`的片段。 166 | 167 | ```py 168 | df.loc['a'] 169 | ``` 170 | 171 | ``` 172 | Rev 0 173 | test 3 174 | col 0 175 | Name: a, dtype: int64 176 | ``` 177 | 178 | ```py 179 | # df.loc[inclusive:inclusive] 180 | df.loc['a':'d'] 181 | ``` 182 | 183 | 184 | | | Rev | test | col | 185 | | --- | --- | 186 | | a | 0 | 3 | 0 | 187 | | b | 1 | 3 | 1 | 188 | | c | 2 | 3 | 2 | 189 | | d | 3 | 3 | 3 | 190 | 191 | ```py 192 | # df.iloc[inclusive:exclusive] 193 | # 注意:.iloc 是严格基于整数位置的,它只在版本 [0.11.0] (http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#v0-11-0-april-22-2013) 及以上可用 194 | df.iloc[0:3] 195 | ``` 196 | 197 | 198 | | | Rev | test | col | 199 | | --- | --- | 200 | | a | 0 | 3 | 0 | 201 | | b | 1 | 3 | 1 | 202 | | c | 2 | 3 | 2 | 203 | 204 | 我们也可以使用列名称来选取。 205 | 206 | ```py 207 | df['Rev'] 208 | ``` 209 | 210 | ``` 211 | a 0 212 | b 1 213 | c 2 214 | d 3 215 | e 4 216 | f 5 217 | g 6 218 | h 7 219 | i 8 220 | j 9 221 | Name: Rev, dtype: int64 222 | ``` 223 | 224 | ```py 225 | df[['Rev', 'test']] 226 | ``` 227 | 228 | 229 | | | Rev | test | 230 | | --- | --- | 231 | | a | 0 | 3 | 232 | | b | 1 | 3 | 233 | | c | 2 | 3 | 234 | | d | 3 | 3 | 235 | | e | 4 | 3 | 236 | | f | 5 | 3 | 237 | | g | 6 | 3 | 238 | | h | 7 | 3 | 239 | | i | 8 | 3 | 240 | | j | 9 | 3 | 241 | 242 | ```py 243 | # df.ix[rows,columns] 244 | df.ix[0:3,'Rev'] 245 | ``` 246 | 247 | ``` 248 | a 0 249 | b 1 250 | c 2 251 | Name: Rev, dtype: int64 252 | ``` 253 | 254 | ```py 255 | df.ix[5:,'col'] 256 | ``` 257 | 258 | ``` 259 | f 5 260 | g 6 261 | h 7 262 | i 8 263 | j 9 264 | Name: col, dtype: int64 265 | ``` 266 | 267 | ```py 268 | df.ix[:3,['col', 'test']] 269 | ``` 270 | 271 | 272 | | | col | test | 273 | | --- | --- | 274 | | a | 0 | 3 | 275 | | b | 1 | 3 | 276 | | c | 2 | 3 | 277 | 278 | 这里是一些便利的方法,用于选择`DataFrame`顶端和底端的数据集。 279 | 280 | ```py 281 | # 选择前 N 条记录(默认为 5) 282 | df.head() 283 | ``` 284 | 285 | 286 | | | Rev | test | col | 287 | | --- | --- | 288 | | a | 0 | 3 | 0 | 289 | | b | 1 | 3 | 1 | 290 | | c | 2 | 3 | 2 | 291 | | d | 3 | 3 | 3 | 292 | | e | 4 | 3 | 4 | 293 | 294 | ``` 295 | # 选择后 N 条记录(默认为 5) 296 | df.tail() 297 | ``` 298 | 299 | 300 | | | Rev | test | col | 301 | | --- | --- | 302 | | f | 5 | 3 | 5 | 303 | | g | 6 | 3 | 6 | 304 | | h | 7 | 3 | 7 | 305 | | i | 8 | 3 | 8 | 306 | | j | 9 | 3 | 9 | -------------------------------------------------------------------------------- /3.5.md: -------------------------------------------------------------------------------- 1 | # 第五课 2 | 3 | > 我们来看看`stack `和`unstack`函数。 4 | 5 | ```py 6 | # 导入库 7 | import pandas as pd 8 | import sys 9 | ``` 10 | 11 | ```py 12 | print('Python version ' + sys.version) 13 | print('Pandas version: ' + pd.__version__) 14 | ``` 15 | 16 | ``` 17 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 18 | Pandas version: 0.18.1 19 | ``` 20 | 21 | ```py 22 | # 我们的小型数据集 23 | d = {'one':[1,1],'two':[2,2]} 24 | i = ['a','b'] 25 | 26 | # 创建 DataFrame 27 | df = pd.DataFrame(data = d, index = i) 28 | df 29 | ``` 30 | 31 | 32 | | | one | two | 33 | | --- | --- | 34 | | a | 1 | 2 | 35 | | b | 1 | 2 | 36 | 37 | ```py 38 | df.index 39 | ``` 40 | 41 | ``` 42 | Index(['a', 'b'], dtype='object') 43 | ``` 44 | 45 | ```py 46 | # 将列去掉,把它们变成索引 47 | stack = df.stack() 48 | stack 49 | ``` 50 | 51 | ```py 52 | a one 1 53 | two 2 54 | b one 1 55 | two 2 56 | dtype: int64 57 | ``` 58 | 59 | ```py 60 | # 现在索引包含列的名称 61 | stack.index 62 | ``` 63 | 64 | ``` 65 | MultiIndex(levels=[['a', 'b'], ['one', 'two']], 66 | labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) 67 | ``` 68 | 69 | ```py 70 | unstack = df.unstack() 71 | unstack 72 | ``` 73 | 74 | ``` 75 | one a 1 76 | b 1 77 | two a 2 78 | b 2 79 | dtype: int64 80 | ``` 81 | 82 | ```py 83 | unstack.index 84 | ``` 85 | 86 | ```py 87 | MultiIndex(levels=[['one', 'two'], ['a', 'b']], 88 | labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) 89 | ``` 90 | 91 | 我们也可以使用`T`(转置)函数来翻转列名称。 92 | 93 | ```py 94 | transpose = df.T 95 | transpose 96 | ``` 97 | 98 | 99 | | | a | b | 100 | | --- | --- | 101 | | one | 1 | 1 | 102 | | two | 2 | 2 | 103 | 104 | ```py 105 | transpose.index 106 | ``` 107 | 108 | ``` 109 | Index(['one', 'two'], dtype='object') 110 | ``` -------------------------------------------------------------------------------- /3.6.md: -------------------------------------------------------------------------------- 1 | # 第六课 2 | 3 | > 让我们来看看`groupby `函数 4 | 5 | ```py 6 | # 导入库 7 | import pandas as pd 8 | import sys 9 | ``` 10 | 11 | ```py 12 | print('Python version ' + sys.version) 13 | print('Pandas version ' + pd.__version__) 14 | ``` 15 | 16 | ``` 17 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 18 | Pandas version 0.18.1 19 | ``` 20 | 21 | ```py 22 | # 我们的小型数据集 23 | d = {'one':[1,1,1,1,1], 24 | 'two':[2,2,2,2,2], 25 | 'letter':['a','a','b','b','c']} 26 | 27 | # Create dataframe 28 | df = pd.DataFrame(d) 29 | df 30 | ``` 31 | 32 | 33 | | | letter | one | two | 34 | | --- | --- | 35 | | 0 | a | 1 | 2 | 36 | | 1 | a | 1 | 2 | 37 | | 2 | b | 1 | 2 | 38 | | 3 | b | 1 | 2 | 39 | | 4 | c | 1 | 2 | 40 | 41 | ```py 42 | # 创建分组对象 43 | one = df.groupby('letter') 44 | 45 | # 调用 sum 函数 46 | one.sum() 47 | ``` 48 | 49 | 50 | | | one | two | 51 | | --- | --- | 52 | | letter | | | 53 | | a | 2 | 4 | 54 | | b | 2 | 4 | 55 | | c | 1 | 2 | 56 | 57 | ```py 58 | letterone = df.groupby(['letter','one']).sum() 59 | letterone 60 | ``` 61 | 62 | 63 | | | | two | 64 | | --- | --- | 65 | | letter | one | | 66 | | a | 1 | 4 | 67 | | b | 1 | 4 | 68 | | c | 1 | 2 | 69 | 70 | ```py 71 | letterone.index 72 | ``` 73 | 74 | ```py 75 | MultiIndex(levels=[['a', 'b', 'c'], [1]], 76 | labels=[[0, 1, 2], [0, 0, 0]], 77 | names=['letter', 'one']) 78 | ``` 79 | 80 | 你可能不希望分组的列成为索引,这可以使用下面的语句轻易实现。 81 | 82 | ```py 83 | letterone = df.groupby(['letter','one'], as_index=False).sum() 84 | letterone 85 | ``` 86 | 87 | | | letter | one | two | 88 | | --- | --- | 89 | | 0 | a | 1 | 4 | 90 | | 1 | b | 1 | 4 | 91 | | 2 | c | 1 | 2 | 92 | 93 | ```py 94 | letterone.index 95 | ``` 96 | 97 | ``` 98 | Int64Index([0, 1, 2], dtype='int64') 99 | ``` 100 | -------------------------------------------------------------------------------- /3.7.md: -------------------------------------------------------------------------------- 1 | # 第七课 2 | 3 | ## 离群点 4 | 5 | ```py 6 | import pandas as pd 7 | import sys 8 | ``` 9 | 10 | ```py 11 | print('Python version ' + sys.version) 12 | print('Pandas version ' + pd.__version__) 13 | ``` 14 | 15 | ``` 16 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 17 | Pandas version 0.18.1 18 | ``` 19 | 20 | ```py 21 | # 从日期创建 DataFrame 作为索引 22 | States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL'] 23 | data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10] 24 | idx = pd.date_range('1/1/2012', periods=10, freq='MS') 25 | df1 = pd.DataFrame(data, index=idx, columns=['Revenue']) 26 | df1['State'] = States 27 | 28 | # 创建第二个 DataFrame 29 | data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6] 30 | idx2 = pd.date_range('1/1/2013', periods=10, freq='MS') 31 | df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue']) 32 | df2['State'] = States 33 | ``` 34 | 35 | ```py 36 | # 合并 DataFrame 37 | df = pd.concat([df1,df2]) 38 | df 39 | ``` 40 | 41 | 42 | | | Revenue | State | 43 | | --- | --- | 44 | | 2012-01-01 | 1.0 | NY | 45 | | 2012-02-01 | 2.0 | NY | 46 | | 2012-03-01 | 3.0 | NY | 47 | | 2012-04-01 | 4.0 | NY | 48 | | 2012-05-01 | 5.0 | FL | 49 | | 2012-06-01 | 6.0 | FL | 50 | | 2012-07-01 | 7.0 | GA | 51 | | 2012-08-01 | 8.0 | GA | 52 | | 2012-09-01 | 9.0 | FL | 53 | | 2012-10-01 | 10.0 | FL | 54 | | 2013-01-01 | 10.0 | NY | 55 | | 2013-02-01 | 10.0 | NY | 56 | | 2013-03-01 | 9.0 | NY | 57 | | 2013-04-01 | 9.0 | NY | 58 | | 2013-05-01 | 8.0 | FL | 59 | | 2013-06-01 | 8.0 | FL | 60 | | 2013-07-01 | 7.0 | GA | 61 | | 2013-08-01 | 7.0 | GA | 62 | | 2013-09-01 | 6.0 | FL | 63 | | 2013-10-01 | 6.0 | FL | 64 | 65 | ## 计算离群点的方式 66 | 67 | 注意:均值和标准差只适用于高斯分布。 68 | 69 | ```py 70 | # 方法 1 71 | 72 | # 创建原始 df 的副本 73 | newdf = df.copy() 74 | 75 | newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) 76 | newdf['1.96*std'] = 1.96*newdf['Revenue'].std() 77 | newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > 1.96*newdf['Revenue'].std() 78 | newdf 79 | ``` 80 | 81 | 82 | | | Revenue | State | x-Mean | 1.96*std | Outlier | 83 | | --- | --- | 84 | | 2012-01-01 | 1.0 | NY | 5.75 | 5.200273 | True | 85 | | 2012-02-01 | 2.0 | NY | 4.75 | 5.200273 | False | 86 | | 2012-03-01 | 3.0 | NY | 3.75 | 5.200273 | False | 87 | | 2012-04-01 | 4.0 | NY | 2.75 | 5.200273 | False | 88 | | 2012-05-01 | 5.0 | FL | 1.75 | 5.200273 | False | 89 | | 2012-06-01 | 6.0 | FL | 0.75 | 5.200273 | False | 90 | | 2012-07-01 | 7.0 | GA | 0.25 | 5.200273 | False | 91 | | 2012-08-01 | 8.0 | GA | 1.25 | 5.200273 | False | 92 | | 2012-09-01 | 9.0 | FL | 2.25 | 5.200273 | False | 93 | | 2012-10-01 | 10.0 | FL | 3.25 | 5.200273 | False | 94 | | 2013-01-01 | 10.0 | NY | 3.25 | 5.200273 | False | 95 | | 2013-02-01 | 10.0 | NY | 3.25 | 5.200273 | False | 96 | | 2013-03-01 | 9.0 | NY | 2.25 | 5.200273 | False | 97 | | 2013-04-01 | 9.0 | NY | 2.25 | 5.200273 | False | 98 | | 2013-05-01 | 8.0 | FL | 1.25 | 5.200273 | False | 99 | | 2013-06-01 | 8.0 | FL | 1.25 | 5.200273 | False | 100 | | 2013-07-01 | 7.0 | GA | 0.25 | 5.200273 | False | 101 | | 2013-08-01 | 7.0 | GA | 0.25 | 5.200273 | False | 102 | | 2013-09-01 | 6.0 | FL | 0.75 | 5.200273 | False | 103 | | 2013-10-01 | 6.0 | FL | 0.75 | 5.200273 | False | 104 | 105 | ```py 106 | # 方法 2 107 | # 按照项目分组 108 | 109 | # 创建原始 df 的副本 110 | newdf = df.copy() 111 | 112 | State = newdf.groupby('State') 113 | 114 | newdf['Outlier'] = State.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() ) 115 | newdf['x-Mean'] = State.transform( lambda x: abs(x-x.mean()) ) 116 | newdf['1.96*std'] = State.transform( lambda x: 1.96*x.std() ) 117 | newdf 118 | ``` 119 | 120 | 121 | | | Revenue | State | Outlier | x-Mean | 1.96*std | 122 | | --- | --- | 123 | | 2012-01-01 | 1.0 | NY | False | 4.5 | 12.473364 | 124 | | 2012-02-01 | 2.0 | NY | False | 4.0 | 11.087434 | 125 | | 2012-03-01 | 3.0 | NY | False | 3.0 | 8.315576 | 126 | | 2012-04-01 | 4.0 | NY | False | 2.5 | 6.929646 | 127 | | 2012-05-01 | 5.0 | FL | False | 1.5 | 4.157788 | 128 | | 2012-06-01 | 6.0 | FL | False | 1.0 | 2.771859 | 129 | | 2012-07-01 | 7.0 | GA | False | 0.0 | 0.000000 | 130 | | 2012-08-01 | 8.0 | GA | False | 0.5 | 1.385929 | 131 | | 2012-09-01 | 9.0 | FL | False | 1.5 | 4.157788 | 132 | | 2012-10-01 | 10.0 | FL | False | 2.0 | 5.543717 | 133 | | 2013-01-01 | 10.0 | NY | False | 4.5 | 12.473364 | 134 | | 2013-02-01 | 10.0 | NY | False | 4.0 | 11.087434 | 135 | | 2013-03-01 | 9.0 | NY | False | 3.0 | 8.315576 | 136 | | 2013-04-01 | 9.0 | NY | False | 2.5 | 6.929646 | 137 | | 2013-05-01 | 8.0 | FL | False | 1.5 | 4.157788 | 138 | | 2013-06-01 | 8.0 | FL | False | 1.0 | 2.771859 | 139 | | 2013-07-01 | 7.0 | GA | False | 0.0 | 0.000000 | 140 | | 2013-08-01 | 7.0 | GA | False | 0.5 | 1.385929 | 141 | | 2013-09-01 | 6.0 | FL | False | 1.5 | 4.157788 | 142 | | 2013-10-01 | 6.0 | FL | False | 2.0 | 5.543717 | 143 | 144 | ```py 145 | # 方法 3 146 | # 按照项目分组 147 | 148 | # 创建原始 df 的副本 149 | newdf = df.copy() 150 | 151 | State = newdf.groupby('State') 152 | 153 | def s(group): 154 | group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean()) 155 | group['1.96*std'] = 1.96*group['Revenue'].std() 156 | group['Outlier'] = abs(group['Revenue'] - group['Revenue'].mean()) > 1.96*group['Revenue'].std() 157 | return group 158 | 159 | Newdf2 = State.apply(s) 160 | Newdf2 161 | ``` 162 | 163 | | | Revenue | State | x-Mean | 1.96*std | Outlier | 164 | | --- | --- | 165 | | 2012-01-01 | 1.0 | NY | 4.5 | 12.473364 | False | 166 | | 2012-02-01 | 2.0 | NY | 4.0 | 11.087434 | False | 167 | | 2012-03-01 | 3.0 | NY | 3.0 | 8.315576 | False | 168 | | 2012-04-01 | 4.0 | NY | 2.5 | 6.929646 | False | 169 | | 2012-05-01 | 5.0 | FL | 1.5 | 4.157788 | False | 170 | | 2012-06-01 | 6.0 | FL | 1.0 | 2.771859 | False | 171 | | 2012-07-01 | 7.0 | GA | 0.0 | 0.000000 | False | 172 | | 2012-08-01 | 8.0 | GA | 0.5 | 1.385929 | False | 173 | | 2012-09-01 | 9.0 | FL | 1.5 | 4.157788 | False | 174 | | 2012-10-01 | 10.0 | FL | 2.0 | 5.543717 | False | 175 | | 2013-01-01 | 10.0 | NY | 4.5 | 12.473364 | False | 176 | | 2013-02-01 | 10.0 | NY | 4.0 | 11.087434 | False | 177 | | 2013-03-01 | 9.0 | NY | 3.0 | 8.315576 | False | 178 | | 2013-04-01 | 9.0 | NY | 2.5 | 6.929646 | False | 179 | | 2013-05-01 | 8.0 | FL | 1.5 | 4.157788 | False | 180 | | 2013-06-01 | 8.0 | FL | 1.0 | 2.771859 | False | 181 | | 2013-07-01 | 7.0 | GA | 0.0 | 0.000000 | False | 182 | | 2013-08-01 | 7.0 | GA | 0.5 | 1.385929 | False | 183 | | 2013-09-01 | 6.0 | FL | 1.5 | 4.157788 | False | 184 | | 2013-10-01 | 6.0 | FL | 2.0 | 5.543717 | False | 185 | 186 | 假设不是高斯分布(如果你绘制它,你会觉得不像) 187 | 188 | ```py 189 | # 创建原始 df 的副本 190 | newdf = df.copy() 191 | 192 | State = newdf.groupby('State') 193 | 194 | newdf['Lower'] = State['Revenue'].transform( lambda x: x.quantile(q=.25) - (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) ) 195 | newdf['Upper'] = State['Revenue'].transform( lambda x: x.quantile(q=.75) + (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) ) 196 | newdf['Outlier'] = (newdf['Revenue'] < newdf['Lower']) | (newdf['Revenue'] > newdf['Upper']) 197 | newdf 198 | ``` 199 | 200 | 201 | | | Revenue | State | Lower | Upper | Outlier | 202 | | --- | --- | 203 | | 2012-01-01 | 1.0 | NY | -7.000 | 19.000 | False | 204 | | 2012-02-01 | 2.0 | NY | -7.000 | 19.000 | False | 205 | | 2012-03-01 | 3.0 | NY | -7.000 | 19.000 | False | 206 | | 2012-04-01 | 4.0 | NY | -7.000 | 19.000 | False | 207 | | 2012-05-01 | 5.0 | FL | 2.625 | 11.625 | False | 208 | | 2012-06-01 | 6.0 | FL | 2.625 | 11.625 | False | 209 | | 2012-07-01 | 7.0 | GA | 6.625 | 7.625 | False | 210 | | 2012-08-01 | 8.0 | GA | 6.625 | 7.625 | True | 211 | | 2012-09-01 | 9.0 | FL | 2.625 | 11.625 | False | 212 | | 2012-10-01 | 10.0 | FL | 2.625 | 11.625 | False | 213 | | 2013-01-01 | 10.0 | NY | -7.000 | 19.000 | False | 214 | | 2013-02-01 | 10.0 | NY | -7.000 | 19.000 | False | 215 | | 2013-03-01 | 9.0 | NY | -7.000 | 19.000 | False | 216 | | 2013-04-01 | 9.0 | NY | -7.000 | 19.000 | False | 217 | | 2013-05-01 | 8.0 | FL | 2.625 | 11.625 | False | 218 | | 2013-06-01 | 8.0 | FL | 2.625 | 11.625 | False | 219 | | 2013-07-01 | 7.0 | GA | 6.625 | 7.625 | False | 220 | | 2013-08-01 | 7.0 | GA | 6.625 | 7.625 | False | 221 | | 2013-09-01 | 6.0 | FL | 2.625 | 11.625 | False | 222 | | 2013-10-01 | 6.0 | FL | 2.625 | 11.625 | False | -------------------------------------------------------------------------------- /3.8.md: -------------------------------------------------------------------------------- 1 | # 第八课 2 | 3 | > 如何从小型 SQL 数据库拉取数据。 4 | 5 | ```py 6 | # 导入库 7 | import pandas as pd 8 | import sys 9 | from sqlalchemy import create_engine, MetaData, Table, select, engine 10 | ``` 11 | 12 | ```py 13 | print('Python version ' + sys.version) 14 | print('Pandas version ' + pd.__version__) 15 | ``` 16 | 17 | ``` 18 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 19 | Pandas version 0.18.1 20 | ``` 21 | 22 | ## 版本 1 23 | 24 | 这一节中,我们使用 sqlalchemy 库从 SQL 数据库中抓取数据。要确保使用你自己的`ServerName`,`Database`,`TableName`。 25 | 26 | ```py 27 | # 参数 28 | TableName = "data" 29 | 30 | DB = { 31 | 'drivername': 'mssql+pyodbc', 32 | 'servername': 'DAVID-THINK', 33 | #'port': '5432', 34 | #'username': 'lynn', 35 | #'password': '', 36 | 'database': 'BizIntel', 37 | 'driver': 'SQL Server Native Client 11.0', 38 | 'trusted_connection': 'yes', 39 | 'legacy_schema_aliasing': False 40 | } 41 | 42 | # 创建连接 43 | engine = create_engine(DB['drivername'] + '://' + DB['servername'] + '/' + DB['database'] + '?' + 'driver=' + DB['driver'] + ';' + 'trusted_connection=' + DB['trusted_connection'], legacy_schema_aliasing=DB['legacy_schema_aliasing']) 44 | conn = engine.connect() 45 | 46 | # 需要用于表的查询 47 | metadata = MetaData(conn) 48 | 49 | # 要查询的表 50 | tbl = Table(TableName, metadata, autoload=True, schema="dbo") 51 | #tbl.create(checkfirst=True) 52 | 53 | # 选择全部 54 | sql = tbl.select() 55 | 56 | # 执行 SQL 语句 57 | result = conn.execute(sql) 58 | 59 | # 插入到 DataFrame 60 | df = pd.DataFrame(data=list(result), columns=result.keys()) 61 | 62 | # 关闭连接 63 | conn.close() 64 | 65 | print('Done') 66 | ``` 67 | 68 | ``` 69 | Done 70 | ``` 71 | 72 | 选择`DataFrame`中的内容。 73 | 74 | ```py 75 | df.head() 76 | ``` 77 | 78 | 79 | | | Date | Symbol | Volume | 80 | | --- | --- | 81 | | 0 | 2013-01-01 | A | 0.00 | 82 | | 1 | 2013-01-02 | A | 200.00 | 83 | | 2 | 2013-01-03 | A | 1200.00 | 84 | | 3 | 2013-01-04 | A | 1001.00 | 85 | | 4 | 2013-01-05 | A | 1300.00 | 86 | 87 | ```py 88 | df.dtypes 89 | ``` 90 | 91 | ``` 92 | Date datetime64[ns] 93 | Symbol object 94 | Volume object 95 | dtype: object 96 | ``` 97 | 98 | 转换为特殊的数据类型。下面的代码需要修改来匹配你的表。 99 | 100 | ## 版本 2 101 | 102 | ```py 103 | import pandas.io.sql 104 | import pyodbc 105 | ``` 106 | 107 | ```py 108 | # 参数 109 | server = 'DAVID-THINK' 110 | db = 'BizIntel' 111 | 112 | # 创建连接 113 | conn = pyodbc.connect('DRIVER={SQL Server};SERVER=' + DB['servername'] + ';DATABASE=' + DB['database'] + ';Trusted_Connection=yes') 114 | 115 | # 查询语句 116 | sql = """ 117 | 118 | SELECT top 5 * 119 | FROM data 120 | 121 | """ 122 | df = pandas.io.sql.read_sql(sql, conn) 123 | df.head() 124 | ``` 125 | 126 | 127 | | | Date | Symbol | Volume | 128 | | --- | --- | 129 | | 0 | 2013-01-01 | A | 0.0 | 130 | | 1 | 2013-01-02 | A | 200.0 | 131 | | 2 | 2013-01-03 | A | 1200.0 | 132 | | 3 | 2013-01-04 | A | 1001.0 | 133 | | 4 | 2013-01-05 | A | 1300.0 | 134 | 135 | ## 版本 3 136 | 137 | ```py 138 | from sqlalchemy import create_engine 139 | ``` 140 | 141 | ```py 142 | # 参数 143 | ServerName = "DAVID-THINK" 144 | Database = "BizIntel" 145 | Driver = "driver=SQL Server Native Client 11.0" 146 | 147 | # 创建连接 148 | engine = create_engine('mssql+pyodbc://' + ServerName + '/' + Database + "?" + Driver) 149 | 150 | df = pd.read_sql_query("SELECT top 5 * FROM data", engine) 151 | df 152 | ``` 153 | 154 | 155 | | | Date | Symbol | Volume | 156 | | --- | --- | 157 | | 0 | 2013-01-01 | A | 0.0 | 158 | | 1 | 2013-01-02 | A | 200.0 | 159 | | 2 | 2013-01-03 | A | 1200.0 | 160 | | 3 | 2013-01-04 | A | 1001.0 | 161 | | 4 | 2013-01-05 | A | 1300.0 | -------------------------------------------------------------------------------- /3.9.md: -------------------------------------------------------------------------------- 1 | # 第九课 2 | 3 | > 将数据从小型 SQL 数据库导出到 CVS、Excel 和 TXT。 4 | 5 | ```py 6 | # 导入库 7 | import pandas as pd 8 | import sys 9 | from sqlalchemy import create_engine, MetaData, Table, select 10 | ``` 11 | 12 | ```py 13 | print('Python version ' + sys.version) 14 | print('Pandas version ' + pd.__version__) 15 | ``` 16 | 17 | ``` 18 | Python version 3.5.1 |Anaconda custom (64-bit)| (default, Feb 16 2016, 09:49:46) [MSC v.1900 64 bit (AMD64)] 19 | Pandas version 0.18.1 20 | ``` 21 | 22 | ## 从 SQL 抓取数据 23 | 24 | 这一节中我们使用 sqlalchemy 库从 SQL 数据库中抓取数据。要注意参数部分需要修改。 25 | 26 | ```py 27 | # 参数 28 | TableName = "data" 29 | 30 | DB = { 31 | 'drivername': 'mssql+pyodbc', 32 | 'servername': 'DAVID-THINK', 33 | #'port': '5432', 34 | #'username': 'lynn', 35 | #'password': '', 36 | 'database': 'BizIntel', 37 | 'driver': 'SQL Server Native Client 11.0', 38 | 'trusted_connection': 'yes', 39 | 'legacy_schema_aliasing': False 40 | } 41 | 42 | # 创建连接 43 | engine = create_engine(DB['drivername'] + '://' + DB['servername'] + '/' + DB['database'] + '?' + 'driver=' + DB['driver'] + ';' + 'trusted_connection=' + DB['trusted_connection'], legacy_schema_aliasing=DB['legacy_schema_aliasing']) 44 | conn = engine.connect() 45 | 46 | # 需要用于表的查询 47 | metadata = MetaData(conn) 48 | 49 | # 要查询的表 50 | tbl = Table(TableName, metadata, autoload=True, schema="dbo") 51 | #tbl.create(checkfirst=True) 52 | 53 | # 选择全部 54 | sql = tbl.select() 55 | 56 | # 执行 SQL 语句 57 | result = conn.execute(sql) 58 | 59 | # 插入到 DataFrame 60 | df = pd.DataFrame(data=list(result), columns=result.keys()) 61 | 62 | # 关闭连接 63 | conn.close() 64 | 65 | print('Done') 66 | ``` 67 | 68 | ``` 69 | Done 70 | ``` 71 | 72 | 所有下面的文件都保存在该笔记本的相同文件夹中。 73 | 74 | ## 导出到 CSV 75 | 76 | ```py 77 | df.to_csv('DimDate.csv', index=False) 78 | print('Done') 79 | ``` 80 | 81 | ``` 82 | Done 83 | ``` 84 | 85 | ## 导出到 Excel 86 | 87 | ```py 88 | df.to_excel('DimDate.xls', index=False) 89 | print('Done') 90 | ``` 91 | 92 | ``` 93 | Done 94 | ``` 95 | 96 | ## 导出到 TXT 97 | 98 | ```py 99 | df.to_csv('DimDate.txt', index=False) 100 | print('Done') 101 | ``` 102 | 103 | ``` 104 | Done 105 | ``` 106 | 107 | -------------------------------------------------------------------------------- /3.md: -------------------------------------------------------------------------------- 1 | # 新手入门课程 --------------------------------------------------------------------------------