├── Chapter01 ├── IMDB.p ├── IMDB.xlsx └── IMDB.sqlite ├── LICENSE ├── Chapter03 ├── data-movies-top-grossing.csv ├── Handling SettingWithCopyWarning.ipynb ├── Renaming columns in a pandas DataFrame.ipynb ├── Handling missing values in pandas.ipynb ├── Work with dates and times data.ipynb └── Merging and concatenating multiple data frames into one.ipynb ├── README.md ├── Chapter04 └── data-alcohol.csv └── Chapter02 ├── Changing the datatype of a Pandas Series.ipynb ├── Using string methods in pandas.ipynb ├── Filter rows of a pandas DataFrame by column value.ipynb ├── Apply multiple filter criteria to a pandas DataFrame.ipynb ├── Using pandas Series data structure to select a subset of the data.ipynb └── Using the axis parameter in pandas.ipynb /Chapter01/IMDB.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-Exploratory-Analysis-with-pandas/HEAD/Chapter01/IMDB.p -------------------------------------------------------------------------------- /Chapter01/IMDB.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-Exploratory-Analysis-with-pandas/HEAD/Chapter01/IMDB.xlsx -------------------------------------------------------------------------------- /Chapter01/IMDB.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-Exploratory-Analysis-with-pandas/HEAD/Chapter01/IMDB.sqlite -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter03/data-movies-top-grossing.csv: -------------------------------------------------------------------------------- 1 | Rank,Title,Worldwide gross,Year 2 | 1,Avatar,"$2,787,965,087",2009 3 | 2,Titanic,"$2,186,772,302",1997 4 | 3,Star Wars: The Force Awakens,"$2,068,223,624",2015 5 | 4,Jurassic World,"$1,671,713,208",2015 6 | 5,The Avengers,"$1,518,812,988",2012 7 | 6,Furious 7,"$1,516,045,911",2015 8 | 7,Avengers: Age of Ultron,"$1,405,403,694",2015 9 | 8,Harry Potter and the Deathly Hallows – Part 2,"$1,341,511,219",2011 10 | 9,Frozen,"$1,287,000,000",2013 11 | 10,Beauty and the Beast,"$1,257,024,611",2017 12 | 11,The Fate of the Furious,"$1,238,290,870",2017 13 | 12,Iron Man 3,"$1,214,811,252",2013 14 | 13,Minions,"$1,159,398,397",2015 15 | 14,Captain America: Civil War,"$1,153,304,495",2016 16 | 15,Transformers: Dark of the Moon,"$1,123,794,079",2011 17 | 16,The Lord of the Rings: The Return of the King,"$1,120,237,002",2003 18 | 17,Skyfall,"$1,108,561,013",2012 19 | 18,Transformers: Age of Extinction,"$1,104,054,072",2014 20 | 19,The Dark Knight Rises,"$1,084,939,099",2012 21 | 20,Toy Story 3,"$1,066,969,703",2010 22 | 21,Pirates of the Caribbean: Dead Man's Chest,"$1,066,179,725",2006 23 | 22,Rogue One: A Star Wars Story,"$1,056,057,273",2016 24 | 23,Pirates of the Caribbean: On Stranger Tides,"$1,045,713,802",2011 25 | 24,Jurassic Park,"$1,029,939,903",1993 26 | 25,Finding Dory,"$1,028,570,889",2016 27 | 26,Star Wars: Episode I – The Phantom Menace,"$1,027,044,677",1999 28 | 27,Alice in Wonderland,"$1,025,467,110",2010 29 | 28,Zootopia,"$1,023,784,195",2016 30 | 29,The Hobbit: An Unexpected Journey,"$1,021,103,568",2012 31 | 30,The Dark Knight,"$1,004,558,444",2008 32 | 31,Harry Potter and the Philosopher's Stone,"$974,755,371",2001 33 | 32,Despicable Me 2,"$970,761,885",2013 34 | 33,The Lion King,"$968,483,777",1994 35 | 34,The Jungle Book,"$966,550,600",2016 36 | 35,Pirates of the Caribbean: At World's End,"$963,420,425",2007 37 | 36,Harry Potter and the Deathly Hallows – Part 1,"$960,283,305",2010 38 | 37,The Hobbit: The Desolation of Smaug,"$958,366,855",2013 39 | 38,The Hobbit: The Battle of the Five Armies,"$956,019,788",2014 40 | 39,Finding Nemo,"$940,335,536",2003 41 | 40,Harry Potter and the Order of the Phoenix,"$939,885,929",2007 42 | 41,Harry Potter and the Half-Blood Prince,"$934,416,487",2009 43 | 42,The Lord of the Rings: The Two Towers,"$926,349,708",2002 44 | 43,Shrek 2,"$919,838,758",2004 45 | 44,Harry Potter and the Goblet of Fire,"$896,911,078",2005 46 | 45,Spider-Man 3,"$890,871,626",2007 47 | 46,Ice Age: Dawn of the Dinosaurs,"$886,686,817",2009 48 | 47,Spectre,"$880,674,609",2015 49 | 48,Harry Potter and the Chamber of Secrets,"$878,979,634",2002 50 | 49,Ice Age: Continental Drift,"$877,244,782",2012 51 | 50,The Secret Life of Pets,"$875,457,937",2016 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## $5 Tech Unlocked 2021! 2 | [Buy and download this Book for only $5 on PacktPub.com](https://www.packtpub.com/product/mastering-exploratory-analysis-with-pandas/9781789619638) 3 | ----- 4 | *If you have read this book, please leave a review on [Amazon.com](https://www.amazon.com/gp/product/1789619637). Potential readers can then use your unbiased opinion to help them make purchase decisions. Thank you. The $5 campaign runs from __December 15th 2020__ to __January 13th 2021.__* 5 | 6 | # Mastering Exploratory Analysis with pandas 7 | 8 | Mastering Exploratory Analysis with pandas 9 | 10 | This is the code repository for [Mastering Exploratory Analysis with pandas](https://www.packtpub.com/big-data-and-business-intelligence/mastering-exploratory-analysis-pandas?utm_source=github&utm_medium=repository&utm_campaign=9781789619638), published by Packt. 11 | 12 | **Build an end-to-end data analysis workflow with Python** 13 | 14 | ## What is this book about? 15 | 16 | The pandas is a Python library that lets you manipulate, transform, and analyze data. It is a popular framework for exploratory data visualization and analyzing datasets and data pipelines based on their properties. 17 | 18 | This book covers the following exciting features: 19 | 20 | * Learn how to read different kinds of data into pandas DataFrames for data analysis 21 | * Manipulate, transform, and apply formulas to data imported into pandas DataFrames 22 | * Use pandas to analyze and visualize different kinds of data to gain real-world insights 23 | * Extract transformed data form pandas DataFrames and convert it into the formats your application expects 24 | * Manipulate model time-series data, perform algorithmic trading, derive results on fixed and moving windows, and more 25 | 26 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789619637) today! 27 | 28 | https://www.packtpub.com/ 30 | 31 | 32 | ## Instructions and Navigations 33 | All of the code is organized into folders. For example, Chapter02. 34 | 35 | The code will look like the following: 36 | ``` 37 | df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1") 38 | df.head() 39 | 40 | ``` 41 | 42 | **Following is what you need for this book:** 43 | 44 | If you are a budding data scientist looking to learn the popular pandas library, or a Python developer looking to step into the world of data analysis, this book is the ideal resource you need to get started. Some programming experience in Python will be helpful to get the most out of this course 45 | 46 | With the following software and hardware list you can run all code files present in the book (Chapter 1-4). 47 | 48 | ### Software and Hardware List 49 | 50 | | Chapter | Software required | OS required | 51 | | -------- | ------------------------------------| -----------------------------------| 52 | | 1-4 | Python 2.7x and above | Windows/Ubuntu | 53 | 54 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://www.packtpub.com/sites/default/files/downloads/9781789619638_ColorImages.pdf). 55 | 56 | 57 | ### Related products 58 | * Artificial Intelligence By Example [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/artificial-intelligence-example?utm_source=github&utm_medium=repository&utm_campaign=9781788990547) [[Amazon]](https://www.amazon.com/dp/1788990544) 59 | 60 | * Machine Learning Algorithms [[Packt]](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-algorithms?utm_source=github&utm_medium=repository&utm_campaign=9781785889622) [[Amazon]](https://www.amazon.com/dp/1785889621) 61 | 62 | ## Get to Know the Author 63 | **Harish Garg** is a data analyst, author, and software developer who is really passionate about data science and Python. He is a graduate of Udacity's Data Analyst Nanodegree program. He has 17 years of industry experience in data analysis using Python, developing and testing enterprise and consumer software, managing projects and software teams, and creating training material and tutorials. He also worked for 11 years for Intel Security (previously McAfee, Inc.). He regularly contributes articles and tutorials on data analysis and Python. He is also active in the open data community and is a contributing member of the Data4Democracy open data initiative. He has written data analysis pieces for the Takshashila think tank. 64 | 65 | 66 | 67 | 68 | ### Suggestions and Feedback 69 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 70 | -------------------------------------------------------------------------------- /Chapter04/data-alcohol.csv: -------------------------------------------------------------------------------- 1 | country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol 2 | Afghanistan,0,0,0,0.0 3 | Albania,89,132,54,4.9 4 | Algeria,25,0,14,0.7 5 | Andorra,245,138,312,12.4 6 | Angola,217,57,45,5.9 7 | Antigua & Barbuda,102,128,45,4.9 8 | Argentina,193,25,221,8.3 9 | Armenia,21,179,11,3.8 10 | Australia,261,72,212,10.4 11 | Austria,279,75,191,9.7 12 | Azerbaijan,21,46,5,1.3 13 | Bahamas,122,176,51,6.3 14 | Bahrain,42,63,7,2.0 15 | Bangladesh,0,0,0,0.0 16 | Barbados,143,173,36,6.3 17 | Belarus,142,373,42,14.4 18 | Belgium,295,84,212,10.5 19 | Belize,263,114,8,6.8 20 | Benin,34,4,13,1.1 21 | Bhutan,23,0,0,0.4 22 | Bolivia,167,41,8,3.8 23 | Bosnia-Herzegovina,76,173,8,4.6 24 | Botswana,173,35,35,5.4 25 | Brazil,245,145,16,7.2 26 | Brunei,31,2,1,0.6 27 | Bulgaria,231,252,94,10.3 28 | Burkina Faso,25,7,7,4.3 29 | Burundi,88,0,0,6.3 30 | Cote d'Ivoire,37,1,7,4.0 31 | Cabo Verde,144,56,16,4.0 32 | Cambodia,57,65,1,2.2 33 | Cameroon,147,1,4,5.8 34 | Canada,240,122,100,8.2 35 | Central African Republic,17,2,1,1.8 36 | Chad,15,1,1,0.4 37 | Chile,130,124,172,7.6 38 | China,79,192,8,5.0 39 | Colombia,159,76,3,4.2 40 | Comoros,1,3,1,0.1 41 | Congo,76,1,9,1.7 42 | Cook Islands,0,254,74,5.9 43 | Costa Rica,149,87,11,4.4 44 | Croatia,230,87,254,10.2 45 | Cuba,93,137,5,4.2 46 | Cyprus,192,154,113,8.2 47 | Czech Republic,361,170,134,11.8 48 | North Korea,0,0,0,0.0 49 | DR Congo,32,3,1,2.3 50 | Denmark,224,81,278,10.4 51 | Djibouti,15,44,3,1.1 52 | Dominica,52,286,26,6.6 53 | Dominican Republic,193,147,9,6.2 54 | Ecuador,162,74,3,4.2 55 | Egypt,6,4,1,0.2 56 | El Salvador,52,69,2,2.2 57 | Equatorial Guinea,92,0,233,5.8 58 | Eritrea,18,0,0,0.5 59 | Estonia,224,194,59,9.5 60 | Ethiopia,20,3,0,0.7 61 | Fiji,77,35,1,2.0 62 | Finland,263,133,97,10.0 63 | France,127,151,370,11.8 64 | Gabon,347,98,59,8.9 65 | Gambia,8,0,1,2.4 66 | Georgia,52,100,149,5.4 67 | Germany,346,117,175,11.3 68 | Ghana,31,3,10,1.8 69 | Greece,133,112,218,8.3 70 | Grenada,199,438,28,11.9 71 | Guatemala,53,69,2,2.2 72 | Guinea,9,0,2,0.2 73 | Guinea-Bissau,28,31,21,2.5 74 | Guyana,93,302,1,7.1 75 | Haiti,1,326,1,5.9 76 | Honduras,69,98,2,3.0 77 | Hungary,234,215,185,11.3 78 | Iceland,233,61,78,6.6 79 | India,9,114,0,2.2 80 | Indonesia,5,1,0,0.1 81 | Iran,0,0,0,0.0 82 | Iraq,9,3,0,0.2 83 | Ireland,313,118,165,11.4 84 | Israel,63,69,9,2.5 85 | Italy,85,42,237,6.5 86 | Jamaica,82,97,9,3.4 87 | Japan,77,202,16,7.0 88 | Jordan,6,21,1,0.5 89 | Kazakhstan,124,246,12,6.8 90 | Kenya,58,22,2,1.8 91 | Kiribati,21,34,1,1.0 92 | Kuwait,0,0,0,0.0 93 | Kyrgyzstan,31,97,6,2.4 94 | Laos,62,0,123,6.2 95 | Latvia,281,216,62,10.5 96 | Lebanon,20,55,31,1.9 97 | Lesotho,82,29,0,2.8 98 | Liberia,19,152,2,3.1 99 | Libya,0,0,0,0.0 100 | Lithuania,343,244,56,12.9 101 | Luxembourg,236,133,271,11.4 102 | Madagascar,26,15,4,0.8 103 | Malawi,8,11,1,1.5 104 | Malaysia,13,4,0,0.3 105 | Maldives,0,0,0,0.0 106 | Mali,5,1,1,0.6 107 | Malta,149,100,120,6.6 108 | Marshall Islands,0,0,0,0.0 109 | Mauritania,0,0,0,0.0 110 | Mauritius,98,31,18,2.6 111 | Mexico,238,68,5,5.5 112 | Micronesia,62,50,18,2.3 113 | Monaco,0,0,0,0.0 114 | Mongolia,77,189,8,4.9 115 | Montenegro,31,114,128,4.9 116 | Morocco,12,6,10,0.5 117 | Mozambique,47,18,5,1.3 118 | Myanmar,5,1,0,0.1 119 | Namibia,376,3,1,6.8 120 | Nauru,49,0,8,1.0 121 | Nepal,5,6,0,0.2 122 | Netherlands,251,88,190,9.4 123 | New Zealand,203,79,175,9.3 124 | Nicaragua,78,118,1,3.5 125 | Niger,3,2,1,0.1 126 | Nigeria,42,5,2,9.1 127 | Niue,188,200,7,7.0 128 | Norway,169,71,129,6.7 129 | Oman,22,16,1,0.7 130 | Pakistan,0,0,0,0.0 131 | Palau,306,63,23,6.9 132 | Panama,285,104,18,7.2 133 | Papua New Guinea,44,39,1,1.5 134 | Paraguay,213,117,74,7.3 135 | Peru,163,160,21,6.1 136 | Philippines,71,186,1,4.6 137 | Poland,343,215,56,10.9 138 | Portugal,194,67,339,11.0 139 | Qatar,1,42,7,0.9 140 | South Korea,140,16,9,9.8 141 | Moldova,109,226,18,6.3 142 | Romania,297,122,167,10.4 143 | Russian Federation,247,326,73,11.5 144 | Rwanda,43,2,0,6.8 145 | St. Kitts & Nevis,194,205,32,7.7 146 | St. Lucia,171,315,71,10.1 147 | St. Vincent & the Grenadines,120,221,11,6.3 148 | Samoa,105,18,24,2.6 149 | San Marino,0,0,0,0.0 150 | Sao Tome & Principe,56,38,140,4.2 151 | Saudi Arabia,0,5,0,0.1 152 | Senegal,9,1,7,0.3 153 | Serbia,283,131,127,9.6 154 | Seychelles,157,25,51,4.1 155 | Sierra Leone,25,3,2,6.7 156 | Singapore,60,12,11,1.5 157 | Slovakia,196,293,116,11.4 158 | Slovenia,270,51,276,10.6 159 | Solomon Islands,56,11,1,1.2 160 | Somalia,0,0,0,0.0 161 | South Africa,225,76,81,8.2 162 | Spain,284,157,112,10.0 163 | Sri Lanka,16,104,0,2.2 164 | Sudan,8,13,0,1.7 165 | Suriname,128,178,7,5.6 166 | Swaziland,90,2,2,4.7 167 | Sweden,152,60,186,7.2 168 | Switzerland,185,100,280,10.2 169 | Syria,5,35,16,1.0 170 | Tajikistan,2,15,0,0.3 171 | Thailand,99,258,1,6.4 172 | Macedonia,106,27,86,3.9 173 | Timor-Leste,1,1,4,0.1 174 | Togo,36,2,19,1.3 175 | Tonga,36,21,5,1.1 176 | Trinidad & Tobago,197,156,7,6.4 177 | Tunisia,51,3,20,1.3 178 | Turkey,51,22,7,1.4 179 | Turkmenistan,19,71,32,2.2 180 | Tuvalu,6,41,9,1.0 181 | Uganda,45,9,0,8.3 182 | Ukraine,206,237,45,8.9 183 | United Arab Emirates,16,135,5,2.8 184 | United Kingdom,219,126,195,10.4 185 | Tanzania,36,6,1,5.7 186 | USA,249,158,84,8.7 187 | Uruguay,115,35,220,6.6 188 | Uzbekistan,25,101,8,2.4 189 | Vanuatu,21,18,11,0.9 190 | Venezuela,333,100,3,7.7 191 | Vietnam,111,2,1,2.0 192 | Yemen,6,0,0,0.1 193 | Zambia,32,19,4,2.5 194 | Zimbabwe,64,18,4,4.7 -------------------------------------------------------------------------------- /Chapter02/Changing the datatype of a Pandas Series.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Read in the dataset" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true, 117 | "slideshow": { 118 | "slide_type": "fragment" 119 | } 120 | }, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " Date RegionID RegionName State Metro \\\n", 202 | "0 2017-05-31 6181 New York NY New York \n", 203 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 204 | "2 2017-05-31 17426 Chicago IL Chicago \n", 205 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 206 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 207 | "\n", 208 | " County SizeRank Zhvi \n", 209 | "0 Queens 0 672400 \n", 210 | "1 Los Angeles 1 629900 \n", 211 | "2 Cook 2 222700 \n", 212 | "3 Philadelphia 3 137300 \n", 213 | "4 Maricopa 4 211300 " 214 | ] 215 | }, 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "data = pd.read_table('data-zillow.csv', sep=',')\n", 223 | "data.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "slideshow": { 230 | "slide_type": "slide" 231 | } 232 | }, 233 | "source": [ 234 | "## Changing an int column to float" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 6, 240 | "metadata": { 241 | "collapsed": false, 242 | "deletable": true, 243 | "editable": true, 244 | "slideshow": { 245 | "slide_type": "fragment" 246 | } 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "Date object\n", 253 | "RegionID int64\n", 254 | "RegionName object\n", 255 | "State object\n", 256 | "Metro object\n", 257 | "County object\n", 258 | "SizeRank int64\n", 259 | "Zhvi int64\n", 260 | "dtype: object" 261 | ] 262 | }, 263 | "execution_count": 6, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "data.dtypes" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 7, 275 | "metadata": { 276 | "collapsed": true, 277 | "deletable": true, 278 | "editable": true, 279 | "slideshow": { 280 | "slide_type": "fragment" 281 | } 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "data['Zhvi'] = data.Zhvi.astype(float)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 8, 291 | "metadata": { 292 | "collapsed": false, 293 | "deletable": true, 294 | "editable": true, 295 | "slideshow": { 296 | "slide_type": "fragment" 297 | } 298 | }, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "Date object\n", 304 | "RegionID int64\n", 305 | "RegionName object\n", 306 | "State object\n", 307 | "Metro object\n", 308 | "County object\n", 309 | "SizeRank int64\n", 310 | "Zhvi float64\n", 311 | "dtype: object" 312 | ] 313 | }, 314 | "execution_count": 8, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "data.dtypes" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "slideshow": { 327 | "slide_type": "slide" 328 | } 329 | }, 330 | "source": [ 331 | "## Changing datatype while reading data" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 9, 337 | "metadata": { 338 | "collapsed": false, 339 | "deletable": true, 340 | "editable": true, 341 | "slideshow": { 342 | "slide_type": "fragment" 343 | } 344 | }, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "Date object\n", 350 | "RegionID int64\n", 351 | "RegionName object\n", 352 | "State object\n", 353 | "Metro object\n", 354 | "County object\n", 355 | "SizeRank int64\n", 356 | "Zhvi float64\n", 357 | "dtype: object" 358 | ] 359 | }, 360 | "execution_count": 9, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "data2 = pd.read_csv('data-zillow.csv', sep=',', dtype={'Zhvi':float})\n", 367 | "data2.dtypes" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "slideshow": { 374 | "slide_type": "slide" 375 | } 376 | }, 377 | "source": [ 378 | "## Converting string to datetime " 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 16, 384 | "metadata": { 385 | "collapsed": false, 386 | "slideshow": { 387 | "slide_type": "fragment" 388 | } 389 | }, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/plain": [ 394 | "0 2017-05-31\n", 395 | "1 2017-05-31\n", 396 | "2 2017-05-31\n", 397 | "3 2017-05-31\n", 398 | "4 2017-05-31\n", 399 | "Name: Date, dtype: datetime64[ns]" 400 | ] 401 | }, 402 | "execution_count": 16, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "pd.to_datetime(data2.Date,infer_datetime_format=True).head()" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": { 415 | "collapsed": true, 416 | "deletable": true, 417 | "editable": true, 418 | "slideshow": { 419 | "slide_type": "skip" 420 | } 421 | }, 422 | "outputs": [], 423 | "source": [] 424 | } 425 | ], 426 | "metadata": { 427 | "celltoolbar": "Slideshow", 428 | "kernelspec": { 429 | "display_name": "Python 3", 430 | "language": "python", 431 | "name": "python3" 432 | }, 433 | "language_info": { 434 | "codemirror_mode": { 435 | "name": "ipython", 436 | "version": 3 437 | }, 438 | "file_extension": ".py", 439 | "mimetype": "text/x-python", 440 | "name": "python", 441 | "nbconvert_exporter": "python", 442 | "pygments_lexer": "ipython3", 443 | "version": "3.6.1" 444 | } 445 | }, 446 | "nbformat": 4, 447 | "nbformat_minor": 2 448 | } 449 | -------------------------------------------------------------------------------- /Chapter03/Handling SettingWithCopyWarning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Read in the dataset" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 35, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true, 117 | "slideshow": { 118 | "slide_type": "fragment" 119 | } 120 | }, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 222 | "
" 223 | ], 224 | "text/plain": [ 225 | " PassengerId Survived Pclass \\\n", 226 | "0 1 0 3 \n", 227 | "1 2 1 1 \n", 228 | "2 3 1 3 \n", 229 | "3 4 1 1 \n", 230 | "4 5 0 3 \n", 231 | "\n", 232 | " Name Sex Age SibSp \\\n", 233 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 234 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 235 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 236 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 237 | "4 Allen, Mr. William Henry male 35.0 0 \n", 238 | "\n", 239 | " Parch Ticket Fare Cabin Embarked \n", 240 | "0 0 A/5 21171 7.2500 NaN S \n", 241 | "1 0 PC 17599 71.2833 C85 C \n", 242 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 243 | "3 0 113803 53.1000 C123 S \n", 244 | "4 0 373450 8.0500 NaN S " 245 | ] 246 | }, 247 | "execution_count": 35, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "data = pd.read_csv('data-titanic.csv')\n", 254 | "data.head()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": { 260 | "slideshow": { 261 | "slide_type": "slide" 262 | } 263 | }, 264 | "source": [ 265 | "## A `SettingWithCopyWarning` scenario" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 36, 271 | "metadata": { 272 | "collapsed": false, 273 | "deletable": true, 274 | "editable": true, 275 | "slideshow": { 276 | "slide_type": "fragment" 277 | } 278 | }, 279 | "outputs": [ 280 | { 281 | "name": "stderr", 282 | "output_type": "stream", 283 | "text": [ 284 | "C:\\Users\\harish\\Anaconda2\\envs\\python3\\lib\\site-packages\\pandas\\core\\generic.py:2773: SettingWithCopyWarning: \n", 285 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 286 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 287 | "\n", 288 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 289 | " self[name] = value\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "data[data.Age.isnull()].Age = data.Age.mean()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": { 300 | "slideshow": { 301 | "slide_type": "slide" 302 | } 303 | }, 304 | "source": [ 305 | "## Handling the `SettingWithCopyWarning`" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 39, 311 | "metadata": { 312 | "collapsed": false, 313 | "slideshow": { 314 | "slide_type": "fragment" 315 | } 316 | }, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "5 NaN\n", 322 | "17 NaN\n", 323 | "19 NaN\n", 324 | "26 NaN\n", 325 | "28 NaN\n", 326 | "Name: Age, dtype: float64" 327 | ] 328 | }, 329 | "execution_count": 39, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "data[data.Age.isnull()].Age.head()" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 41, 341 | "metadata": { 342 | "collapsed": false, 343 | "slideshow": { 344 | "slide_type": "fragment" 345 | } 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "data.loc[data.Age.isnull(), 'Age'] = data.Age.mean" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 42, 355 | "metadata": { 356 | "collapsed": false, 357 | "slideshow": { 358 | "slide_type": "fragment" 359 | } 360 | }, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/html": [ 365 | "
\n", 366 | "\n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
\n", 387 | "
" 388 | ], 389 | "text/plain": [ 390 | "Empty DataFrame\n", 391 | "Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]\n", 392 | "Index: []" 393 | ] 394 | }, 395 | "execution_count": 42, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "data[data.Age.isnull()]" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": { 408 | "collapsed": true, 409 | "slideshow": { 410 | "slide_type": "skip" 411 | } 412 | }, 413 | "outputs": [], 414 | "source": [] 415 | } 416 | ], 417 | "metadata": { 418 | "celltoolbar": "Slideshow", 419 | "kernelspec": { 420 | "display_name": "Python 3", 421 | "language": "python", 422 | "name": "python3" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": { 426 | "name": "ipython", 427 | "version": 3 428 | }, 429 | "file_extension": ".py", 430 | "mimetype": "text/x-python", 431 | "name": "python", 432 | "nbconvert_exporter": "python", 433 | "pygments_lexer": "ipython3", 434 | "version": "3.6.1" 435 | } 436 | }, 437 | "nbformat": 4, 438 | "nbformat_minor": 2 439 | } 440 | -------------------------------------------------------------------------------- /Chapter02/Using string methods in pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Read in the dataset" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true, 117 | "slideshow": { 118 | "slide_type": "fragment" 119 | } 120 | }, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " Date RegionID RegionName State Metro \\\n", 202 | "0 2017-05-31 6181 New York NY New York \n", 203 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 204 | "2 2017-05-31 17426 Chicago IL Chicago \n", 205 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 206 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 207 | "\n", 208 | " County SizeRank Zhvi \n", 209 | "0 Queens 0 672400 \n", 210 | "1 Los Angeles 1 629900 \n", 211 | "2 Cook 2 222700 \n", 212 | "3 Philadelphia 3 137300 \n", 213 | "4 Maricopa 4 211300 " 214 | ] 215 | }, 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "data = pd.read_table('data-zillow.csv', sep=',')\n", 223 | "data.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "slideshow": { 230 | "slide_type": "slide" 231 | } 232 | }, 233 | "source": [ 234 | "\n", 235 | "## Check for a substring" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 13, 241 | "metadata": { 242 | "collapsed": false, 243 | "deletable": true, 244 | "editable": true, 245 | "slideshow": { 246 | "slide_type": "fragment" 247 | } 248 | }, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "0 True\n", 254 | "1 False\n", 255 | "2 False\n", 256 | "3 False\n", 257 | "4 False\n", 258 | "Name: RegionName, dtype: bool" 259 | ] 260 | }, 261 | "execution_count": 13, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "data.RegionName.str.contains('New').head()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": { 273 | "slideshow": { 274 | "slide_type": "slide" 275 | } 276 | }, 277 | "source": [ 278 | "## Make values of a series or column uppercase" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 7, 284 | "metadata": { 285 | "collapsed": false, 286 | "deletable": true, 287 | "editable": true, 288 | "slideshow": { 289 | "slide_type": "fragment" 290 | } 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "0 NEW YORK\n", 297 | "1 LOS ANGELES\n", 298 | "2 CHICAGO\n", 299 | "3 PHILADELPHIA\n", 300 | "4 PHOENIX\n", 301 | "Name: RegionName, dtype: object" 302 | ] 303 | }, 304 | "execution_count": 7, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "data.RegionName.str.upper().head()" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": { 316 | "slideshow": { 317 | "slide_type": "slide" 318 | } 319 | }, 320 | "source": [ 321 | "## Make values lowercase" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 8, 327 | "metadata": { 328 | "collapsed": false, 329 | "deletable": true, 330 | "editable": true, 331 | "slideshow": { 332 | "slide_type": "fragment" 333 | } 334 | }, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "0 queens\n", 340 | "1 los angeles\n", 341 | "2 cook\n", 342 | "3 philadelphia\n", 343 | "4 maricopa\n", 344 | "Name: County, dtype: object" 345 | ] 346 | }, 347 | "execution_count": 8, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "data.County.str.lower().head()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": { 359 | "slideshow": { 360 | "slide_type": "slide" 361 | } 362 | }, 363 | "source": [ 364 | "## Get the length of each value in a column" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 10, 370 | "metadata": { 371 | "collapsed": false, 372 | "deletable": true, 373 | "editable": true, 374 | "slideshow": { 375 | "slide_type": "fragment" 376 | } 377 | }, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/plain": [ 382 | "0 6\n", 383 | "1 11\n", 384 | "2 4\n", 385 | "3 12\n", 386 | "4 8\n", 387 | "Name: County, dtype: int64" 388 | ] 389 | }, 390 | "execution_count": 10, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "data.County.str.len().head()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": { 402 | "slideshow": { 403 | "slide_type": "slide" 404 | } 405 | }, 406 | "source": [ 407 | "## Remove all whitespace from the beginning" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 16, 413 | "metadata": { 414 | "collapsed": false, 415 | "slideshow": { 416 | "slide_type": "fragment" 417 | } 418 | }, 419 | "outputs": [ 420 | { 421 | "data": { 422 | "text/plain": [ 423 | "0 New York\n", 424 | "1 Los Angeles\n", 425 | "2 Chicago\n", 426 | "3 Philadelphia\n", 427 | "4 Phoenix\n", 428 | "Name: RegionName, dtype: object" 429 | ] 430 | }, 431 | "execution_count": 16, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "data.RegionName.str.lstrip().head()" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": { 443 | "slideshow": { 444 | "slide_type": "slide" 445 | } 446 | }, 447 | "source": [ 448 | "## Replace parts of a column's values" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 18, 454 | "metadata": { 455 | "collapsed": false, 456 | "slideshow": { 457 | "slide_type": "fragment" 458 | } 459 | }, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/plain": [ 464 | "0 NewYork\n", 465 | "1 LosAngeles\n", 466 | "2 Chicago\n", 467 | "3 Philadelphia\n", 468 | "4 Phoenix\n", 469 | "Name: RegionName, dtype: object" 470 | ] 471 | }, 472 | "execution_count": 18, 473 | "metadata": {}, 474 | "output_type": "execute_result" 475 | } 476 | ], 477 | "source": [ 478 | "data.RegionName.str.replace(' ', '').head()" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": { 485 | "collapsed": true, 486 | "slideshow": { 487 | "slide_type": "skip" 488 | } 489 | }, 490 | "outputs": [], 491 | "source": [] 492 | } 493 | ], 494 | "metadata": { 495 | "celltoolbar": "Slideshow", 496 | "kernelspec": { 497 | "display_name": "Python 3", 498 | "language": "python", 499 | "name": "python3" 500 | }, 501 | "language_info": { 502 | "codemirror_mode": { 503 | "name": "ipython", 504 | "version": 3 505 | }, 506 | "file_extension": ".py", 507 | "mimetype": "text/x-python", 508 | "name": "python", 509 | "nbconvert_exporter": "python", 510 | "pygments_lexer": "ipython3", 511 | "version": "3.6.1" 512 | } 513 | }, 514 | "nbformat": 4, 515 | "nbformat_minor": 2 516 | } 517 | -------------------------------------------------------------------------------- /Chapter03/Renaming columns in a pandas DataFrame.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "deletable": true, 55 | "editable": true, 56 | "slideshow": { 57 | "slide_type": "slide" 58 | } 59 | }, 60 | "source": [ 61 | "## Import Pandas" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 2, 67 | "metadata": { 68 | "collapsed": true, 69 | "deletable": true, 70 | "editable": true, 71 | "slideshow": { 72 | "slide_type": "fragment" 73 | } 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "import pandas as pd" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "deletable": true, 84 | "editable": true, 85 | "slideshow": { 86 | "slide_type": "slide" 87 | } 88 | }, 89 | "source": [ 90 | "## Renaming Columns" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "deletable": true, 97 | "editable": true, 98 | "slideshow": { 99 | "slide_type": "slide" 100 | } 101 | }, 102 | "source": [ 103 | "## Rename columns while reading the data" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": { 110 | "collapsed": false, 111 | "deletable": true, 112 | "editable": true, 113 | "slideshow": { 114 | "slide_type": "fragment" 115 | } 116 | }, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/html": [ 121 | "
\n", 122 | "\n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | "
DateRegion IDRegion NameStateCityCountySize RankPrice
0DateRegionIDRegionNameStateMetroCountySizeRankPrice
12017-05-316181New YorkNYNew YorkQueens0672400
22017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
32017-05-3117426ChicagoILChicagoCook2222700
42017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
\n", 194 | "
" 195 | ], 196 | "text/plain": [ 197 | " Date Region ID Region Name State City \\\n", 198 | "0 Date RegionID RegionName State Metro \n", 199 | "1 2017-05-31 6181 New York NY New York \n", 200 | "2 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 201 | "3 2017-05-31 17426 Chicago IL Chicago \n", 202 | "4 2017-05-31 13271 Philadelphia PA Philadelphia \n", 203 | "\n", 204 | " County Size Rank Price \n", 205 | "0 County SizeRank Price \n", 206 | "1 Queens 0 672400 \n", 207 | "2 Los Angeles 1 629900 \n", 208 | "3 Cook 2 222700 \n", 209 | "4 Philadelphia 3 137300 " 210 | ] 211 | }, 212 | "execution_count": 5, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "list_columns = ['Date', 'Region ID', 'Region Name', 'State',\n", 219 | " 'City', 'County', 'Size Rank','Price']\n", 220 | "data = pd.read_csv('data-zillow.csv', names = list_columns)\n", 221 | "data.head()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": { 227 | "deletable": true, 228 | "editable": true, 229 | "slideshow": { 230 | "slide_type": "slide" 231 | } 232 | }, 233 | "source": [ 234 | "## Rename columns using rename method" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "deletable": true, 241 | "editable": true, 242 | "slideshow": { 243 | "slide_type": "slide" 244 | } 245 | }, 246 | "source": [ 247 | "### Read in the dataset again" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 4, 253 | "metadata": { 254 | "collapsed": false, 255 | "deletable": true, 256 | "editable": true, 257 | "slideshow": { 258 | "slide_type": "fragment" 259 | } 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/html": [ 265 | "
\n", 266 | "\n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | "
DateRegionIDRegionNameStateMetroCountySizeRankPrice
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 338 | "
" 339 | ], 340 | "text/plain": [ 341 | " Date RegionID RegionName State Metro \\\n", 342 | "0 2017-05-31 6181 New York NY New York \n", 343 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 344 | "2 2017-05-31 17426 Chicago IL Chicago \n", 345 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 346 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 347 | "\n", 348 | " County SizeRank Price \n", 349 | "0 Queens 0 672400 \n", 350 | "1 Los Angeles 1 629900 \n", 351 | "2 Cook 2 222700 \n", 352 | "3 Philadelphia 3 137300 \n", 353 | "4 Maricopa 4 211300 " 354 | ] 355 | }, 356 | "execution_count": 4, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "data = pd.read_csv('data-zillow.csv')\n", 363 | "data.head()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "slideshow": { 370 | "slide_type": "slide" 371 | } 372 | }, 373 | "source": [ 374 | "### Rename " 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 7, 380 | "metadata": { 381 | "collapsed": false, 382 | "deletable": true, 383 | "editable": true, 384 | "slideshow": { 385 | "slide_type": "fragment" 386 | } 387 | }, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "Index(['Date', 'Region ID', 'Region Name', 'State', 'City', 'County',\n", 393 | " 'Size Rank', 'Price'],\n", 394 | " dtype='object')" 395 | ] 396 | }, 397 | "execution_count": 7, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "data.columns" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 8, 409 | "metadata": { 410 | "collapsed": true, 411 | "deletable": true, 412 | "editable": true, 413 | "slideshow": { 414 | "slide_type": "fragment" 415 | } 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "data.rename(columns={'RegionName':'Region', 'Metro':'City'}, inplace=True)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 9, 425 | "metadata": { 426 | "collapsed": false, 427 | "deletable": true, 428 | "editable": true, 429 | "slideshow": { 430 | "slide_type": "fragment" 431 | } 432 | }, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/plain": [ 437 | "Index(['Date', 'Region ID', 'Region Name', 'State', 'City', 'County',\n", 438 | " 'Size Rank', 'Price'],\n", 439 | " dtype='object')" 440 | ] 441 | }, 442 | "execution_count": 9, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "data.columns" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": { 454 | "deletable": true, 455 | "editable": true, 456 | "slideshow": { 457 | "slide_type": "slide" 458 | } 459 | }, 460 | "source": [ 461 | "## Rename all columns" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 12, 467 | "metadata": { 468 | "collapsed": false, 469 | "deletable": true, 470 | "editable": true, 471 | "slideshow": { 472 | "slide_type": "fragment" 473 | } 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "data.columns = ['Date', 'Region ID', 'Region Name', 'State',\n", 478 | " 'City', 'County', 'Size Rank','Price']" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": { 485 | "collapsed": true, 486 | "deletable": true, 487 | "editable": true, 488 | "slideshow": { 489 | "slide_type": "skip" 490 | } 491 | }, 492 | "outputs": [], 493 | "source": [] 494 | } 495 | ], 496 | "metadata": { 497 | "celltoolbar": "Slideshow", 498 | "kernelspec": { 499 | "display_name": "Python 3", 500 | "language": "python", 501 | "name": "python3" 502 | }, 503 | "language_info": { 504 | "codemirror_mode": { 505 | "name": "ipython", 506 | "version": 3 507 | }, 508 | "file_extension": ".py", 509 | "mimetype": "text/x-python", 510 | "name": "python", 511 | "nbconvert_exporter": "python", 512 | "pygments_lexer": "ipython3", 513 | "version": "3.6.1" 514 | } 515 | }, 516 | "nbformat": 4, 517 | "nbformat_minor": 2 518 | } 519 | -------------------------------------------------------------------------------- /Chapter03/Handling missing values in pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Read in the dataset\n", 108 | "\n", 109 | "Titanic Survival Dataset from https://www.kaggle.com/c/titanic/data" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "metadata": { 116 | "collapsed": false, 117 | "deletable": true, 118 | "editable": true, 119 | "slideshow": { 120 | "slide_type": "fragment" 121 | } 122 | }, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/html": [ 127 | "
\n", 128 | "\n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", 224 | "
" 225 | ], 226 | "text/plain": [ 227 | " PassengerId Survived Pclass \\\n", 228 | "0 1 0 3 \n", 229 | "1 2 1 1 \n", 230 | "2 3 1 3 \n", 231 | "3 4 1 1 \n", 232 | "4 5 0 3 \n", 233 | "\n", 234 | " Name Sex Age SibSp \\\n", 235 | "0 Braund, Mr. Owen Harris male 22.0 1 \n", 236 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", 237 | "2 Heikkinen, Miss. Laina female 26.0 0 \n", 238 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", 239 | "4 Allen, Mr. William Henry male 35.0 0 \n", 240 | "\n", 241 | " Parch Ticket Fare Cabin Embarked \n", 242 | "0 0 A/5 21171 7.2500 NaN S \n", 243 | "1 0 PC 17599 71.2833 C85 C \n", 244 | "2 0 STON/O2. 3101282 7.9250 NaN S \n", 245 | "3 0 113803 53.1000 C123 S \n", 246 | "4 0 373450 8.0500 NaN S " 247 | ] 248 | }, 249 | "execution_count": 7, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "data = pd.read_csv('data-titanic.csv')\n", 256 | "data.head()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "deletable": true, 263 | "editable": true, 264 | "slideshow": { 265 | "slide_type": "slide" 266 | } 267 | }, 268 | "source": [ 269 | "## Missing Records" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": { 275 | "deletable": true, 276 | "editable": true, 277 | "slideshow": { 278 | "slide_type": "slide" 279 | } 280 | }, 281 | "source": [ 282 | "### Find out total records in the dataset" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 8, 288 | "metadata": { 289 | "collapsed": false, 290 | "deletable": true, 291 | "editable": true, 292 | "slideshow": { 293 | "slide_type": "fragment" 294 | } 295 | }, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "(891, 12)" 301 | ] 302 | }, 303 | "execution_count": 8, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "data.shape" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": { 315 | "collapsed": true, 316 | "deletable": true, 317 | "editable": true, 318 | "slideshow": { 319 | "slide_type": "slide" 320 | } 321 | }, 322 | "source": [ 323 | "### Number of valid records per column" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 9, 329 | "metadata": { 330 | "collapsed": false, 331 | "deletable": true, 332 | "editable": true, 333 | "slideshow": { 334 | "slide_type": "fragment" 335 | } 336 | }, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "PassengerId 891\n", 342 | "Survived 891\n", 343 | "Pclass 891\n", 344 | "Name 891\n", 345 | "Sex 891\n", 346 | "Age 714\n", 347 | "SibSp 891\n", 348 | "Parch 891\n", 349 | "Ticket 891\n", 350 | "Fare 891\n", 351 | "Cabin 204\n", 352 | "Embarked 889\n", 353 | "dtype: int64" 354 | ] 355 | }, 356 | "execution_count": 9, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "data.count()" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": { 368 | "deletable": true, 369 | "editable": true, 370 | "slideshow": { 371 | "slide_type": "slide" 372 | } 373 | }, 374 | "source": [ 375 | "## Dropping missing records" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": { 381 | "deletable": true, 382 | "editable": true, 383 | "slideshow": { 384 | "slide_type": "slide" 385 | } 386 | }, 387 | "source": [ 388 | "### Drop all records that have one or more missing values" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 10, 394 | "metadata": { 395 | "collapsed": false, 396 | "deletable": true, 397 | "editable": true, 398 | "slideshow": { 399 | "slide_type": "fragment" 400 | } 401 | }, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/plain": [ 406 | "(183, 12)" 407 | ] 408 | }, 409 | "execution_count": 10, 410 | "metadata": {}, 411 | "output_type": "execute_result" 412 | } 413 | ], 414 | "source": [ 415 | "data_missing_dropped = data.dropna()\n", 416 | "data_missing_dropped.shape" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": { 422 | "deletable": true, 423 | "editable": true, 424 | "slideshow": { 425 | "slide_type": "slide" 426 | } 427 | }, 428 | "source": [ 429 | "### Drop only those rows that have all records missing" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 11, 435 | "metadata": { 436 | "collapsed": false, 437 | "deletable": true, 438 | "editable": true, 439 | "slideshow": { 440 | "slide_type": "fragment" 441 | } 442 | }, 443 | "outputs": [ 444 | { 445 | "data": { 446 | "text/plain": [ 447 | "(891, 12)" 448 | ] 449 | }, 450 | "execution_count": 11, 451 | "metadata": {}, 452 | "output_type": "execute_result" 453 | } 454 | ], 455 | "source": [ 456 | "data_all_missing_dropped = data.dropna(how=\"all\")\n", 457 | "data_all_missing_dropped.shape" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": { 463 | "deletable": true, 464 | "editable": true, 465 | "slideshow": { 466 | "slide_type": "slide" 467 | } 468 | }, 469 | "source": [ 470 | "## Fill in missing data" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": { 476 | "deletable": true, 477 | "editable": true, 478 | "slideshow": { 479 | "slide_type": "slide" 480 | } 481 | }, 482 | "source": [ 483 | "### Fill in missing data with zeros" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 10, 489 | "metadata": { 490 | "collapsed": false, 491 | "deletable": true, 492 | "editable": true, 493 | "slideshow": { 494 | "slide_type": "fragment" 495 | } 496 | }, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "PassengerId 891\n", 502 | "Survived 891\n", 503 | "Pclass 891\n", 504 | "Name 891\n", 505 | "Sex 891\n", 506 | "Age 891\n", 507 | "SibSp 891\n", 508 | "Parch 891\n", 509 | "Ticket 891\n", 510 | "Fare 891\n", 511 | "Cabin 891\n", 512 | "Embarked 891\n", 513 | "dtype: int64" 514 | ] 515 | }, 516 | "execution_count": 10, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "data_filled_zeros = data.fillna(0)\n", 523 | "data_filled_zeros.count()" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": { 529 | "deletable": true, 530 | "editable": true, 531 | "slideshow": { 532 | "slide_type": "slide" 533 | } 534 | }, 535 | "source": [ 536 | "### Fill in missing data with a mean of the values from other rows" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 12, 542 | "metadata": { 543 | "collapsed": false, 544 | "deletable": true, 545 | "editable": true, 546 | "slideshow": { 547 | "slide_type": "fragment" 548 | } 549 | }, 550 | "outputs": [ 551 | { 552 | "data": { 553 | "text/plain": [ 554 | "PassengerId 891\n", 555 | "Survived 891\n", 556 | "Pclass 891\n", 557 | "Name 891\n", 558 | "Sex 891\n", 559 | "Age 891\n", 560 | "SibSp 891\n", 561 | "Parch 891\n", 562 | "Ticket 891\n", 563 | "Fare 891\n", 564 | "Cabin 204\n", 565 | "Embarked 889\n", 566 | "dtype: int64" 567 | ] 568 | }, 569 | "execution_count": 12, 570 | "metadata": {}, 571 | "output_type": "execute_result" 572 | } 573 | ], 574 | "source": [ 575 | "data_filled_in_mean = data.copy()\n", 576 | "data_filled_in_mean.Age.fillna(data.Age.mean(), inplace=True)\n", 577 | "data_filled_in_mean.count()" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": { 584 | "collapsed": true, 585 | "deletable": true, 586 | "editable": true, 587 | "slideshow": { 588 | "slide_type": "skip" 589 | } 590 | }, 591 | "outputs": [], 592 | "source": [] 593 | } 594 | ], 595 | "metadata": { 596 | "celltoolbar": "Slideshow", 597 | "kernelspec": { 598 | "display_name": "Python 3", 599 | "language": "python", 600 | "name": "python3" 601 | }, 602 | "language_info": { 603 | "codemirror_mode": { 604 | "name": "ipython", 605 | "version": 3 606 | }, 607 | "file_extension": ".py", 608 | "mimetype": "text/x-python", 609 | "name": "python", 610 | "nbconvert_exporter": "python", 611 | "pygments_lexer": "ipython3", 612 | "version": "3.6.1" 613 | } 614 | }, 615 | "nbformat": 4, 616 | "nbformat_minor": 2 617 | } 618 | -------------------------------------------------------------------------------- /Chapter02/Filter rows of a pandas DataFrame by column value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Read in the dataset" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true, 117 | "slideshow": { 118 | "slide_type": "fragment" 119 | } 120 | }, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " Date RegionID RegionName State Metro \\\n", 202 | "0 2017-05-31 6181 New York NY New York \n", 203 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 204 | "2 2017-05-31 17426 Chicago IL Chicago \n", 205 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 206 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 207 | "\n", 208 | " County SizeRank Zhvi \n", 209 | "0 Queens 0 672400 \n", 210 | "1 Los Angeles 1 629900 \n", 211 | "2 Cook 2 222700 \n", 212 | "3 Philadelphia 3 137300 \n", 213 | "4 Maricopa 4 211300 " 214 | ] 215 | }, 216 | "execution_count": 4, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "data = pd.read_table('data-zillow.csv', sep=',')\n", 223 | "data.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "collapsed": true, 230 | "deletable": true, 231 | "editable": true, 232 | "slideshow": { 233 | "slide_type": "slide" 234 | } 235 | }, 236 | "source": [ 237 | "## Filter columns by name using filter()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 11, 243 | "metadata": { 244 | "collapsed": false, 245 | "deletable": true, 246 | "editable": true, 247 | "slideshow": { 248 | "slide_type": "fragment" 249 | } 250 | }, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/html": [ 255 | "
\n", 256 | "\n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | "
StateMetro
0NYNew York
1CALos Angeles-Long Beach-Anaheim
2ILChicago
3PAPhiladelphia
4AZPhoenix
\n", 292 | "
" 293 | ], 294 | "text/plain": [ 295 | " State Metro\n", 296 | "0 NY New York\n", 297 | "1 CA Los Angeles-Long Beach-Anaheim\n", 298 | "2 IL Chicago\n", 299 | "3 PA Philadelphia\n", 300 | "4 AZ Phoenix" 301 | ] 302 | }, 303 | "execution_count": 11, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "filtered_data = data.filter(items=['State', 'Metro'])\n", 310 | "filtered_data.head()" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": { 316 | "deletable": true, 317 | "editable": true, 318 | "slideshow": { 319 | "slide_type": "slide" 320 | } 321 | }, 322 | "source": [ 323 | "## Filter columns by regular expression using filter()" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 12, 329 | "metadata": { 330 | "collapsed": false, 331 | "deletable": true, 332 | "editable": true, 333 | "slideshow": { 334 | "slide_type": "fragment" 335 | } 336 | }, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/html": [ 341 | "
\n", 342 | "\n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | "
RegionIDRegionName
06181New York
112447Los Angeles
217426Chicago
313271Philadelphia
440326Phoenix
\n", 378 | "
" 379 | ], 380 | "text/plain": [ 381 | " RegionID RegionName\n", 382 | "0 6181 New York\n", 383 | "1 12447 Los Angeles\n", 384 | "2 17426 Chicago\n", 385 | "3 13271 Philadelphia\n", 386 | "4 40326 Phoenix" 387 | ] 388 | }, 389 | "execution_count": 12, 390 | "metadata": {}, 391 | "output_type": "execute_result" 392 | } 393 | ], 394 | "source": [ 395 | "filtered_data = data.filter(regex='Region', axis=1)\n", 396 | "filtered_data.head()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": { 402 | "deletable": true, 403 | "editable": true, 404 | "slideshow": { 405 | "slide_type": "slide" 406 | } 407 | }, 408 | "source": [ 409 | "## Filter data using boolean indexing" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 8, 415 | "metadata": { 416 | "collapsed": false, 417 | "deletable": true, 418 | "editable": true, 419 | "slideshow": { 420 | "slide_type": "fragment" 421 | } 422 | }, 423 | "outputs": [ 424 | { 425 | "data": { 426 | "text/plain": [ 427 | "0 True\n", 428 | "1 True\n", 429 | "2 False\n", 430 | "3 False\n", 431 | "4 False\n", 432 | "Name: Zhvi, dtype: bool" 433 | ] 434 | }, 435 | "execution_count": 8, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "price_filter_series = data['Zhvi'] > 500000\n", 442 | "price_filter_series.head()" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 9, 448 | "metadata": { 449 | "collapsed": false, 450 | "deletable": true, 451 | "editable": true, 452 | "slideshow": { 453 | "slide_type": "fragment" 454 | } 455 | }, 456 | "outputs": [ 457 | { 458 | "data": { 459 | "text/html": [ 460 | "
\n", 461 | "\n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
62017-05-3154296San DiegoCASan DiegoSan Diego6572100
82017-05-3133839San JoseCASan JoseSanta Clara8877400
102017-05-3120330San FranciscoCASan FranciscoSan Francisco101194300
\n", 533 | "
" 534 | ], 535 | "text/plain": [ 536 | " Date RegionID RegionName State Metro \\\n", 537 | "0 2017-05-31 6181 New York NY New York \n", 538 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 539 | "6 2017-05-31 54296 San Diego CA San Diego \n", 540 | "8 2017-05-31 33839 San Jose CA San Jose \n", 541 | "10 2017-05-31 20330 San Francisco CA San Francisco \n", 542 | "\n", 543 | " County SizeRank Zhvi \n", 544 | "0 Queens 0 672400 \n", 545 | "1 Los Angeles 1 629900 \n", 546 | "6 San Diego 6 572100 \n", 547 | "8 Santa Clara 8 877400 \n", 548 | "10 San Francisco 10 1194300 " 549 | ] 550 | }, 551 | "execution_count": 9, 552 | "metadata": {}, 553 | "output_type": "execute_result" 554 | } 555 | ], 556 | "source": [ 557 | "data[price_filter_series].head()" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": { 563 | "slideshow": { 564 | "slide_type": "slide" 565 | } 566 | }, 567 | "source": [ 568 | "## An alternative way to filter" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 10, 574 | "metadata": { 575 | "collapsed": false, 576 | "deletable": true, 577 | "editable": true, 578 | "slideshow": { 579 | "slide_type": "fragment" 580 | } 581 | }, 582 | "outputs": [ 583 | { 584 | "data": { 585 | "text/html": [ 586 | "
\n", 587 | "\n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
102017-05-3120330San FranciscoCASan FranciscoSan Francisco101194300
1812017-05-3154626SunnyvaleCASan JoseSanta Clara1811509300
2342017-05-3113713Santa ClaraCASan JoseSanta Clara2341071500
2382017-05-3116992BerkeleyCASan FranciscoAlameda2381102000
3082017-05-3113699San MateoCASan FranciscoSan Mateo3081198300
\n", 659 | "
" 660 | ], 661 | "text/plain": [ 662 | " Date RegionID RegionName State Metro County \\\n", 663 | "10 2017-05-31 20330 San Francisco CA San Francisco San Francisco \n", 664 | "181 2017-05-31 54626 Sunnyvale CA San Jose Santa Clara \n", 665 | "234 2017-05-31 13713 Santa Clara CA San Jose Santa Clara \n", 666 | "238 2017-05-31 16992 Berkeley CA San Francisco Alameda \n", 667 | "308 2017-05-31 13699 San Mateo CA San Francisco San Mateo \n", 668 | "\n", 669 | " SizeRank Zhvi \n", 670 | "10 10 1194300 \n", 671 | "181 181 1509300 \n", 672 | "234 234 1071500 \n", 673 | "238 238 1102000 \n", 674 | "308 308 1198300 " 675 | ] 676 | }, 677 | "execution_count": 10, 678 | "metadata": {}, 679 | "output_type": "execute_result" 680 | } 681 | ], 682 | "source": [ 683 | "data[data.Zhvi >= 1000000].head()" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": { 690 | "collapsed": true, 691 | "deletable": true, 692 | "editable": true, 693 | "slideshow": { 694 | "slide_type": "skip" 695 | } 696 | }, 697 | "outputs": [], 698 | "source": [] 699 | } 700 | ], 701 | "metadata": { 702 | "celltoolbar": "Slideshow", 703 | "kernelspec": { 704 | "display_name": "Python 3", 705 | "language": "python", 706 | "name": "python3" 707 | }, 708 | "language_info": { 709 | "codemirror_mode": { 710 | "name": "ipython", 711 | "version": 3 712 | }, 713 | "file_extension": ".py", 714 | "mimetype": "text/x-python", 715 | "name": "python", 716 | "nbconvert_exporter": "python", 717 | "pygments_lexer": "ipython3", 718 | "version": "3.6.1" 719 | } 720 | }, 721 | "nbformat": 4, 722 | "nbformat_minor": 2 723 | } 724 | -------------------------------------------------------------------------------- /Chapter03/Work with dates and times data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Our dataset" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true, 117 | "slideshow": { 118 | "slide_type": "fragment" 119 | } 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "dataset = pd.DataFrame({'DOB': ['1976-06-01', '1980-09-23', '1984-03-30', '1991-12-31', '1994-10-2', '1973-11-11'],\n", 124 | " 'Sex': ['F', 'M', 'F', 'M', 'M', 'F'],\n", 125 | " 'State': ['CA', 'NY', 'OH', 'OR', 'TX', 'CA'],\n", 126 | " 'Name': ['Jane', 'John', 'Cathy', 'Jo', 'Sam', 'Tai']})" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "metadata": { 133 | "collapsed": false, 134 | "deletable": true, 135 | "editable": true, 136 | "slideshow": { 137 | "slide_type": "fragment" 138 | } 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | "
DOBNameSexState
01976-06-01JaneFCA
11980-09-23JohnMNY
21984-03-30CathyFOH
31991-12-31JoMOR
41994-10-2SamMTX
51973-11-11TaiFCA
\n", 200 | "
" 201 | ], 202 | "text/plain": [ 203 | " DOB Name Sex State\n", 204 | "0 1976-06-01 Jane F CA\n", 205 | "1 1980-09-23 John M NY\n", 206 | "2 1984-03-30 Cathy F OH\n", 207 | "3 1991-12-31 Jo M OR\n", 208 | "4 1994-10-2 Sam M TX\n", 209 | "5 1973-11-11 Tai F CA" 210 | ] 211 | }, 212 | "execution_count": 5, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "dataset" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "deletable": true, 225 | "editable": true, 226 | "slideshow": { 227 | "slide_type": "slide" 228 | } 229 | }, 230 | "source": [ 231 | "## let's first convert our date column to `datetime`" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 6, 237 | "metadata": { 238 | "collapsed": false, 239 | "deletable": true, 240 | "editable": true, 241 | "slideshow": { 242 | "slide_type": "fragment" 243 | } 244 | }, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "DOB object\n", 250 | "Name object\n", 251 | "Sex object\n", 252 | "State object\n", 253 | "dtype: object" 254 | ] 255 | }, 256 | "execution_count": 6, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "dataset.dtypes" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 38, 268 | "metadata": { 269 | "collapsed": false, 270 | "deletable": true, 271 | "editable": true, 272 | "slideshow": { 273 | "slide_type": "fragment" 274 | } 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "dataset.DOB = pd.to_datetime(dataset.DOB)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 7, 284 | "metadata": { 285 | "collapsed": false, 286 | "deletable": true, 287 | "editable": true, 288 | "slideshow": { 289 | "slide_type": "fragment" 290 | } 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "DOB object\n", 297 | "Name object\n", 298 | "Sex object\n", 299 | "State object\n", 300 | "dtype: object" 301 | ] 302 | }, 303 | "execution_count": 7, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "dataset.dtypes" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": { 315 | "slideshow": { 316 | "slide_type": "slide" 317 | } 318 | }, 319 | "source": [ 320 | "### Let's set index to the date column" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 8, 326 | "metadata": { 327 | "collapsed": false, 328 | "deletable": true, 329 | "editable": true, 330 | "slideshow": { 331 | "slide_type": "fragment" 332 | } 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "dataset.set_index('DOB', inplace=True)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 9, 342 | "metadata": { 343 | "collapsed": false, 344 | "deletable": true, 345 | "editable": true, 346 | "slideshow": { 347 | "slide_type": "fragment" 348 | } 349 | }, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/html": [ 354 | "
\n", 355 | "\n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | "
NameSexState
DOB
1976-06-01JaneFCA
1980-09-23JohnMNY
1984-03-30CathyFOH
1991-12-31JoMOR
1994-10-2SamMTX
1973-11-11TaiFCA
\n", 409 | "
" 410 | ], 411 | "text/plain": [ 412 | " Name Sex State\n", 413 | "DOB \n", 414 | "1976-06-01 Jane F CA\n", 415 | "1980-09-23 John M NY\n", 416 | "1984-03-30 Cathy F OH\n", 417 | "1991-12-31 Jo M OR\n", 418 | "1994-10-2 Sam M TX\n", 419 | "1973-11-11 Tai F CA" 420 | ] 421 | }, 422 | "execution_count": 9, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "dataset" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "slideshow": { 435 | "slide_type": "slide" 436 | } 437 | }, 438 | "source": [ 439 | "### Filter and select time series Data" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 41, 445 | "metadata": { 446 | "collapsed": false, 447 | "deletable": true, 448 | "editable": true, 449 | "slideshow": { 450 | "slide_type": "fragment" 451 | } 452 | }, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/html": [ 457 | "
\n", 458 | "\n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | "
NameSexState
DOB
1980-09-23JohnMNY
\n", 482 | "
" 483 | ], 484 | "text/plain": [ 485 | " Name Sex State\n", 486 | "DOB \n", 487 | "1980-09-23 John M NY" 488 | ] 489 | }, 490 | "execution_count": 41, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "dataset['1980']" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 42, 502 | "metadata": { 503 | "collapsed": false, 504 | "deletable": true, 505 | "editable": true, 506 | "slideshow": { 507 | "slide_type": "fragment" 508 | } 509 | }, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/html": [ 514 | "
\n", 515 | "\n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | "
NameSexState
DOB
1980-09-23JohnMNY
1984-03-30CathyFOH
1991-12-31JoMOR
1994-10-02SamMTX
\n", 557 | "
" 558 | ], 559 | "text/plain": [ 560 | " Name Sex State\n", 561 | "DOB \n", 562 | "1980-09-23 John M NY\n", 563 | "1984-03-30 Cathy F OH\n", 564 | "1991-12-31 Jo M OR\n", 565 | "1994-10-02 Sam M TX" 566 | ] 567 | }, 568 | "execution_count": 42, 569 | "metadata": {}, 570 | "output_type": "execute_result" 571 | } 572 | ], 573 | "source": [ 574 | "dataset['1980':]" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 43, 580 | "metadata": { 581 | "collapsed": false, 582 | "deletable": true, 583 | "editable": true, 584 | "slideshow": { 585 | "slide_type": "fragment" 586 | } 587 | }, 588 | "outputs": [ 589 | { 590 | "data": { 591 | "text/html": [ 592 | "
\n", 593 | "\n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | "
NameSexState
DOB
1976-06-01JaneFCA
1980-09-23JohnMNY
1973-11-11TaiFCA
\n", 629 | "
" 630 | ], 631 | "text/plain": [ 632 | " Name Sex State\n", 633 | "DOB \n", 634 | "1976-06-01 Jane F CA\n", 635 | "1980-09-23 John M NY\n", 636 | "1973-11-11 Tai F CA" 637 | ] 638 | }, 639 | "execution_count": 43, 640 | "metadata": {}, 641 | "output_type": "execute_result" 642 | } 643 | ], 644 | "source": [ 645 | "dataset[:'1980']" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 45, 651 | "metadata": { 652 | "collapsed": false, 653 | "deletable": true, 654 | "editable": true, 655 | "slideshow": { 656 | "slide_type": "fragment" 657 | } 658 | }, 659 | "outputs": [ 660 | { 661 | "data": { 662 | "text/html": [ 663 | "
\n", 664 | "\n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | "
NameSexState
DOB
1980-09-23JohnMNY
1984-03-30CathyFOH
\n", 694 | "
" 695 | ], 696 | "text/plain": [ 697 | " Name Sex State\n", 698 | "DOB \n", 699 | "1980-09-23 John M NY\n", 700 | "1984-03-30 Cathy F OH" 701 | ] 702 | }, 703 | "execution_count": 45, 704 | "metadata": {}, 705 | "output_type": "execute_result" 706 | } 707 | ], 708 | "source": [ 709 | "\n", 710 | "dataset['1980':'1984']" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 51, 716 | "metadata": { 717 | "collapsed": true, 718 | "deletable": true, 719 | "editable": true, 720 | "slideshow": { 721 | "slide_type": "skip" 722 | } 723 | }, 724 | "outputs": [], 725 | "source": [ 726 | "dataset.reset_index(inplace=True)" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": { 732 | "slideshow": { 733 | "slide_type": "slide" 734 | } 735 | }, 736 | "source": [ 737 | "### Get properties of date-time series data" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 55, 743 | "metadata": { 744 | "collapsed": false, 745 | "deletable": true, 746 | "editable": true, 747 | "slideshow": { 748 | "slide_type": "fragment" 749 | } 750 | }, 751 | "outputs": [ 752 | { 753 | "data": { 754 | "text/plain": [ 755 | "0 153\n", 756 | "1 267\n", 757 | "2 90\n", 758 | "3 365\n", 759 | "4 275\n", 760 | "5 315\n", 761 | "Name: DOB, dtype: int64" 762 | ] 763 | }, 764 | "execution_count": 55, 765 | "metadata": {}, 766 | "output_type": "execute_result" 767 | } 768 | ], 769 | "source": [ 770 | "dataset.DOB.dt.dayofyear" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 56, 776 | "metadata": { 777 | "collapsed": false, 778 | "deletable": true, 779 | "editable": true, 780 | "slideshow": { 781 | "slide_type": "fragment" 782 | } 783 | }, 784 | "outputs": [ 785 | { 786 | "data": { 787 | "text/plain": [ 788 | "0 Tuesday\n", 789 | "1 Tuesday\n", 790 | "2 Friday\n", 791 | "3 Tuesday\n", 792 | "4 Sunday\n", 793 | "5 Sunday\n", 794 | "Name: DOB, dtype: object" 795 | ] 796 | }, 797 | "execution_count": 56, 798 | "metadata": {}, 799 | "output_type": "execute_result" 800 | } 801 | ], 802 | "source": [ 803 | "dataset.DOB.dt.weekday_name" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": { 810 | "collapsed": true, 811 | "deletable": true, 812 | "editable": true, 813 | "slideshow": { 814 | "slide_type": "skip" 815 | } 816 | }, 817 | "outputs": [], 818 | "source": [] 819 | } 820 | ], 821 | "metadata": { 822 | "celltoolbar": "Slideshow", 823 | "kernelspec": { 824 | "display_name": "Python 3", 825 | "language": "python", 826 | "name": "python3" 827 | }, 828 | "language_info": { 829 | "codemirror_mode": { 830 | "name": "ipython", 831 | "version": 3 832 | }, 833 | "file_extension": ".py", 834 | "mimetype": "text/x-python", 835 | "name": "python", 836 | "nbconvert_exporter": "python", 837 | "pygments_lexer": "ipython3", 838 | "version": "3.6.1" 839 | } 840 | }, 841 | "nbformat": 4, 842 | "nbformat_minor": 2 843 | } 844 | -------------------------------------------------------------------------------- /Chapter02/Apply multiple filter criteria to a pandas DataFrame.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Read in the dataset" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true, 117 | "slideshow": { 118 | "slide_type": "fragment" 119 | } 120 | }, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " Date RegionID RegionName State Metro \\\n", 202 | "0 2017-05-31 6181 New York NY New York \n", 203 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 204 | "2 2017-05-31 17426 Chicago IL Chicago \n", 205 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 206 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 207 | "\n", 208 | " County SizeRank Zhvi \n", 209 | "0 Queens 0 672400 \n", 210 | "1 Los Angeles 1 629900 \n", 211 | "2 Cook 2 222700 \n", 212 | "3 Philadelphia 3 137300 \n", 213 | "4 Maricopa 4 211300 " 214 | ] 215 | }, 216 | "execution_count": 4, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "data = pd.read_table('data-zillow.csv', sep=',')\n", 223 | "data.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "slideshow": { 230 | "slide_type": "slide" 231 | } 232 | }, 233 | "source": [ 234 | "## Filtering based on multiple conditions " 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 8, 240 | "metadata": { 241 | "collapsed": false, 242 | "deletable": true, 243 | "editable": true, 244 | "slideshow": { 245 | "slide_type": "fragment" 246 | } 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/html": [ 252 | "
\n", 253 | "\n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
11322017-05-3118375Great NeckNYNew YorkNassau11321235800
24052017-05-3154333ScarsdaleNYNew YorkWestchester24051468100
26192017-05-3147495RyeNYNew YorkWestchester26191736400
30322017-05-3125725ManhassetNYNew YorkNassau30321483400
30642017-05-3118955LarchmontNYNew YorkWestchester30641052200
\n", 325 | "
" 326 | ], 327 | "text/plain": [ 328 | " Date RegionID RegionName State Metro County SizeRank \\\n", 329 | "1132 2017-05-31 18375 Great Neck NY New York Nassau 1132 \n", 330 | "2405 2017-05-31 54333 Scarsdale NY New York Westchester 2405 \n", 331 | "2619 2017-05-31 47495 Rye NY New York Westchester 2619 \n", 332 | "3032 2017-05-31 25725 Manhasset NY New York Nassau 3032 \n", 333 | "3064 2017-05-31 18955 Larchmont NY New York Westchester 3064 \n", 334 | "\n", 335 | " Zhvi \n", 336 | "1132 1235800 \n", 337 | "2405 1468100 \n", 338 | "2619 1736400 \n", 339 | "3032 1483400 \n", 340 | "3064 1052200 " 341 | ] 342 | }, 343 | "execution_count": 8, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "data[(data['Zhvi'] > 1000000) & (data['State'] == 'NY')].head()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": { 355 | "slideshow": { 356 | "slide_type": "slide" 357 | } 358 | }, 359 | "source": [ 360 | "## Filtering on multiple conditions - OR" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 6, 366 | "metadata": { 367 | "collapsed": false, 368 | "deletable": true, 369 | "editable": true, 370 | "slideshow": { 371 | "slide_type": "fragment" 372 | } 373 | }, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/html": [ 378 | "
\n", 379 | "\n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
62017-05-3154296San DiegoCASan DiegoSan Diego6572100
82017-05-3133839San JoseCASan JoseSanta Clara8877400
102017-05-3120330San FranciscoCASan FranciscoSan Francisco101194300
\n", 451 | "
" 452 | ], 453 | "text/plain": [ 454 | " Date RegionID RegionName State Metro \\\n", 455 | "0 2017-05-31 6181 New York NY New York \n", 456 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 457 | "6 2017-05-31 54296 San Diego CA San Diego \n", 458 | "8 2017-05-31 33839 San Jose CA San Jose \n", 459 | "10 2017-05-31 20330 San Francisco CA San Francisco \n", 460 | "\n", 461 | " County SizeRank Zhvi \n", 462 | "0 Queens 0 672400 \n", 463 | "1 Los Angeles 1 629900 \n", 464 | "6 San Diego 6 572100 \n", 465 | "8 Santa Clara 8 877400 \n", 466 | "10 San Francisco 10 1194300 " 467 | ] 468 | }, 469 | "execution_count": 6, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "data[((data['State'] == 'CA') | (data['State'] == 'NY'))].head()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": { 481 | "slideshow": { 482 | "slide_type": "slide" 483 | } 484 | }, 485 | "source": [ 486 | "## Filtering using `isin` method" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 12, 492 | "metadata": { 493 | "collapsed": false, 494 | "deletable": true, 495 | "editable": true, 496 | "slideshow": { 497 | "slide_type": "fragment" 498 | } 499 | }, 500 | "outputs": [ 501 | { 502 | "data": { 503 | "text/html": [ 504 | "
\n", 505 | "\n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
102017-05-3120330San FranciscoCASan FranciscoSan Francisco101194300
382017-05-3113072OaklandCASan FranciscoAlameda38680100
632017-05-3112970NewarkNJNew YorkEssex63232800
722017-05-3125320Jersey CityNJNew YorkHudson72380000
\n", 577 | "
" 578 | ], 579 | "text/plain": [ 580 | " Date RegionID RegionName State Metro County \\\n", 581 | "0 2017-05-31 6181 New York NY New York Queens \n", 582 | "10 2017-05-31 20330 San Francisco CA San Francisco San Francisco \n", 583 | "38 2017-05-31 13072 Oakland CA San Francisco Alameda \n", 584 | "63 2017-05-31 12970 Newark NJ New York Essex \n", 585 | "72 2017-05-31 25320 Jersey City NJ New York Hudson \n", 586 | "\n", 587 | " SizeRank Zhvi \n", 588 | "0 0 672400 \n", 589 | "10 10 1194300 \n", 590 | "38 38 680100 \n", 591 | "63 63 232800 \n", 592 | "72 72 380000 " 593 | ] 594 | }, 595 | "execution_count": 12, 596 | "metadata": {}, 597 | "output_type": "execute_result" 598 | } 599 | ], 600 | "source": [ 601 | "filter = data['Metro'].isin(['New York', 'San Francisco'])\n", 602 | "data[filter].head()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": { 608 | "slideshow": { 609 | "slide_type": "slide" 610 | } 611 | }, 612 | "source": [ 613 | "## Using `isin` method with multiple conditions" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 14, 619 | "metadata": { 620 | "collapsed": false, 621 | "slideshow": { 622 | "slide_type": "fragment" 623 | } 624 | }, 625 | "outputs": [ 626 | { 627 | "data": { 628 | "text/html": [ 629 | "
\n", 630 | "\n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
0NaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaNNaNCANaNNaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaN
\n", 702 | "
" 703 | ], 704 | "text/plain": [ 705 | " Date RegionID RegionName State Metro County SizeRank Zhvi\n", 706 | "0 NaN NaN NaN NaN NaN NaN NaN NaN\n", 707 | "1 NaN NaN NaN CA NaN NaN NaN NaN\n", 708 | "2 NaN NaN NaN NaN NaN NaN NaN NaN\n", 709 | "3 NaN NaN NaN NaN NaN NaN NaN NaN\n", 710 | "4 NaN NaN NaN NaN NaN NaN NaN NaN" 711 | ] 712 | }, 713 | "execution_count": 14, 714 | "metadata": {}, 715 | "output_type": "execute_result" 716 | } 717 | ], 718 | "source": [ 719 | "filter = data.isin({'State': ['CA'], 'Metro': ['San Francisco']})\n", 720 | "data[filter].head()" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": null, 726 | "metadata": { 727 | "collapsed": true, 728 | "deletable": true, 729 | "editable": true, 730 | "slideshow": { 731 | "slide_type": "skip" 732 | } 733 | }, 734 | "outputs": [], 735 | "source": [] 736 | } 737 | ], 738 | "metadata": { 739 | "celltoolbar": "Slideshow", 740 | "kernelspec": { 741 | "display_name": "Python 3", 742 | "language": "python", 743 | "name": "python3" 744 | }, 745 | "language_info": { 746 | "codemirror_mode": { 747 | "name": "ipython", 748 | "version": 3 749 | }, 750 | "file_extension": ".py", 751 | "mimetype": "text/x-python", 752 | "name": "python", 753 | "nbconvert_exporter": "python", 754 | "pygments_lexer": "ipython3", 755 | "version": "3.6.1" 756 | } 757 | }, 758 | "nbformat": 4, 759 | "nbformat_minor": 2 760 | } 761 | -------------------------------------------------------------------------------- /Chapter02/Using pandas Series data structure to select a subset of the data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 13, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Introduce our dataset" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "deletable": true, 85 | "editable": true, 86 | "slideshow": { 87 | "slide_type": "fragment" 88 | } 89 | }, 90 | "source": [ 91 | "* Data from Zillow.com, a real estate marketplace.\n", 92 | "* A public and free to use dataset, after attribution.\n", 93 | "* Real dataset which lists mean prices of houses in various locations in US.\n", 94 | "* Dataset is a csv or a comma separated values text file.\n", 95 | "* Available here - https://www.zillow.com/research/data/" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": { 101 | "deletable": true, 102 | "editable": true, 103 | "slideshow": { 104 | "slide_type": "slide" 105 | } 106 | }, 107 | "source": [ 108 | "## Import Pandas" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 1, 114 | "metadata": { 115 | "collapsed": true, 116 | "deletable": true, 117 | "editable": true, 118 | "slideshow": { 119 | "slide_type": "fragment" 120 | } 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "import pandas as pd" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "deletable": true, 131 | "editable": true, 132 | "slideshow": { 133 | "slide_type": "slide" 134 | } 135 | }, 136 | "source": [ 137 | "## Read in the dataset" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 2, 143 | "metadata": { 144 | "collapsed": false, 145 | "deletable": true, 146 | "editable": true, 147 | "slideshow": { 148 | "slide_type": "fragment" 149 | } 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "data = pd.read_table('data-zillow.csv', sep=',')" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "deletable": true, 160 | "editable": true, 161 | "slideshow": { 162 | "slide_type": "slide" 163 | } 164 | }, 165 | "source": [ 166 | "## View the dataset" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 14, 172 | "metadata": { 173 | "collapsed": false, 174 | "deletable": true, 175 | "editable": true, 176 | "slideshow": { 177 | "slide_type": "fragment" 178 | } 179 | }, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/html": [ 184 | "
\n", 185 | "\n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 257 | "
" 258 | ], 259 | "text/plain": [ 260 | " Date RegionID RegionName State Metro \\\n", 261 | "0 2017-05-31 6181 New York NY New York \n", 262 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 263 | "2 2017-05-31 17426 Chicago IL Chicago \n", 264 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 265 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 266 | "\n", 267 | " County SizeRank Zhvi \n", 268 | "0 Queens 0 672400 \n", 269 | "1 Los Angeles 1 629900 \n", 270 | "2 Cook 2 222700 \n", 271 | "3 Philadelphia 3 137300 \n", 272 | "4 Maricopa 4 211300 " 273 | ] 274 | }, 275 | "execution_count": 14, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "data.head()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "slideshow": { 288 | "slide_type": "slide" 289 | } 290 | }, 291 | "source": [ 292 | "# Select data" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "deletable": true, 299 | "editable": true, 300 | "slideshow": { 301 | "slide_type": "slide" 302 | } 303 | }, 304 | "source": [ 305 | "### Select a Series with bracket notation" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 20, 311 | "metadata": { 312 | "collapsed": false, 313 | "deletable": true, 314 | "editable": true, 315 | "slideshow": { 316 | "slide_type": "fragment" 317 | } 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "regions = data['RegionName']" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 21, 327 | "metadata": { 328 | "collapsed": false, 329 | "slideshow": { 330 | "slide_type": "fragment" 331 | } 332 | }, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/plain": [ 337 | "pandas.core.series.Series" 338 | ] 339 | }, 340 | "execution_count": 21, 341 | "metadata": {}, 342 | "output_type": "execute_result" 343 | } 344 | ], 345 | "source": [ 346 | "type(regions)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 22, 352 | "metadata": { 353 | "collapsed": false, 354 | "slideshow": { 355 | "slide_type": "fragment" 356 | } 357 | }, 358 | "outputs": [ 359 | { 360 | "data": { 361 | "text/plain": [ 362 | "0 New York\n", 363 | "1 Los Angeles\n", 364 | "2 Chicago\n", 365 | "3 Philadelphia\n", 366 | "4 Phoenix\n", 367 | "5 Las Vegas\n", 368 | "6 San Diego\n", 369 | "7 Dallas\n", 370 | "8 San Jose\n", 371 | "9 Jacksonville\n", 372 | "10 San Francisco\n", 373 | "11 Austin\n", 374 | "12 Detroit\n", 375 | "13 Columbus\n", 376 | "14 Memphis\n", 377 | "15 Charlotte\n", 378 | "16 El Paso\n", 379 | "17 Boston\n", 380 | "18 Seattle\n", 381 | "19 Baltimore\n", 382 | "20 Denver\n", 383 | "21 Washington\n", 384 | "22 Nashville\n", 385 | "23 Milwaukee\n", 386 | "24 Tucson\n", 387 | "25 Portland\n", 388 | "26 Oklahoma City\n", 389 | "27 Omaha\n", 390 | "28 Albuquerque\n", 391 | "29 Fresno\n", 392 | " ... \n", 393 | "10800 Mount Sidney\n", 394 | "10801 Cedar Grove\n", 395 | "10802 Clark\n", 396 | "10803 Daleville\n", 397 | "10804 Piney Point\n", 398 | "10805 Almo\n", 399 | "10806 Bois D Arc\n", 400 | "10807 Tyaskin\n", 401 | "10808 Inverness\n", 402 | "10809 Otwell\n", 403 | "10810 Lake Hughes\n", 404 | "10811 Marble Hill\n", 405 | "10812 Mead\n", 406 | "10813 Hesperus\n", 407 | "10814 Midland\n", 408 | "10815 Lavaca\n", 409 | "10816 Mays Landing\n", 410 | "10817 Stinson Beach\n", 411 | "10818 Upper Brookville\n", 412 | "10819 Decatur\n", 413 | "10820 Horse Shoe\n", 414 | "10821 Laotto\n", 415 | "10822 Goldsboro\n", 416 | "10823 Phoenicia\n", 417 | "10824 Jones\n", 418 | "10825 Nehalem\n", 419 | "10826 Clear Lake Shores\n", 420 | "10827 Lebanon Borough\n", 421 | "10828 Henrico\n", 422 | "10829 East Hampstead\n", 423 | "Name: RegionName, dtype: object" 424 | ] 425 | }, 426 | "execution_count": 22, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "regions" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": { 438 | "slideshow": { 439 | "slide_type": "slide" 440 | } 441 | }, 442 | "source": [ 443 | "## DataFrame vs Series" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": { 449 | "slideshow": { 450 | "slide_type": "slide" 451 | } 452 | }, 453 | "source": [ 454 | "## Multi Column Selection - Series or DataFrame" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 25, 460 | "metadata": { 461 | "collapsed": false, 462 | "slideshow": { 463 | "slide_type": "fragment" 464 | } 465 | }, 466 | "outputs": [ 467 | { 468 | "data": { 469 | "text/html": [ 470 | "
\n", 471 | "\n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | "
RegionNameState
0New YorkNY
1Los AngelesCA
2ChicagoIL
3PhiladelphiaPA
4PhoenixAZ
\n", 507 | "
" 508 | ], 509 | "text/plain": [ 510 | " RegionName State\n", 511 | "0 New York NY\n", 512 | "1 Los Angeles CA\n", 513 | "2 Chicago IL\n", 514 | "3 Philadelphia PA\n", 515 | "4 Phoenix AZ" 516 | ] 517 | }, 518 | "execution_count": 25, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "region_n_state = data[['RegionName', 'State']]\n", 525 | "region_n_state.head()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 26, 531 | "metadata": { 532 | "collapsed": false, 533 | "slideshow": { 534 | "slide_type": "fragment" 535 | } 536 | }, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/plain": [ 541 | "pandas.core.frame.DataFrame" 542 | ] 543 | }, 544 | "execution_count": 26, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "type(region_n_state)" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": { 556 | "deletable": true, 557 | "editable": true, 558 | "slideshow": { 559 | "slide_type": "slide" 560 | } 561 | }, 562 | "source": [ 563 | "## Select using dot notation" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 28, 569 | "metadata": { 570 | "collapsed": false, 571 | "deletable": true, 572 | "editable": true, 573 | "slideshow": { 574 | "slide_type": "fragment" 575 | } 576 | }, 577 | "outputs": [ 578 | { 579 | "data": { 580 | "text/plain": [ 581 | "0 NY\n", 582 | "1 CA\n", 583 | "2 IL\n", 584 | "3 PA\n", 585 | "4 AZ\n", 586 | "5 NV\n", 587 | "6 CA\n", 588 | "7 TX\n", 589 | "8 CA\n", 590 | "9 FL\n", 591 | "10 CA\n", 592 | "11 TX\n", 593 | "12 MI\n", 594 | "13 OH\n", 595 | "14 TN\n", 596 | "15 NC\n", 597 | "16 TX\n", 598 | "17 MA\n", 599 | "18 WA\n", 600 | "19 MD\n", 601 | "20 CO\n", 602 | "21 DC\n", 603 | "22 TN\n", 604 | "23 WI\n", 605 | "24 AZ\n", 606 | "25 OR\n", 607 | "26 OK\n", 608 | "27 NE\n", 609 | "28 NM\n", 610 | "29 CA\n", 611 | " ..\n", 612 | "10800 VA\n", 613 | "10801 WV\n", 614 | "10802 CO\n", 615 | "10803 VA\n", 616 | "10804 MD\n", 617 | "10805 KY\n", 618 | "10806 MO\n", 619 | "10807 MD\n", 620 | "10808 CA\n", 621 | "10809 IN\n", 622 | "10810 CA\n", 623 | "10811 GA\n", 624 | "10812 CO\n", 625 | "10813 CO\n", 626 | "10814 GA\n", 627 | "10815 AR\n", 628 | "10816 NJ\n", 629 | "10817 CA\n", 630 | "10818 NY\n", 631 | "10819 AR\n", 632 | "10820 NC\n", 633 | "10821 IN\n", 634 | "10822 MD\n", 635 | "10823 NY\n", 636 | "10824 AL\n", 637 | "10825 OR\n", 638 | "10826 TX\n", 639 | "10827 NJ\n", 640 | "10828 VA\n", 641 | "10829 NH\n", 642 | "Name: State, dtype: object" 643 | ] 644 | }, 645 | "execution_count": 28, 646 | "metadata": {}, 647 | "output_type": "execute_result" 648 | } 649 | ], 650 | "source": [ 651 | "data.State" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": { 657 | "deletable": true, 658 | "editable": true, 659 | "slideshow": { 660 | "slide_type": "slide" 661 | } 662 | }, 663 | "source": [ 664 | "## Creating a new series by selection" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 29, 670 | "metadata": { 671 | "collapsed": true, 672 | "deletable": true, 673 | "editable": true, 674 | "slideshow": { 675 | "slide_type": "fragment" 676 | } 677 | }, 678 | "outputs": [], 679 | "source": [ 680 | "data['Address'] = data.County + ', ' + data.Metro + ', ' + data.State" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 31, 686 | "metadata": { 687 | "collapsed": false, 688 | "deletable": true, 689 | "editable": true, 690 | "slideshow": { 691 | "slide_type": "fragment" 692 | } 693 | }, 694 | "outputs": [ 695 | { 696 | "data": { 697 | "text/plain": [ 698 | "0 Queens, New York, NY\n", 699 | "1 Los Angeles, Los Angeles-Long Beach-Anaheim, CA\n", 700 | "2 Cook, Chicago, IL\n", 701 | "3 Philadelphia, Philadelphia, PA\n", 702 | "4 Maricopa, Phoenix, AZ\n", 703 | "5 Clark, Las Vegas, NV\n", 704 | "6 San Diego, San Diego, CA\n", 705 | "7 Dallas, Dallas-Fort Worth, TX\n", 706 | "8 Santa Clara, San Jose, CA\n", 707 | "9 Duval, Jacksonville, FL\n", 708 | "10 San Francisco, San Francisco, CA\n", 709 | "11 Travis, Austin, TX\n", 710 | "12 Wayne, Detroit, MI\n", 711 | "13 Franklin, Columbus, OH\n", 712 | "14 Shelby, Memphis, TN\n", 713 | "15 Mecklenburg, Charlotte, NC\n", 714 | "16 El Paso, El Paso, TX\n", 715 | "17 Suffolk, Boston, MA\n", 716 | "18 King, Seattle, WA\n", 717 | "19 Baltimore City, Baltimore, MD\n", 718 | "20 Denver, Denver, CO\n", 719 | "21 District of Columbia, Washington, DC\n", 720 | "22 Davidson, Nashville, TN\n", 721 | "23 Milwaukee, Milwaukee, WI\n", 722 | "24 Pima, Tucson, AZ\n", 723 | "25 Multnomah, Portland, OR\n", 724 | "26 Oklahoma, Oklahoma City, OK\n", 725 | "27 Douglas, Omaha, NE\n", 726 | "28 Bernalillo, Albuquerque, NM\n", 727 | "29 Fresno, Fresno, CA\n", 728 | " ... \n", 729 | "10800 Augusta, Staunton, VA\n", 730 | "10801 Kanawha, Charleston, WV\n", 731 | "10802 Routt, Steamboat Springs, CO\n", 732 | "10803 Botetourt, Roanoke, VA\n", 733 | "10804 Saint Marys, California-Lexington Park, MD\n", 734 | "10805 Calloway, Murray, KY\n", 735 | "10806 Greene, Springfield, MO\n", 736 | "10807 Wicomico, Salisbury, MD\n", 737 | "10808 Marin, San Francisco, CA\n", 738 | "10809 Pike, Jasper, IN\n", 739 | "10810 Los Angeles, Los Angeles-Long Beach-Anaheim, CA\n", 740 | "10811 Dawson, Atlanta, GA\n", 741 | "10812 Weld, Greeley, CO\n", 742 | "10813 La Plata, Durango, CO\n", 743 | "10814 Muscogee, Columbus, GA\n", 744 | "10815 Sebastian, Fort Smith, AR\n", 745 | "10816 Atlantic, Atlantic City, NJ\n", 746 | "10817 Marin, San Francisco, CA\n", 747 | "10818 Nassau, New York, NY\n", 748 | "10819 Benton, Fayetteville, AR\n", 749 | "10820 Henderson, Asheville, NC\n", 750 | "10821 Noble, Kendallville, IN\n", 751 | "10822 NaN\n", 752 | "10823 Ulster, Kingston, NY\n", 753 | "10824 Autauga, Montgomery, AL\n", 754 | "10825 NaN\n", 755 | "10826 Galveston, Houston, TX\n", 756 | "10827 Hunterdon, New York, NJ\n", 757 | "10828 Henrico, Richmond, VA\n", 758 | "10829 Rockingham, Boston, NH\n", 759 | "Name: Address, dtype: object" 760 | ] 761 | }, 762 | "execution_count": 31, 763 | "metadata": {}, 764 | "output_type": "execute_result" 765 | } 766 | ], 767 | "source": [ 768 | "data.Address" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": null, 774 | "metadata": { 775 | "collapsed": true, 776 | "deletable": true, 777 | "editable": true, 778 | "slideshow": { 779 | "slide_type": "skip" 780 | } 781 | }, 782 | "outputs": [], 783 | "source": [] 784 | } 785 | ], 786 | "metadata": { 787 | "celltoolbar": "Slideshow", 788 | "kernelspec": { 789 | "display_name": "Python 3", 790 | "language": "python", 791 | "name": "python3" 792 | }, 793 | "language_info": { 794 | "codemirror_mode": { 795 | "name": "ipython", 796 | "version": 3 797 | }, 798 | "file_extension": ".py", 799 | "mimetype": "text/x-python", 800 | "name": "python", 801 | "nbconvert_exporter": "python", 802 | "pygments_lexer": "ipython3", 803 | "version": "3.6.1" 804 | } 805 | }, 806 | "nbformat": 4, 807 | "nbformat_minor": 2 808 | } 809 | -------------------------------------------------------------------------------- /Chapter03/Merging and concatenating multiple data frames into one.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 17, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 18, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 19, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "collapsed": true, 101 | "deletable": true, 102 | "editable": true, 103 | "slideshow": { 104 | "slide_type": "slide" 105 | } 106 | }, 107 | "source": [ 108 | "## Concatenate Dataset DataFrames" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 38, 114 | "metadata": { 115 | "collapsed": false, 116 | "deletable": true, 117 | "editable": true, 118 | "slideshow": { 119 | "slide_type": "fragment" 120 | } 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "dataset1 = pd.DataFrame({'Age': ['32', '26', '29'],\n", 125 | " 'Sex': ['F', 'M', 'F'],\n", 126 | " 'State': ['CA', 'NY', 'OH']},\n", 127 | " index=['Jane', 'John', 'Cathy'])\n", 128 | " \n", 129 | "dataset2 = pd.DataFrame({'Age': ['34', '23', '24', '21'],\n", 130 | " 'Sex': ['M', 'F', 'F', 'F'],\n", 131 | " 'State': ['AZ', 'OR', 'CA', 'WA']},\n", 132 | " index=['Dave', 'Kris', 'Xi', 'Jo'])" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 39, 138 | "metadata": { 139 | "collapsed": false, 140 | "deletable": true, 141 | "editable": true, 142 | "slideshow": { 143 | "slide_type": "fragment" 144 | } 145 | }, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/html": [ 150 | "
\n", 151 | "\n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | "
AgeSexState
Jane32FCA
John26MNY
Cathy29FOH
Dave34MAZ
Kris23FOR
Xi24FCA
Jo21FWA
\n", 205 | "
" 206 | ], 207 | "text/plain": [ 208 | " Age Sex State\n", 209 | "Jane 32 F CA\n", 210 | "John 26 M NY\n", 211 | "Cathy 29 F OH\n", 212 | "Dave 34 M AZ\n", 213 | "Kris 23 F OR\n", 214 | "Xi 24 F CA\n", 215 | "Jo 21 F WA" 216 | ] 217 | }, 218 | "execution_count": 39, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "pd.concat([dataset1, dataset2])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": { 230 | "slideshow": { 231 | "slide_type": "slide" 232 | } 233 | }, 234 | "source": [ 235 | "## Concatenate using append()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 32, 241 | "metadata": { 242 | "collapsed": false, 243 | "deletable": true, 244 | "editable": true, 245 | "slideshow": { 246 | "slide_type": "fragment" 247 | } 248 | }, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/html": [ 253 | "
\n", 254 | "\n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | "
AgeSexState
Jane32FCA
John26MNY
Cathy29FOH
Dave34MAZ
Kris23FOR
Xi24FCA
Jo21FWA
\n", 308 | "
" 309 | ], 310 | "text/plain": [ 311 | " Age Sex State\n", 312 | "Jane 32 F CA\n", 313 | "John 26 M NY\n", 314 | "Cathy 29 F OH\n", 315 | "Dave 34 M AZ\n", 316 | "Kris 23 F OR\n", 317 | "Xi 24 F CA\n", 318 | "Jo 21 F WA" 319 | ] 320 | }, 321 | "execution_count": 32, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "dataset1.append(dataset2)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "slideshow": { 334 | "slide_type": "slide" 335 | } 336 | }, 337 | "source": [ 338 | "## Concatenate on columns" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 40, 344 | "metadata": { 345 | "collapsed": true, 346 | "deletable": true, 347 | "editable": true, 348 | "slideshow": { 349 | "slide_type": "fragment" 350 | } 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "dataset1 = pd.DataFrame({'Age': ['32', '26', '29'],\n", 355 | " 'Sex': ['F', 'M', 'F'],\n", 356 | " 'State': ['CA', 'NY', 'OH']},\n", 357 | " index=['Jane', 'John', 'Cathy'])\n", 358 | "\n", 359 | "dataset2 = pd.DataFrame({'City': ['SF', 'NY', 'Columbus'],\n", 360 | " 'Work Status': ['No', 'Yes', 'Yes']},\n", 361 | " index=['Jane', 'John', 'Cathy']) " 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 41, 367 | "metadata": { 368 | "collapsed": false, 369 | "deletable": true, 370 | "editable": true, 371 | "slideshow": { 372 | "slide_type": "fragment" 373 | } 374 | }, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/html": [ 379 | "
\n", 380 | "\n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | "
AgeSexStateCityWork Status
Jane32FCASFNo
John26MNYNYYes
Cathy29FOHColumbusYes
\n", 418 | "
" 419 | ], 420 | "text/plain": [ 421 | " Age Sex State City Work Status\n", 422 | "Jane 32 F CA SF No\n", 423 | "John 26 M NY NY Yes\n", 424 | "Cathy 29 F OH Columbus Yes" 425 | ] 426 | }, 427 | "execution_count": 41, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "pd.concat([dataset1, dataset2], axis=1)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": { 439 | "slideshow": { 440 | "slide_type": "slide" 441 | } 442 | }, 443 | "source": [ 444 | "### Merging DataFrames" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 20, 450 | "metadata": { 451 | "collapsed": true, 452 | "deletable": true, 453 | "editable": true, 454 | "slideshow": { 455 | "slide_type": "fragment" 456 | } 457 | }, 458 | "outputs": [], 459 | "source": [ 460 | "dataset1 = pd.DataFrame({'Name': ['Jane', 'John', 'Cathy', 'Sarah'],\n", 461 | " 'Age': ['32', '26', '29', '23'],\n", 462 | " 'Sex': ['F', 'M', 'F', 'F'],\n", 463 | " 'State': ['CA', 'NY', 'OH', 'TX']})\n", 464 | "\n", 465 | "dataset2 = pd.DataFrame({'Name': ['Jane', 'John', 'Cathy', 'Rob'],\n", 466 | " 'City': ['SF', 'NY', 'Columbus', 'Austin'],\n", 467 | " 'Work Status': ['No', 'Yes', 'Yes', 'Yes']})" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 25, 473 | "metadata": { 474 | "collapsed": false, 475 | "deletable": true, 476 | "editable": true, 477 | "slideshow": { 478 | "slide_type": "fragment" 479 | } 480 | }, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/html": [ 485 | "
\n", 486 | "\n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | "
AgeNameSexStateCityWork Status
032JaneFCASFNo
126JohnMNYNYYes
229CathyFOHColumbusYes
\n", 528 | "
" 529 | ], 530 | "text/plain": [ 531 | " Age Name Sex State City Work Status\n", 532 | "0 32 Jane F CA SF No\n", 533 | "1 26 John M NY NY Yes\n", 534 | "2 29 Cathy F OH Columbus Yes" 535 | ] 536 | }, 537 | "execution_count": 25, 538 | "metadata": {}, 539 | "output_type": "execute_result" 540 | } 541 | ], 542 | "source": [ 543 | "pd.merge(dataset1, dataset2, on='Name', how='inner')" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": { 549 | "slideshow": { 550 | "slide_type": "slide" 551 | } 552 | }, 553 | "source": [ 554 | "### Left outer merge" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 26, 560 | "metadata": { 561 | "collapsed": false, 562 | "deletable": true, 563 | "editable": true, 564 | "slideshow": { 565 | "slide_type": "fragment" 566 | } 567 | }, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/html": [ 572 | "
\n", 573 | "\n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | "
AgeNameSexStateCityWork Status
032JaneFCASFNo
126JohnMNYNYYes
229CathyFOHColumbusYes
323SarahFTXNaNNaN
\n", 624 | "
" 625 | ], 626 | "text/plain": [ 627 | " Age Name Sex State City Work Status\n", 628 | "0 32 Jane F CA SF No\n", 629 | "1 26 John M NY NY Yes\n", 630 | "2 29 Cathy F OH Columbus Yes\n", 631 | "3 23 Sarah F TX NaN NaN" 632 | ] 633 | }, 634 | "execution_count": 26, 635 | "metadata": {}, 636 | "output_type": "execute_result" 637 | } 638 | ], 639 | "source": [ 640 | "pd.merge(dataset1, dataset2, on='Name', how='left')" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "metadata": { 646 | "slideshow": { 647 | "slide_type": "slide" 648 | } 649 | }, 650 | "source": [ 651 | "### Right outer merge" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 27, 657 | "metadata": { 658 | "collapsed": false, 659 | "deletable": true, 660 | "editable": true, 661 | "slideshow": { 662 | "slide_type": "fragment" 663 | } 664 | }, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "text/html": [ 669 | "
\n", 670 | "\n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | "
AgeNameSexStateCityWork Status
032JaneFCASFNo
126JohnMNYNYYes
229CathyFOHColumbusYes
3NaNRobNaNNaNAustinYes
\n", 721 | "
" 722 | ], 723 | "text/plain": [ 724 | " Age Name Sex State City Work Status\n", 725 | "0 32 Jane F CA SF No\n", 726 | "1 26 John M NY NY Yes\n", 727 | "2 29 Cathy F OH Columbus Yes\n", 728 | "3 NaN Rob NaN NaN Austin Yes" 729 | ] 730 | }, 731 | "execution_count": 27, 732 | "metadata": {}, 733 | "output_type": "execute_result" 734 | } 735 | ], 736 | "source": [ 737 | "pd.merge(dataset1, dataset2, on='Name', how='right')" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": { 743 | "slideshow": { 744 | "slide_type": "slide" 745 | } 746 | }, 747 | "source": [ 748 | "### Full outer merge" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": 28, 754 | "metadata": { 755 | "collapsed": false, 756 | "deletable": true, 757 | "editable": true, 758 | "slideshow": { 759 | "slide_type": "fragment" 760 | } 761 | }, 762 | "outputs": [ 763 | { 764 | "data": { 765 | "text/html": [ 766 | "
\n", 767 | "\n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | "
AgeNameSexStateCityWork Status
032JaneFCASFNo
126JohnMNYNYYes
229CathyFOHColumbusYes
323SarahFTXNaNNaN
4NaNRobNaNNaNAustinYes
\n", 827 | "
" 828 | ], 829 | "text/plain": [ 830 | " Age Name Sex State City Work Status\n", 831 | "0 32 Jane F CA SF No\n", 832 | "1 26 John M NY NY Yes\n", 833 | "2 29 Cathy F OH Columbus Yes\n", 834 | "3 23 Sarah F TX NaN NaN\n", 835 | "4 NaN Rob NaN NaN Austin Yes" 836 | ] 837 | }, 838 | "execution_count": 28, 839 | "metadata": {}, 840 | "output_type": "execute_result" 841 | } 842 | ], 843 | "source": [ 844 | "pd.merge(dataset1, dataset2, on='Name', how='outer')" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": { 851 | "collapsed": true, 852 | "deletable": true, 853 | "editable": true, 854 | "slideshow": { 855 | "slide_type": "skip" 856 | } 857 | }, 858 | "outputs": [], 859 | "source": [] 860 | } 861 | ], 862 | "metadata": { 863 | "celltoolbar": "Slideshow", 864 | "kernelspec": { 865 | "display_name": "Python 3", 866 | "language": "python", 867 | "name": "python3" 868 | }, 869 | "language_info": { 870 | "codemirror_mode": { 871 | "name": "ipython", 872 | "version": 3 873 | }, 874 | "file_extension": ".py", 875 | "mimetype": "text/x-python", 876 | "name": "python", 877 | "nbconvert_exporter": "python", 878 | "pygments_lexer": "ipython3", 879 | "version": "3.6.1" 880 | } 881 | }, 882 | "nbformat": 4, 883 | "nbformat_minor": 2 884 | } 885 | -------------------------------------------------------------------------------- /Chapter02/Using the axis parameter in pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 13, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "slideshow": { 11 | "slide_type": "skip" 12 | } 13 | }, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/html": [ 18 | "" 28 | ], 29 | "text/plain": [ 30 | "" 31 | ] 32 | }, 33 | "metadata": {}, 34 | "output_type": "display_data" 35 | } 36 | ], 37 | "source": [ 38 | "%%html\n", 39 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 14, 54 | "metadata": { 55 | "collapsed": true, 56 | "deletable": true, 57 | "editable": true, 58 | "slideshow": { 59 | "slide_type": "skip" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "from IPython.core.interactiveshell import InteractiveShell\n", 65 | "InteractiveShell.ast_node_interactivity = \"all\"" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "deletable": true, 72 | "editable": true, 73 | "slideshow": { 74 | "slide_type": "slide" 75 | } 76 | }, 77 | "source": [ 78 | "## Import Pandas" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 15, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true, 88 | "slideshow": { 89 | "slide_type": "fragment" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "import pandas as pd" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true, 102 | "slideshow": { 103 | "slide_type": "slide" 104 | } 105 | }, 106 | "source": [ 107 | "## Read in the dataset" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 16, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true, 117 | "slideshow": { 118 | "slide_type": "fragment" 119 | } 120 | }, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/html": [ 125 | "
\n", 126 | "\n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " Date RegionID RegionName State Metro \\\n", 202 | "0 2017-05-31 6181 New York NY New York \n", 203 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 204 | "2 2017-05-31 17426 Chicago IL Chicago \n", 205 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 206 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 207 | "\n", 208 | " County SizeRank Zhvi \n", 209 | "0 Queens 0 672400 \n", 210 | "1 Los Angeles 1 629900 \n", 211 | "2 Cook 2 222700 \n", 212 | "3 Philadelphia 3 137300 \n", 213 | "4 Maricopa 4 211300 " 214 | ] 215 | }, 216 | "execution_count": 16, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "data = pd.read_table('data-zillow.csv', sep=',')\n", 223 | "data.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "collapsed": true, 230 | "deletable": true, 231 | "editable": true, 232 | "slideshow": { 233 | "slide_type": "slide" 234 | } 235 | }, 236 | "source": [ 237 | "## Usage of axis parameter" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 25, 243 | "metadata": { 244 | "collapsed": false, 245 | "slideshow": { 246 | "slide_type": "fragment" 247 | } 248 | }, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/html": [ 253 | "
\n", 254 | "\n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
02017-05-316181New YorkNYNew YorkQueens0672400
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
\n", 326 | "
" 327 | ], 328 | "text/plain": [ 329 | " Date RegionID RegionName State Metro \\\n", 330 | "0 2017-05-31 6181 New York NY New York \n", 331 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 332 | "2 2017-05-31 17426 Chicago IL Chicago \n", 333 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 334 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 335 | "\n", 336 | " County SizeRank Zhvi \n", 337 | "0 Queens 0 672400 \n", 338 | "1 Los Angeles 1 629900 \n", 339 | "2 Cook 2 222700 \n", 340 | "3 Philadelphia 3 137300 \n", 341 | "4 Maricopa 4 211300 " 342 | ] 343 | }, 344 | "execution_count": 25, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "data.head()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 17, 356 | "metadata": { 357 | "collapsed": false, 358 | "deletable": true, 359 | "editable": true, 360 | "slideshow": { 361 | "slide_type": "fragment" 362 | } 363 | }, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "[RangeIndex(start=0, stop=10830, step=1),\n", 369 | " Index(['Date', 'RegionID', 'RegionName', 'State', 'Metro', 'County',\n", 370 | " 'SizeRank', 'Zhvi'],\n", 371 | " dtype='object')]" 372 | ] 373 | }, 374 | "execution_count": 17, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "data.axes" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": { 386 | "deletable": true, 387 | "editable": true, 388 | "slideshow": { 389 | "slide_type": "slide" 390 | } 391 | }, 392 | "source": [ 393 | "## axis usage examples" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": { 399 | "slideshow": { 400 | "slide_type": "slide" 401 | } 402 | }, 403 | "source": [ 404 | "## axis = 0" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 18, 410 | "metadata": { 411 | "collapsed": false, 412 | "deletable": true, 413 | "editable": true, 414 | "slideshow": { 415 | "slide_type": "fragment" 416 | } 417 | }, 418 | "outputs": [ 419 | { 420 | "data": { 421 | "text/plain": [ 422 | "RegionID 84344.818837\n", 423 | "SizeRank 5414.500000\n", 424 | "Zhvi 250307.590028\n", 425 | "dtype: float64" 426 | ] 427 | }, 428 | "execution_count": 18, 429 | "metadata": {}, 430 | "output_type": "execute_result" 431 | } 432 | ], 433 | "source": [ 434 | "data.mean(axis=0)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": { 440 | "slideshow": { 441 | "slide_type": "slide" 442 | } 443 | }, 444 | "source": [ 445 | "## axis = 1" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 26, 451 | "metadata": { 452 | "collapsed": false, 453 | "deletable": true, 454 | "editable": true, 455 | "slideshow": { 456 | "slide_type": "fragment" 457 | } 458 | }, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "0 226193.666667\n", 464 | "1 214116.000000\n", 465 | "2 80042.666667\n", 466 | "3 50191.333333\n", 467 | "4 83876.666667\n", 468 | "dtype: float64" 469 | ] 470 | }, 471 | "execution_count": 26, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "data.mean(axis=1).head()" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "slideshow": { 484 | "slide_type": "slide" 485 | } 486 | }, 487 | "source": [ 488 | "## use labels instead of 0 and 1" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 27, 494 | "metadata": { 495 | "collapsed": false, 496 | "deletable": true, 497 | "editable": true, 498 | "slideshow": { 499 | "slide_type": "fragment" 500 | } 501 | }, 502 | "outputs": [ 503 | { 504 | "data": { 505 | "text/plain": [ 506 | "RegionID 84344.818837\n", 507 | "SizeRank 5414.500000\n", 508 | "Zhvi 250307.590028\n", 509 | "dtype: float64" 510 | ] 511 | }, 512 | "execution_count": 27, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "data.mean(axis='rows')" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 29, 524 | "metadata": { 525 | "collapsed": false, 526 | "deletable": true, 527 | "editable": true, 528 | "slideshow": { 529 | "slide_type": "fragment" 530 | } 531 | }, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/plain": [ 536 | "0 226193.666667\n", 537 | "1 214116.000000\n", 538 | "2 80042.666667\n", 539 | "3 50191.333333\n", 540 | "4 83876.666667\n", 541 | "dtype: float64" 542 | ] 543 | }, 544 | "execution_count": 29, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "data.mean(axis='columns').head()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 22, 556 | "metadata": { 557 | "collapsed": false, 558 | "deletable": true, 559 | "editable": true, 560 | "slideshow": { 561 | "slide_type": "slide" 562 | } 563 | }, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/html": [ 568 | "
\n", 569 | "\n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | "
DateRegionIDRegionNameStateMetroCountySizeRankZhvi
12017-05-3112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
22017-05-3117426ChicagoILChicagoCook2222700
32017-05-3113271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
42017-05-3140326PhoenixAZPhoenixMaricopa4211300
52017-05-3118959Las VegasNVLas VegasClark5216500
\n", 641 | "
" 642 | ], 643 | "text/plain": [ 644 | " Date RegionID RegionName State Metro \\\n", 645 | "1 2017-05-31 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim \n", 646 | "2 2017-05-31 17426 Chicago IL Chicago \n", 647 | "3 2017-05-31 13271 Philadelphia PA Philadelphia \n", 648 | "4 2017-05-31 40326 Phoenix AZ Phoenix \n", 649 | "5 2017-05-31 18959 Las Vegas NV Las Vegas \n", 650 | "\n", 651 | " County SizeRank Zhvi \n", 652 | "1 Los Angeles 1 629900 \n", 653 | "2 Cook 2 222700 \n", 654 | "3 Philadelphia 3 137300 \n", 655 | "4 Maricopa 4 211300 \n", 656 | "5 Clark 5 216500 " 657 | ] 658 | }, 659 | "execution_count": 22, 660 | "metadata": {}, 661 | "output_type": "execute_result" 662 | } 663 | ], 664 | "source": [ 665 | "data.drop(0, axis=0).head()" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 23, 671 | "metadata": { 672 | "collapsed": false, 673 | "deletable": true, 674 | "editable": true, 675 | "slideshow": { 676 | "slide_type": "slide" 677 | } 678 | }, 679 | "outputs": [ 680 | { 681 | "data": { 682 | "text/html": [ 683 | "
\n", 684 | "\n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | "
RegionIDRegionNameStateMetroCountySizeRankZhvi
06181New YorkNYNew YorkQueens0672400
112447Los AngelesCALos Angeles-Long Beach-AnaheimLos Angeles1629900
217426ChicagoILChicagoCook2222700
313271PhiladelphiaPAPhiladelphiaPhiladelphia3137300
440326PhoenixAZPhoenixMaricopa4211300
\n", 750 | "
" 751 | ], 752 | "text/plain": [ 753 | " RegionID RegionName State Metro County \\\n", 754 | "0 6181 New York NY New York Queens \n", 755 | "1 12447 Los Angeles CA Los Angeles-Long Beach-Anaheim Los Angeles \n", 756 | "2 17426 Chicago IL Chicago Cook \n", 757 | "3 13271 Philadelphia PA Philadelphia Philadelphia \n", 758 | "4 40326 Phoenix AZ Phoenix Maricopa \n", 759 | "\n", 760 | " SizeRank Zhvi \n", 761 | "0 0 672400 \n", 762 | "1 1 629900 \n", 763 | "2 2 222700 \n", 764 | "3 3 137300 \n", 765 | "4 4 211300 " 766 | ] 767 | }, 768 | "execution_count": 23, 769 | "metadata": {}, 770 | "output_type": "execute_result" 771 | } 772 | ], 773 | "source": [ 774 | "data.drop('Date', axis=1).head()" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 24, 780 | "metadata": { 781 | "collapsed": false, 782 | "deletable": true, 783 | "editable": true, 784 | "slideshow": { 785 | "slide_type": "slide" 786 | } 787 | }, 788 | "outputs": [ 789 | { 790 | "data": { 791 | "text/html": [ 792 | "
\n", 793 | "\n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | "
RegionIDRegionName
06181New York
112447Los Angeles
217426Chicago
313271Philadelphia
440326Phoenix
\n", 829 | "
" 830 | ], 831 | "text/plain": [ 832 | " RegionID RegionName\n", 833 | "0 6181 New York\n", 834 | "1 12447 Los Angeles\n", 835 | "2 17426 Chicago\n", 836 | "3 13271 Philadelphia\n", 837 | "4 40326 Phoenix" 838 | ] 839 | }, 840 | "execution_count": 24, 841 | "metadata": {}, 842 | "output_type": "execute_result" 843 | } 844 | ], 845 | "source": [ 846 | "data.filter(regex='Region', axis=1).head()" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": { 853 | "collapsed": true, 854 | "deletable": true, 855 | "editable": true, 856 | "slideshow": { 857 | "slide_type": "skip" 858 | } 859 | }, 860 | "outputs": [], 861 | "source": [] 862 | } 863 | ], 864 | "metadata": { 865 | "celltoolbar": "Slideshow", 866 | "kernelspec": { 867 | "display_name": "Python 3", 868 | "language": "python", 869 | "name": "python3" 870 | }, 871 | "language_info": { 872 | "codemirror_mode": { 873 | "name": "ipython", 874 | "version": 3 875 | }, 876 | "file_extension": ".py", 877 | "mimetype": "text/x-python", 878 | "name": "python", 879 | "nbconvert_exporter": "python", 880 | "pygments_lexer": "ipython3", 881 | "version": "3.6.1" 882 | } 883 | }, 884 | "nbformat": 4, 885 | "nbformat_minor": 2 886 | } 887 | --------------------------------------------------------------------------------