├── .flake8 ├── .github └── ISSUE_TEMPLATE │ ├── config.yml │ └── issue_template.md ├── .gitignore ├── .travis.yml ├── README.md ├── datasets ├── ds1.jpg ├── ds2.jpg ├── mycsv.csv ├── mycsv_few_columns.csv ├── myexcel.xls ├── season.csv ├── titanic.xls ├── travel.csv ├── weather-dataset.zip ├── weather.csv └── weather_and_house.xls ├── pandas1.png ├── pandas_part1.ipynb ├── pandas_part10.ipynb ├── pandas_part2.ipynb ├── pandas_part3.ipynb ├── pandas_part4.ipynb ├── pandas_part5.ipynb ├── pandas_part6.ipynb ├── pandas_part7.ipynb ├── pandas_part8.ipynb └── pandas_part9.ipynb /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | ignore = E121,E123,E126,E221,E222,E225,E226,E242,E701,E702,E704,E731,W503,F405,F841 4 | exclude = tests 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: GitHub Community Support 4 | url: https://github.community/ 5 | about: Please ask and answer questions here. 6 | - name: GitHub Security Bug Bounty 7 | url: https://bounty.github.com/ 8 | about: Please report security vulnerabilities here. 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue_template.md: -------------------------------------------------------------------------------- 1 | 2 | ### Issue type 3 | Fill 'x' without quote if you want to checked the below boxes. 4 | - [ ] Code improvements 5 | - [ ] I want to add files 6 | - [ ] Suggestions 7 | 8 | ##### Explain in brief what you have selected. 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: 2 | - python 3 | 4 | python: 5 | - "3.6" 6 | 7 | install: 8 | - pip install pandas 9 | - pip install numpy 10 | - pip install flake8 11 | - pip install ipython 12 | - pip install matplotlib 13 | - pip install ipywidgets 14 | script: 15 | - python script.py 16 | 17 | after_success: 18 | - flake8 --max-line-length 100 --ignore=E121,E123,E126,E221,E222,E225,E226,E242,E701,E702,E704,E731,W503 . 19 | 20 | notifications: 21 | email: false 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![pandas for data analysis](pandas1.png) 2 |
**Fig. 1** 3 | 4 | Welcome to the tutorial Data analysis with pandas. In this tutorial i have covered all the topics of pandas and tried to explain each and every concepts with lesser words in jupyter notebook so that you can observe the function of every methods in pandas from the ground level.
5 |
6 |
First of all let's understand "what is data analysis and why should we use pandas for analysis ?". 7 | 8 | ### What is data analysis ? 9 | Suppose you are working in a company which daily generates a lot of data of customers and you are assigned a task to extract some useful information out of it with certain deadline. What will you do if you have very limited time you can not exract information just by looking into the dataset because size of the data is huge. So you asked for help from your collegue he said just read about pandas for data analysis. You study about pandas and you found that pandas makes your life easier than just looking at dataset and finding useful informations. 10 | ### What does pandas actually do ? 11 | This official documentation says-
12 | pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python. Additionally, it has the broader goal of becoming the most powerful and flexible open source data analysis / manipulation tool available in any language. It is already well on its way toward this goal. 13 |
14 |
15 | pandas is well suited for many different kinds of data: 16 | * Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet. 17 | * Ordered and unordered (not necessarily fixed-frequency) time series data. 18 | * Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels. 19 | * Any other form of observational / statistical data sets. The data actually need not be labeled at all to be placed into a pandas data structure. 20 | 21 | ### Dependencies- 22 | ``` 23 | pip install pandas 24 | pip install numpy 25 | ``` 26 | 27 | ### Table of contents - 28 | * [Introduction to pandas](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part1.ipynb) 29 | * [Dataframe Object](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part2.ipynb) 30 | * [Reading, Writing CSV and EXCEL file](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part3.ipynb) 31 | * [Handling Missing Data part-1](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part4.ipynb) 32 | * [Handling Missing Data part-2](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part5.ipynb) 33 | * [Groupby : Split, Combine and Apply](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part6.ipynb) 34 | * [Concat Dataframes](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part7.ipynb) 35 | * [Merging Dataframes](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part8.ipynb) 36 | * [Pivot and Pivot table](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part9.ipynb) 37 | * [Reshaping Dataframes](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part10.ipynb) 38 |
39 | 40 | ### Introduction to pandas : 41 | ![](https://i1.wp.com/www.ugandaletsgotravel.com/holidays/wp-content/uploads/2018/04/holidays-panda-breeding-china-600x400.jpg) 42 |
**Fig.2**

43 | Pandas is used as data cleaning tool in the field of data science.You can do whatever operation you want in the dataset with this tool.Now question arises, can we clean or change the value in the dataset manually ? Answer is yes we can if size of the dataset is small.What if we have a large dataset then we can not do it manually it will take a lot of time.Pandas makes data science very easy and effective. 44 |
45 | To use pandas you need to first import the pandas module in your program 46 | ``` 47 | import pandas as pd 48 | ``` 49 |
50 | 51 | 52 | #### Reading CSV and Excel sheets: 53 | **d=pd.read_csv("path"):** 54 | * pd.read_csv() is the function to read the CSV(Comma separated values) file from your computer. 55 | * In the function you have to pass "path" of the CSV file under quote. 56 | * Store the dataframe in any variable,here i stored it in variable "d". 57 | * read_csv() function makes the CSV file into dataframe so that you can access it just like a disctionary.
58 | 59 | **d=pd.read_excel("path") :** 60 | * It is same as the read_csv() but it reads excel sheet or file. Here i am using the weather dataset which has all the data of weather. In my case,weather.csv file is in my current directory that is why the path of the file is file name itself. 61 | ``` 62 | d=pd.read_csv('datasets/weather.csv') 63 | print(d) 64 | ``` 65 |
66 | 67 | **For futher tutorial go to the above link given in the Table of contents or click this [link](https://github.com/dshahid380/Data-analysis-with-pandas)** 68 | 69 | 70 | 71 | 72 | ### References : 73 | * [Pandas Official documentation](https://pandas.pydata.org/pandas-docs/stable/tutorials.html) 74 | * [Tutorials points](https://www.tutorialspoint.com/python_pandas) 75 | * [Datacamp](https://www.datacamp.com/courses/pandas-foundations) 76 | 77 |
78 | 79 | 80 | [dshahid380](https://github.com/dshahid380) 81 | [ Md Shahid](https://www.linkedin.com/in/dshahid380/) 82 | -------------------------------------------------------------------------------- /datasets/ds1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/ds1.jpg -------------------------------------------------------------------------------- /datasets/ds2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/ds2.jpg -------------------------------------------------------------------------------- /datasets/mycsv.csv: -------------------------------------------------------------------------------- 1 | dates,day,temp,wind-speed 2 | 02-01-12,sunny,45,12 3 | 03-01-12,rainy,46,34 4 | 04-01-12,hot,47,45 5 | 05-01-12,sunny,48,56 6 | 06-01-12,hot,49,67 7 | -------------------------------------------------------------------------------- /datasets/mycsv_few_columns.csv: -------------------------------------------------------------------------------- 1 | day,temp,wind-speed 2 | sunny,45,12 3 | rainy,46,34 4 | hot,47,45 5 | sunny,48,56 6 | hot,49,67 7 | -------------------------------------------------------------------------------- /datasets/myexcel.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/myexcel.xls -------------------------------------------------------------------------------- /datasets/season.csv: -------------------------------------------------------------------------------- 1 | dates,day,temp,wind-speed 2 | 2/1/2012,sunny,45,12 3 | 3/1/2012,rainy,46,34 4 | 4/1/2012,hot,47,45 5 | 5/1/2012,NaN,NaN,56 6 | 6/1/2012,hot,49,Not available 7 | 7/1/2012,NaN,NaN,Not available 8 | 8/1/2012,hot,12,45 9 | 9/1/2012,rainy,23,41 10 | 10/1/2012,NaN,NaN,NaN 11 | 11/1/2012,NaN,NaN,NaN 12 | -------------------------------------------------------------------------------- /datasets/titanic.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/titanic.xls -------------------------------------------------------------------------------- /datasets/travel.csv: -------------------------------------------------------------------------------- 1 | Age,Name,No_of_pkg,Package,travel_id 2 | 20yrs,Bikash Kumar,1 packages,$100 ,1 3 | 21yrs,Ashish Shaw,5 packages,$200 ,2 4 | 23years,Dipak Kumar,2pkgs,$100 ,3 5 | 20 Years,John Doe,3 pkgs,$100 ,4 6 | 2000,Elisha,5000,$400 ,5 7 | 5000,Md Shahid,10 packages,$200 ,6 8 | 21 yrs,Adrika Roy,7pkgs,$300 ,7 9 | 24 yrs,Shashi Kumar,2000,$500 ,8 10 | -------------------------------------------------------------------------------- /datasets/weather-dataset.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/weather-dataset.zip -------------------------------------------------------------------------------- /datasets/weather_and_house.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/weather_and_house.xls -------------------------------------------------------------------------------- /pandas1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/pandas1.png -------------------------------------------------------------------------------- /pandas_part10.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Reshaping Dataframe \n", 8 | " In this technique we will discuss about reshape the dataframe with melt() method.\n", 9 | " You have to pass the dataframe and the column which want to keep the same and other argumets are optional for you\n", 10 | " " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import pandas as pd" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 5, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
\n", 31 | "\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | "
DayKeralaKolkataMumbai
0Mon231010
1Tue322020
2Wed133030
3Thu424040
4Fri133232
5Sat433434
6Sun232323
\n", 106 | "
" 107 | ], 108 | "text/plain": [ 109 | " Day Kerala Kolkata Mumbai\n", 110 | "0 Mon 23 10 10\n", 111 | "1 Tue 32 20 20\n", 112 | "2 Wed 13 30 30\n", 113 | "3 Thu 42 40 40\n", 114 | "4 Fri 13 32 32\n", 115 | "5 Sat 43 34 34\n", 116 | "6 Sun 23 23 23" 117 | ] 118 | }, 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "mydis={\n", 126 | " 'Day':['Mon','Tue','Wed','Thu','Fri','Sat','Sun'],\n", 127 | " 'Kolkata':[10,20,30,40,32,34,23],\n", 128 | " 'Kerala':[23,32,13,42,13,43,23],\n", 129 | " 'Mumbai':[10,20,30,40,32,34,23]\n", 130 | "}\n", 131 | "df=pd.DataFrame(mydis)\n", 132 | "df" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "
\n", 144 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | "
Dayvariablevalue
0MonKerala23
1TueKerala32
2WedKerala13
3ThuKerala42
4FriKerala13
5SatKerala43
6SunKerala23
7MonKolkata10
8TueKolkata20
9WedKolkata30
10ThuKolkata40
11FriKolkata32
12SatKolkata34
13SunKolkata23
14MonMumbai10
15TueMumbai20
16WedMumbai30
17ThuMumbai40
18FriMumbai32
19SatMumbai34
20SunMumbai23
\n", 295 | "
" 296 | ], 297 | "text/plain": [ 298 | " Day variable value\n", 299 | "0 Mon Kerala 23\n", 300 | "1 Tue Kerala 32\n", 301 | "2 Wed Kerala 13\n", 302 | "3 Thu Kerala 42\n", 303 | "4 Fri Kerala 13\n", 304 | "5 Sat Kerala 43\n", 305 | "6 Sun Kerala 23\n", 306 | "7 Mon Kolkata 10\n", 307 | "8 Tue Kolkata 20\n", 308 | "9 Wed Kolkata 30\n", 309 | "10 Thu Kolkata 40\n", 310 | "11 Fri Kolkata 32\n", 311 | "12 Sat Kolkata 34\n", 312 | "13 Sun Kolkata 23\n", 313 | "14 Mon Mumbai 10\n", 314 | "15 Tue Mumbai 20\n", 315 | "16 Wed Mumbai 30\n", 316 | "17 Thu Mumbai 40\n", 317 | "18 Fri Mumbai 32\n", 318 | "19 Sat Mumbai 34\n", 319 | "20 Sun Mumbai 23" 320 | ] 321 | }, 322 | "execution_count": 10, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "df2=pd.melt(df,id_vars=['Day'],var_name='City',value_name='Temp')\n", 329 | "df2" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [] 338 | } 339 | ], 340 | "metadata": { 341 | "kernelspec": { 342 | "display_name": "Python 3", 343 | "language": "python", 344 | "name": "python3" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 3 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython3", 356 | "version": "3.6.4" 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 2 361 | } 362 | -------------------------------------------------------------------------------- /pandas_part3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Reading, writing CSV and Excel file
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 16, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 17, 22 | "metadata": { 23 | "scrolled": true 24 | }, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/html": [ 29 | "
\n", 30 | "\n", 43 | "\n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | "
Formatted DateSummaryPrecip TypeTemperature (C)Apparent Temperature (C)HumidityWind Speed (km/h)Wind Bearing (degrees)Visibility (km)Loud CoverPressure (millibars)Daily Summary
02006-04-01 00:00:00.000 +0200Partly Cloudyrain9.4722227.3888890.8914.1197251.015.82630.01015.13Partly cloudy throughout the day.
12006-04-01 01:00:00.000 +0200Partly Cloudyrain9.3555567.2277780.8614.2646259.015.82630.01015.63Partly cloudy throughout the day.
22006-04-01 02:00:00.000 +0200Mostly Cloudyrain9.3777789.3777780.893.9284204.014.95690.01015.94Partly cloudy throughout the day.
\n", 109 | "
" 110 | ], 111 | "text/plain": [ 112 | " Formatted Date Summary Precip Type Temperature (C) \\\n", 113 | "0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 \n", 114 | "1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 \n", 115 | "2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 \n", 116 | "\n", 117 | " Apparent Temperature (C) Humidity Wind Speed (km/h) \\\n", 118 | "0 7.388889 0.89 14.1197 \n", 119 | "1 7.227778 0.86 14.2646 \n", 120 | "2 9.377778 0.89 3.9284 \n", 121 | "\n", 122 | " Wind Bearing (degrees) Visibility (km) Loud Cover Pressure (millibars) \\\n", 123 | "0 251.0 15.8263 0.0 1015.13 \n", 124 | "1 259.0 15.8263 0.0 1015.63 \n", 125 | "2 204.0 14.9569 0.0 1015.94 \n", 126 | "\n", 127 | " Daily Summary \n", 128 | "0 Partly cloudy throughout the day. \n", 129 | "1 Partly cloudy throughout the day. \n", 130 | "2 Partly cloudy throughout the day. " 131 | ] 132 | }, 133 | "execution_count": 17, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "#Reading CSV file\n", 140 | "d=pd.read_csv('datasets/weather.csv')\n", 141 | "d.head(3)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 18, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/html": [ 152 | "
\n", 153 | "\n", 166 | "\n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhome.dest
011Allen, Miss. Elisabeth Waltonfemale290024160211.3375B5S2NoneSt Louis, MO
111Allison, Master. Hudson Trevormale0.916712113781151.5500C22 C26S11NoneMontreal, PQ / Chesterville, ON
210Allison, Miss. Helen Lorainefemale212113781151.5500C22 C26SNoneNoneMontreal, PQ / Chesterville, ON
\n", 240 | "
" 241 | ], 242 | "text/plain": [ 243 | " pclass survived name sex age sibsp \\\n", 244 | "0 1 1 Allen, Miss. Elisabeth Walton female 29 0 \n", 245 | "1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 \n", 246 | "2 1 0 Allison, Miss. Helen Loraine female 2 1 \n", 247 | "\n", 248 | " parch ticket fare cabin embarked boat body \\\n", 249 | "0 0 24160 211.3375 B5 S 2 None \n", 250 | "1 2 113781 151.5500 C22 C26 S 11 None \n", 251 | "2 2 113781 151.5500 C22 C26 S None None \n", 252 | "\n", 253 | " home.dest \n", 254 | "0 St Louis, MO \n", 255 | "1 Montreal, PQ / Chesterville, ON \n", 256 | "2 Montreal, PQ / Chesterville, ON " 257 | ] 258 | }, 259 | "execution_count": 18, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "#Reading excel file\n", 266 | "df=pd.read_excel('datasets/titanic.xls')\n", 267 | "df.head(3)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "### If you want to read only few rows rather than all" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 19, 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/html": [ 285 | "
\n", 286 | "\n", 299 | "\n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | "
Formatted DateSummaryPrecip TypeTemperature (C)Apparent Temperature (C)HumidityWind Speed (km/h)Wind Bearing (degrees)Visibility (km)Loud CoverPressure (millibars)Daily Summary
02006-04-01 00:00:00.000 +0200Partly Cloudyrain9.4722227.3888890.8914.1197251.015.82630.01015.13Partly cloudy throughout the day.
12006-04-01 01:00:00.000 +0200Partly Cloudyrain9.3555567.2277780.8614.2646259.015.82630.01015.63Partly cloudy throughout the day.
22006-04-01 02:00:00.000 +0200Mostly Cloudyrain9.3777789.3777780.893.9284204.014.95690.01015.94Partly cloudy throughout the day.
32006-04-01 03:00:00.000 +0200Partly Cloudyrain8.2888895.9444440.8314.1036269.015.82630.01016.41Partly cloudy throughout the day.
42006-04-01 04:00:00.000 +0200Mostly Cloudyrain8.7555566.9777780.8311.0446259.015.82630.01016.51Partly cloudy throughout the day.
52006-04-01 05:00:00.000 +0200Partly Cloudyrain9.2222227.1111110.8513.9587258.014.95690.01016.66Partly cloudy throughout the day.
62006-04-01 06:00:00.000 +0200Partly Cloudyrain7.7333335.5222220.9512.3648259.09.98200.01016.72Partly cloudy throughout the day.
\n", 425 | "
" 426 | ], 427 | "text/plain": [ 428 | " Formatted Date Summary Precip Type Temperature (C) \\\n", 429 | "0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 \n", 430 | "1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 \n", 431 | "2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 \n", 432 | "3 2006-04-01 03:00:00.000 +0200 Partly Cloudy rain 8.288889 \n", 433 | "4 2006-04-01 04:00:00.000 +0200 Mostly Cloudy rain 8.755556 \n", 434 | "5 2006-04-01 05:00:00.000 +0200 Partly Cloudy rain 9.222222 \n", 435 | "6 2006-04-01 06:00:00.000 +0200 Partly Cloudy rain 7.733333 \n", 436 | "\n", 437 | " Apparent Temperature (C) Humidity Wind Speed (km/h) \\\n", 438 | "0 7.388889 0.89 14.1197 \n", 439 | "1 7.227778 0.86 14.2646 \n", 440 | "2 9.377778 0.89 3.9284 \n", 441 | "3 5.944444 0.83 14.1036 \n", 442 | "4 6.977778 0.83 11.0446 \n", 443 | "5 7.111111 0.85 13.9587 \n", 444 | "6 5.522222 0.95 12.3648 \n", 445 | "\n", 446 | " Wind Bearing (degrees) Visibility (km) Loud Cover Pressure (millibars) \\\n", 447 | "0 251.0 15.8263 0.0 1015.13 \n", 448 | "1 259.0 15.8263 0.0 1015.63 \n", 449 | "2 204.0 14.9569 0.0 1015.94 \n", 450 | "3 269.0 15.8263 0.0 1016.41 \n", 451 | "4 259.0 15.8263 0.0 1016.51 \n", 452 | "5 258.0 14.9569 0.0 1016.66 \n", 453 | "6 259.0 9.9820 0.0 1016.72 \n", 454 | "\n", 455 | " Daily Summary \n", 456 | "0 Partly cloudy throughout the day. \n", 457 | "1 Partly cloudy throughout the day. \n", 458 | "2 Partly cloudy throughout the day. \n", 459 | "3 Partly cloudy throughout the day. \n", 460 | "4 Partly cloudy throughout the day. \n", 461 | "5 Partly cloudy throughout the day. \n", 462 | "6 Partly cloudy throughout the day. " 463 | ] 464 | }, 465 | "execution_count": 19, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "df=pd.read_csv('datasets/weather.csv',nrows=7)\n", 472 | "df" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "### Changing specific values with NaN while reading\n", 480 | " * While reading the dataset you can change the specific value with NaN\n", 481 | " > df = pd.read_excel ( \" titanic.xls \" , na_values = [ list of element which you want to change to NaN ] ) " 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 20, 487 | "metadata": {}, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "text/html": [ 492 | "
\n", 493 | "\n", 506 | "\n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhome.dest
011Allen, Miss. Elisabeth Waltonfemale29.00000024160211.3375B5S2NaNSt Louis, MO
111Allison, Master. Hudson Trevormale0.916712113781151.5500C22 C26S11NaNMontreal, PQ / Chesterville, ON
210Allison, Miss. Helen Lorainefemale2.000012113781151.5500C22 C26SNaNNaNMontreal, PQ / Chesterville, ON
310Allison, Mr. Hudson Joshua Creightonmale30.000012113781151.5500C22 C26SNaN135.0Montreal, PQ / Chesterville, ON
410Allison, Mrs. Hudson J C (Bessie Waldo Daniels)female25.000012113781151.5500C22 C26SNaNNaNMontreal, PQ / Chesterville, ON
\n", 614 | "
" 615 | ], 616 | "text/plain": [ 617 | " pclass survived name sex \\\n", 618 | "0 1 1 Allen, Miss. Elisabeth Walton female \n", 619 | "1 1 1 Allison, Master. Hudson Trevor male \n", 620 | "2 1 0 Allison, Miss. Helen Loraine female \n", 621 | "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n", 622 | "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n", 623 | "\n", 624 | " age sibsp parch ticket fare cabin embarked boat body \\\n", 625 | "0 29.0000 0 0 24160 211.3375 B5 S 2 NaN \n", 626 | "1 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN \n", 627 | "2 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN \n", 628 | "3 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 \n", 629 | "4 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN \n", 630 | "\n", 631 | " home.dest \n", 632 | "0 St Louis, MO \n", 633 | "1 Montreal, PQ / Chesterville, ON \n", 634 | "2 Montreal, PQ / Chesterville, ON \n", 635 | "3 Montreal, PQ / Chesterville, ON \n", 636 | "4 Montreal, PQ / Chesterville, ON " 637 | ] 638 | }, 639 | "execution_count": 20, 640 | "metadata": {}, 641 | "output_type": "execute_result" 642 | } 643 | ], 644 | "source": [ 645 | "df = pd.read_excel(\"datasets/titanic.xls\",na_values=[None]) \n", 646 | "df.head()" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "### Changing specific values with NaN columns wise while reading" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 21, 659 | "metadata": {}, 660 | "outputs": [ 661 | { 662 | "data": { 663 | "text/html": [ 664 | "
\n", 665 | "\n", 678 | "\n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedboatbodyhome.dest
011Allen, Miss. Elisabeth Waltonfemale290024160211.3375B5S2NoneSt Louis, MO
111Allison, Master. Hudson Trevormale0.916712113781151.5500C22 C26S11NoneMontreal, PQ / Chesterville, ON
210Allison, Miss. Helen Lorainefemale212113781151.5500C22 C26SNoneNoneMontreal, PQ / Chesterville, ON
310Allison, Mr. Hudson Joshua Creightonmale3012113781151.5500C22 C26SNone135Montreal, PQ / Chesterville, ON
410Allison, Mrs. Hudson J C (Bessie Waldo Daniels)female2512113781151.5500C22 C26SNoneNoneMontreal, PQ / Chesterville, ON
\n", 786 | "
" 787 | ], 788 | "text/plain": [ 789 | " pclass survived name sex \\\n", 790 | "0 1 1 Allen, Miss. Elisabeth Walton female \n", 791 | "1 1 1 Allison, Master. Hudson Trevor male \n", 792 | "2 1 0 Allison, Miss. Helen Loraine female \n", 793 | "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n", 794 | "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n", 795 | "\n", 796 | " age sibsp parch ticket fare cabin embarked boat body \\\n", 797 | "0 29 0 0 24160 211.3375 B5 S 2 None \n", 798 | "1 0.9167 1 2 113781 151.5500 C22 C26 S 11 None \n", 799 | "2 2 1 2 113781 151.5500 C22 C26 S None None \n", 800 | "3 30 1 2 113781 151.5500 C22 C26 S None 135 \n", 801 | "4 25 1 2 113781 151.5500 C22 C26 S None None \n", 802 | "\n", 803 | " home.dest \n", 804 | "0 St Louis, MO \n", 805 | "1 Montreal, PQ / Chesterville, ON \n", 806 | "2 Montreal, PQ / Chesterville, ON \n", 807 | "3 Montreal, PQ / Chesterville, ON \n", 808 | "4 Montreal, PQ / Chesterville, ON " 809 | ] 810 | }, 811 | "execution_count": 21, 812 | "metadata": {}, 813 | "output_type": "execute_result" 814 | } 815 | ], 816 | "source": [ 817 | "df=pd.read_excel('datasets/titanic.xls')\n", 818 | "df.head()" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 22, 824 | "metadata": {}, 825 | "outputs": [], 826 | "source": [ 827 | "df=pd.read_excel('datasets/titanic.xls',na_values={'body':[None],'boat':[None],'parch':[2,0]})\n" 828 | ] 829 | }, 830 | { 831 | "cell_type": "markdown", 832 | "metadata": {}, 833 | "source": [ 834 | " * na_values argument is used to replace all the values passes in the list with NaN.You can also replace some invalid values column wise as you can in the above code. If you run the above code you will a dataframe where 'None' in body column is replaced by NaN,again None in the boat column is replaced by NaN and 2 & 0 in parch column are replaced by NaN." 835 | ] 836 | }, 837 | { 838 | "cell_type": "markdown", 839 | "metadata": {}, 840 | "source": [ 841 | "### Writing into CSV or excel \n", 842 | " * You can convert any dataframe in a new CSV or excel file file" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 23, 848 | "metadata": {}, 849 | "outputs": [ 850 | { 851 | "data": { 852 | "text/html": [ 853 | "
\n", 854 | "\n", 867 | "\n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | "
datesdaytempwind-speed
002-01-12sunny4512
103-01-12rainy4634
204-01-12hot4745
305-01-12sunny4856
406-01-12hot4967
\n", 915 | "
" 916 | ], 917 | "text/plain": [ 918 | " dates day temp wind-speed\n", 919 | "0 02-01-12 sunny 45 12\n", 920 | "1 03-01-12 rainy 46 34\n", 921 | "2 04-01-12 hot 47 45\n", 922 | "3 05-01-12 sunny 48 56\n", 923 | "4 06-01-12 hot 49 67" 924 | ] 925 | }, 926 | "execution_count": 23, 927 | "metadata": {}, 928 | "output_type": "execute_result" 929 | } 930 | ], 931 | "source": [ 932 | "#Creatiing my own disctionary\n", 933 | "mydict={\n", 934 | " 'dates':['02-01-12','03-01-12','04-01-12','05-01-12','06-01-12'],\n", 935 | " 'day':['sunny','rainy','hot','sunny','hot'],\n", 936 | " 'wind-speed':[12,34,45,56,67],\n", 937 | " 'temp':[45,46,47,48,49]\n", 938 | " }\n", 939 | "#Converting disction to dataframe object\n", 940 | "df=pd.DataFrame(mydict)\n", 941 | "\n", 942 | "#Printing the dataframe\n", 943 | "df" 944 | ] 945 | }, 946 | { 947 | "cell_type": "markdown", 948 | "metadata": {}, 949 | "source": [ 950 | "### Creating a new file\n" 951 | ] 952 | }, 953 | { 954 | "cell_type": "markdown", 955 | "metadata": {}, 956 | "source": [ 957 | "#### Creating a new csv file mycsv.csv" 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "metadata": {}, 963 | "source": [ 964 | "Suppose you have read a csv or excel file as dataframe object and you did some modification.Now you want to write a new csv or excel file that contains the modified dataframe. In that case you will need to_csv() or to_excel() function to create a new file.See the example below -" 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": 24, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [ 973 | "df.to_csv('datasets/mycsv.csv',index=False)\n" 974 | ] 975 | }, 976 | { 977 | "cell_type": "markdown", 978 | "metadata": {}, 979 | "source": [ 980 | "#### Creating a new excel file myexcel.xls" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 25, 986 | "metadata": {}, 987 | "outputs": [], 988 | "source": [ 989 | "df.to_excel('datasets/myexcel.xls',index=False)" 990 | ] 991 | }, 992 | { 993 | "cell_type": "markdown", 994 | "metadata": {}, 995 | "source": [ 996 | "### Writing into csv or excel only selected rows\n", 997 | "
\n", 998 | "Suppose you modified the dataframe and you want to write it in another csv file with only selected columns.In the following example we have created the new csv file mycsv_few_columns.csv by \"df\" dataframe allowing only three columns which we want i.e, day, temp, wind-speed and i have ignored the date column." 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "markdown", 1003 | "metadata": {}, 1004 | "source": [ 1005 | "#### how many columns in dataframe df ?" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "execution_count": 26, 1011 | "metadata": {}, 1012 | "outputs": [ 1013 | { 1014 | "data": { 1015 | "text/plain": [ 1016 | "Index(['dates', 'day', 'temp', 'wind-speed'], dtype='object')" 1017 | ] 1018 | }, 1019 | "execution_count": 26, 1020 | "metadata": {}, 1021 | "output_type": "execute_result" 1022 | } 1023 | ], 1024 | "source": [ 1025 | "df.columns" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 27, 1031 | "metadata": {}, 1032 | "outputs": [ 1033 | { 1034 | "data": { 1035 | "text/html": [ 1036 | "
\n", 1037 | "\n", 1050 | "\n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | "
daytempwind-speed
0sunny4512
1rainy4634
2hot4745
3sunny4856
4hot4967
\n", 1092 | "
" 1093 | ], 1094 | "text/plain": [ 1095 | " day temp wind-speed\n", 1096 | "0 sunny 45 12\n", 1097 | "1 rainy 46 34\n", 1098 | "2 hot 47 45\n", 1099 | "3 sunny 48 56\n", 1100 | "4 hot 49 67" 1101 | ] 1102 | }, 1103 | "execution_count": 27, 1104 | "metadata": {}, 1105 | "output_type": "execute_result" 1106 | } 1107 | ], 1108 | "source": [ 1109 | "#Writing only three columns day,temp,wind-speed\n", 1110 | "df.to_csv('datasets/mycsv_few_columns.csv',columns=['day','temp','wind-speed'],index=False)\n", 1111 | "\n", 1112 | "#Again reading to see the result\n", 1113 | "d=pd.read_csv('datasets/mycsv_few_columns.csv')\n", 1114 | "d" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "markdown", 1119 | "metadata": {}, 1120 | "source": [ 1121 | "### Converters :" 1122 | ] 1123 | }, 1124 | { 1125 | "cell_type": "markdown", 1126 | "metadata": {}, 1127 | "source": [ 1128 | "In various cases you dont have clean dataset. Invalid values in the dataset leads to a lot of problem while predicting or extracting the meaningful information.To avoid this problem we use converters.\n", 1129 | "* Converters are basically functions which convert the specific value of a column in your desire value\n", 1130 | "* Converter function passed into the disctionary like in the na_values.\n" 1131 | ] 1132 | }, 1133 | { 1134 | "cell_type": "markdown", 1135 | "metadata": {}, 1136 | "source": [ 1137 | "This is our dataset in which you can see that there are lots of invalid values are present.These values are noise in our dataset." 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "code", 1142 | "execution_count": 28, 1143 | "metadata": {}, 1144 | "outputs": [ 1145 | { 1146 | "data": { 1147 | "text/html": [ 1148 | "
\n", 1149 | "\n", 1162 | "\n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | "
datesdaytempwind-speed
02/1/2012sunny45.012
13/1/2012rainy46.034
24/1/2012hot47.045
35/1/2012NaNNaN56
46/1/2012hot49.0Not available
57/1/2012NaNNaNNot available
68/1/2012hot12.045
79/1/2012rainy23.041
810/1/2012NaNNaNNaN
911/1/2012NaNNaNNaN
\n", 1245 | "
" 1246 | ], 1247 | "text/plain": [ 1248 | " dates day temp wind-speed\n", 1249 | "0 2/1/2012 sunny 45.0 12\n", 1250 | "1 3/1/2012 rainy 46.0 34\n", 1251 | "2 4/1/2012 hot 47.0 45\n", 1252 | "3 5/1/2012 NaN NaN 56\n", 1253 | "4 6/1/2012 hot 49.0 Not available\n", 1254 | "5 7/1/2012 NaN NaN Not available\n", 1255 | "6 8/1/2012 hot 12.0 45\n", 1256 | "7 9/1/2012 rainy 23.0 41\n", 1257 | "8 10/1/2012 NaN NaN NaN\n", 1258 | "9 11/1/2012 NaN NaN NaN" 1259 | ] 1260 | }, 1261 | "execution_count": 28, 1262 | "metadata": {}, 1263 | "output_type": "execute_result" 1264 | } 1265 | ], 1266 | "source": [ 1267 | "df=pd.read_csv('datasets/season.csv')\n", 1268 | "df" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "markdown", 1273 | "metadata": {}, 1274 | "source": [ 1275 | "#### The following function will convert any cell of column having 'NaN' into 40, so here we will apply this function into 'temp' column and so for the 'day' and 'wind-speed' columns." 1276 | ] 1277 | }, 1278 | { 1279 | "cell_type": "code", 1280 | "execution_count": 37, 1281 | "metadata": {}, 1282 | "outputs": [], 1283 | "source": [ 1284 | "def converter_for_temp(col):\n", 1285 | " if col=='NaN':\n", 1286 | " return 40\n", 1287 | " else:\n", 1288 | " return col\n", 1289 | "def converter_for_day(col):\n", 1290 | " if col=='NaN':\n", 1291 | " return 'sunny'\n", 1292 | " else:\n", 1293 | " return col\n", 1294 | "def converter_for_wind_speed(col):\n", 1295 | " if col=='Not available':\n", 1296 | " return 30\n", 1297 | " elif col==\"NaN\":\n", 1298 | " return 48\n", 1299 | " else:\n", 1300 | " return col" 1301 | ] 1302 | }, 1303 | { 1304 | "cell_type": "code", 1305 | "execution_count": 38, 1306 | "metadata": {}, 1307 | "outputs": [ 1308 | { 1309 | "data": { 1310 | "text/html": [ 1311 | "
\n", 1312 | "\n", 1325 | "\n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | "
datesdaytempwind-speed
02/1/2012sunny4512
13/1/2012rainy4634
24/1/2012hot4745
35/1/2012sunny4056
46/1/2012hot4930
57/1/2012sunny4030
68/1/2012hot1245
79/1/2012rainy2341
810/1/2012sunny4048
911/1/2012sunny4048
\n", 1408 | "
" 1409 | ], 1410 | "text/plain": [ 1411 | " dates day temp wind-speed\n", 1412 | "0 2/1/2012 sunny 45 12\n", 1413 | "1 3/1/2012 rainy 46 34\n", 1414 | "2 4/1/2012 hot 47 45\n", 1415 | "3 5/1/2012 sunny 40 56\n", 1416 | "4 6/1/2012 hot 49 30\n", 1417 | "5 7/1/2012 sunny 40 30\n", 1418 | "6 8/1/2012 hot 12 45\n", 1419 | "7 9/1/2012 rainy 23 41\n", 1420 | "8 10/1/2012 sunny 40 48\n", 1421 | "9 11/1/2012 sunny 40 48" 1422 | ] 1423 | }, 1424 | "execution_count": 38, 1425 | "metadata": {}, 1426 | "output_type": "execute_result" 1427 | } 1428 | ], 1429 | "source": [ 1430 | "df=pd.read_csv('datasets/season.csv',converters={\n", 1431 | " 'day':converter_for_day,\n", 1432 | " 'temp':converter_for_temp,\n", 1433 | " 'wind-speed':converter_for_wind_speed\n", 1434 | " })\n", 1435 | "df" 1436 | ] 1437 | }, 1438 | { 1439 | "cell_type": "markdown", 1440 | "metadata": {}, 1441 | "source": [ 1442 | "You can observe that previously the columns 'day', 'temp' and 'wind-speed' had some invalid data like 'NaN','NAN','Not available' but after applying the converter functions we got a cleaned dataset.Now we can apply some data analysis techniquw to predict something in our dataset." 1443 | ] 1444 | }, 1445 | { 1446 | "cell_type": "markdown", 1447 | "metadata": {}, 1448 | "source": [ 1449 | "### Writing different dataframes into one file but different sheet names" 1450 | ] 1451 | }, 1452 | { 1453 | "cell_type": "markdown", 1454 | "metadata": {}, 1455 | "source": [ 1456 | "Let's assume that you have two different dataframes and you want to write it in the same excel sheet but different sheet names. \n", 1457 | "
\n", 1458 | "\n", 1459 | "Let's take two disctionary one is \"weather1\" and another is \"house1\" and make it two dataframe \"weather\" & \"house\"" 1460 | ] 1461 | }, 1462 | { 1463 | "cell_type": "code", 1464 | "execution_count": 31, 1465 | "metadata": {}, 1466 | "outputs": [], 1467 | "source": [ 1468 | "#Creating weather disctionary\n", 1469 | "weather1={\n", 1470 | " 'dates':['02-01-12','03-01-12','04-01-12','05-01-12','06-01-12'],\n", 1471 | " 'day':['sunny','rainy','hot','sunny','hot'],\n", 1472 | " 'wind-speed':[12,34,45,56,67],\n", 1473 | " 'temp':[45,46,47,48,49]\n", 1474 | " }\n", 1475 | "#Converting disction to dataframe object\n", 1476 | "weather=pd.DataFrame(weather1)\n", 1477 | "\n", 1478 | "#Creating house disctionary\n", 1479 | "house1={\n", 1480 | " 'dates':['02-01-12','03-01-12','04-01-12','05-01-12','06-01-12'],\n", 1481 | " 'price':[20000,30000,40000,50000,60000],\n", 1482 | " 'bhk':[1,3,2,1,2],\n", 1483 | " 'how-old':[2,5,2,7,4]\n", 1484 | " }\n", 1485 | "\n", 1486 | "#converting house disctionary to dataframe object\n", 1487 | "\n", 1488 | "house=pd.DataFrame(house1)\n" 1489 | ] 1490 | }, 1491 | { 1492 | "cell_type": "markdown", 1493 | "metadata": {}, 1494 | "source": [ 1495 | " Call the \"ExcelWriter\" and make a object \"writer\".Now call to_excel() function and pass three argument -\n", 1496 | " * 1. \"writer\" object :\n", 1497 | " * 2. sheet_name : \n", 1498 | " * 3. Index : This is optional for you\n", 1499 | " " 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "code", 1504 | "execution_count": 32, 1505 | "metadata": {}, 1506 | "outputs": [], 1507 | "source": [ 1508 | "with pd.ExcelWriter('datasets/weather_and_house.xls') as writer:\n", 1509 | " weather.to_excel(writer,sheet_name='Weather',index=False)\n", 1510 | " house.to_excel(writer,sheet_name='House',index=False)\n" 1511 | ] 1512 | }, 1513 | { 1514 | "cell_type": "markdown", 1515 | "metadata": {}, 1516 | "source": [ 1517 | "#### Now access the sheets separately by passing its name while reading\n", 1518 | "See the example" 1519 | ] 1520 | }, 1521 | { 1522 | "cell_type": "code", 1523 | "execution_count": 33, 1524 | "metadata": {}, 1525 | "outputs": [ 1526 | { 1527 | "data": { 1528 | "text/html": [ 1529 | "
\n", 1530 | "\n", 1543 | "\n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | "
datesdaytempwind-speed
002-01-12sunny4512
103-01-12rainy4634
204-01-12hot4745
305-01-12sunny4856
406-01-12hot4967
\n", 1591 | "
" 1592 | ], 1593 | "text/plain": [ 1594 | " dates day temp wind-speed\n", 1595 | "0 02-01-12 sunny 45 12\n", 1596 | "1 03-01-12 rainy 46 34\n", 1597 | "2 04-01-12 hot 47 45\n", 1598 | "3 05-01-12 sunny 48 56\n", 1599 | "4 06-01-12 hot 49 67" 1600 | ] 1601 | }, 1602 | "execution_count": 33, 1603 | "metadata": {}, 1604 | "output_type": "execute_result" 1605 | } 1606 | ], 1607 | "source": [ 1608 | "d1=pd.read_excel('datasets/weather_and_house.xls','Weather')\n", 1609 | "d1" 1610 | ] 1611 | }, 1612 | { 1613 | "cell_type": "code", 1614 | "execution_count": 34, 1615 | "metadata": {}, 1616 | "outputs": [ 1617 | { 1618 | "data": { 1619 | "text/html": [ 1620 | "
\n", 1621 | "\n", 1634 | "\n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | "
bhkdateshow-oldprice
0102-01-12220000
1303-01-12530000
2204-01-12240000
3105-01-12750000
4206-01-12460000
\n", 1682 | "
" 1683 | ], 1684 | "text/plain": [ 1685 | " bhk dates how-old price\n", 1686 | "0 1 02-01-12 2 20000\n", 1687 | "1 3 03-01-12 5 30000\n", 1688 | "2 2 04-01-12 2 40000\n", 1689 | "3 1 05-01-12 7 50000\n", 1690 | "4 2 06-01-12 4 60000" 1691 | ] 1692 | }, 1693 | "execution_count": 34, 1694 | "metadata": {}, 1695 | "output_type": "execute_result" 1696 | } 1697 | ], 1698 | "source": [ 1699 | "d2=pd.read_excel('datasets/weather_and_house.xls','House')\n", 1700 | "d2" 1701 | ] 1702 | }, 1703 | { 1704 | "cell_type": "markdown", 1705 | "metadata": {}, 1706 | "source": [] 1707 | } 1708 | ], 1709 | "metadata": { 1710 | "kernelspec": { 1711 | "display_name": "Python 3", 1712 | "language": "python", 1713 | "name": "python3" 1714 | }, 1715 | "language_info": { 1716 | "codemirror_mode": { 1717 | "name": "ipython", 1718 | "version": 3 1719 | }, 1720 | "file_extension": ".py", 1721 | "mimetype": "text/x-python", 1722 | "name": "python", 1723 | "nbconvert_exporter": "python", 1724 | "pygments_lexer": "ipython3", 1725 | "version": "3.6.4" 1726 | } 1727 | }, 1728 | "nbformat": 4, 1729 | "nbformat_minor": 2 1730 | } 1731 | -------------------------------------------------------------------------------- /pandas_part5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Handling Missing data part-2\n", 8 | "------------" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 7, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import pandas as pd\n", 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 17, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/html": [ 29 | "
\n", 30 | "\n", 43 | "\n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
AgeNameNo_of_pkgPackagetravel_id
020yrsBikash Kumar1 packages$1001
121yrsAshish Shaw5 packages$2002
223yearsDipak Kumar2pkgs$1003
320 YearsJohn Doe3 pkgs$1004
42000Elisha5000$4005
55000Md Shahid10 packages$2006
621 yrsAdrika Roy7pkgs$3007
724 yrsShashi Kumar2000$5008
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " Age Name No_of_pkg Package travel_id\n", 125 | "0 20yrs Bikash Kumar 1 packages $100 1\n", 126 | "1 21yrs Ashish Shaw 5 packages $200 2\n", 127 | "2 23years Dipak Kumar 2pkgs $100 3\n", 128 | "3 20 Years John Doe 3 pkgs $100 4\n", 129 | "4 2000 Elisha 5000 $400 5\n", 130 | "5 5000 Md Shahid 10 packages $200 6\n", 131 | "6 21 yrs Adrika Roy 7pkgs $300 7\n", 132 | "7 24 yrs Shashi Kumar 2000 $500 8" 133 | ] 134 | }, 135 | "execution_count": 17, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "df=pd.read_csv(\"datasets/travel.csv\")\n", 142 | "df" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Replacing specific value some another value" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | " Suppose your dataframe contains some invalid values and you want to replace it with some other values like 0 or NaN.\n", 157 | " In this case special values are 5000 and 2000.You can see the following result where 2000 and 5000 is replaced by NaN\n", 158 | " value\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 19, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/html": [ 169 | "
\n", 170 | "\n", 183 | "\n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | "
AgeNameNo_of_pkgPackagetravel_id
020yrsBikash Kumar1 packages$1001
121yrsAshish Shaw5 packages$2002
223yearsDipak Kumar2pkgs$1003
320 YearsJohn Doe3 pkgs$1004
4NaNElishaNaN$4005
5NaNMd Shahid10 packages$2006
621 yrsAdrika Roy7pkgs$3007
724 yrsShashi KumarNaN$5008
\n", 261 | "
" 262 | ], 263 | "text/plain": [ 264 | " Age Name No_of_pkg Package travel_id\n", 265 | "0 20yrs Bikash Kumar 1 packages $100 1\n", 266 | "1 21yrs Ashish Shaw 5 packages $200 2\n", 267 | "2 23years Dipak Kumar 2pkgs $100 3\n", 268 | "3 20 Years John Doe 3 pkgs $100 4\n", 269 | "4 NaN Elisha NaN $400 5\n", 270 | "5 NaN Md Shahid 10 packages $200 6\n", 271 | "6 21 yrs Adrika Roy 7pkgs $300 7\n", 272 | "7 24 yrs Shashi Kumar NaN $500 8" 273 | ] 274 | }, 275 | "execution_count": 19, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "df2=df.replace([\"5000\",\"2000\"],np.NaN)\n", 282 | "df2" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | " There is one problem with this approach is it will replace all the values which you have passed in the list with\n", 290 | " you your value but in many other cases you dont want it like if you have 50000 in price column it is valid but if\n", 291 | " 50000 is in name column it is not valid in this case.So you only want to replace 50000 of name column with NaN but\n", 292 | " not of price column.\n", 293 | " In that case you need to pass the disctionary in the replace column.This disctionary will contain name of the column\n", 294 | " and the value you want to replace" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 21, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/html": [ 305 | "
\n", 306 | "\n", 319 | "\n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | "
AgeNameNo_of_pkgPackagetravel_id
020yrsBikash Kumar1 packages$1001
121yrsAshish Shaw5 packages$2002
223yearsDipak Kumar2pkgs$1003
320 YearsJohn Doe3 pkgs$1004
4NaNElishaNaN$4005
5NaNMd Shahid10 packages$2006
621 yrsAdrika Roy7pkgs$3007
724 yrsShashi KumarNaN$5008
\n", 397 | "
" 398 | ], 399 | "text/plain": [ 400 | " Age Name No_of_pkg Package travel_id\n", 401 | "0 20yrs Bikash Kumar 1 packages $100 1\n", 402 | "1 21yrs Ashish Shaw 5 packages $200 2\n", 403 | "2 23years Dipak Kumar 2pkgs $100 3\n", 404 | "3 20 Years John Doe 3 pkgs $100 4\n", 405 | "4 NaN Elisha NaN $400 5\n", 406 | "5 NaN Md Shahid 10 packages $200 6\n", 407 | "6 21 yrs Adrika Roy 7pkgs $300 7\n", 408 | "7 24 yrs Shashi Kumar NaN $500 8" 409 | ] 410 | }, 411 | "execution_count": 21, 412 | "metadata": {}, 413 | "output_type": "execute_result" 414 | } 415 | ], 416 | "source": [ 417 | "df2=df.replace({\n", 418 | " 'Age':[\"2000\",\"5000\"],\n", 419 | " 'No_of_pkg':[\"2000\",\"5000\"],\n", 420 | " \"travel_id\":[0]\n", 421 | "},np.NaN)\n", 422 | "df2" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | " If you want to replace some specific value like 5000 with any other value and so on.In that case you need \n", 430 | " to pass the disctionary with all keys which you want to replace and values which you want to replace with.\n", 431 | " Here 5000,2000,8 are values to replaced are the keys and np.NaN & 10 are values to be replaced with are value\n", 432 | " of the disctionary." 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 25, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/html": [ 443 | "
\n", 444 | "\n", 457 | "\n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | "
AgeNameNo_of_pkgPackagetravel_id
020yrsBikash Kumar1 packages$1001
121yrsAshish Shaw5 packages$2002
223yearsDipak Kumar2pkgs$1003
320 YearsJohn Doe3 pkgs$1004
4NaNElishaNaN$4005
5NaNMd Shahid10 packages$2006
621 yrsAdrika Roy7pkgs$3007
724 yrsShashi KumarNaN$50010
\n", 535 | "
" 536 | ], 537 | "text/plain": [ 538 | " Age Name No_of_pkg Package travel_id\n", 539 | "0 20yrs Bikash Kumar 1 packages $100 1\n", 540 | "1 21yrs Ashish Shaw 5 packages $200 2\n", 541 | "2 23years Dipak Kumar 2pkgs $100 3\n", 542 | "3 20 Years John Doe 3 pkgs $100 4\n", 543 | "4 NaN Elisha NaN $400 5\n", 544 | "5 NaN Md Shahid 10 packages $200 6\n", 545 | "6 21 yrs Adrika Roy 7pkgs $300 7\n", 546 | "7 24 yrs Shashi Kumar NaN $500 10" 547 | ] 548 | }, 549 | "execution_count": 25, 550 | "metadata": {}, 551 | "output_type": "execute_result" 552 | } 553 | ], 554 | "source": [ 555 | "df2=df.replace({\n", 556 | " \"5000\":np.NaN,\n", 557 | " \"2000\":np.NaN,\n", 558 | " 8:10\n", 559 | "})\n", 560 | "df2" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "**Note:** All the values in the dataframe that belongs to keys of disctionary will be replaced no matter what column it is." 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "### Removing unnecessary character from columns" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | " Suppose your dataframe contains unnecessary characters with your data values.Here years/yrs/Yrs/Years\n", 582 | " in Age column, same in the No_of_pkg & Package columns are unnecessary charactors which you dont want \n", 583 | " and these charactor will prevent you from applying any kind of operation in data analysis.So you want\n", 584 | " to get rid of it.In that case you have to pass regex as a value and column name as a key of the \n", 585 | " disctionary which you have passed in the replace() function as well as you also have to set regex=True\n", 586 | " and a pass an empty string\n", 587 | " \n", 588 | "* ** [A-Za-z] ** : This is the regex of all the character from A to Z and a to z.\n", 589 | "* \\$ : This is the regex for **$** sign.\n", 590 | "* For futher information about regex go to this link https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285\n" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 26, 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/html": [ 601 | "
\n", 602 | "\n", 615 | "\n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | "
AgeNameNo_of_pkgPackagetravel_id
020Bikash Kumar11001
121Ashish Shaw52002
223Dipak Kumar21003
320John Doe31004
4NaNElishaNaN4005
5NaNMd Shahid102006
621Adrika Roy73007
724Shashi KumarNaN50010
\n", 693 | "
" 694 | ], 695 | "text/plain": [ 696 | " Age Name No_of_pkg Package travel_id\n", 697 | "0 20 Bikash Kumar 1 100 1\n", 698 | "1 21 Ashish Shaw 5 200 2\n", 699 | "2 23 Dipak Kumar 2 100 3\n", 700 | "3 20 John Doe 3 100 4\n", 701 | "4 NaN Elisha NaN 400 5\n", 702 | "5 NaN Md Shahid 10 200 6\n", 703 | "6 21 Adrika Roy 7 300 7\n", 704 | "7 24 Shashi Kumar NaN 500 10" 705 | ] 706 | }, 707 | "execution_count": 26, 708 | "metadata": {}, 709 | "output_type": "execute_result" 710 | } 711 | ], 712 | "source": [ 713 | "df3=df2.replace({\n", 714 | " 'Age':'[A-Za-z]',\n", 715 | " 'No_of_pkg':'[A-Za-z]',\n", 716 | " 'Package':'\\$'\n", 717 | "},\"\",regex=True)\n", 718 | "df3" 719 | ] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": {}, 724 | "source": [ 725 | "### Mapping from one list to another list " 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | " If your dataset contains data which is repeating more than once or you want to change some set of string in to\n", 733 | " number then you have apply list mapping." 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 29, 739 | "metadata": {}, 740 | "outputs": [ 741 | { 742 | "data": { 743 | "text/html": [ 744 | "
\n", 745 | "\n", 758 | "\n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | "
gradesname
0poorShahid
1excellentAdrika
2very goodBikash
3averageAshish
4goodGanesh
5very goodZahid
6outstandingMohan
7poorSohan
\n", 809 | "
" 810 | ], 811 | "text/plain": [ 812 | " grades name\n", 813 | "0 poor Shahid\n", 814 | "1 excellent Adrika\n", 815 | "2 very good Bikash\n", 816 | "3 average Ashish\n", 817 | "4 good Ganesh\n", 818 | "5 very good Zahid\n", 819 | "6 outstanding Mohan\n", 820 | "7 poor Sohan" 821 | ] 822 | }, 823 | "execution_count": 29, 824 | "metadata": {}, 825 | "output_type": "execute_result" 826 | } 827 | ], 828 | "source": [ 829 | "mydis={\n", 830 | " \"name\":[\"Shahid\",\"Adrika\",\"Bikash\",\"Ashish\",\"Ganesh\",\"Zahid\",\"Mohan\",\"Sohan\"],\n", 831 | " \"grades\":[\"poor\",\"excellent\",\"very good\",\"average\",\"good\",\"very good\",\"outstanding\",\"poor\"]\n", 832 | " }\n", 833 | "df=pd.DataFrame(mydis)\n", 834 | "df" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 30, 840 | "metadata": {}, 841 | "outputs": [ 842 | { 843 | "data": { 844 | "text/html": [ 845 | "
\n", 846 | "\n", 859 | "\n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | "
gradesname
05Shahid
19Adrika
28Bikash
36Ashish
47Ganesh
58Zahid
610Mohan
75Sohan
\n", 910 | "
" 911 | ], 912 | "text/plain": [ 913 | " grades name\n", 914 | "0 5 Shahid\n", 915 | "1 9 Adrika\n", 916 | "2 8 Bikash\n", 917 | "3 6 Ashish\n", 918 | "4 7 Ganesh\n", 919 | "5 8 Zahid\n", 920 | "6 10 Mohan\n", 921 | "7 5 Sohan" 922 | ] 923 | }, 924 | "execution_count": 30, 925 | "metadata": {}, 926 | "output_type": "execute_result" 927 | } 928 | ], 929 | "source": [ 930 | "df2=df.replace([\"poor\",\"average\",\"good\",\"very good\",\"excellent\",\"outstanding\"],[5,6,7,8,9,10])\n", 931 | "df2" 932 | ] 933 | } 934 | ], 935 | "metadata": { 936 | "kernelspec": { 937 | "display_name": "Python 3", 938 | "language": "python", 939 | "name": "python3" 940 | }, 941 | "language_info": { 942 | "codemirror_mode": { 943 | "name": "ipython", 944 | "version": 3 945 | }, 946 | "file_extension": ".py", 947 | "mimetype": "text/x-python", 948 | "name": "python", 949 | "nbconvert_exporter": "python", 950 | "pygments_lexer": "ipython3", 951 | "version": "3.6.4" 952 | } 953 | }, 954 | "nbformat": 4, 955 | "nbformat_minor": 2 956 | } 957 | -------------------------------------------------------------------------------- /pandas_part6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Groupby : Split, Apply and Combine\n", 8 | "---------------\n", 9 | " Groupby is one of the important operations in data analysis.It includes three steps -
\n", 10 | " \n", 11 | " \n", 22 | " \n", 23 | " Groupby property is grouped the data according the column supplied to the function.In the following example\n", 24 | " you can see the i have grouped the dataframe df by its team." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 28, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import pandas as pd\n", 34 | "import numpy as np" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 22, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/html": [ 45 | "
\n", 46 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | "
MatchRunYearteam
023302012India
142302012New zealand
223002012Australia
311802012India
452002013India
562502013New zealand
631902013Australia
724002013India
853402014New zealand
932902014Australia
1013902014New zealand
1113332014Australia
\n", 156 | "
" 157 | ], 158 | "text/plain": [ 159 | " Match Run Year team\n", 160 | "0 2 330 2012 India\n", 161 | "1 4 230 2012 New zealand\n", 162 | "2 2 300 2012 Australia\n", 163 | "3 1 180 2012 India\n", 164 | "4 5 200 2013 India\n", 165 | "5 6 250 2013 New zealand\n", 166 | "6 3 190 2013 Australia\n", 167 | "7 2 400 2013 India\n", 168 | "8 5 340 2014 New zealand\n", 169 | "9 3 290 2014 Australia\n", 170 | "10 1 390 2014 New zealand\n", 171 | "11 1 333 2014 Australia" 172 | ] 173 | }, 174 | "execution_count": 22, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "d={\n", 181 | " 'team':[\"India\",\"New zealand\",\"Australia\",\"India\",\n", 182 | " \"India\",\"New zealand\",\"Australia\",\"India\",\n", 183 | " \"New zealand\",\"Australia\",\"New zealand\",\"Australia\"],\n", 184 | " 'Run':[330,230,300,180,200,250,190,400,340,290,390,333],\n", 185 | " \"Match\":[2,4,2,1,5,6,3,2,5,3,1,1],\n", 186 | " \"Year\":['2012','2012','2012','2012','2013','2013','2013','2013','2014','2014','2014','2014']\n", 187 | " }\n", 188 | "df=pd.DataFrame(d)\n", 189 | "df" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | " groupby() function will return an object.we can imagine that every group is pointing to its dataframe.\n", 197 | " " 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 4, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "" 209 | ] 210 | }, 211 | "execution_count": 4, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "g=df.groupby('team')\n", 218 | "g" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 14, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "Australia\n", 231 | "----------\n", 232 | " Match Run Year team\n", 233 | "2 2 300 2012 Australia\n", 234 | "6 3 190 2013 Australia\n", 235 | "9 3 290 2014 Australia\n", 236 | "11 1 333 2014 Australia\n", 237 | "-----------------------------------\n", 238 | "India\n", 239 | "----------\n", 240 | " Match Run Year team\n", 241 | "0 2 330 2012 India\n", 242 | "3 1 180 2012 India\n", 243 | "4 5 200 2013 India\n", 244 | "7 2 400 2013 India\n", 245 | "-----------------------------------\n", 246 | "New zealand\n", 247 | "----------\n", 248 | " Match Run Year team\n", 249 | "1 4 230 2012 New zealand\n", 250 | "5 6 250 2013 New zealand\n", 251 | "8 5 340 2014 New zealand\n", 252 | "10 1 390 2014 New zealand\n", 253 | "-----------------------------------\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "for team,teamdata in g:\n", 259 | " print(team)\n", 260 | " print(\"-\"*10)\n", 261 | " print(teamdata)\n", 262 | " print(\"-\"*35)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | " **get_group('group name') :**
\n", 270 | " It will return dataframe of particular group" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 16, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/html": [ 281 | "
\n", 282 | "\n", 295 | "\n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | "
MatchRunYear
023302012
311802012
452002013
724002013
\n", 331 | "
" 332 | ], 333 | "text/plain": [ 334 | " Match Run Year\n", 335 | "0 2 330 2012\n", 336 | "3 1 180 2012\n", 337 | "4 5 200 2013\n", 338 | "7 2 400 2013" 339 | ] 340 | }, 341 | "execution_count": 16, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "g.get_group('India')" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 17, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "text/html": [ 358 | "
\n", 359 | "\n", 372 | "\n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | "
MatchRunYear
223002012
631902013
932902014
1113332014
\n", 408 | "
" 409 | ], 410 | "text/plain": [ 411 | " Match Run Year\n", 412 | "2 2 300 2012\n", 413 | "6 3 190 2013\n", 414 | "9 3 290 2014\n", 415 | "11 1 333 2014" 416 | ] 417 | }, 418 | "execution_count": 17, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "g.get_group('Australia')" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | " We can also perform the operations which we were applying in dataframe.This only difference here you get is \n", 432 | " your operation will be applied to all of group and return the result of all groups." 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 18, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/html": [ 443 | "
\n", 444 | "\n", 457 | "\n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | "
MatchRunYear
team
Australia11902012
India11802012
New zealand12302012
\n", 493 | "
" 494 | ], 495 | "text/plain": [ 496 | " Match Run Year\n", 497 | "team \n", 498 | "Australia 1 190 2012\n", 499 | "India 1 180 2012\n", 500 | "New zealand 1 230 2012" 501 | ] 502 | }, 503 | "execution_count": 18, 504 | "metadata": {}, 505 | "output_type": "execute_result" 506 | } 507 | ], 508 | "source": [ 509 | "g.min()" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 20, 515 | "metadata": {}, 516 | "outputs": [ 517 | { 518 | "data": { 519 | "text/html": [ 520 | "
\n", 521 | "\n", 534 | "\n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | "
MatchRunYear
team
Australia2.25278.252013.25
India2.50277.502012.50
New zealand4.00302.502013.25
\n", 570 | "
" 571 | ], 572 | "text/plain": [ 573 | " Match Run Year\n", 574 | "team \n", 575 | "Australia 2.25 278.25 2013.25\n", 576 | "India 2.50 277.50 2012.50\n", 577 | "New zealand 4.00 302.50 2013.25" 578 | ] 579 | }, 580 | "execution_count": 20, 581 | "metadata": {}, 582 | "output_type": "execute_result" 583 | } 584 | ], 585 | "source": [ 586 | "g.mean()" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 30, 592 | "metadata": {}, 593 | "outputs": [ 594 | { 595 | "data": { 596 | "text/html": [ 597 | "
\n", 598 | "\n", 611 | "\n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | "
aminamaxmeansumstd
Year
2012180330260.00104067.823300
2013190400260.00104096.953597
2014290390338.25135340.974586
\n", 657 | "
" 658 | ], 659 | "text/plain": [ 660 | " amin amax mean sum std\n", 661 | "Year \n", 662 | "2012 180 330 260.00 1040 67.823300\n", 663 | "2013 190 400 260.00 1040 96.953597\n", 664 | "2014 290 390 338.25 1353 40.974586" 665 | ] 666 | }, 667 | "execution_count": 30, 668 | "metadata": {}, 669 | "output_type": "execute_result" 670 | } 671 | ], 672 | "source": [ 673 | "g_yr=df.groupby('Year')\n", 674 | "g_yr['Run'].agg([np.min,np.max,np.mean,np.sum,np.std])" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": null, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": null, 694 | "metadata": {}, 695 | "outputs": [], 696 | "source": [] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [] 704 | } 705 | ], 706 | "metadata": { 707 | "kernelspec": { 708 | "display_name": "Python 3", 709 | "language": "python", 710 | "name": "python3" 711 | }, 712 | "language_info": { 713 | "codemirror_mode": { 714 | "name": "ipython", 715 | "version": 3 716 | }, 717 | "file_extension": ".py", 718 | "mimetype": "text/x-python", 719 | "name": "python", 720 | "nbconvert_exporter": "python", 721 | "pygments_lexer": "ipython3", 722 | "version": "3.6.4" 723 | } 724 | }, 725 | "nbformat": 4, 726 | "nbformat_minor": 2 727 | } 728 | -------------------------------------------------------------------------------- /pandas_part7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Concat DataFrame \n", 8 | "------------\n", 9 | " Concat is very useful method of pandas.You can concatanate two dataframes in two way -\n", 10 | " 1)Append by row\n", 11 | " 2)Append by col\n", 12 | " \n", 13 | "### 1) Append by row :\n", 14 | " Concat is the method of pandas in which you can join two and more dataframes provided its indices are same.\n", 15 | " Let's take an example -\n", 16 | " You have two dataframes of weathers of two cities Kolkata and Chennai,\n", 17 | " \n", 18 | " df=pd.concat([dataframe_1,dataframe_2,...,dataframe_n]) \n", 19 | " for n number of dataframes," 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/html": [ 39 | "
\n", 40 | "\n", 53 | "\n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | "
dateeventtempwind-speed
001-02-12hot2033
103-02-12sunny2123
204-02-12rainy1545
305-02-12cold1824
\n", 94 | "
" 95 | ], 96 | "text/plain": [ 97 | " date event temp wind-speed\n", 98 | "0 01-02-12 hot 20 33\n", 99 | "1 03-02-12 sunny 21 23\n", 100 | "2 04-02-12 rainy 15 45\n", 101 | "3 05-02-12 cold 18 24" 102 | ] 103 | }, 104 | "execution_count": 3, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "chennai={\n", 111 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 112 | " \"event\":['hot','sunny','rainy','cold'],\n", 113 | " \"wind-speed\":[33,23,45,24],\n", 114 | " \"temp\":[20,21,15,18]\n", 115 | "}\n", 116 | "chen=pd.DataFrame(chennai)\n", 117 | "chen" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/html": [ 128 | "
\n", 129 | "\n", 142 | "\n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | "
dateeventtempwind-speed
001-02-12sunny1412
103-02-12cold1610
204-02-12cold159
305-02-12rainy1014
\n", 183 | "
" 184 | ], 185 | "text/plain": [ 186 | " date event temp wind-speed\n", 187 | "0 01-02-12 sunny 14 12\n", 188 | "1 03-02-12 cold 16 10\n", 189 | "2 04-02-12 cold 15 9\n", 190 | "3 05-02-12 rainy 10 14" 191 | ] 192 | }, 193 | "execution_count": 5, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "kolkata={\n", 200 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 201 | " \"event\":['sunny','cold','cold','rainy'],\n", 202 | " \"wind-speed\":[12,10,9,14],\n", 203 | " \"temp\":[14,16,15,10]\n", 204 | "}\n", 205 | "kol=pd.DataFrame(kolkata)\n", 206 | "kol" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 6, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/html": [ 217 | "
\n", 218 | "\n", 231 | "\n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | "
dateeventtempwind-speed
001-02-12sunny1412
103-02-12cold1610
204-02-12cold159
305-02-12rainy1014
001-02-12hot2033
103-02-12sunny2123
204-02-12rainy1545
305-02-12cold1824
\n", 300 | "
" 301 | ], 302 | "text/plain": [ 303 | " date event temp wind-speed\n", 304 | "0 01-02-12 sunny 14 12\n", 305 | "1 03-02-12 cold 16 10\n", 306 | "2 04-02-12 cold 15 9\n", 307 | "3 05-02-12 rainy 10 14\n", 308 | "0 01-02-12 hot 20 33\n", 309 | "1 03-02-12 sunny 21 23\n", 310 | "2 04-02-12 rainy 15 45\n", 311 | "3 05-02-12 cold 18 24" 312 | ] 313 | }, 314 | "execution_count": 6, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "df=pd.concat([kol,chen])\n", 321 | "df" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | " **You can observe the index first 0 to 3 then again 0 to 3.To ignore this you have to pass an extra argument.**" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 8, 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/html": [ 339 | "
\n", 340 | "\n", 353 | "\n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | "
dateeventtempwind-speed
001-02-12sunny1412
103-02-12cold1610
204-02-12cold159
305-02-12rainy1014
401-02-12hot2033
503-02-12sunny2123
604-02-12rainy1545
705-02-12cold1824
\n", 422 | "
" 423 | ], 424 | "text/plain": [ 425 | " date event temp wind-speed\n", 426 | "0 01-02-12 sunny 14 12\n", 427 | "1 03-02-12 cold 16 10\n", 428 | "2 04-02-12 cold 15 9\n", 429 | "3 05-02-12 rainy 10 14\n", 430 | "4 01-02-12 hot 20 33\n", 431 | "5 03-02-12 sunny 21 23\n", 432 | "6 04-02-12 rainy 15 45\n", 433 | "7 05-02-12 cold 18 24" 434 | ] 435 | }, 436 | "execution_count": 8, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "df=pd.concat([kol,chen],ignore_index=True)\n", 443 | "df\n", 444 | "# Now see index column" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "##### If you want to join dataframes with dataframe name as an index name" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 9, 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/html": [ 462 | "
\n", 463 | "\n", 476 | "\n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | "
dateeventtempwind-speed
Kolkata001-02-12sunny1412
103-02-12cold1610
204-02-12cold159
305-02-12rainy1014
Chennai001-02-12hot2033
103-02-12sunny2123
204-02-12rainy1545
305-02-12cold1824
\n", 548 | "
" 549 | ], 550 | "text/plain": [ 551 | " date event temp wind-speed\n", 552 | "Kolkata 0 01-02-12 sunny 14 12\n", 553 | " 1 03-02-12 cold 16 10\n", 554 | " 2 04-02-12 cold 15 9\n", 555 | " 3 05-02-12 rainy 10 14\n", 556 | "Chennai 0 01-02-12 hot 20 33\n", 557 | " 1 03-02-12 sunny 21 23\n", 558 | " 2 04-02-12 rainy 15 45\n", 559 | " 3 05-02-12 cold 18 24" 560 | ] 561 | }, 562 | "execution_count": 9, 563 | "metadata": {}, 564 | "output_type": "execute_result" 565 | } 566 | ], 567 | "source": [ 568 | "df=pd.concat([kol,chen],keys=['Kolkata','Chennai'])\n", 569 | "df" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 11, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "data": { 579 | "text/html": [ 580 | "
\n", 581 | "\n", 594 | "\n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | "
dateeventtempwind-speed
001-02-12sunny1412
103-02-12cold1610
204-02-12cold159
305-02-12rainy1014
\n", 635 | "
" 636 | ], 637 | "text/plain": [ 638 | " date event temp wind-speed\n", 639 | "0 01-02-12 sunny 14 12\n", 640 | "1 03-02-12 cold 16 10\n", 641 | "2 04-02-12 cold 15 9\n", 642 | "3 05-02-12 rainy 10 14" 643 | ] 644 | }, 645 | "execution_count": 11, 646 | "metadata": {}, 647 | "output_type": "execute_result" 648 | } 649 | ], 650 | "source": [ 651 | "df.loc['Kolkata']" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 12, 657 | "metadata": {}, 658 | "outputs": [ 659 | { 660 | "data": { 661 | "text/html": [ 662 | "
\n", 663 | "\n", 676 | "\n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | "
dateeventtempwind-speed
001-02-12hot2033
103-02-12sunny2123
204-02-12rainy1545
305-02-12cold1824
\n", 717 | "
" 718 | ], 719 | "text/plain": [ 720 | " date event temp wind-speed\n", 721 | "0 01-02-12 hot 20 33\n", 722 | "1 03-02-12 sunny 21 23\n", 723 | "2 04-02-12 rainy 15 45\n", 724 | "3 05-02-12 cold 18 24" 725 | ] 726 | }, 727 | "execution_count": 12, 728 | "metadata": {}, 729 | "output_type": "execute_result" 730 | } 731 | ], 732 | "source": [ 733 | "df.loc['Chennai']" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "### 2) Append by column :\n", 741 | " if you have two dataframes and you want to append column wise. For example - if you have two dataframes of weather\n", 742 | " first dataframe having columns date,event & temp and second dataframe having columns date,event & wind-speed when you\n", 743 | " join both you will get one dataframe having columns date, event, temp, date, event and wind-speed.\n", 744 | " To join two dataframe column wise\n", 745 | " you have to pass axis=1 in concat() method.\n", 746 | " \n", 747 | " df=pd.concat([dataframe_1,dataframe_2,...,dataframe_n],axis=1) " 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 3, 753 | "metadata": {}, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/html": [ 758 | "
\n", 759 | "\n", 772 | "\n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | "
dateeventtemp
001-02-12sunny14
103-02-12cold16
204-02-12cold15
305-02-12rainy10
\n", 808 | "
" 809 | ], 810 | "text/plain": [ 811 | " date event temp\n", 812 | "0 01-02-12 sunny 14\n", 813 | "1 03-02-12 cold 16\n", 814 | "2 04-02-12 cold 15\n", 815 | "3 05-02-12 rainy 10" 816 | ] 817 | }, 818 | "execution_count": 3, 819 | "metadata": {}, 820 | "output_type": "execute_result" 821 | } 822 | ], 823 | "source": [ 824 | "temp1={\n", 825 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 826 | " \"event\":['sunny','cold','cold','rainy'],\n", 827 | " \"temp\":[14,16,15,10]\n", 828 | "}\n", 829 | "temp=pd.DataFrame(temp1)\n", 830 | "temp" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 7, 836 | "metadata": {}, 837 | "outputs": [ 838 | { 839 | "data": { 840 | "text/html": [ 841 | "
\n", 842 | "\n", 855 | "\n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | "
dateeventwind-speed
001-02-12sunny12
103-02-12cold10
204-02-12cold9
305-02-12rainy14
\n", 891 | "
" 892 | ], 893 | "text/plain": [ 894 | " date event wind-speed\n", 895 | "0 01-02-12 sunny 12\n", 896 | "1 03-02-12 cold 10\n", 897 | "2 04-02-12 cold 9\n", 898 | "3 05-02-12 rainy 14" 899 | ] 900 | }, 901 | "execution_count": 7, 902 | "metadata": {}, 903 | "output_type": "execute_result" 904 | } 905 | ], 906 | "source": [ 907 | "ws={\n", 908 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 909 | " \"event\":['sunny','cold','cold','rainy'],\n", 910 | " \"wind-speed\":[12,10,9,14],\n", 911 | "}\n", 912 | "wind_speed=pd.DataFrame(ws)\n", 913 | "wind_speed" 914 | ] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": 9, 919 | "metadata": {}, 920 | "outputs": [ 921 | { 922 | "data": { 923 | "text/html": [ 924 | "
\n", 925 | "\n", 938 | "\n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | "
dateeventtempdateeventwind-speed
001-02-12sunny1401-02-12sunny12
103-02-12cold1603-02-12cold10
204-02-12cold1504-02-12cold9
305-02-12rainy1005-02-12rainy14
\n", 989 | "
" 990 | ], 991 | "text/plain": [ 992 | " date event temp date event wind-speed\n", 993 | "0 01-02-12 sunny 14 01-02-12 sunny 12\n", 994 | "1 03-02-12 cold 16 03-02-12 cold 10\n", 995 | "2 04-02-12 cold 15 04-02-12 cold 9\n", 996 | "3 05-02-12 rainy 10 05-02-12 rainy 14" 997 | ] 998 | }, 999 | "execution_count": 9, 1000 | "metadata": {}, 1001 | "output_type": "execute_result" 1002 | } 1003 | ], 1004 | "source": [ 1005 | "df=pd.concat([temp,wind_speed],axis=1)\n", 1006 | "df" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "markdown", 1011 | "metadata": {}, 1012 | "source": [] 1013 | } 1014 | ], 1015 | "metadata": { 1016 | "kernelspec": { 1017 | "display_name": "Python 3", 1018 | "language": "python", 1019 | "name": "python3" 1020 | }, 1021 | "language_info": { 1022 | "codemirror_mode": { 1023 | "name": "ipython", 1024 | "version": 3 1025 | }, 1026 | "file_extension": ".py", 1027 | "mimetype": "text/x-python", 1028 | "name": "python", 1029 | "nbconvert_exporter": "python", 1030 | "pygments_lexer": "ipython3", 1031 | "version": "3.6.4" 1032 | } 1033 | }, 1034 | "nbformat": 4, 1035 | "nbformat_minor": 2 1036 | } 1037 | -------------------------------------------------------------------------------- /pandas_part8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Merging dataframes \n", 8 | "-------\n", 9 | " merge() is a method in pandas in which you can merge two dataframes withou repeating columns as we did in\n", 10 | " concat() method.\n", 11 | " There are two types of merging -\n", 12 | " 1) Inner join\n", 13 | " 2) outer join\n", 14 | " ### 1) Inner join\n", 15 | " In this method you will get the intersetion of two dataframes with merged column.It is the default merge method.\n", 16 | " In merge() method you have to pass the dataframes as arguments and list of columns on which you want to merge\n", 17 | " and dataframes as arguments." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/html": [ 37 | "
\n", 38 | "\n", 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | "
dateeventtemp
001-02-12sunny14
103-02-12cold16
204-02-12cold15
305-02-12rainy10
\n", 87 | "
" 88 | ], 89 | "text/plain": [ 90 | " date event temp\n", 91 | "0 01-02-12 sunny 14\n", 92 | "1 03-02-12 cold 16\n", 93 | "2 04-02-12 cold 15\n", 94 | "3 05-02-12 rainy 10" 95 | ] 96 | }, 97 | "execution_count": 2, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "temp1={\n", 104 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 105 | " \"event\":['sunny','cold','cold','rainy'],\n", 106 | " \"temp\":[14,16,15,10]\n", 107 | "}\n", 108 | "temp=pd.DataFrame(temp1)\n", 109 | "temp" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 3, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/html": [ 120 | "
\n", 121 | "\n", 134 | "\n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | "
dateeventwind-speed
001-02-12sunny12
103-02-12cold10
204-02-12cold9
305-02-12rainy14
\n", 170 | "
" 171 | ], 172 | "text/plain": [ 173 | " date event wind-speed\n", 174 | "0 01-02-12 sunny 12\n", 175 | "1 03-02-12 cold 10\n", 176 | "2 04-02-12 cold 9\n", 177 | "3 05-02-12 rainy 14" 178 | ] 179 | }, 180 | "execution_count": 3, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "ws={\n", 187 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 188 | " \"event\":['sunny','cold','cold','rainy'],\n", 189 | " \"wind-speed\":[12,10,9,14],\n", 190 | "}\n", 191 | "wind_speed=pd.DataFrame(ws)\n", 192 | "wind_speed" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 5, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/html": [ 203 | "
\n", 204 | "\n", 217 | "\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | "
dateeventtempwind-speed
001-02-12sunny1412
103-02-12cold1610
204-02-12cold159
305-02-12rainy1014
\n", 258 | "
" 259 | ], 260 | "text/plain": [ 261 | " date event temp wind-speed\n", 262 | "0 01-02-12 sunny 14 12\n", 263 | "1 03-02-12 cold 16 10\n", 264 | "2 04-02-12 cold 15 9\n", 265 | "3 05-02-12 rainy 10 14" 266 | ] 267 | }, 268 | "execution_count": 5, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "df=pd.merge(temp,wind_speed,on=['date','event'])\n", 275 | "df" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | " If you have different data in the common columns then you can not do inner merge in this case.\n", 283 | " Assume the following case:\n", 284 | " \n", 285 | " temp \n", 286 | " ___________________\n", 287 | " | event | temp |\n", 288 | " |--------|----------|\n", 289 | " | sunny | 40 |\n", 290 | " | hot | 30 |\n", 291 | " | rainy | 25 |\n", 292 | " |________|__________|\n", 293 | " \n", 294 | " wind-speed\n", 295 | " ___________________\n", 296 | " | event |wind-speed|\n", 297 | " |--------|----------|\n", 298 | " | sunny | 12 |\n", 299 | " | hot | 14 |\n", 300 | " | hot | 11 |\n", 301 | " |________|__________|\n", 302 | " \n", 303 | " Merged Dataframe by inner method\n", 304 | " \n", 305 | " ________________________________\n", 306 | " | event | temp | wind-speed |\n", 307 | " |--------|----------|------------|\n", 308 | " | sunny | 40 | 12 | \n", 309 | " | hot | 30 | 14 |\n", 310 | " |________|__________|____________|\n", 311 | " \n", 312 | " You can see that last row is not merged here because there is no common element in the common columns.\n", 313 | " " 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "### Outer join\n", 321 | "This is just like union of two dataframe.The value which dont exist will contain NaN. \n", 322 | " \n", 323 | "\n", 324 | " temp \n", 325 | " ___________________\n", 326 | " | event | temp |\n", 327 | " |--------|----------|\n", 328 | " | sunny | 40 |\n", 329 | " | hot | 30 |\n", 330 | " | rainy | 25 |\n", 331 | " |________|__________|\n", 332 | " \n", 333 | " wind-speed\n", 334 | " ___________________\n", 335 | " | event |wind-speed|\n", 336 | " |--------|----------|\n", 337 | " | sunny | 12 |\n", 338 | " | hot | 14 |\n", 339 | " | hot | 11 |\n", 340 | " |________|__________|\n", 341 | " \n", 342 | " Merged Dataframe by inner method\n", 343 | " \n", 344 | " ________________________________\n", 345 | " | event | temp | wind-speed |\n", 346 | " |--------|----------|------------|\n", 347 | " | sunny | 40 | 12 | \n", 348 | " | hot | 30 | 14 |\n", 349 | " | rainy | 25 | NaN |\n", 350 | " | hot | NaN | 11 |\n", 351 | " |________|__________|____________|\n", 352 | " \n", 353 | " You can see in the final dataframe two rows increased." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 8, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "data": { 363 | "text/html": [ 364 | "
\n", 365 | "\n", 378 | "\n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | "
dateeventtemp
001-02-12sunny14
103-02-12cold16
204-02-12hot15
305-02-12sunny10
\n", 414 | "
" 415 | ], 416 | "text/plain": [ 417 | " date event temp\n", 418 | "0 01-02-12 sunny 14\n", 419 | "1 03-02-12 cold 16\n", 420 | "2 04-02-12 hot 15\n", 421 | "3 05-02-12 sunny 10" 422 | ] 423 | }, 424 | "execution_count": 8, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "temp1={\n", 431 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 432 | " \"event\":['sunny','cold','hot','sunny'],\n", 433 | " \"temp\":[14,16,15,10]\n", 434 | "}\n", 435 | "temp=pd.DataFrame(temp1)\n", 436 | "temp" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 7, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "data": { 446 | "text/html": [ 447 | "
\n", 448 | "\n", 461 | "\n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | "
dateeventwind-speed
001-02-12sunny12
103-02-12cold10
204-02-12cold9
305-02-12rainy14
\n", 497 | "
" 498 | ], 499 | "text/plain": [ 500 | " date event wind-speed\n", 501 | "0 01-02-12 sunny 12\n", 502 | "1 03-02-12 cold 10\n", 503 | "2 04-02-12 cold 9\n", 504 | "3 05-02-12 rainy 14" 505 | ] 506 | }, 507 | "execution_count": 7, 508 | "metadata": {}, 509 | "output_type": "execute_result" 510 | } 511 | ], 512 | "source": [ 513 | "ws={\n", 514 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n", 515 | " \"event\":['sunny','cold','cold','rainy'],\n", 516 | " \"wind-speed\":[12,10,9,14],\n", 517 | "}\n", 518 | "wind_speed=pd.DataFrame(ws)\n", 519 | "wind_speed" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "##### One extra argument you need to pass in outer join i.e, how=\"outer\".By defaut it is inner." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 9, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "data": { 536 | "text/html": [ 537 | "
\n", 538 | "\n", 551 | "\n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | "
dateeventtempwind-speed
001-02-12sunny14.012.0
103-02-12cold16.010.0
204-02-12hot15.0NaN
305-02-12sunny10.0NaN
404-02-12coldNaN9.0
505-02-12rainyNaN14.0
\n", 606 | "
" 607 | ], 608 | "text/plain": [ 609 | " date event temp wind-speed\n", 610 | "0 01-02-12 sunny 14.0 12.0\n", 611 | "1 03-02-12 cold 16.0 10.0\n", 612 | "2 04-02-12 hot 15.0 NaN\n", 613 | "3 05-02-12 sunny 10.0 NaN\n", 614 | "4 04-02-12 cold NaN 9.0\n", 615 | "5 05-02-12 rainy NaN 14.0" 616 | ] 617 | }, 618 | "execution_count": 9, 619 | "metadata": {}, 620 | "output_type": "execute_result" 621 | } 622 | ], 623 | "source": [ 624 | "df=pd.merge(temp,wind_speed,on=['date','event'],how='outer')\n", 625 | "df" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [] 634 | } 635 | ], 636 | "metadata": { 637 | "kernelspec": { 638 | "display_name": "Python 3", 639 | "language": "python", 640 | "name": "python3" 641 | }, 642 | "language_info": { 643 | "codemirror_mode": { 644 | "name": "ipython", 645 | "version": 3 646 | }, 647 | "file_extension": ".py", 648 | "mimetype": "text/x-python", 649 | "name": "python", 650 | "nbconvert_exporter": "python", 651 | "pygments_lexer": "ipython3", 652 | "version": "3.6.4" 653 | } 654 | }, 655 | "nbformat": 4, 656 | "nbformat_minor": 2 657 | } 658 | --------------------------------------------------------------------------------