├── .flake8
├── .github
└── ISSUE_TEMPLATE
│ ├── config.yml
│ └── issue_template.md
├── .gitignore
├── .travis.yml
├── README.md
├── datasets
├── ds1.jpg
├── ds2.jpg
├── mycsv.csv
├── mycsv_few_columns.csv
├── myexcel.xls
├── season.csv
├── titanic.xls
├── travel.csv
├── weather-dataset.zip
├── weather.csv
└── weather_and_house.xls
├── pandas1.png
├── pandas_part1.ipynb
├── pandas_part10.ipynb
├── pandas_part2.ipynb
├── pandas_part3.ipynb
├── pandas_part4.ipynb
├── pandas_part5.ipynb
├── pandas_part6.ipynb
├── pandas_part7.ipynb
├── pandas_part8.ipynb
└── pandas_part9.ipynb
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | ignore = E121,E123,E126,E221,E222,E225,E226,E242,E701,E702,E704,E731,W503,F405,F841
4 | exclude = tests
5 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 | - name: GitHub Community Support
4 | url: https://github.community/
5 | about: Please ask and answer questions here.
6 | - name: GitHub Security Bug Bounty
7 | url: https://bounty.github.com/
8 | about: Please report security vulnerabilities here.
9 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue_template.md:
--------------------------------------------------------------------------------
1 |
2 | ### Issue type
3 | Fill 'x' without quote if you want to checked the below boxes.
4 | - [ ] Code improvements
5 | - [ ] I want to add files
6 | - [ ] Suggestions
7 |
8 | ##### Explain in brief what you have selected.
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language:
2 | - python
3 |
4 | python:
5 | - "3.6"
6 |
7 | install:
8 | - pip install pandas
9 | - pip install numpy
10 | - pip install flake8
11 | - pip install ipython
12 | - pip install matplotlib
13 | - pip install ipywidgets
14 | script:
15 | - python script.py
16 |
17 | after_success:
18 | - flake8 --max-line-length 100 --ignore=E121,E123,E126,E221,E222,E225,E226,E242,E701,E702,E704,E731,W503 .
19 |
20 | notifications:
21 | email: false
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
**Fig. 1**
3 |
4 | Welcome to the tutorial Data analysis with pandas. In this tutorial i have covered all the topics of pandas and tried to explain each and every concepts with lesser words in jupyter notebook so that you can observe the function of every methods in pandas from the ground level.
5 |
6 |
First of all let's understand "what is data analysis and why should we use pandas for analysis ?".
7 |
8 | ### What is data analysis ?
9 | Suppose you are working in a company which daily generates a lot of data of customers and you are assigned a task to extract some useful information out of it with certain deadline. What will you do if you have very limited time you can not exract information just by looking into the dataset because size of the data is huge. So you asked for help from your collegue he said just read about pandas for data analysis. You study about pandas and you found that pandas makes your life easier than just looking at dataset and finding useful informations.
10 | ### What does pandas actually do ?
11 | This official documentation says-
12 | pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python. Additionally, it has the broader goal of becoming the most powerful and flexible open source data analysis / manipulation tool available in any language. It is already well on its way toward this goal.
13 |
14 |
15 | pandas is well suited for many different kinds of data:
16 | * Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet.
17 | * Ordered and unordered (not necessarily fixed-frequency) time series data.
18 | * Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels.
19 | * Any other form of observational / statistical data sets. The data actually need not be labeled at all to be placed into a pandas data structure.
20 |
21 | ### Dependencies-
22 | ```
23 | pip install pandas
24 | pip install numpy
25 | ```
26 |
27 | ### Table of contents -
28 | * [Introduction to pandas](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part1.ipynb)
29 | * [Dataframe Object](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part2.ipynb)
30 | * [Reading, Writing CSV and EXCEL file](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part3.ipynb)
31 | * [Handling Missing Data part-1](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part4.ipynb)
32 | * [Handling Missing Data part-2](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part5.ipynb)
33 | * [Groupby : Split, Combine and Apply](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part6.ipynb)
34 | * [Concat Dataframes](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part7.ipynb)
35 | * [Merging Dataframes](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part8.ipynb)
36 | * [Pivot and Pivot table](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part9.ipynb)
37 | * [Reshaping Dataframes](https://github.com/dshahid380/Data-analysis-with-pandas/blob/master/pandas_part10.ipynb)
38 |
39 |
40 | ### Introduction to pandas :
41 | 
42 |
**Fig.2**
43 | Pandas is used as data cleaning tool in the field of data science.You can do whatever operation you want in the dataset with this tool.Now question arises, can we clean or change the value in the dataset manually ? Answer is yes we can if size of the dataset is small.What if we have a large dataset then we can not do it manually it will take a lot of time.Pandas makes data science very easy and effective.
44 |
45 | To use pandas you need to first import the pandas module in your program
46 | ```
47 | import pandas as pd
48 | ```
49 |
50 |
51 |
52 | #### Reading CSV and Excel sheets:
53 | **d=pd.read_csv("path"):**
54 | * pd.read_csv() is the function to read the CSV(Comma separated values) file from your computer.
55 | * In the function you have to pass "path" of the CSV file under quote.
56 | * Store the dataframe in any variable,here i stored it in variable "d".
57 | * read_csv() function makes the CSV file into dataframe so that you can access it just like a disctionary.
58 |
59 | **d=pd.read_excel("path") :**
60 | * It is same as the read_csv() but it reads excel sheet or file. Here i am using the weather dataset which has all the data of weather. In my case,weather.csv file is in my current directory that is why the path of the file is file name itself.
61 | ```
62 | d=pd.read_csv('datasets/weather.csv')
63 | print(d)
64 | ```
65 |
66 |
67 | **For futher tutorial go to the above link given in the Table of contents or click this [link](https://github.com/dshahid380/Data-analysis-with-pandas)**
68 |
69 |
70 |
71 |
72 | ### References :
73 | * [Pandas Official documentation](https://pandas.pydata.org/pandas-docs/stable/tutorials.html)
74 | * [Tutorials points](https://www.tutorialspoint.com/python_pandas)
75 | * [Datacamp](https://www.datacamp.com/courses/pandas-foundations)
76 |
77 |
78 |
79 |
80 | [dshahid380](https://github.com/dshahid380)
81 | [ Md Shahid](https://www.linkedin.com/in/dshahid380/)
82 |
--------------------------------------------------------------------------------
/datasets/ds1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/ds1.jpg
--------------------------------------------------------------------------------
/datasets/ds2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/ds2.jpg
--------------------------------------------------------------------------------
/datasets/mycsv.csv:
--------------------------------------------------------------------------------
1 | dates,day,temp,wind-speed
2 | 02-01-12,sunny,45,12
3 | 03-01-12,rainy,46,34
4 | 04-01-12,hot,47,45
5 | 05-01-12,sunny,48,56
6 | 06-01-12,hot,49,67
7 |
--------------------------------------------------------------------------------
/datasets/mycsv_few_columns.csv:
--------------------------------------------------------------------------------
1 | day,temp,wind-speed
2 | sunny,45,12
3 | rainy,46,34
4 | hot,47,45
5 | sunny,48,56
6 | hot,49,67
7 |
--------------------------------------------------------------------------------
/datasets/myexcel.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/myexcel.xls
--------------------------------------------------------------------------------
/datasets/season.csv:
--------------------------------------------------------------------------------
1 | dates,day,temp,wind-speed
2 | 2/1/2012,sunny,45,12
3 | 3/1/2012,rainy,46,34
4 | 4/1/2012,hot,47,45
5 | 5/1/2012,NaN,NaN,56
6 | 6/1/2012,hot,49,Not available
7 | 7/1/2012,NaN,NaN,Not available
8 | 8/1/2012,hot,12,45
9 | 9/1/2012,rainy,23,41
10 | 10/1/2012,NaN,NaN,NaN
11 | 11/1/2012,NaN,NaN,NaN
12 |
--------------------------------------------------------------------------------
/datasets/titanic.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/titanic.xls
--------------------------------------------------------------------------------
/datasets/travel.csv:
--------------------------------------------------------------------------------
1 | Age,Name,No_of_pkg,Package,travel_id
2 | 20yrs,Bikash Kumar,1 packages,$100 ,1
3 | 21yrs,Ashish Shaw,5 packages,$200 ,2
4 | 23years,Dipak Kumar,2pkgs,$100 ,3
5 | 20 Years,John Doe,3 pkgs,$100 ,4
6 | 2000,Elisha,5000,$400 ,5
7 | 5000,Md Shahid,10 packages,$200 ,6
8 | 21 yrs,Adrika Roy,7pkgs,$300 ,7
9 | 24 yrs,Shashi Kumar,2000,$500 ,8
10 |
--------------------------------------------------------------------------------
/datasets/weather-dataset.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/weather-dataset.zip
--------------------------------------------------------------------------------
/datasets/weather_and_house.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/datasets/weather_and_house.xls
--------------------------------------------------------------------------------
/pandas1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dshahid380/Data-analysis-with-pandas/84a92bb63ab285160668dc3a7003a45269736ef2/pandas1.png
--------------------------------------------------------------------------------
/pandas_part10.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Reshaping Dataframe \n",
8 | " In this technique we will discuss about reshape the dataframe with melt() method.\n",
9 | " You have to pass the dataframe and the column which want to keep the same and other argumets are optional for you\n",
10 | " "
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import pandas as pd"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 5,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "data": {
29 | "text/html": [
30 | "
\n",
31 | "\n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " | \n",
48 | " Day | \n",
49 | " Kerala | \n",
50 | " Kolkata | \n",
51 | " Mumbai | \n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " \n",
56 | " 0 | \n",
57 | " Mon | \n",
58 | " 23 | \n",
59 | " 10 | \n",
60 | " 10 | \n",
61 | "
\n",
62 | " \n",
63 | " 1 | \n",
64 | " Tue | \n",
65 | " 32 | \n",
66 | " 20 | \n",
67 | " 20 | \n",
68 | "
\n",
69 | " \n",
70 | " 2 | \n",
71 | " Wed | \n",
72 | " 13 | \n",
73 | " 30 | \n",
74 | " 30 | \n",
75 | "
\n",
76 | " \n",
77 | " 3 | \n",
78 | " Thu | \n",
79 | " 42 | \n",
80 | " 40 | \n",
81 | " 40 | \n",
82 | "
\n",
83 | " \n",
84 | " 4 | \n",
85 | " Fri | \n",
86 | " 13 | \n",
87 | " 32 | \n",
88 | " 32 | \n",
89 | "
\n",
90 | " \n",
91 | " 5 | \n",
92 | " Sat | \n",
93 | " 43 | \n",
94 | " 34 | \n",
95 | " 34 | \n",
96 | "
\n",
97 | " \n",
98 | " 6 | \n",
99 | " Sun | \n",
100 | " 23 | \n",
101 | " 23 | \n",
102 | " 23 | \n",
103 | "
\n",
104 | " \n",
105 | "
\n",
106 | "
"
107 | ],
108 | "text/plain": [
109 | " Day Kerala Kolkata Mumbai\n",
110 | "0 Mon 23 10 10\n",
111 | "1 Tue 32 20 20\n",
112 | "2 Wed 13 30 30\n",
113 | "3 Thu 42 40 40\n",
114 | "4 Fri 13 32 32\n",
115 | "5 Sat 43 34 34\n",
116 | "6 Sun 23 23 23"
117 | ]
118 | },
119 | "execution_count": 5,
120 | "metadata": {},
121 | "output_type": "execute_result"
122 | }
123 | ],
124 | "source": [
125 | "mydis={\n",
126 | " 'Day':['Mon','Tue','Wed','Thu','Fri','Sat','Sun'],\n",
127 | " 'Kolkata':[10,20,30,40,32,34,23],\n",
128 | " 'Kerala':[23,32,13,42,13,43,23],\n",
129 | " 'Mumbai':[10,20,30,40,32,34,23]\n",
130 | "}\n",
131 | "df=pd.DataFrame(mydis)\n",
132 | "df"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 10,
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "data": {
142 | "text/html": [
143 | "\n",
144 | "\n",
157 | "
\n",
158 | " \n",
159 | " \n",
160 | " | \n",
161 | " Day | \n",
162 | " variable | \n",
163 | " value | \n",
164 | "
\n",
165 | " \n",
166 | " \n",
167 | " \n",
168 | " 0 | \n",
169 | " Mon | \n",
170 | " Kerala | \n",
171 | " 23 | \n",
172 | "
\n",
173 | " \n",
174 | " 1 | \n",
175 | " Tue | \n",
176 | " Kerala | \n",
177 | " 32 | \n",
178 | "
\n",
179 | " \n",
180 | " 2 | \n",
181 | " Wed | \n",
182 | " Kerala | \n",
183 | " 13 | \n",
184 | "
\n",
185 | " \n",
186 | " 3 | \n",
187 | " Thu | \n",
188 | " Kerala | \n",
189 | " 42 | \n",
190 | "
\n",
191 | " \n",
192 | " 4 | \n",
193 | " Fri | \n",
194 | " Kerala | \n",
195 | " 13 | \n",
196 | "
\n",
197 | " \n",
198 | " 5 | \n",
199 | " Sat | \n",
200 | " Kerala | \n",
201 | " 43 | \n",
202 | "
\n",
203 | " \n",
204 | " 6 | \n",
205 | " Sun | \n",
206 | " Kerala | \n",
207 | " 23 | \n",
208 | "
\n",
209 | " \n",
210 | " 7 | \n",
211 | " Mon | \n",
212 | " Kolkata | \n",
213 | " 10 | \n",
214 | "
\n",
215 | " \n",
216 | " 8 | \n",
217 | " Tue | \n",
218 | " Kolkata | \n",
219 | " 20 | \n",
220 | "
\n",
221 | " \n",
222 | " 9 | \n",
223 | " Wed | \n",
224 | " Kolkata | \n",
225 | " 30 | \n",
226 | "
\n",
227 | " \n",
228 | " 10 | \n",
229 | " Thu | \n",
230 | " Kolkata | \n",
231 | " 40 | \n",
232 | "
\n",
233 | " \n",
234 | " 11 | \n",
235 | " Fri | \n",
236 | " Kolkata | \n",
237 | " 32 | \n",
238 | "
\n",
239 | " \n",
240 | " 12 | \n",
241 | " Sat | \n",
242 | " Kolkata | \n",
243 | " 34 | \n",
244 | "
\n",
245 | " \n",
246 | " 13 | \n",
247 | " Sun | \n",
248 | " Kolkata | \n",
249 | " 23 | \n",
250 | "
\n",
251 | " \n",
252 | " 14 | \n",
253 | " Mon | \n",
254 | " Mumbai | \n",
255 | " 10 | \n",
256 | "
\n",
257 | " \n",
258 | " 15 | \n",
259 | " Tue | \n",
260 | " Mumbai | \n",
261 | " 20 | \n",
262 | "
\n",
263 | " \n",
264 | " 16 | \n",
265 | " Wed | \n",
266 | " Mumbai | \n",
267 | " 30 | \n",
268 | "
\n",
269 | " \n",
270 | " 17 | \n",
271 | " Thu | \n",
272 | " Mumbai | \n",
273 | " 40 | \n",
274 | "
\n",
275 | " \n",
276 | " 18 | \n",
277 | " Fri | \n",
278 | " Mumbai | \n",
279 | " 32 | \n",
280 | "
\n",
281 | " \n",
282 | " 19 | \n",
283 | " Sat | \n",
284 | " Mumbai | \n",
285 | " 34 | \n",
286 | "
\n",
287 | " \n",
288 | " 20 | \n",
289 | " Sun | \n",
290 | " Mumbai | \n",
291 | " 23 | \n",
292 | "
\n",
293 | " \n",
294 | "
\n",
295 | "
"
296 | ],
297 | "text/plain": [
298 | " Day variable value\n",
299 | "0 Mon Kerala 23\n",
300 | "1 Tue Kerala 32\n",
301 | "2 Wed Kerala 13\n",
302 | "3 Thu Kerala 42\n",
303 | "4 Fri Kerala 13\n",
304 | "5 Sat Kerala 43\n",
305 | "6 Sun Kerala 23\n",
306 | "7 Mon Kolkata 10\n",
307 | "8 Tue Kolkata 20\n",
308 | "9 Wed Kolkata 30\n",
309 | "10 Thu Kolkata 40\n",
310 | "11 Fri Kolkata 32\n",
311 | "12 Sat Kolkata 34\n",
312 | "13 Sun Kolkata 23\n",
313 | "14 Mon Mumbai 10\n",
314 | "15 Tue Mumbai 20\n",
315 | "16 Wed Mumbai 30\n",
316 | "17 Thu Mumbai 40\n",
317 | "18 Fri Mumbai 32\n",
318 | "19 Sat Mumbai 34\n",
319 | "20 Sun Mumbai 23"
320 | ]
321 | },
322 | "execution_count": 10,
323 | "metadata": {},
324 | "output_type": "execute_result"
325 | }
326 | ],
327 | "source": [
328 | "df2=pd.melt(df,id_vars=['Day'],var_name='City',value_name='Temp')\n",
329 | "df2"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": []
338 | }
339 | ],
340 | "metadata": {
341 | "kernelspec": {
342 | "display_name": "Python 3",
343 | "language": "python",
344 | "name": "python3"
345 | },
346 | "language_info": {
347 | "codemirror_mode": {
348 | "name": "ipython",
349 | "version": 3
350 | },
351 | "file_extension": ".py",
352 | "mimetype": "text/x-python",
353 | "name": "python",
354 | "nbconvert_exporter": "python",
355 | "pygments_lexer": "ipython3",
356 | "version": "3.6.4"
357 | }
358 | },
359 | "nbformat": 4,
360 | "nbformat_minor": 2
361 | }
362 |
--------------------------------------------------------------------------------
/pandas_part3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Reading, writing CSV and Excel file
"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 16,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 17,
22 | "metadata": {
23 | "scrolled": true
24 | },
25 | "outputs": [
26 | {
27 | "data": {
28 | "text/html": [
29 | "\n",
30 | "\n",
43 | "
\n",
44 | " \n",
45 | " \n",
46 | " | \n",
47 | " Formatted Date | \n",
48 | " Summary | \n",
49 | " Precip Type | \n",
50 | " Temperature (C) | \n",
51 | " Apparent Temperature (C) | \n",
52 | " Humidity | \n",
53 | " Wind Speed (km/h) | \n",
54 | " Wind Bearing (degrees) | \n",
55 | " Visibility (km) | \n",
56 | " Loud Cover | \n",
57 | " Pressure (millibars) | \n",
58 | " Daily Summary | \n",
59 | "
\n",
60 | " \n",
61 | " \n",
62 | " \n",
63 | " 0 | \n",
64 | " 2006-04-01 00:00:00.000 +0200 | \n",
65 | " Partly Cloudy | \n",
66 | " rain | \n",
67 | " 9.472222 | \n",
68 | " 7.388889 | \n",
69 | " 0.89 | \n",
70 | " 14.1197 | \n",
71 | " 251.0 | \n",
72 | " 15.8263 | \n",
73 | " 0.0 | \n",
74 | " 1015.13 | \n",
75 | " Partly cloudy throughout the day. | \n",
76 | "
\n",
77 | " \n",
78 | " 1 | \n",
79 | " 2006-04-01 01:00:00.000 +0200 | \n",
80 | " Partly Cloudy | \n",
81 | " rain | \n",
82 | " 9.355556 | \n",
83 | " 7.227778 | \n",
84 | " 0.86 | \n",
85 | " 14.2646 | \n",
86 | " 259.0 | \n",
87 | " 15.8263 | \n",
88 | " 0.0 | \n",
89 | " 1015.63 | \n",
90 | " Partly cloudy throughout the day. | \n",
91 | "
\n",
92 | " \n",
93 | " 2 | \n",
94 | " 2006-04-01 02:00:00.000 +0200 | \n",
95 | " Mostly Cloudy | \n",
96 | " rain | \n",
97 | " 9.377778 | \n",
98 | " 9.377778 | \n",
99 | " 0.89 | \n",
100 | " 3.9284 | \n",
101 | " 204.0 | \n",
102 | " 14.9569 | \n",
103 | " 0.0 | \n",
104 | " 1015.94 | \n",
105 | " Partly cloudy throughout the day. | \n",
106 | "
\n",
107 | " \n",
108 | "
\n",
109 | "
"
110 | ],
111 | "text/plain": [
112 | " Formatted Date Summary Precip Type Temperature (C) \\\n",
113 | "0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 \n",
114 | "1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 \n",
115 | "2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 \n",
116 | "\n",
117 | " Apparent Temperature (C) Humidity Wind Speed (km/h) \\\n",
118 | "0 7.388889 0.89 14.1197 \n",
119 | "1 7.227778 0.86 14.2646 \n",
120 | "2 9.377778 0.89 3.9284 \n",
121 | "\n",
122 | " Wind Bearing (degrees) Visibility (km) Loud Cover Pressure (millibars) \\\n",
123 | "0 251.0 15.8263 0.0 1015.13 \n",
124 | "1 259.0 15.8263 0.0 1015.63 \n",
125 | "2 204.0 14.9569 0.0 1015.94 \n",
126 | "\n",
127 | " Daily Summary \n",
128 | "0 Partly cloudy throughout the day. \n",
129 | "1 Partly cloudy throughout the day. \n",
130 | "2 Partly cloudy throughout the day. "
131 | ]
132 | },
133 | "execution_count": 17,
134 | "metadata": {},
135 | "output_type": "execute_result"
136 | }
137 | ],
138 | "source": [
139 | "#Reading CSV file\n",
140 | "d=pd.read_csv('datasets/weather.csv')\n",
141 | "d.head(3)"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 18,
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "data": {
151 | "text/html": [
152 | "\n",
153 | "\n",
166 | "
\n",
167 | " \n",
168 | " \n",
169 | " | \n",
170 | " pclass | \n",
171 | " survived | \n",
172 | " name | \n",
173 | " sex | \n",
174 | " age | \n",
175 | " sibsp | \n",
176 | " parch | \n",
177 | " ticket | \n",
178 | " fare | \n",
179 | " cabin | \n",
180 | " embarked | \n",
181 | " boat | \n",
182 | " body | \n",
183 | " home.dest | \n",
184 | "
\n",
185 | " \n",
186 | " \n",
187 | " \n",
188 | " 0 | \n",
189 | " 1 | \n",
190 | " 1 | \n",
191 | " Allen, Miss. Elisabeth Walton | \n",
192 | " female | \n",
193 | " 29 | \n",
194 | " 0 | \n",
195 | " 0 | \n",
196 | " 24160 | \n",
197 | " 211.3375 | \n",
198 | " B5 | \n",
199 | " S | \n",
200 | " 2 | \n",
201 | " None | \n",
202 | " St Louis, MO | \n",
203 | "
\n",
204 | " \n",
205 | " 1 | \n",
206 | " 1 | \n",
207 | " 1 | \n",
208 | " Allison, Master. Hudson Trevor | \n",
209 | " male | \n",
210 | " 0.9167 | \n",
211 | " 1 | \n",
212 | " 2 | \n",
213 | " 113781 | \n",
214 | " 151.5500 | \n",
215 | " C22 C26 | \n",
216 | " S | \n",
217 | " 11 | \n",
218 | " None | \n",
219 | " Montreal, PQ / Chesterville, ON | \n",
220 | "
\n",
221 | " \n",
222 | " 2 | \n",
223 | " 1 | \n",
224 | " 0 | \n",
225 | " Allison, Miss. Helen Loraine | \n",
226 | " female | \n",
227 | " 2 | \n",
228 | " 1 | \n",
229 | " 2 | \n",
230 | " 113781 | \n",
231 | " 151.5500 | \n",
232 | " C22 C26 | \n",
233 | " S | \n",
234 | " None | \n",
235 | " None | \n",
236 | " Montreal, PQ / Chesterville, ON | \n",
237 | "
\n",
238 | " \n",
239 | "
\n",
240 | "
"
241 | ],
242 | "text/plain": [
243 | " pclass survived name sex age sibsp \\\n",
244 | "0 1 1 Allen, Miss. Elisabeth Walton female 29 0 \n",
245 | "1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 \n",
246 | "2 1 0 Allison, Miss. Helen Loraine female 2 1 \n",
247 | "\n",
248 | " parch ticket fare cabin embarked boat body \\\n",
249 | "0 0 24160 211.3375 B5 S 2 None \n",
250 | "1 2 113781 151.5500 C22 C26 S 11 None \n",
251 | "2 2 113781 151.5500 C22 C26 S None None \n",
252 | "\n",
253 | " home.dest \n",
254 | "0 St Louis, MO \n",
255 | "1 Montreal, PQ / Chesterville, ON \n",
256 | "2 Montreal, PQ / Chesterville, ON "
257 | ]
258 | },
259 | "execution_count": 18,
260 | "metadata": {},
261 | "output_type": "execute_result"
262 | }
263 | ],
264 | "source": [
265 | "#Reading excel file\n",
266 | "df=pd.read_excel('datasets/titanic.xls')\n",
267 | "df.head(3)"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "### If you want to read only few rows rather than all"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 19,
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "data": {
284 | "text/html": [
285 | "\n",
286 | "\n",
299 | "
\n",
300 | " \n",
301 | " \n",
302 | " | \n",
303 | " Formatted Date | \n",
304 | " Summary | \n",
305 | " Precip Type | \n",
306 | " Temperature (C) | \n",
307 | " Apparent Temperature (C) | \n",
308 | " Humidity | \n",
309 | " Wind Speed (km/h) | \n",
310 | " Wind Bearing (degrees) | \n",
311 | " Visibility (km) | \n",
312 | " Loud Cover | \n",
313 | " Pressure (millibars) | \n",
314 | " Daily Summary | \n",
315 | "
\n",
316 | " \n",
317 | " \n",
318 | " \n",
319 | " 0 | \n",
320 | " 2006-04-01 00:00:00.000 +0200 | \n",
321 | " Partly Cloudy | \n",
322 | " rain | \n",
323 | " 9.472222 | \n",
324 | " 7.388889 | \n",
325 | " 0.89 | \n",
326 | " 14.1197 | \n",
327 | " 251.0 | \n",
328 | " 15.8263 | \n",
329 | " 0.0 | \n",
330 | " 1015.13 | \n",
331 | " Partly cloudy throughout the day. | \n",
332 | "
\n",
333 | " \n",
334 | " 1 | \n",
335 | " 2006-04-01 01:00:00.000 +0200 | \n",
336 | " Partly Cloudy | \n",
337 | " rain | \n",
338 | " 9.355556 | \n",
339 | " 7.227778 | \n",
340 | " 0.86 | \n",
341 | " 14.2646 | \n",
342 | " 259.0 | \n",
343 | " 15.8263 | \n",
344 | " 0.0 | \n",
345 | " 1015.63 | \n",
346 | " Partly cloudy throughout the day. | \n",
347 | "
\n",
348 | " \n",
349 | " 2 | \n",
350 | " 2006-04-01 02:00:00.000 +0200 | \n",
351 | " Mostly Cloudy | \n",
352 | " rain | \n",
353 | " 9.377778 | \n",
354 | " 9.377778 | \n",
355 | " 0.89 | \n",
356 | " 3.9284 | \n",
357 | " 204.0 | \n",
358 | " 14.9569 | \n",
359 | " 0.0 | \n",
360 | " 1015.94 | \n",
361 | " Partly cloudy throughout the day. | \n",
362 | "
\n",
363 | " \n",
364 | " 3 | \n",
365 | " 2006-04-01 03:00:00.000 +0200 | \n",
366 | " Partly Cloudy | \n",
367 | " rain | \n",
368 | " 8.288889 | \n",
369 | " 5.944444 | \n",
370 | " 0.83 | \n",
371 | " 14.1036 | \n",
372 | " 269.0 | \n",
373 | " 15.8263 | \n",
374 | " 0.0 | \n",
375 | " 1016.41 | \n",
376 | " Partly cloudy throughout the day. | \n",
377 | "
\n",
378 | " \n",
379 | " 4 | \n",
380 | " 2006-04-01 04:00:00.000 +0200 | \n",
381 | " Mostly Cloudy | \n",
382 | " rain | \n",
383 | " 8.755556 | \n",
384 | " 6.977778 | \n",
385 | " 0.83 | \n",
386 | " 11.0446 | \n",
387 | " 259.0 | \n",
388 | " 15.8263 | \n",
389 | " 0.0 | \n",
390 | " 1016.51 | \n",
391 | " Partly cloudy throughout the day. | \n",
392 | "
\n",
393 | " \n",
394 | " 5 | \n",
395 | " 2006-04-01 05:00:00.000 +0200 | \n",
396 | " Partly Cloudy | \n",
397 | " rain | \n",
398 | " 9.222222 | \n",
399 | " 7.111111 | \n",
400 | " 0.85 | \n",
401 | " 13.9587 | \n",
402 | " 258.0 | \n",
403 | " 14.9569 | \n",
404 | " 0.0 | \n",
405 | " 1016.66 | \n",
406 | " Partly cloudy throughout the day. | \n",
407 | "
\n",
408 | " \n",
409 | " 6 | \n",
410 | " 2006-04-01 06:00:00.000 +0200 | \n",
411 | " Partly Cloudy | \n",
412 | " rain | \n",
413 | " 7.733333 | \n",
414 | " 5.522222 | \n",
415 | " 0.95 | \n",
416 | " 12.3648 | \n",
417 | " 259.0 | \n",
418 | " 9.9820 | \n",
419 | " 0.0 | \n",
420 | " 1016.72 | \n",
421 | " Partly cloudy throughout the day. | \n",
422 | "
\n",
423 | " \n",
424 | "
\n",
425 | "
"
426 | ],
427 | "text/plain": [
428 | " Formatted Date Summary Precip Type Temperature (C) \\\n",
429 | "0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 \n",
430 | "1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 \n",
431 | "2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 \n",
432 | "3 2006-04-01 03:00:00.000 +0200 Partly Cloudy rain 8.288889 \n",
433 | "4 2006-04-01 04:00:00.000 +0200 Mostly Cloudy rain 8.755556 \n",
434 | "5 2006-04-01 05:00:00.000 +0200 Partly Cloudy rain 9.222222 \n",
435 | "6 2006-04-01 06:00:00.000 +0200 Partly Cloudy rain 7.733333 \n",
436 | "\n",
437 | " Apparent Temperature (C) Humidity Wind Speed (km/h) \\\n",
438 | "0 7.388889 0.89 14.1197 \n",
439 | "1 7.227778 0.86 14.2646 \n",
440 | "2 9.377778 0.89 3.9284 \n",
441 | "3 5.944444 0.83 14.1036 \n",
442 | "4 6.977778 0.83 11.0446 \n",
443 | "5 7.111111 0.85 13.9587 \n",
444 | "6 5.522222 0.95 12.3648 \n",
445 | "\n",
446 | " Wind Bearing (degrees) Visibility (km) Loud Cover Pressure (millibars) \\\n",
447 | "0 251.0 15.8263 0.0 1015.13 \n",
448 | "1 259.0 15.8263 0.0 1015.63 \n",
449 | "2 204.0 14.9569 0.0 1015.94 \n",
450 | "3 269.0 15.8263 0.0 1016.41 \n",
451 | "4 259.0 15.8263 0.0 1016.51 \n",
452 | "5 258.0 14.9569 0.0 1016.66 \n",
453 | "6 259.0 9.9820 0.0 1016.72 \n",
454 | "\n",
455 | " Daily Summary \n",
456 | "0 Partly cloudy throughout the day. \n",
457 | "1 Partly cloudy throughout the day. \n",
458 | "2 Partly cloudy throughout the day. \n",
459 | "3 Partly cloudy throughout the day. \n",
460 | "4 Partly cloudy throughout the day. \n",
461 | "5 Partly cloudy throughout the day. \n",
462 | "6 Partly cloudy throughout the day. "
463 | ]
464 | },
465 | "execution_count": 19,
466 | "metadata": {},
467 | "output_type": "execute_result"
468 | }
469 | ],
470 | "source": [
471 | "df=pd.read_csv('datasets/weather.csv',nrows=7)\n",
472 | "df"
473 | ]
474 | },
475 | {
476 | "cell_type": "markdown",
477 | "metadata": {},
478 | "source": [
479 | "### Changing specific values with NaN while reading\n",
480 | " * While reading the dataset you can change the specific value with NaN\n",
481 | " > df = pd.read_excel ( \" titanic.xls \" , na_values = [ list of element which you want to change to NaN ] ) "
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 20,
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "data": {
491 | "text/html": [
492 | "\n",
493 | "\n",
506 | "
\n",
507 | " \n",
508 | " \n",
509 | " | \n",
510 | " pclass | \n",
511 | " survived | \n",
512 | " name | \n",
513 | " sex | \n",
514 | " age | \n",
515 | " sibsp | \n",
516 | " parch | \n",
517 | " ticket | \n",
518 | " fare | \n",
519 | " cabin | \n",
520 | " embarked | \n",
521 | " boat | \n",
522 | " body | \n",
523 | " home.dest | \n",
524 | "
\n",
525 | " \n",
526 | " \n",
527 | " \n",
528 | " 0 | \n",
529 | " 1 | \n",
530 | " 1 | \n",
531 | " Allen, Miss. Elisabeth Walton | \n",
532 | " female | \n",
533 | " 29.0000 | \n",
534 | " 0 | \n",
535 | " 0 | \n",
536 | " 24160 | \n",
537 | " 211.3375 | \n",
538 | " B5 | \n",
539 | " S | \n",
540 | " 2 | \n",
541 | " NaN | \n",
542 | " St Louis, MO | \n",
543 | "
\n",
544 | " \n",
545 | " 1 | \n",
546 | " 1 | \n",
547 | " 1 | \n",
548 | " Allison, Master. Hudson Trevor | \n",
549 | " male | \n",
550 | " 0.9167 | \n",
551 | " 1 | \n",
552 | " 2 | \n",
553 | " 113781 | \n",
554 | " 151.5500 | \n",
555 | " C22 C26 | \n",
556 | " S | \n",
557 | " 11 | \n",
558 | " NaN | \n",
559 | " Montreal, PQ / Chesterville, ON | \n",
560 | "
\n",
561 | " \n",
562 | " 2 | \n",
563 | " 1 | \n",
564 | " 0 | \n",
565 | " Allison, Miss. Helen Loraine | \n",
566 | " female | \n",
567 | " 2.0000 | \n",
568 | " 1 | \n",
569 | " 2 | \n",
570 | " 113781 | \n",
571 | " 151.5500 | \n",
572 | " C22 C26 | \n",
573 | " S | \n",
574 | " NaN | \n",
575 | " NaN | \n",
576 | " Montreal, PQ / Chesterville, ON | \n",
577 | "
\n",
578 | " \n",
579 | " 3 | \n",
580 | " 1 | \n",
581 | " 0 | \n",
582 | " Allison, Mr. Hudson Joshua Creighton | \n",
583 | " male | \n",
584 | " 30.0000 | \n",
585 | " 1 | \n",
586 | " 2 | \n",
587 | " 113781 | \n",
588 | " 151.5500 | \n",
589 | " C22 C26 | \n",
590 | " S | \n",
591 | " NaN | \n",
592 | " 135.0 | \n",
593 | " Montreal, PQ / Chesterville, ON | \n",
594 | "
\n",
595 | " \n",
596 | " 4 | \n",
597 | " 1 | \n",
598 | " 0 | \n",
599 | " Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | \n",
600 | " female | \n",
601 | " 25.0000 | \n",
602 | " 1 | \n",
603 | " 2 | \n",
604 | " 113781 | \n",
605 | " 151.5500 | \n",
606 | " C22 C26 | \n",
607 | " S | \n",
608 | " NaN | \n",
609 | " NaN | \n",
610 | " Montreal, PQ / Chesterville, ON | \n",
611 | "
\n",
612 | " \n",
613 | "
\n",
614 | "
"
615 | ],
616 | "text/plain": [
617 | " pclass survived name sex \\\n",
618 | "0 1 1 Allen, Miss. Elisabeth Walton female \n",
619 | "1 1 1 Allison, Master. Hudson Trevor male \n",
620 | "2 1 0 Allison, Miss. Helen Loraine female \n",
621 | "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n",
622 | "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n",
623 | "\n",
624 | " age sibsp parch ticket fare cabin embarked boat body \\\n",
625 | "0 29.0000 0 0 24160 211.3375 B5 S 2 NaN \n",
626 | "1 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN \n",
627 | "2 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN \n",
628 | "3 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 \n",
629 | "4 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN \n",
630 | "\n",
631 | " home.dest \n",
632 | "0 St Louis, MO \n",
633 | "1 Montreal, PQ / Chesterville, ON \n",
634 | "2 Montreal, PQ / Chesterville, ON \n",
635 | "3 Montreal, PQ / Chesterville, ON \n",
636 | "4 Montreal, PQ / Chesterville, ON "
637 | ]
638 | },
639 | "execution_count": 20,
640 | "metadata": {},
641 | "output_type": "execute_result"
642 | }
643 | ],
644 | "source": [
645 | "df = pd.read_excel(\"datasets/titanic.xls\",na_values=[None]) \n",
646 | "df.head()"
647 | ]
648 | },
649 | {
650 | "cell_type": "markdown",
651 | "metadata": {},
652 | "source": [
653 | "### Changing specific values with NaN columns wise while reading"
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": 21,
659 | "metadata": {},
660 | "outputs": [
661 | {
662 | "data": {
663 | "text/html": [
664 | "\n",
665 | "\n",
678 | "
\n",
679 | " \n",
680 | " \n",
681 | " | \n",
682 | " pclass | \n",
683 | " survived | \n",
684 | " name | \n",
685 | " sex | \n",
686 | " age | \n",
687 | " sibsp | \n",
688 | " parch | \n",
689 | " ticket | \n",
690 | " fare | \n",
691 | " cabin | \n",
692 | " embarked | \n",
693 | " boat | \n",
694 | " body | \n",
695 | " home.dest | \n",
696 | "
\n",
697 | " \n",
698 | " \n",
699 | " \n",
700 | " 0 | \n",
701 | " 1 | \n",
702 | " 1 | \n",
703 | " Allen, Miss. Elisabeth Walton | \n",
704 | " female | \n",
705 | " 29 | \n",
706 | " 0 | \n",
707 | " 0 | \n",
708 | " 24160 | \n",
709 | " 211.3375 | \n",
710 | " B5 | \n",
711 | " S | \n",
712 | " 2 | \n",
713 | " None | \n",
714 | " St Louis, MO | \n",
715 | "
\n",
716 | " \n",
717 | " 1 | \n",
718 | " 1 | \n",
719 | " 1 | \n",
720 | " Allison, Master. Hudson Trevor | \n",
721 | " male | \n",
722 | " 0.9167 | \n",
723 | " 1 | \n",
724 | " 2 | \n",
725 | " 113781 | \n",
726 | " 151.5500 | \n",
727 | " C22 C26 | \n",
728 | " S | \n",
729 | " 11 | \n",
730 | " None | \n",
731 | " Montreal, PQ / Chesterville, ON | \n",
732 | "
\n",
733 | " \n",
734 | " 2 | \n",
735 | " 1 | \n",
736 | " 0 | \n",
737 | " Allison, Miss. Helen Loraine | \n",
738 | " female | \n",
739 | " 2 | \n",
740 | " 1 | \n",
741 | " 2 | \n",
742 | " 113781 | \n",
743 | " 151.5500 | \n",
744 | " C22 C26 | \n",
745 | " S | \n",
746 | " None | \n",
747 | " None | \n",
748 | " Montreal, PQ / Chesterville, ON | \n",
749 | "
\n",
750 | " \n",
751 | " 3 | \n",
752 | " 1 | \n",
753 | " 0 | \n",
754 | " Allison, Mr. Hudson Joshua Creighton | \n",
755 | " male | \n",
756 | " 30 | \n",
757 | " 1 | \n",
758 | " 2 | \n",
759 | " 113781 | \n",
760 | " 151.5500 | \n",
761 | " C22 C26 | \n",
762 | " S | \n",
763 | " None | \n",
764 | " 135 | \n",
765 | " Montreal, PQ / Chesterville, ON | \n",
766 | "
\n",
767 | " \n",
768 | " 4 | \n",
769 | " 1 | \n",
770 | " 0 | \n",
771 | " Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | \n",
772 | " female | \n",
773 | " 25 | \n",
774 | " 1 | \n",
775 | " 2 | \n",
776 | " 113781 | \n",
777 | " 151.5500 | \n",
778 | " C22 C26 | \n",
779 | " S | \n",
780 | " None | \n",
781 | " None | \n",
782 | " Montreal, PQ / Chesterville, ON | \n",
783 | "
\n",
784 | " \n",
785 | "
\n",
786 | "
"
787 | ],
788 | "text/plain": [
789 | " pclass survived name sex \\\n",
790 | "0 1 1 Allen, Miss. Elisabeth Walton female \n",
791 | "1 1 1 Allison, Master. Hudson Trevor male \n",
792 | "2 1 0 Allison, Miss. Helen Loraine female \n",
793 | "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n",
794 | "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n",
795 | "\n",
796 | " age sibsp parch ticket fare cabin embarked boat body \\\n",
797 | "0 29 0 0 24160 211.3375 B5 S 2 None \n",
798 | "1 0.9167 1 2 113781 151.5500 C22 C26 S 11 None \n",
799 | "2 2 1 2 113781 151.5500 C22 C26 S None None \n",
800 | "3 30 1 2 113781 151.5500 C22 C26 S None 135 \n",
801 | "4 25 1 2 113781 151.5500 C22 C26 S None None \n",
802 | "\n",
803 | " home.dest \n",
804 | "0 St Louis, MO \n",
805 | "1 Montreal, PQ / Chesterville, ON \n",
806 | "2 Montreal, PQ / Chesterville, ON \n",
807 | "3 Montreal, PQ / Chesterville, ON \n",
808 | "4 Montreal, PQ / Chesterville, ON "
809 | ]
810 | },
811 | "execution_count": 21,
812 | "metadata": {},
813 | "output_type": "execute_result"
814 | }
815 | ],
816 | "source": [
817 | "df=pd.read_excel('datasets/titanic.xls')\n",
818 | "df.head()"
819 | ]
820 | },
821 | {
822 | "cell_type": "code",
823 | "execution_count": 22,
824 | "metadata": {},
825 | "outputs": [],
826 | "source": [
827 | "df=pd.read_excel('datasets/titanic.xls',na_values={'body':[None],'boat':[None],'parch':[2,0]})\n"
828 | ]
829 | },
830 | {
831 | "cell_type": "markdown",
832 | "metadata": {},
833 | "source": [
834 | " * na_values argument is used to replace all the values passes in the list with NaN.You can also replace some invalid values column wise as you can in the above code. If you run the above code you will a dataframe where 'None' in body column is replaced by NaN,again None in the boat column is replaced by NaN and 2 & 0 in parch column are replaced by NaN."
835 | ]
836 | },
837 | {
838 | "cell_type": "markdown",
839 | "metadata": {},
840 | "source": [
841 | "### Writing into CSV or excel \n",
842 | " * You can convert any dataframe in a new CSV or excel file file"
843 | ]
844 | },
845 | {
846 | "cell_type": "code",
847 | "execution_count": 23,
848 | "metadata": {},
849 | "outputs": [
850 | {
851 | "data": {
852 | "text/html": [
853 | "\n",
854 | "\n",
867 | "
\n",
868 | " \n",
869 | " \n",
870 | " | \n",
871 | " dates | \n",
872 | " day | \n",
873 | " temp | \n",
874 | " wind-speed | \n",
875 | "
\n",
876 | " \n",
877 | " \n",
878 | " \n",
879 | " 0 | \n",
880 | " 02-01-12 | \n",
881 | " sunny | \n",
882 | " 45 | \n",
883 | " 12 | \n",
884 | "
\n",
885 | " \n",
886 | " 1 | \n",
887 | " 03-01-12 | \n",
888 | " rainy | \n",
889 | " 46 | \n",
890 | " 34 | \n",
891 | "
\n",
892 | " \n",
893 | " 2 | \n",
894 | " 04-01-12 | \n",
895 | " hot | \n",
896 | " 47 | \n",
897 | " 45 | \n",
898 | "
\n",
899 | " \n",
900 | " 3 | \n",
901 | " 05-01-12 | \n",
902 | " sunny | \n",
903 | " 48 | \n",
904 | " 56 | \n",
905 | "
\n",
906 | " \n",
907 | " 4 | \n",
908 | " 06-01-12 | \n",
909 | " hot | \n",
910 | " 49 | \n",
911 | " 67 | \n",
912 | "
\n",
913 | " \n",
914 | "
\n",
915 | "
"
916 | ],
917 | "text/plain": [
918 | " dates day temp wind-speed\n",
919 | "0 02-01-12 sunny 45 12\n",
920 | "1 03-01-12 rainy 46 34\n",
921 | "2 04-01-12 hot 47 45\n",
922 | "3 05-01-12 sunny 48 56\n",
923 | "4 06-01-12 hot 49 67"
924 | ]
925 | },
926 | "execution_count": 23,
927 | "metadata": {},
928 | "output_type": "execute_result"
929 | }
930 | ],
931 | "source": [
932 | "#Creatiing my own disctionary\n",
933 | "mydict={\n",
934 | " 'dates':['02-01-12','03-01-12','04-01-12','05-01-12','06-01-12'],\n",
935 | " 'day':['sunny','rainy','hot','sunny','hot'],\n",
936 | " 'wind-speed':[12,34,45,56,67],\n",
937 | " 'temp':[45,46,47,48,49]\n",
938 | " }\n",
939 | "#Converting disction to dataframe object\n",
940 | "df=pd.DataFrame(mydict)\n",
941 | "\n",
942 | "#Printing the dataframe\n",
943 | "df"
944 | ]
945 | },
946 | {
947 | "cell_type": "markdown",
948 | "metadata": {},
949 | "source": [
950 | "### Creating a new file\n"
951 | ]
952 | },
953 | {
954 | "cell_type": "markdown",
955 | "metadata": {},
956 | "source": [
957 | "#### Creating a new csv file mycsv.csv"
958 | ]
959 | },
960 | {
961 | "cell_type": "markdown",
962 | "metadata": {},
963 | "source": [
964 | "Suppose you have read a csv or excel file as dataframe object and you did some modification.Now you want to write a new csv or excel file that contains the modified dataframe. In that case you will need to_csv() or to_excel() function to create a new file.See the example below -"
965 | ]
966 | },
967 | {
968 | "cell_type": "code",
969 | "execution_count": 24,
970 | "metadata": {},
971 | "outputs": [],
972 | "source": [
973 | "df.to_csv('datasets/mycsv.csv',index=False)\n"
974 | ]
975 | },
976 | {
977 | "cell_type": "markdown",
978 | "metadata": {},
979 | "source": [
980 | "#### Creating a new excel file myexcel.xls"
981 | ]
982 | },
983 | {
984 | "cell_type": "code",
985 | "execution_count": 25,
986 | "metadata": {},
987 | "outputs": [],
988 | "source": [
989 | "df.to_excel('datasets/myexcel.xls',index=False)"
990 | ]
991 | },
992 | {
993 | "cell_type": "markdown",
994 | "metadata": {},
995 | "source": [
996 | "### Writing into csv or excel only selected rows\n",
997 | "
\n",
998 | "Suppose you modified the dataframe and you want to write it in another csv file with only selected columns.In the following example we have created the new csv file mycsv_few_columns.csv by \"df\" dataframe allowing only three columns which we want i.e, day, temp, wind-speed and i have ignored the date column."
999 | ]
1000 | },
1001 | {
1002 | "cell_type": "markdown",
1003 | "metadata": {},
1004 | "source": [
1005 | "#### how many columns in dataframe df ?"
1006 | ]
1007 | },
1008 | {
1009 | "cell_type": "code",
1010 | "execution_count": 26,
1011 | "metadata": {},
1012 | "outputs": [
1013 | {
1014 | "data": {
1015 | "text/plain": [
1016 | "Index(['dates', 'day', 'temp', 'wind-speed'], dtype='object')"
1017 | ]
1018 | },
1019 | "execution_count": 26,
1020 | "metadata": {},
1021 | "output_type": "execute_result"
1022 | }
1023 | ],
1024 | "source": [
1025 | "df.columns"
1026 | ]
1027 | },
1028 | {
1029 | "cell_type": "code",
1030 | "execution_count": 27,
1031 | "metadata": {},
1032 | "outputs": [
1033 | {
1034 | "data": {
1035 | "text/html": [
1036 | "\n",
1037 | "\n",
1050 | "
\n",
1051 | " \n",
1052 | " \n",
1053 | " | \n",
1054 | " day | \n",
1055 | " temp | \n",
1056 | " wind-speed | \n",
1057 | "
\n",
1058 | " \n",
1059 | " \n",
1060 | " \n",
1061 | " 0 | \n",
1062 | " sunny | \n",
1063 | " 45 | \n",
1064 | " 12 | \n",
1065 | "
\n",
1066 | " \n",
1067 | " 1 | \n",
1068 | " rainy | \n",
1069 | " 46 | \n",
1070 | " 34 | \n",
1071 | "
\n",
1072 | " \n",
1073 | " 2 | \n",
1074 | " hot | \n",
1075 | " 47 | \n",
1076 | " 45 | \n",
1077 | "
\n",
1078 | " \n",
1079 | " 3 | \n",
1080 | " sunny | \n",
1081 | " 48 | \n",
1082 | " 56 | \n",
1083 | "
\n",
1084 | " \n",
1085 | " 4 | \n",
1086 | " hot | \n",
1087 | " 49 | \n",
1088 | " 67 | \n",
1089 | "
\n",
1090 | " \n",
1091 | "
\n",
1092 | "
"
1093 | ],
1094 | "text/plain": [
1095 | " day temp wind-speed\n",
1096 | "0 sunny 45 12\n",
1097 | "1 rainy 46 34\n",
1098 | "2 hot 47 45\n",
1099 | "3 sunny 48 56\n",
1100 | "4 hot 49 67"
1101 | ]
1102 | },
1103 | "execution_count": 27,
1104 | "metadata": {},
1105 | "output_type": "execute_result"
1106 | }
1107 | ],
1108 | "source": [
1109 | "#Writing only three columns day,temp,wind-speed\n",
1110 | "df.to_csv('datasets/mycsv_few_columns.csv',columns=['day','temp','wind-speed'],index=False)\n",
1111 | "\n",
1112 | "#Again reading to see the result\n",
1113 | "d=pd.read_csv('datasets/mycsv_few_columns.csv')\n",
1114 | "d"
1115 | ]
1116 | },
1117 | {
1118 | "cell_type": "markdown",
1119 | "metadata": {},
1120 | "source": [
1121 | "### Converters :"
1122 | ]
1123 | },
1124 | {
1125 | "cell_type": "markdown",
1126 | "metadata": {},
1127 | "source": [
1128 | "In various cases you dont have clean dataset. Invalid values in the dataset leads to a lot of problem while predicting or extracting the meaningful information.To avoid this problem we use converters.\n",
1129 | "* Converters are basically functions which convert the specific value of a column in your desire value\n",
1130 | "* Converter function passed into the disctionary like in the na_values.\n"
1131 | ]
1132 | },
1133 | {
1134 | "cell_type": "markdown",
1135 | "metadata": {},
1136 | "source": [
1137 | "This is our dataset in which you can see that there are lots of invalid values are present.These values are noise in our dataset."
1138 | ]
1139 | },
1140 | {
1141 | "cell_type": "code",
1142 | "execution_count": 28,
1143 | "metadata": {},
1144 | "outputs": [
1145 | {
1146 | "data": {
1147 | "text/html": [
1148 | "\n",
1149 | "\n",
1162 | "
\n",
1163 | " \n",
1164 | " \n",
1165 | " | \n",
1166 | " dates | \n",
1167 | " day | \n",
1168 | " temp | \n",
1169 | " wind-speed | \n",
1170 | "
\n",
1171 | " \n",
1172 | " \n",
1173 | " \n",
1174 | " 0 | \n",
1175 | " 2/1/2012 | \n",
1176 | " sunny | \n",
1177 | " 45.0 | \n",
1178 | " 12 | \n",
1179 | "
\n",
1180 | " \n",
1181 | " 1 | \n",
1182 | " 3/1/2012 | \n",
1183 | " rainy | \n",
1184 | " 46.0 | \n",
1185 | " 34 | \n",
1186 | "
\n",
1187 | " \n",
1188 | " 2 | \n",
1189 | " 4/1/2012 | \n",
1190 | " hot | \n",
1191 | " 47.0 | \n",
1192 | " 45 | \n",
1193 | "
\n",
1194 | " \n",
1195 | " 3 | \n",
1196 | " 5/1/2012 | \n",
1197 | " NaN | \n",
1198 | " NaN | \n",
1199 | " 56 | \n",
1200 | "
\n",
1201 | " \n",
1202 | " 4 | \n",
1203 | " 6/1/2012 | \n",
1204 | " hot | \n",
1205 | " 49.0 | \n",
1206 | " Not available | \n",
1207 | "
\n",
1208 | " \n",
1209 | " 5 | \n",
1210 | " 7/1/2012 | \n",
1211 | " NaN | \n",
1212 | " NaN | \n",
1213 | " Not available | \n",
1214 | "
\n",
1215 | " \n",
1216 | " 6 | \n",
1217 | " 8/1/2012 | \n",
1218 | " hot | \n",
1219 | " 12.0 | \n",
1220 | " 45 | \n",
1221 | "
\n",
1222 | " \n",
1223 | " 7 | \n",
1224 | " 9/1/2012 | \n",
1225 | " rainy | \n",
1226 | " 23.0 | \n",
1227 | " 41 | \n",
1228 | "
\n",
1229 | " \n",
1230 | " 8 | \n",
1231 | " 10/1/2012 | \n",
1232 | " NaN | \n",
1233 | " NaN | \n",
1234 | " NaN | \n",
1235 | "
\n",
1236 | " \n",
1237 | " 9 | \n",
1238 | " 11/1/2012 | \n",
1239 | " NaN | \n",
1240 | " NaN | \n",
1241 | " NaN | \n",
1242 | "
\n",
1243 | " \n",
1244 | "
\n",
1245 | "
"
1246 | ],
1247 | "text/plain": [
1248 | " dates day temp wind-speed\n",
1249 | "0 2/1/2012 sunny 45.0 12\n",
1250 | "1 3/1/2012 rainy 46.0 34\n",
1251 | "2 4/1/2012 hot 47.0 45\n",
1252 | "3 5/1/2012 NaN NaN 56\n",
1253 | "4 6/1/2012 hot 49.0 Not available\n",
1254 | "5 7/1/2012 NaN NaN Not available\n",
1255 | "6 8/1/2012 hot 12.0 45\n",
1256 | "7 9/1/2012 rainy 23.0 41\n",
1257 | "8 10/1/2012 NaN NaN NaN\n",
1258 | "9 11/1/2012 NaN NaN NaN"
1259 | ]
1260 | },
1261 | "execution_count": 28,
1262 | "metadata": {},
1263 | "output_type": "execute_result"
1264 | }
1265 | ],
1266 | "source": [
1267 | "df=pd.read_csv('datasets/season.csv')\n",
1268 | "df"
1269 | ]
1270 | },
1271 | {
1272 | "cell_type": "markdown",
1273 | "metadata": {},
1274 | "source": [
1275 | "#### The following function will convert any cell of column having 'NaN' into 40, so here we will apply this function into 'temp' column and so for the 'day' and 'wind-speed' columns."
1276 | ]
1277 | },
1278 | {
1279 | "cell_type": "code",
1280 | "execution_count": 37,
1281 | "metadata": {},
1282 | "outputs": [],
1283 | "source": [
1284 | "def converter_for_temp(col):\n",
1285 | " if col=='NaN':\n",
1286 | " return 40\n",
1287 | " else:\n",
1288 | " return col\n",
1289 | "def converter_for_day(col):\n",
1290 | " if col=='NaN':\n",
1291 | " return 'sunny'\n",
1292 | " else:\n",
1293 | " return col\n",
1294 | "def converter_for_wind_speed(col):\n",
1295 | " if col=='Not available':\n",
1296 | " return 30\n",
1297 | " elif col==\"NaN\":\n",
1298 | " return 48\n",
1299 | " else:\n",
1300 | " return col"
1301 | ]
1302 | },
1303 | {
1304 | "cell_type": "code",
1305 | "execution_count": 38,
1306 | "metadata": {},
1307 | "outputs": [
1308 | {
1309 | "data": {
1310 | "text/html": [
1311 | "\n",
1312 | "\n",
1325 | "
\n",
1326 | " \n",
1327 | " \n",
1328 | " | \n",
1329 | " dates | \n",
1330 | " day | \n",
1331 | " temp | \n",
1332 | " wind-speed | \n",
1333 | "
\n",
1334 | " \n",
1335 | " \n",
1336 | " \n",
1337 | " 0 | \n",
1338 | " 2/1/2012 | \n",
1339 | " sunny | \n",
1340 | " 45 | \n",
1341 | " 12 | \n",
1342 | "
\n",
1343 | " \n",
1344 | " 1 | \n",
1345 | " 3/1/2012 | \n",
1346 | " rainy | \n",
1347 | " 46 | \n",
1348 | " 34 | \n",
1349 | "
\n",
1350 | " \n",
1351 | " 2 | \n",
1352 | " 4/1/2012 | \n",
1353 | " hot | \n",
1354 | " 47 | \n",
1355 | " 45 | \n",
1356 | "
\n",
1357 | " \n",
1358 | " 3 | \n",
1359 | " 5/1/2012 | \n",
1360 | " sunny | \n",
1361 | " 40 | \n",
1362 | " 56 | \n",
1363 | "
\n",
1364 | " \n",
1365 | " 4 | \n",
1366 | " 6/1/2012 | \n",
1367 | " hot | \n",
1368 | " 49 | \n",
1369 | " 30 | \n",
1370 | "
\n",
1371 | " \n",
1372 | " 5 | \n",
1373 | " 7/1/2012 | \n",
1374 | " sunny | \n",
1375 | " 40 | \n",
1376 | " 30 | \n",
1377 | "
\n",
1378 | " \n",
1379 | " 6 | \n",
1380 | " 8/1/2012 | \n",
1381 | " hot | \n",
1382 | " 12 | \n",
1383 | " 45 | \n",
1384 | "
\n",
1385 | " \n",
1386 | " 7 | \n",
1387 | " 9/1/2012 | \n",
1388 | " rainy | \n",
1389 | " 23 | \n",
1390 | " 41 | \n",
1391 | "
\n",
1392 | " \n",
1393 | " 8 | \n",
1394 | " 10/1/2012 | \n",
1395 | " sunny | \n",
1396 | " 40 | \n",
1397 | " 48 | \n",
1398 | "
\n",
1399 | " \n",
1400 | " 9 | \n",
1401 | " 11/1/2012 | \n",
1402 | " sunny | \n",
1403 | " 40 | \n",
1404 | " 48 | \n",
1405 | "
\n",
1406 | " \n",
1407 | "
\n",
1408 | "
"
1409 | ],
1410 | "text/plain": [
1411 | " dates day temp wind-speed\n",
1412 | "0 2/1/2012 sunny 45 12\n",
1413 | "1 3/1/2012 rainy 46 34\n",
1414 | "2 4/1/2012 hot 47 45\n",
1415 | "3 5/1/2012 sunny 40 56\n",
1416 | "4 6/1/2012 hot 49 30\n",
1417 | "5 7/1/2012 sunny 40 30\n",
1418 | "6 8/1/2012 hot 12 45\n",
1419 | "7 9/1/2012 rainy 23 41\n",
1420 | "8 10/1/2012 sunny 40 48\n",
1421 | "9 11/1/2012 sunny 40 48"
1422 | ]
1423 | },
1424 | "execution_count": 38,
1425 | "metadata": {},
1426 | "output_type": "execute_result"
1427 | }
1428 | ],
1429 | "source": [
1430 | "df=pd.read_csv('datasets/season.csv',converters={\n",
1431 | " 'day':converter_for_day,\n",
1432 | " 'temp':converter_for_temp,\n",
1433 | " 'wind-speed':converter_for_wind_speed\n",
1434 | " })\n",
1435 | "df"
1436 | ]
1437 | },
1438 | {
1439 | "cell_type": "markdown",
1440 | "metadata": {},
1441 | "source": [
1442 | "You can observe that previously the columns 'day', 'temp' and 'wind-speed' had some invalid data like 'NaN','NAN','Not available' but after applying the converter functions we got a cleaned dataset.Now we can apply some data analysis techniquw to predict something in our dataset."
1443 | ]
1444 | },
1445 | {
1446 | "cell_type": "markdown",
1447 | "metadata": {},
1448 | "source": [
1449 | "### Writing different dataframes into one file but different sheet names"
1450 | ]
1451 | },
1452 | {
1453 | "cell_type": "markdown",
1454 | "metadata": {},
1455 | "source": [
1456 | "Let's assume that you have two different dataframes and you want to write it in the same excel sheet but different sheet names. \n",
1457 | "
\n",
1458 | "\n",
1459 | "Let's take two disctionary one is \"weather1\" and another is \"house1\" and make it two dataframe \"weather\" & \"house\""
1460 | ]
1461 | },
1462 | {
1463 | "cell_type": "code",
1464 | "execution_count": 31,
1465 | "metadata": {},
1466 | "outputs": [],
1467 | "source": [
1468 | "#Creating weather disctionary\n",
1469 | "weather1={\n",
1470 | " 'dates':['02-01-12','03-01-12','04-01-12','05-01-12','06-01-12'],\n",
1471 | " 'day':['sunny','rainy','hot','sunny','hot'],\n",
1472 | " 'wind-speed':[12,34,45,56,67],\n",
1473 | " 'temp':[45,46,47,48,49]\n",
1474 | " }\n",
1475 | "#Converting disction to dataframe object\n",
1476 | "weather=pd.DataFrame(weather1)\n",
1477 | "\n",
1478 | "#Creating house disctionary\n",
1479 | "house1={\n",
1480 | " 'dates':['02-01-12','03-01-12','04-01-12','05-01-12','06-01-12'],\n",
1481 | " 'price':[20000,30000,40000,50000,60000],\n",
1482 | " 'bhk':[1,3,2,1,2],\n",
1483 | " 'how-old':[2,5,2,7,4]\n",
1484 | " }\n",
1485 | "\n",
1486 | "#converting house disctionary to dataframe object\n",
1487 | "\n",
1488 | "house=pd.DataFrame(house1)\n"
1489 | ]
1490 | },
1491 | {
1492 | "cell_type": "markdown",
1493 | "metadata": {},
1494 | "source": [
1495 | " Call the \"ExcelWriter\" and make a object \"writer\".Now call to_excel() function and pass three argument -\n",
1496 | " * 1. \"writer\" object :\n",
1497 | " * 2. sheet_name : \n",
1498 | " * 3. Index : This is optional for you\n",
1499 | " "
1500 | ]
1501 | },
1502 | {
1503 | "cell_type": "code",
1504 | "execution_count": 32,
1505 | "metadata": {},
1506 | "outputs": [],
1507 | "source": [
1508 | "with pd.ExcelWriter('datasets/weather_and_house.xls') as writer:\n",
1509 | " weather.to_excel(writer,sheet_name='Weather',index=False)\n",
1510 | " house.to_excel(writer,sheet_name='House',index=False)\n"
1511 | ]
1512 | },
1513 | {
1514 | "cell_type": "markdown",
1515 | "metadata": {},
1516 | "source": [
1517 | "#### Now access the sheets separately by passing its name while reading\n",
1518 | "See the example"
1519 | ]
1520 | },
1521 | {
1522 | "cell_type": "code",
1523 | "execution_count": 33,
1524 | "metadata": {},
1525 | "outputs": [
1526 | {
1527 | "data": {
1528 | "text/html": [
1529 | "\n",
1530 | "\n",
1543 | "
\n",
1544 | " \n",
1545 | " \n",
1546 | " | \n",
1547 | " dates | \n",
1548 | " day | \n",
1549 | " temp | \n",
1550 | " wind-speed | \n",
1551 | "
\n",
1552 | " \n",
1553 | " \n",
1554 | " \n",
1555 | " 0 | \n",
1556 | " 02-01-12 | \n",
1557 | " sunny | \n",
1558 | " 45 | \n",
1559 | " 12 | \n",
1560 | "
\n",
1561 | " \n",
1562 | " 1 | \n",
1563 | " 03-01-12 | \n",
1564 | " rainy | \n",
1565 | " 46 | \n",
1566 | " 34 | \n",
1567 | "
\n",
1568 | " \n",
1569 | " 2 | \n",
1570 | " 04-01-12 | \n",
1571 | " hot | \n",
1572 | " 47 | \n",
1573 | " 45 | \n",
1574 | "
\n",
1575 | " \n",
1576 | " 3 | \n",
1577 | " 05-01-12 | \n",
1578 | " sunny | \n",
1579 | " 48 | \n",
1580 | " 56 | \n",
1581 | "
\n",
1582 | " \n",
1583 | " 4 | \n",
1584 | " 06-01-12 | \n",
1585 | " hot | \n",
1586 | " 49 | \n",
1587 | " 67 | \n",
1588 | "
\n",
1589 | " \n",
1590 | "
\n",
1591 | "
"
1592 | ],
1593 | "text/plain": [
1594 | " dates day temp wind-speed\n",
1595 | "0 02-01-12 sunny 45 12\n",
1596 | "1 03-01-12 rainy 46 34\n",
1597 | "2 04-01-12 hot 47 45\n",
1598 | "3 05-01-12 sunny 48 56\n",
1599 | "4 06-01-12 hot 49 67"
1600 | ]
1601 | },
1602 | "execution_count": 33,
1603 | "metadata": {},
1604 | "output_type": "execute_result"
1605 | }
1606 | ],
1607 | "source": [
1608 | "d1=pd.read_excel('datasets/weather_and_house.xls','Weather')\n",
1609 | "d1"
1610 | ]
1611 | },
1612 | {
1613 | "cell_type": "code",
1614 | "execution_count": 34,
1615 | "metadata": {},
1616 | "outputs": [
1617 | {
1618 | "data": {
1619 | "text/html": [
1620 | "\n",
1621 | "\n",
1634 | "
\n",
1635 | " \n",
1636 | " \n",
1637 | " | \n",
1638 | " bhk | \n",
1639 | " dates | \n",
1640 | " how-old | \n",
1641 | " price | \n",
1642 | "
\n",
1643 | " \n",
1644 | " \n",
1645 | " \n",
1646 | " 0 | \n",
1647 | " 1 | \n",
1648 | " 02-01-12 | \n",
1649 | " 2 | \n",
1650 | " 20000 | \n",
1651 | "
\n",
1652 | " \n",
1653 | " 1 | \n",
1654 | " 3 | \n",
1655 | " 03-01-12 | \n",
1656 | " 5 | \n",
1657 | " 30000 | \n",
1658 | "
\n",
1659 | " \n",
1660 | " 2 | \n",
1661 | " 2 | \n",
1662 | " 04-01-12 | \n",
1663 | " 2 | \n",
1664 | " 40000 | \n",
1665 | "
\n",
1666 | " \n",
1667 | " 3 | \n",
1668 | " 1 | \n",
1669 | " 05-01-12 | \n",
1670 | " 7 | \n",
1671 | " 50000 | \n",
1672 | "
\n",
1673 | " \n",
1674 | " 4 | \n",
1675 | " 2 | \n",
1676 | " 06-01-12 | \n",
1677 | " 4 | \n",
1678 | " 60000 | \n",
1679 | "
\n",
1680 | " \n",
1681 | "
\n",
1682 | "
"
1683 | ],
1684 | "text/plain": [
1685 | " bhk dates how-old price\n",
1686 | "0 1 02-01-12 2 20000\n",
1687 | "1 3 03-01-12 5 30000\n",
1688 | "2 2 04-01-12 2 40000\n",
1689 | "3 1 05-01-12 7 50000\n",
1690 | "4 2 06-01-12 4 60000"
1691 | ]
1692 | },
1693 | "execution_count": 34,
1694 | "metadata": {},
1695 | "output_type": "execute_result"
1696 | }
1697 | ],
1698 | "source": [
1699 | "d2=pd.read_excel('datasets/weather_and_house.xls','House')\n",
1700 | "d2"
1701 | ]
1702 | },
1703 | {
1704 | "cell_type": "markdown",
1705 | "metadata": {},
1706 | "source": []
1707 | }
1708 | ],
1709 | "metadata": {
1710 | "kernelspec": {
1711 | "display_name": "Python 3",
1712 | "language": "python",
1713 | "name": "python3"
1714 | },
1715 | "language_info": {
1716 | "codemirror_mode": {
1717 | "name": "ipython",
1718 | "version": 3
1719 | },
1720 | "file_extension": ".py",
1721 | "mimetype": "text/x-python",
1722 | "name": "python",
1723 | "nbconvert_exporter": "python",
1724 | "pygments_lexer": "ipython3",
1725 | "version": "3.6.4"
1726 | }
1727 | },
1728 | "nbformat": 4,
1729 | "nbformat_minor": 2
1730 | }
1731 |
--------------------------------------------------------------------------------
/pandas_part5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Handling Missing data part-2\n",
8 | "------------"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 7,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import pandas as pd\n",
18 | "import numpy as np"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 17,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "data": {
28 | "text/html": [
29 | "\n",
30 | "\n",
43 | "
\n",
44 | " \n",
45 | " \n",
46 | " | \n",
47 | " Age | \n",
48 | " Name | \n",
49 | " No_of_pkg | \n",
50 | " Package | \n",
51 | " travel_id | \n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " \n",
56 | " 0 | \n",
57 | " 20yrs | \n",
58 | " Bikash Kumar | \n",
59 | " 1 packages | \n",
60 | " $100 | \n",
61 | " 1 | \n",
62 | "
\n",
63 | " \n",
64 | " 1 | \n",
65 | " 21yrs | \n",
66 | " Ashish Shaw | \n",
67 | " 5 packages | \n",
68 | " $200 | \n",
69 | " 2 | \n",
70 | "
\n",
71 | " \n",
72 | " 2 | \n",
73 | " 23years | \n",
74 | " Dipak Kumar | \n",
75 | " 2pkgs | \n",
76 | " $100 | \n",
77 | " 3 | \n",
78 | "
\n",
79 | " \n",
80 | " 3 | \n",
81 | " 20 Years | \n",
82 | " John Doe | \n",
83 | " 3 pkgs | \n",
84 | " $100 | \n",
85 | " 4 | \n",
86 | "
\n",
87 | " \n",
88 | " 4 | \n",
89 | " 2000 | \n",
90 | " Elisha | \n",
91 | " 5000 | \n",
92 | " $400 | \n",
93 | " 5 | \n",
94 | "
\n",
95 | " \n",
96 | " 5 | \n",
97 | " 5000 | \n",
98 | " Md Shahid | \n",
99 | " 10 packages | \n",
100 | " $200 | \n",
101 | " 6 | \n",
102 | "
\n",
103 | " \n",
104 | " 6 | \n",
105 | " 21 yrs | \n",
106 | " Adrika Roy | \n",
107 | " 7pkgs | \n",
108 | " $300 | \n",
109 | " 7 | \n",
110 | "
\n",
111 | " \n",
112 | " 7 | \n",
113 | " 24 yrs | \n",
114 | " Shashi Kumar | \n",
115 | " 2000 | \n",
116 | " $500 | \n",
117 | " 8 | \n",
118 | "
\n",
119 | " \n",
120 | "
\n",
121 | "
"
122 | ],
123 | "text/plain": [
124 | " Age Name No_of_pkg Package travel_id\n",
125 | "0 20yrs Bikash Kumar 1 packages $100 1\n",
126 | "1 21yrs Ashish Shaw 5 packages $200 2\n",
127 | "2 23years Dipak Kumar 2pkgs $100 3\n",
128 | "3 20 Years John Doe 3 pkgs $100 4\n",
129 | "4 2000 Elisha 5000 $400 5\n",
130 | "5 5000 Md Shahid 10 packages $200 6\n",
131 | "6 21 yrs Adrika Roy 7pkgs $300 7\n",
132 | "7 24 yrs Shashi Kumar 2000 $500 8"
133 | ]
134 | },
135 | "execution_count": 17,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "df=pd.read_csv(\"datasets/travel.csv\")\n",
142 | "df"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "### Replacing specific value some another value"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | " Suppose your dataframe contains some invalid values and you want to replace it with some other values like 0 or NaN.\n",
157 | " In this case special values are 5000 and 2000.You can see the following result where 2000 and 5000 is replaced by NaN\n",
158 | " value\n"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 19,
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "data": {
168 | "text/html": [
169 | "\n",
170 | "\n",
183 | "
\n",
184 | " \n",
185 | " \n",
186 | " | \n",
187 | " Age | \n",
188 | " Name | \n",
189 | " No_of_pkg | \n",
190 | " Package | \n",
191 | " travel_id | \n",
192 | "
\n",
193 | " \n",
194 | " \n",
195 | " \n",
196 | " 0 | \n",
197 | " 20yrs | \n",
198 | " Bikash Kumar | \n",
199 | " 1 packages | \n",
200 | " $100 | \n",
201 | " 1 | \n",
202 | "
\n",
203 | " \n",
204 | " 1 | \n",
205 | " 21yrs | \n",
206 | " Ashish Shaw | \n",
207 | " 5 packages | \n",
208 | " $200 | \n",
209 | " 2 | \n",
210 | "
\n",
211 | " \n",
212 | " 2 | \n",
213 | " 23years | \n",
214 | " Dipak Kumar | \n",
215 | " 2pkgs | \n",
216 | " $100 | \n",
217 | " 3 | \n",
218 | "
\n",
219 | " \n",
220 | " 3 | \n",
221 | " 20 Years | \n",
222 | " John Doe | \n",
223 | " 3 pkgs | \n",
224 | " $100 | \n",
225 | " 4 | \n",
226 | "
\n",
227 | " \n",
228 | " 4 | \n",
229 | " NaN | \n",
230 | " Elisha | \n",
231 | " NaN | \n",
232 | " $400 | \n",
233 | " 5 | \n",
234 | "
\n",
235 | " \n",
236 | " 5 | \n",
237 | " NaN | \n",
238 | " Md Shahid | \n",
239 | " 10 packages | \n",
240 | " $200 | \n",
241 | " 6 | \n",
242 | "
\n",
243 | " \n",
244 | " 6 | \n",
245 | " 21 yrs | \n",
246 | " Adrika Roy | \n",
247 | " 7pkgs | \n",
248 | " $300 | \n",
249 | " 7 | \n",
250 | "
\n",
251 | " \n",
252 | " 7 | \n",
253 | " 24 yrs | \n",
254 | " Shashi Kumar | \n",
255 | " NaN | \n",
256 | " $500 | \n",
257 | " 8 | \n",
258 | "
\n",
259 | " \n",
260 | "
\n",
261 | "
"
262 | ],
263 | "text/plain": [
264 | " Age Name No_of_pkg Package travel_id\n",
265 | "0 20yrs Bikash Kumar 1 packages $100 1\n",
266 | "1 21yrs Ashish Shaw 5 packages $200 2\n",
267 | "2 23years Dipak Kumar 2pkgs $100 3\n",
268 | "3 20 Years John Doe 3 pkgs $100 4\n",
269 | "4 NaN Elisha NaN $400 5\n",
270 | "5 NaN Md Shahid 10 packages $200 6\n",
271 | "6 21 yrs Adrika Roy 7pkgs $300 7\n",
272 | "7 24 yrs Shashi Kumar NaN $500 8"
273 | ]
274 | },
275 | "execution_count": 19,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "df2=df.replace([\"5000\",\"2000\"],np.NaN)\n",
282 | "df2"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {},
288 | "source": [
289 | " There is one problem with this approach is it will replace all the values which you have passed in the list with\n",
290 | " you your value but in many other cases you dont want it like if you have 50000 in price column it is valid but if\n",
291 | " 50000 is in name column it is not valid in this case.So you only want to replace 50000 of name column with NaN but\n",
292 | " not of price column.\n",
293 | " In that case you need to pass the disctionary in the replace column.This disctionary will contain name of the column\n",
294 | " and the value you want to replace"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 21,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "data": {
304 | "text/html": [
305 | "\n",
306 | "\n",
319 | "
\n",
320 | " \n",
321 | " \n",
322 | " | \n",
323 | " Age | \n",
324 | " Name | \n",
325 | " No_of_pkg | \n",
326 | " Package | \n",
327 | " travel_id | \n",
328 | "
\n",
329 | " \n",
330 | " \n",
331 | " \n",
332 | " 0 | \n",
333 | " 20yrs | \n",
334 | " Bikash Kumar | \n",
335 | " 1 packages | \n",
336 | " $100 | \n",
337 | " 1 | \n",
338 | "
\n",
339 | " \n",
340 | " 1 | \n",
341 | " 21yrs | \n",
342 | " Ashish Shaw | \n",
343 | " 5 packages | \n",
344 | " $200 | \n",
345 | " 2 | \n",
346 | "
\n",
347 | " \n",
348 | " 2 | \n",
349 | " 23years | \n",
350 | " Dipak Kumar | \n",
351 | " 2pkgs | \n",
352 | " $100 | \n",
353 | " 3 | \n",
354 | "
\n",
355 | " \n",
356 | " 3 | \n",
357 | " 20 Years | \n",
358 | " John Doe | \n",
359 | " 3 pkgs | \n",
360 | " $100 | \n",
361 | " 4 | \n",
362 | "
\n",
363 | " \n",
364 | " 4 | \n",
365 | " NaN | \n",
366 | " Elisha | \n",
367 | " NaN | \n",
368 | " $400 | \n",
369 | " 5 | \n",
370 | "
\n",
371 | " \n",
372 | " 5 | \n",
373 | " NaN | \n",
374 | " Md Shahid | \n",
375 | " 10 packages | \n",
376 | " $200 | \n",
377 | " 6 | \n",
378 | "
\n",
379 | " \n",
380 | " 6 | \n",
381 | " 21 yrs | \n",
382 | " Adrika Roy | \n",
383 | " 7pkgs | \n",
384 | " $300 | \n",
385 | " 7 | \n",
386 | "
\n",
387 | " \n",
388 | " 7 | \n",
389 | " 24 yrs | \n",
390 | " Shashi Kumar | \n",
391 | " NaN | \n",
392 | " $500 | \n",
393 | " 8 | \n",
394 | "
\n",
395 | " \n",
396 | "
\n",
397 | "
"
398 | ],
399 | "text/plain": [
400 | " Age Name No_of_pkg Package travel_id\n",
401 | "0 20yrs Bikash Kumar 1 packages $100 1\n",
402 | "1 21yrs Ashish Shaw 5 packages $200 2\n",
403 | "2 23years Dipak Kumar 2pkgs $100 3\n",
404 | "3 20 Years John Doe 3 pkgs $100 4\n",
405 | "4 NaN Elisha NaN $400 5\n",
406 | "5 NaN Md Shahid 10 packages $200 6\n",
407 | "6 21 yrs Adrika Roy 7pkgs $300 7\n",
408 | "7 24 yrs Shashi Kumar NaN $500 8"
409 | ]
410 | },
411 | "execution_count": 21,
412 | "metadata": {},
413 | "output_type": "execute_result"
414 | }
415 | ],
416 | "source": [
417 | "df2=df.replace({\n",
418 | " 'Age':[\"2000\",\"5000\"],\n",
419 | " 'No_of_pkg':[\"2000\",\"5000\"],\n",
420 | " \"travel_id\":[0]\n",
421 | "},np.NaN)\n",
422 | "df2"
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "metadata": {},
428 | "source": [
429 | " If you want to replace some specific value like 5000 with any other value and so on.In that case you need \n",
430 | " to pass the disctionary with all keys which you want to replace and values which you want to replace with.\n",
431 | " Here 5000,2000,8 are values to replaced are the keys and np.NaN & 10 are values to be replaced with are value\n",
432 | " of the disctionary."
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 25,
438 | "metadata": {},
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/html": [
443 | "\n",
444 | "\n",
457 | "
\n",
458 | " \n",
459 | " \n",
460 | " | \n",
461 | " Age | \n",
462 | " Name | \n",
463 | " No_of_pkg | \n",
464 | " Package | \n",
465 | " travel_id | \n",
466 | "
\n",
467 | " \n",
468 | " \n",
469 | " \n",
470 | " 0 | \n",
471 | " 20yrs | \n",
472 | " Bikash Kumar | \n",
473 | " 1 packages | \n",
474 | " $100 | \n",
475 | " 1 | \n",
476 | "
\n",
477 | " \n",
478 | " 1 | \n",
479 | " 21yrs | \n",
480 | " Ashish Shaw | \n",
481 | " 5 packages | \n",
482 | " $200 | \n",
483 | " 2 | \n",
484 | "
\n",
485 | " \n",
486 | " 2 | \n",
487 | " 23years | \n",
488 | " Dipak Kumar | \n",
489 | " 2pkgs | \n",
490 | " $100 | \n",
491 | " 3 | \n",
492 | "
\n",
493 | " \n",
494 | " 3 | \n",
495 | " 20 Years | \n",
496 | " John Doe | \n",
497 | " 3 pkgs | \n",
498 | " $100 | \n",
499 | " 4 | \n",
500 | "
\n",
501 | " \n",
502 | " 4 | \n",
503 | " NaN | \n",
504 | " Elisha | \n",
505 | " NaN | \n",
506 | " $400 | \n",
507 | " 5 | \n",
508 | "
\n",
509 | " \n",
510 | " 5 | \n",
511 | " NaN | \n",
512 | " Md Shahid | \n",
513 | " 10 packages | \n",
514 | " $200 | \n",
515 | " 6 | \n",
516 | "
\n",
517 | " \n",
518 | " 6 | \n",
519 | " 21 yrs | \n",
520 | " Adrika Roy | \n",
521 | " 7pkgs | \n",
522 | " $300 | \n",
523 | " 7 | \n",
524 | "
\n",
525 | " \n",
526 | " 7 | \n",
527 | " 24 yrs | \n",
528 | " Shashi Kumar | \n",
529 | " NaN | \n",
530 | " $500 | \n",
531 | " 10 | \n",
532 | "
\n",
533 | " \n",
534 | "
\n",
535 | "
"
536 | ],
537 | "text/plain": [
538 | " Age Name No_of_pkg Package travel_id\n",
539 | "0 20yrs Bikash Kumar 1 packages $100 1\n",
540 | "1 21yrs Ashish Shaw 5 packages $200 2\n",
541 | "2 23years Dipak Kumar 2pkgs $100 3\n",
542 | "3 20 Years John Doe 3 pkgs $100 4\n",
543 | "4 NaN Elisha NaN $400 5\n",
544 | "5 NaN Md Shahid 10 packages $200 6\n",
545 | "6 21 yrs Adrika Roy 7pkgs $300 7\n",
546 | "7 24 yrs Shashi Kumar NaN $500 10"
547 | ]
548 | },
549 | "execution_count": 25,
550 | "metadata": {},
551 | "output_type": "execute_result"
552 | }
553 | ],
554 | "source": [
555 | "df2=df.replace({\n",
556 | " \"5000\":np.NaN,\n",
557 | " \"2000\":np.NaN,\n",
558 | " 8:10\n",
559 | "})\n",
560 | "df2"
561 | ]
562 | },
563 | {
564 | "cell_type": "markdown",
565 | "metadata": {},
566 | "source": [
567 | "**Note:** All the values in the dataframe that belongs to keys of disctionary will be replaced no matter what column it is."
568 | ]
569 | },
570 | {
571 | "cell_type": "markdown",
572 | "metadata": {},
573 | "source": [
574 | "### Removing unnecessary character from columns"
575 | ]
576 | },
577 | {
578 | "cell_type": "markdown",
579 | "metadata": {},
580 | "source": [
581 | " Suppose your dataframe contains unnecessary characters with your data values.Here years/yrs/Yrs/Years\n",
582 | " in Age column, same in the No_of_pkg & Package columns are unnecessary charactors which you dont want \n",
583 | " and these charactor will prevent you from applying any kind of operation in data analysis.So you want\n",
584 | " to get rid of it.In that case you have to pass regex as a value and column name as a key of the \n",
585 | " disctionary which you have passed in the replace() function as well as you also have to set regex=True\n",
586 | " and a pass an empty string\n",
587 | " \n",
588 | "* ** [A-Za-z]
** : This is the regex of all the character from A to Z and a to z.\n",
589 | "* \\$
: This is the regex for **$
** sign.\n",
590 | "* For futher information about regex go to this link https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285\n"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 26,
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "data": {
600 | "text/html": [
601 | "\n",
602 | "\n",
615 | "
\n",
616 | " \n",
617 | " \n",
618 | " | \n",
619 | " Age | \n",
620 | " Name | \n",
621 | " No_of_pkg | \n",
622 | " Package | \n",
623 | " travel_id | \n",
624 | "
\n",
625 | " \n",
626 | " \n",
627 | " \n",
628 | " 0 | \n",
629 | " 20 | \n",
630 | " Bikash Kumar | \n",
631 | " 1 | \n",
632 | " 100 | \n",
633 | " 1 | \n",
634 | "
\n",
635 | " \n",
636 | " 1 | \n",
637 | " 21 | \n",
638 | " Ashish Shaw | \n",
639 | " 5 | \n",
640 | " 200 | \n",
641 | " 2 | \n",
642 | "
\n",
643 | " \n",
644 | " 2 | \n",
645 | " 23 | \n",
646 | " Dipak Kumar | \n",
647 | " 2 | \n",
648 | " 100 | \n",
649 | " 3 | \n",
650 | "
\n",
651 | " \n",
652 | " 3 | \n",
653 | " 20 | \n",
654 | " John Doe | \n",
655 | " 3 | \n",
656 | " 100 | \n",
657 | " 4 | \n",
658 | "
\n",
659 | " \n",
660 | " 4 | \n",
661 | " NaN | \n",
662 | " Elisha | \n",
663 | " NaN | \n",
664 | " 400 | \n",
665 | " 5 | \n",
666 | "
\n",
667 | " \n",
668 | " 5 | \n",
669 | " NaN | \n",
670 | " Md Shahid | \n",
671 | " 10 | \n",
672 | " 200 | \n",
673 | " 6 | \n",
674 | "
\n",
675 | " \n",
676 | " 6 | \n",
677 | " 21 | \n",
678 | " Adrika Roy | \n",
679 | " 7 | \n",
680 | " 300 | \n",
681 | " 7 | \n",
682 | "
\n",
683 | " \n",
684 | " 7 | \n",
685 | " 24 | \n",
686 | " Shashi Kumar | \n",
687 | " NaN | \n",
688 | " 500 | \n",
689 | " 10 | \n",
690 | "
\n",
691 | " \n",
692 | "
\n",
693 | "
"
694 | ],
695 | "text/plain": [
696 | " Age Name No_of_pkg Package travel_id\n",
697 | "0 20 Bikash Kumar 1 100 1\n",
698 | "1 21 Ashish Shaw 5 200 2\n",
699 | "2 23 Dipak Kumar 2 100 3\n",
700 | "3 20 John Doe 3 100 4\n",
701 | "4 NaN Elisha NaN 400 5\n",
702 | "5 NaN Md Shahid 10 200 6\n",
703 | "6 21 Adrika Roy 7 300 7\n",
704 | "7 24 Shashi Kumar NaN 500 10"
705 | ]
706 | },
707 | "execution_count": 26,
708 | "metadata": {},
709 | "output_type": "execute_result"
710 | }
711 | ],
712 | "source": [
713 | "df3=df2.replace({\n",
714 | " 'Age':'[A-Za-z]',\n",
715 | " 'No_of_pkg':'[A-Za-z]',\n",
716 | " 'Package':'\\$'\n",
717 | "},\"\",regex=True)\n",
718 | "df3"
719 | ]
720 | },
721 | {
722 | "cell_type": "markdown",
723 | "metadata": {},
724 | "source": [
725 | "### Mapping from one list to another list "
726 | ]
727 | },
728 | {
729 | "cell_type": "markdown",
730 | "metadata": {},
731 | "source": [
732 | " If your dataset contains data which is repeating more than once or you want to change some set of string in to\n",
733 | " number then you have apply list mapping."
734 | ]
735 | },
736 | {
737 | "cell_type": "code",
738 | "execution_count": 29,
739 | "metadata": {},
740 | "outputs": [
741 | {
742 | "data": {
743 | "text/html": [
744 | "\n",
745 | "\n",
758 | "
\n",
759 | " \n",
760 | " \n",
761 | " | \n",
762 | " grades | \n",
763 | " name | \n",
764 | "
\n",
765 | " \n",
766 | " \n",
767 | " \n",
768 | " 0 | \n",
769 | " poor | \n",
770 | " Shahid | \n",
771 | "
\n",
772 | " \n",
773 | " 1 | \n",
774 | " excellent | \n",
775 | " Adrika | \n",
776 | "
\n",
777 | " \n",
778 | " 2 | \n",
779 | " very good | \n",
780 | " Bikash | \n",
781 | "
\n",
782 | " \n",
783 | " 3 | \n",
784 | " average | \n",
785 | " Ashish | \n",
786 | "
\n",
787 | " \n",
788 | " 4 | \n",
789 | " good | \n",
790 | " Ganesh | \n",
791 | "
\n",
792 | " \n",
793 | " 5 | \n",
794 | " very good | \n",
795 | " Zahid | \n",
796 | "
\n",
797 | " \n",
798 | " 6 | \n",
799 | " outstanding | \n",
800 | " Mohan | \n",
801 | "
\n",
802 | " \n",
803 | " 7 | \n",
804 | " poor | \n",
805 | " Sohan | \n",
806 | "
\n",
807 | " \n",
808 | "
\n",
809 | "
"
810 | ],
811 | "text/plain": [
812 | " grades name\n",
813 | "0 poor Shahid\n",
814 | "1 excellent Adrika\n",
815 | "2 very good Bikash\n",
816 | "3 average Ashish\n",
817 | "4 good Ganesh\n",
818 | "5 very good Zahid\n",
819 | "6 outstanding Mohan\n",
820 | "7 poor Sohan"
821 | ]
822 | },
823 | "execution_count": 29,
824 | "metadata": {},
825 | "output_type": "execute_result"
826 | }
827 | ],
828 | "source": [
829 | "mydis={\n",
830 | " \"name\":[\"Shahid\",\"Adrika\",\"Bikash\",\"Ashish\",\"Ganesh\",\"Zahid\",\"Mohan\",\"Sohan\"],\n",
831 | " \"grades\":[\"poor\",\"excellent\",\"very good\",\"average\",\"good\",\"very good\",\"outstanding\",\"poor\"]\n",
832 | " }\n",
833 | "df=pd.DataFrame(mydis)\n",
834 | "df"
835 | ]
836 | },
837 | {
838 | "cell_type": "code",
839 | "execution_count": 30,
840 | "metadata": {},
841 | "outputs": [
842 | {
843 | "data": {
844 | "text/html": [
845 | "\n",
846 | "\n",
859 | "
\n",
860 | " \n",
861 | " \n",
862 | " | \n",
863 | " grades | \n",
864 | " name | \n",
865 | "
\n",
866 | " \n",
867 | " \n",
868 | " \n",
869 | " 0 | \n",
870 | " 5 | \n",
871 | " Shahid | \n",
872 | "
\n",
873 | " \n",
874 | " 1 | \n",
875 | " 9 | \n",
876 | " Adrika | \n",
877 | "
\n",
878 | " \n",
879 | " 2 | \n",
880 | " 8 | \n",
881 | " Bikash | \n",
882 | "
\n",
883 | " \n",
884 | " 3 | \n",
885 | " 6 | \n",
886 | " Ashish | \n",
887 | "
\n",
888 | " \n",
889 | " 4 | \n",
890 | " 7 | \n",
891 | " Ganesh | \n",
892 | "
\n",
893 | " \n",
894 | " 5 | \n",
895 | " 8 | \n",
896 | " Zahid | \n",
897 | "
\n",
898 | " \n",
899 | " 6 | \n",
900 | " 10 | \n",
901 | " Mohan | \n",
902 | "
\n",
903 | " \n",
904 | " 7 | \n",
905 | " 5 | \n",
906 | " Sohan | \n",
907 | "
\n",
908 | " \n",
909 | "
\n",
910 | "
"
911 | ],
912 | "text/plain": [
913 | " grades name\n",
914 | "0 5 Shahid\n",
915 | "1 9 Adrika\n",
916 | "2 8 Bikash\n",
917 | "3 6 Ashish\n",
918 | "4 7 Ganesh\n",
919 | "5 8 Zahid\n",
920 | "6 10 Mohan\n",
921 | "7 5 Sohan"
922 | ]
923 | },
924 | "execution_count": 30,
925 | "metadata": {},
926 | "output_type": "execute_result"
927 | }
928 | ],
929 | "source": [
930 | "df2=df.replace([\"poor\",\"average\",\"good\",\"very good\",\"excellent\",\"outstanding\"],[5,6,7,8,9,10])\n",
931 | "df2"
932 | ]
933 | }
934 | ],
935 | "metadata": {
936 | "kernelspec": {
937 | "display_name": "Python 3",
938 | "language": "python",
939 | "name": "python3"
940 | },
941 | "language_info": {
942 | "codemirror_mode": {
943 | "name": "ipython",
944 | "version": 3
945 | },
946 | "file_extension": ".py",
947 | "mimetype": "text/x-python",
948 | "name": "python",
949 | "nbconvert_exporter": "python",
950 | "pygments_lexer": "ipython3",
951 | "version": "3.6.4"
952 | }
953 | },
954 | "nbformat": 4,
955 | "nbformat_minor": 2
956 | }
957 |
--------------------------------------------------------------------------------
/pandas_part6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Groupby : Split, Apply and Combine\n",
8 | "---------------\n",
9 | " Groupby is one of the important operations in data analysis.It includes three steps -
\n",
10 | " \n",
11 | " \n",
12 | " - Splitting
\n",
13 | " - Applying\n",
14 | "
\n",
15 | " - Aggregation
\n",
16 | " - Transformation
\n",
17 | " - Filterting
\n",
18 | "
\n",
19 | " \n",
20 | " - Combine
\n",
21 | "
\n",
22 | " \n",
23 | " Groupby property is grouped the data according the column supplied to the function.In the following example\n",
24 | " you can see the i have grouped the dataframe df by its team."
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 28,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pandas as pd\n",
34 | "import numpy as np"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 22,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/html": [
45 | "\n",
46 | "\n",
59 | "
\n",
60 | " \n",
61 | " \n",
62 | " | \n",
63 | " Match | \n",
64 | " Run | \n",
65 | " Year | \n",
66 | " team | \n",
67 | "
\n",
68 | " \n",
69 | " \n",
70 | " \n",
71 | " 0 | \n",
72 | " 2 | \n",
73 | " 330 | \n",
74 | " 2012 | \n",
75 | " India | \n",
76 | "
\n",
77 | " \n",
78 | " 1 | \n",
79 | " 4 | \n",
80 | " 230 | \n",
81 | " 2012 | \n",
82 | " New zealand | \n",
83 | "
\n",
84 | " \n",
85 | " 2 | \n",
86 | " 2 | \n",
87 | " 300 | \n",
88 | " 2012 | \n",
89 | " Australia | \n",
90 | "
\n",
91 | " \n",
92 | " 3 | \n",
93 | " 1 | \n",
94 | " 180 | \n",
95 | " 2012 | \n",
96 | " India | \n",
97 | "
\n",
98 | " \n",
99 | " 4 | \n",
100 | " 5 | \n",
101 | " 200 | \n",
102 | " 2013 | \n",
103 | " India | \n",
104 | "
\n",
105 | " \n",
106 | " 5 | \n",
107 | " 6 | \n",
108 | " 250 | \n",
109 | " 2013 | \n",
110 | " New zealand | \n",
111 | "
\n",
112 | " \n",
113 | " 6 | \n",
114 | " 3 | \n",
115 | " 190 | \n",
116 | " 2013 | \n",
117 | " Australia | \n",
118 | "
\n",
119 | " \n",
120 | " 7 | \n",
121 | " 2 | \n",
122 | " 400 | \n",
123 | " 2013 | \n",
124 | " India | \n",
125 | "
\n",
126 | " \n",
127 | " 8 | \n",
128 | " 5 | \n",
129 | " 340 | \n",
130 | " 2014 | \n",
131 | " New zealand | \n",
132 | "
\n",
133 | " \n",
134 | " 9 | \n",
135 | " 3 | \n",
136 | " 290 | \n",
137 | " 2014 | \n",
138 | " Australia | \n",
139 | "
\n",
140 | " \n",
141 | " 10 | \n",
142 | " 1 | \n",
143 | " 390 | \n",
144 | " 2014 | \n",
145 | " New zealand | \n",
146 | "
\n",
147 | " \n",
148 | " 11 | \n",
149 | " 1 | \n",
150 | " 333 | \n",
151 | " 2014 | \n",
152 | " Australia | \n",
153 | "
\n",
154 | " \n",
155 | "
\n",
156 | "
"
157 | ],
158 | "text/plain": [
159 | " Match Run Year team\n",
160 | "0 2 330 2012 India\n",
161 | "1 4 230 2012 New zealand\n",
162 | "2 2 300 2012 Australia\n",
163 | "3 1 180 2012 India\n",
164 | "4 5 200 2013 India\n",
165 | "5 6 250 2013 New zealand\n",
166 | "6 3 190 2013 Australia\n",
167 | "7 2 400 2013 India\n",
168 | "8 5 340 2014 New zealand\n",
169 | "9 3 290 2014 Australia\n",
170 | "10 1 390 2014 New zealand\n",
171 | "11 1 333 2014 Australia"
172 | ]
173 | },
174 | "execution_count": 22,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "d={\n",
181 | " 'team':[\"India\",\"New zealand\",\"Australia\",\"India\",\n",
182 | " \"India\",\"New zealand\",\"Australia\",\"India\",\n",
183 | " \"New zealand\",\"Australia\",\"New zealand\",\"Australia\"],\n",
184 | " 'Run':[330,230,300,180,200,250,190,400,340,290,390,333],\n",
185 | " \"Match\":[2,4,2,1,5,6,3,2,5,3,1,1],\n",
186 | " \"Year\":['2012','2012','2012','2012','2013','2013','2013','2013','2014','2014','2014','2014']\n",
187 | " }\n",
188 | "df=pd.DataFrame(d)\n",
189 | "df"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | " groupby() function will return an object.we can imagine that every group is pointing to its dataframe.\n",
197 | " "
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 4,
203 | "metadata": {},
204 | "outputs": [
205 | {
206 | "data": {
207 | "text/plain": [
208 | ""
209 | ]
210 | },
211 | "execution_count": 4,
212 | "metadata": {},
213 | "output_type": "execute_result"
214 | }
215 | ],
216 | "source": [
217 | "g=df.groupby('team')\n",
218 | "g"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 14,
224 | "metadata": {},
225 | "outputs": [
226 | {
227 | "name": "stdout",
228 | "output_type": "stream",
229 | "text": [
230 | "Australia\n",
231 | "----------\n",
232 | " Match Run Year team\n",
233 | "2 2 300 2012 Australia\n",
234 | "6 3 190 2013 Australia\n",
235 | "9 3 290 2014 Australia\n",
236 | "11 1 333 2014 Australia\n",
237 | "-----------------------------------\n",
238 | "India\n",
239 | "----------\n",
240 | " Match Run Year team\n",
241 | "0 2 330 2012 India\n",
242 | "3 1 180 2012 India\n",
243 | "4 5 200 2013 India\n",
244 | "7 2 400 2013 India\n",
245 | "-----------------------------------\n",
246 | "New zealand\n",
247 | "----------\n",
248 | " Match Run Year team\n",
249 | "1 4 230 2012 New zealand\n",
250 | "5 6 250 2013 New zealand\n",
251 | "8 5 340 2014 New zealand\n",
252 | "10 1 390 2014 New zealand\n",
253 | "-----------------------------------\n"
254 | ]
255 | }
256 | ],
257 | "source": [
258 | "for team,teamdata in g:\n",
259 | " print(team)\n",
260 | " print(\"-\"*10)\n",
261 | " print(teamdata)\n",
262 | " print(\"-\"*35)"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | " **get_group('group name') :**
\n",
270 | " It will return dataframe of particular group"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 16,
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/html": [
281 | "\n",
282 | "\n",
295 | "
\n",
296 | " \n",
297 | " \n",
298 | " | \n",
299 | " Match | \n",
300 | " Run | \n",
301 | " Year | \n",
302 | "
\n",
303 | " \n",
304 | " \n",
305 | " \n",
306 | " 0 | \n",
307 | " 2 | \n",
308 | " 330 | \n",
309 | " 2012 | \n",
310 | "
\n",
311 | " \n",
312 | " 3 | \n",
313 | " 1 | \n",
314 | " 180 | \n",
315 | " 2012 | \n",
316 | "
\n",
317 | " \n",
318 | " 4 | \n",
319 | " 5 | \n",
320 | " 200 | \n",
321 | " 2013 | \n",
322 | "
\n",
323 | " \n",
324 | " 7 | \n",
325 | " 2 | \n",
326 | " 400 | \n",
327 | " 2013 | \n",
328 | "
\n",
329 | " \n",
330 | "
\n",
331 | "
"
332 | ],
333 | "text/plain": [
334 | " Match Run Year\n",
335 | "0 2 330 2012\n",
336 | "3 1 180 2012\n",
337 | "4 5 200 2013\n",
338 | "7 2 400 2013"
339 | ]
340 | },
341 | "execution_count": 16,
342 | "metadata": {},
343 | "output_type": "execute_result"
344 | }
345 | ],
346 | "source": [
347 | "g.get_group('India')"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 17,
353 | "metadata": {},
354 | "outputs": [
355 | {
356 | "data": {
357 | "text/html": [
358 | "\n",
359 | "\n",
372 | "
\n",
373 | " \n",
374 | " \n",
375 | " | \n",
376 | " Match | \n",
377 | " Run | \n",
378 | " Year | \n",
379 | "
\n",
380 | " \n",
381 | " \n",
382 | " \n",
383 | " 2 | \n",
384 | " 2 | \n",
385 | " 300 | \n",
386 | " 2012 | \n",
387 | "
\n",
388 | " \n",
389 | " 6 | \n",
390 | " 3 | \n",
391 | " 190 | \n",
392 | " 2013 | \n",
393 | "
\n",
394 | " \n",
395 | " 9 | \n",
396 | " 3 | \n",
397 | " 290 | \n",
398 | " 2014 | \n",
399 | "
\n",
400 | " \n",
401 | " 11 | \n",
402 | " 1 | \n",
403 | " 333 | \n",
404 | " 2014 | \n",
405 | "
\n",
406 | " \n",
407 | "
\n",
408 | "
"
409 | ],
410 | "text/plain": [
411 | " Match Run Year\n",
412 | "2 2 300 2012\n",
413 | "6 3 190 2013\n",
414 | "9 3 290 2014\n",
415 | "11 1 333 2014"
416 | ]
417 | },
418 | "execution_count": 17,
419 | "metadata": {},
420 | "output_type": "execute_result"
421 | }
422 | ],
423 | "source": [
424 | "g.get_group('Australia')"
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "metadata": {},
430 | "source": [
431 | " We can also perform the operations which we were applying in dataframe.This only difference here you get is \n",
432 | " your operation will be applied to all of group and return the result of all groups."
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 18,
438 | "metadata": {},
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/html": [
443 | "\n",
444 | "\n",
457 | "
\n",
458 | " \n",
459 | " \n",
460 | " | \n",
461 | " Match | \n",
462 | " Run | \n",
463 | " Year | \n",
464 | "
\n",
465 | " \n",
466 | " team | \n",
467 | " | \n",
468 | " | \n",
469 | " | \n",
470 | "
\n",
471 | " \n",
472 | " \n",
473 | " \n",
474 | " Australia | \n",
475 | " 1 | \n",
476 | " 190 | \n",
477 | " 2012 | \n",
478 | "
\n",
479 | " \n",
480 | " India | \n",
481 | " 1 | \n",
482 | " 180 | \n",
483 | " 2012 | \n",
484 | "
\n",
485 | " \n",
486 | " New zealand | \n",
487 | " 1 | \n",
488 | " 230 | \n",
489 | " 2012 | \n",
490 | "
\n",
491 | " \n",
492 | "
\n",
493 | "
"
494 | ],
495 | "text/plain": [
496 | " Match Run Year\n",
497 | "team \n",
498 | "Australia 1 190 2012\n",
499 | "India 1 180 2012\n",
500 | "New zealand 1 230 2012"
501 | ]
502 | },
503 | "execution_count": 18,
504 | "metadata": {},
505 | "output_type": "execute_result"
506 | }
507 | ],
508 | "source": [
509 | "g.min()"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 20,
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "data": {
519 | "text/html": [
520 | "\n",
521 | "\n",
534 | "
\n",
535 | " \n",
536 | " \n",
537 | " | \n",
538 | " Match | \n",
539 | " Run | \n",
540 | " Year | \n",
541 | "
\n",
542 | " \n",
543 | " team | \n",
544 | " | \n",
545 | " | \n",
546 | " | \n",
547 | "
\n",
548 | " \n",
549 | " \n",
550 | " \n",
551 | " Australia | \n",
552 | " 2.25 | \n",
553 | " 278.25 | \n",
554 | " 2013.25 | \n",
555 | "
\n",
556 | " \n",
557 | " India | \n",
558 | " 2.50 | \n",
559 | " 277.50 | \n",
560 | " 2012.50 | \n",
561 | "
\n",
562 | " \n",
563 | " New zealand | \n",
564 | " 4.00 | \n",
565 | " 302.50 | \n",
566 | " 2013.25 | \n",
567 | "
\n",
568 | " \n",
569 | "
\n",
570 | "
"
571 | ],
572 | "text/plain": [
573 | " Match Run Year\n",
574 | "team \n",
575 | "Australia 2.25 278.25 2013.25\n",
576 | "India 2.50 277.50 2012.50\n",
577 | "New zealand 4.00 302.50 2013.25"
578 | ]
579 | },
580 | "execution_count": 20,
581 | "metadata": {},
582 | "output_type": "execute_result"
583 | }
584 | ],
585 | "source": [
586 | "g.mean()"
587 | ]
588 | },
589 | {
590 | "cell_type": "code",
591 | "execution_count": 30,
592 | "metadata": {},
593 | "outputs": [
594 | {
595 | "data": {
596 | "text/html": [
597 | "\n",
598 | "\n",
611 | "
\n",
612 | " \n",
613 | " \n",
614 | " | \n",
615 | " amin | \n",
616 | " amax | \n",
617 | " mean | \n",
618 | " sum | \n",
619 | " std | \n",
620 | "
\n",
621 | " \n",
622 | " Year | \n",
623 | " | \n",
624 | " | \n",
625 | " | \n",
626 | " | \n",
627 | " | \n",
628 | "
\n",
629 | " \n",
630 | " \n",
631 | " \n",
632 | " 2012 | \n",
633 | " 180 | \n",
634 | " 330 | \n",
635 | " 260.00 | \n",
636 | " 1040 | \n",
637 | " 67.823300 | \n",
638 | "
\n",
639 | " \n",
640 | " 2013 | \n",
641 | " 190 | \n",
642 | " 400 | \n",
643 | " 260.00 | \n",
644 | " 1040 | \n",
645 | " 96.953597 | \n",
646 | "
\n",
647 | " \n",
648 | " 2014 | \n",
649 | " 290 | \n",
650 | " 390 | \n",
651 | " 338.25 | \n",
652 | " 1353 | \n",
653 | " 40.974586 | \n",
654 | "
\n",
655 | " \n",
656 | "
\n",
657 | "
"
658 | ],
659 | "text/plain": [
660 | " amin amax mean sum std\n",
661 | "Year \n",
662 | "2012 180 330 260.00 1040 67.823300\n",
663 | "2013 190 400 260.00 1040 96.953597\n",
664 | "2014 290 390 338.25 1353 40.974586"
665 | ]
666 | },
667 | "execution_count": 30,
668 | "metadata": {},
669 | "output_type": "execute_result"
670 | }
671 | ],
672 | "source": [
673 | "g_yr=df.groupby('Year')\n",
674 | "g_yr['Run'].agg([np.min,np.max,np.mean,np.sum,np.std])"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": null,
680 | "metadata": {},
681 | "outputs": [],
682 | "source": []
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": null,
687 | "metadata": {},
688 | "outputs": [],
689 | "source": []
690 | },
691 | {
692 | "cell_type": "code",
693 | "execution_count": null,
694 | "metadata": {},
695 | "outputs": [],
696 | "source": []
697 | },
698 | {
699 | "cell_type": "code",
700 | "execution_count": null,
701 | "metadata": {},
702 | "outputs": [],
703 | "source": []
704 | }
705 | ],
706 | "metadata": {
707 | "kernelspec": {
708 | "display_name": "Python 3",
709 | "language": "python",
710 | "name": "python3"
711 | },
712 | "language_info": {
713 | "codemirror_mode": {
714 | "name": "ipython",
715 | "version": 3
716 | },
717 | "file_extension": ".py",
718 | "mimetype": "text/x-python",
719 | "name": "python",
720 | "nbconvert_exporter": "python",
721 | "pygments_lexer": "ipython3",
722 | "version": "3.6.4"
723 | }
724 | },
725 | "nbformat": 4,
726 | "nbformat_minor": 2
727 | }
728 |
--------------------------------------------------------------------------------
/pandas_part7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Concat DataFrame \n",
8 | "------------\n",
9 | " Concat is very useful method of pandas.You can concatanate two dataframes in two way -\n",
10 | " 1)Append by row\n",
11 | " 2)Append by col\n",
12 | " \n",
13 | "### 1) Append by row :\n",
14 | " Concat is the method of pandas in which you can join two and more dataframes provided its indices are same.\n",
15 | " Let's take an example -\n",
16 | " You have two dataframes of weathers of two cities Kolkata and Chennai,\n",
17 | " \n",
18 | " df=pd.concat([dataframe_1,dataframe_2,...,dataframe_n])
\n",
19 | " for n number of dataframes,"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import pandas as pd"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "data": {
38 | "text/html": [
39 | "\n",
40 | "\n",
53 | "
\n",
54 | " \n",
55 | " \n",
56 | " | \n",
57 | " date | \n",
58 | " event | \n",
59 | " temp | \n",
60 | " wind-speed | \n",
61 | "
\n",
62 | " \n",
63 | " \n",
64 | " \n",
65 | " 0 | \n",
66 | " 01-02-12 | \n",
67 | " hot | \n",
68 | " 20 | \n",
69 | " 33 | \n",
70 | "
\n",
71 | " \n",
72 | " 1 | \n",
73 | " 03-02-12 | \n",
74 | " sunny | \n",
75 | " 21 | \n",
76 | " 23 | \n",
77 | "
\n",
78 | " \n",
79 | " 2 | \n",
80 | " 04-02-12 | \n",
81 | " rainy | \n",
82 | " 15 | \n",
83 | " 45 | \n",
84 | "
\n",
85 | " \n",
86 | " 3 | \n",
87 | " 05-02-12 | \n",
88 | " cold | \n",
89 | " 18 | \n",
90 | " 24 | \n",
91 | "
\n",
92 | " \n",
93 | "
\n",
94 | "
"
95 | ],
96 | "text/plain": [
97 | " date event temp wind-speed\n",
98 | "0 01-02-12 hot 20 33\n",
99 | "1 03-02-12 sunny 21 23\n",
100 | "2 04-02-12 rainy 15 45\n",
101 | "3 05-02-12 cold 18 24"
102 | ]
103 | },
104 | "execution_count": 3,
105 | "metadata": {},
106 | "output_type": "execute_result"
107 | }
108 | ],
109 | "source": [
110 | "chennai={\n",
111 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
112 | " \"event\":['hot','sunny','rainy','cold'],\n",
113 | " \"wind-speed\":[33,23,45,24],\n",
114 | " \"temp\":[20,21,15,18]\n",
115 | "}\n",
116 | "chen=pd.DataFrame(chennai)\n",
117 | "chen"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 5,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/html": [
128 | "\n",
129 | "\n",
142 | "
\n",
143 | " \n",
144 | " \n",
145 | " | \n",
146 | " date | \n",
147 | " event | \n",
148 | " temp | \n",
149 | " wind-speed | \n",
150 | "
\n",
151 | " \n",
152 | " \n",
153 | " \n",
154 | " 0 | \n",
155 | " 01-02-12 | \n",
156 | " sunny | \n",
157 | " 14 | \n",
158 | " 12 | \n",
159 | "
\n",
160 | " \n",
161 | " 1 | \n",
162 | " 03-02-12 | \n",
163 | " cold | \n",
164 | " 16 | \n",
165 | " 10 | \n",
166 | "
\n",
167 | " \n",
168 | " 2 | \n",
169 | " 04-02-12 | \n",
170 | " cold | \n",
171 | " 15 | \n",
172 | " 9 | \n",
173 | "
\n",
174 | " \n",
175 | " 3 | \n",
176 | " 05-02-12 | \n",
177 | " rainy | \n",
178 | " 10 | \n",
179 | " 14 | \n",
180 | "
\n",
181 | " \n",
182 | "
\n",
183 | "
"
184 | ],
185 | "text/plain": [
186 | " date event temp wind-speed\n",
187 | "0 01-02-12 sunny 14 12\n",
188 | "1 03-02-12 cold 16 10\n",
189 | "2 04-02-12 cold 15 9\n",
190 | "3 05-02-12 rainy 10 14"
191 | ]
192 | },
193 | "execution_count": 5,
194 | "metadata": {},
195 | "output_type": "execute_result"
196 | }
197 | ],
198 | "source": [
199 | "kolkata={\n",
200 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
201 | " \"event\":['sunny','cold','cold','rainy'],\n",
202 | " \"wind-speed\":[12,10,9,14],\n",
203 | " \"temp\":[14,16,15,10]\n",
204 | "}\n",
205 | "kol=pd.DataFrame(kolkata)\n",
206 | "kol"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 6,
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "data": {
216 | "text/html": [
217 | "\n",
218 | "\n",
231 | "
\n",
232 | " \n",
233 | " \n",
234 | " | \n",
235 | " date | \n",
236 | " event | \n",
237 | " temp | \n",
238 | " wind-speed | \n",
239 | "
\n",
240 | " \n",
241 | " \n",
242 | " \n",
243 | " 0 | \n",
244 | " 01-02-12 | \n",
245 | " sunny | \n",
246 | " 14 | \n",
247 | " 12 | \n",
248 | "
\n",
249 | " \n",
250 | " 1 | \n",
251 | " 03-02-12 | \n",
252 | " cold | \n",
253 | " 16 | \n",
254 | " 10 | \n",
255 | "
\n",
256 | " \n",
257 | " 2 | \n",
258 | " 04-02-12 | \n",
259 | " cold | \n",
260 | " 15 | \n",
261 | " 9 | \n",
262 | "
\n",
263 | " \n",
264 | " 3 | \n",
265 | " 05-02-12 | \n",
266 | " rainy | \n",
267 | " 10 | \n",
268 | " 14 | \n",
269 | "
\n",
270 | " \n",
271 | " 0 | \n",
272 | " 01-02-12 | \n",
273 | " hot | \n",
274 | " 20 | \n",
275 | " 33 | \n",
276 | "
\n",
277 | " \n",
278 | " 1 | \n",
279 | " 03-02-12 | \n",
280 | " sunny | \n",
281 | " 21 | \n",
282 | " 23 | \n",
283 | "
\n",
284 | " \n",
285 | " 2 | \n",
286 | " 04-02-12 | \n",
287 | " rainy | \n",
288 | " 15 | \n",
289 | " 45 | \n",
290 | "
\n",
291 | " \n",
292 | " 3 | \n",
293 | " 05-02-12 | \n",
294 | " cold | \n",
295 | " 18 | \n",
296 | " 24 | \n",
297 | "
\n",
298 | " \n",
299 | "
\n",
300 | "
"
301 | ],
302 | "text/plain": [
303 | " date event temp wind-speed\n",
304 | "0 01-02-12 sunny 14 12\n",
305 | "1 03-02-12 cold 16 10\n",
306 | "2 04-02-12 cold 15 9\n",
307 | "3 05-02-12 rainy 10 14\n",
308 | "0 01-02-12 hot 20 33\n",
309 | "1 03-02-12 sunny 21 23\n",
310 | "2 04-02-12 rainy 15 45\n",
311 | "3 05-02-12 cold 18 24"
312 | ]
313 | },
314 | "execution_count": 6,
315 | "metadata": {},
316 | "output_type": "execute_result"
317 | }
318 | ],
319 | "source": [
320 | "df=pd.concat([kol,chen])\n",
321 | "df"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | " **You can observe the index first 0 to 3 then again 0 to 3.To ignore this you have to pass an extra argument.**"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 8,
334 | "metadata": {},
335 | "outputs": [
336 | {
337 | "data": {
338 | "text/html": [
339 | "\n",
340 | "\n",
353 | "
\n",
354 | " \n",
355 | " \n",
356 | " | \n",
357 | " date | \n",
358 | " event | \n",
359 | " temp | \n",
360 | " wind-speed | \n",
361 | "
\n",
362 | " \n",
363 | " \n",
364 | " \n",
365 | " 0 | \n",
366 | " 01-02-12 | \n",
367 | " sunny | \n",
368 | " 14 | \n",
369 | " 12 | \n",
370 | "
\n",
371 | " \n",
372 | " 1 | \n",
373 | " 03-02-12 | \n",
374 | " cold | \n",
375 | " 16 | \n",
376 | " 10 | \n",
377 | "
\n",
378 | " \n",
379 | " 2 | \n",
380 | " 04-02-12 | \n",
381 | " cold | \n",
382 | " 15 | \n",
383 | " 9 | \n",
384 | "
\n",
385 | " \n",
386 | " 3 | \n",
387 | " 05-02-12 | \n",
388 | " rainy | \n",
389 | " 10 | \n",
390 | " 14 | \n",
391 | "
\n",
392 | " \n",
393 | " 4 | \n",
394 | " 01-02-12 | \n",
395 | " hot | \n",
396 | " 20 | \n",
397 | " 33 | \n",
398 | "
\n",
399 | " \n",
400 | " 5 | \n",
401 | " 03-02-12 | \n",
402 | " sunny | \n",
403 | " 21 | \n",
404 | " 23 | \n",
405 | "
\n",
406 | " \n",
407 | " 6 | \n",
408 | " 04-02-12 | \n",
409 | " rainy | \n",
410 | " 15 | \n",
411 | " 45 | \n",
412 | "
\n",
413 | " \n",
414 | " 7 | \n",
415 | " 05-02-12 | \n",
416 | " cold | \n",
417 | " 18 | \n",
418 | " 24 | \n",
419 | "
\n",
420 | " \n",
421 | "
\n",
422 | "
"
423 | ],
424 | "text/plain": [
425 | " date event temp wind-speed\n",
426 | "0 01-02-12 sunny 14 12\n",
427 | "1 03-02-12 cold 16 10\n",
428 | "2 04-02-12 cold 15 9\n",
429 | "3 05-02-12 rainy 10 14\n",
430 | "4 01-02-12 hot 20 33\n",
431 | "5 03-02-12 sunny 21 23\n",
432 | "6 04-02-12 rainy 15 45\n",
433 | "7 05-02-12 cold 18 24"
434 | ]
435 | },
436 | "execution_count": 8,
437 | "metadata": {},
438 | "output_type": "execute_result"
439 | }
440 | ],
441 | "source": [
442 | "df=pd.concat([kol,chen],ignore_index=True)\n",
443 | "df\n",
444 | "# Now see index column"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "metadata": {},
450 | "source": [
451 | "##### If you want to join dataframes with dataframe name as an index name"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 9,
457 | "metadata": {},
458 | "outputs": [
459 | {
460 | "data": {
461 | "text/html": [
462 | "\n",
463 | "\n",
476 | "
\n",
477 | " \n",
478 | " \n",
479 | " | \n",
480 | " | \n",
481 | " date | \n",
482 | " event | \n",
483 | " temp | \n",
484 | " wind-speed | \n",
485 | "
\n",
486 | " \n",
487 | " \n",
488 | " \n",
489 | " Kolkata | \n",
490 | " 0 | \n",
491 | " 01-02-12 | \n",
492 | " sunny | \n",
493 | " 14 | \n",
494 | " 12 | \n",
495 | "
\n",
496 | " \n",
497 | " 1 | \n",
498 | " 03-02-12 | \n",
499 | " cold | \n",
500 | " 16 | \n",
501 | " 10 | \n",
502 | "
\n",
503 | " \n",
504 | " 2 | \n",
505 | " 04-02-12 | \n",
506 | " cold | \n",
507 | " 15 | \n",
508 | " 9 | \n",
509 | "
\n",
510 | " \n",
511 | " 3 | \n",
512 | " 05-02-12 | \n",
513 | " rainy | \n",
514 | " 10 | \n",
515 | " 14 | \n",
516 | "
\n",
517 | " \n",
518 | " Chennai | \n",
519 | " 0 | \n",
520 | " 01-02-12 | \n",
521 | " hot | \n",
522 | " 20 | \n",
523 | " 33 | \n",
524 | "
\n",
525 | " \n",
526 | " 1 | \n",
527 | " 03-02-12 | \n",
528 | " sunny | \n",
529 | " 21 | \n",
530 | " 23 | \n",
531 | "
\n",
532 | " \n",
533 | " 2 | \n",
534 | " 04-02-12 | \n",
535 | " rainy | \n",
536 | " 15 | \n",
537 | " 45 | \n",
538 | "
\n",
539 | " \n",
540 | " 3 | \n",
541 | " 05-02-12 | \n",
542 | " cold | \n",
543 | " 18 | \n",
544 | " 24 | \n",
545 | "
\n",
546 | " \n",
547 | "
\n",
548 | "
"
549 | ],
550 | "text/plain": [
551 | " date event temp wind-speed\n",
552 | "Kolkata 0 01-02-12 sunny 14 12\n",
553 | " 1 03-02-12 cold 16 10\n",
554 | " 2 04-02-12 cold 15 9\n",
555 | " 3 05-02-12 rainy 10 14\n",
556 | "Chennai 0 01-02-12 hot 20 33\n",
557 | " 1 03-02-12 sunny 21 23\n",
558 | " 2 04-02-12 rainy 15 45\n",
559 | " 3 05-02-12 cold 18 24"
560 | ]
561 | },
562 | "execution_count": 9,
563 | "metadata": {},
564 | "output_type": "execute_result"
565 | }
566 | ],
567 | "source": [
568 | "df=pd.concat([kol,chen],keys=['Kolkata','Chennai'])\n",
569 | "df"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 11,
575 | "metadata": {},
576 | "outputs": [
577 | {
578 | "data": {
579 | "text/html": [
580 | "\n",
581 | "\n",
594 | "
\n",
595 | " \n",
596 | " \n",
597 | " | \n",
598 | " date | \n",
599 | " event | \n",
600 | " temp | \n",
601 | " wind-speed | \n",
602 | "
\n",
603 | " \n",
604 | " \n",
605 | " \n",
606 | " 0 | \n",
607 | " 01-02-12 | \n",
608 | " sunny | \n",
609 | " 14 | \n",
610 | " 12 | \n",
611 | "
\n",
612 | " \n",
613 | " 1 | \n",
614 | " 03-02-12 | \n",
615 | " cold | \n",
616 | " 16 | \n",
617 | " 10 | \n",
618 | "
\n",
619 | " \n",
620 | " 2 | \n",
621 | " 04-02-12 | \n",
622 | " cold | \n",
623 | " 15 | \n",
624 | " 9 | \n",
625 | "
\n",
626 | " \n",
627 | " 3 | \n",
628 | " 05-02-12 | \n",
629 | " rainy | \n",
630 | " 10 | \n",
631 | " 14 | \n",
632 | "
\n",
633 | " \n",
634 | "
\n",
635 | "
"
636 | ],
637 | "text/plain": [
638 | " date event temp wind-speed\n",
639 | "0 01-02-12 sunny 14 12\n",
640 | "1 03-02-12 cold 16 10\n",
641 | "2 04-02-12 cold 15 9\n",
642 | "3 05-02-12 rainy 10 14"
643 | ]
644 | },
645 | "execution_count": 11,
646 | "metadata": {},
647 | "output_type": "execute_result"
648 | }
649 | ],
650 | "source": [
651 | "df.loc['Kolkata']"
652 | ]
653 | },
654 | {
655 | "cell_type": "code",
656 | "execution_count": 12,
657 | "metadata": {},
658 | "outputs": [
659 | {
660 | "data": {
661 | "text/html": [
662 | "\n",
663 | "\n",
676 | "
\n",
677 | " \n",
678 | " \n",
679 | " | \n",
680 | " date | \n",
681 | " event | \n",
682 | " temp | \n",
683 | " wind-speed | \n",
684 | "
\n",
685 | " \n",
686 | " \n",
687 | " \n",
688 | " 0 | \n",
689 | " 01-02-12 | \n",
690 | " hot | \n",
691 | " 20 | \n",
692 | " 33 | \n",
693 | "
\n",
694 | " \n",
695 | " 1 | \n",
696 | " 03-02-12 | \n",
697 | " sunny | \n",
698 | " 21 | \n",
699 | " 23 | \n",
700 | "
\n",
701 | " \n",
702 | " 2 | \n",
703 | " 04-02-12 | \n",
704 | " rainy | \n",
705 | " 15 | \n",
706 | " 45 | \n",
707 | "
\n",
708 | " \n",
709 | " 3 | \n",
710 | " 05-02-12 | \n",
711 | " cold | \n",
712 | " 18 | \n",
713 | " 24 | \n",
714 | "
\n",
715 | " \n",
716 | "
\n",
717 | "
"
718 | ],
719 | "text/plain": [
720 | " date event temp wind-speed\n",
721 | "0 01-02-12 hot 20 33\n",
722 | "1 03-02-12 sunny 21 23\n",
723 | "2 04-02-12 rainy 15 45\n",
724 | "3 05-02-12 cold 18 24"
725 | ]
726 | },
727 | "execution_count": 12,
728 | "metadata": {},
729 | "output_type": "execute_result"
730 | }
731 | ],
732 | "source": [
733 | "df.loc['Chennai']"
734 | ]
735 | },
736 | {
737 | "cell_type": "markdown",
738 | "metadata": {},
739 | "source": [
740 | "### 2) Append by column :\n",
741 | " if you have two dataframes and you want to append column wise. For example - if you have two dataframes of weather\n",
742 | " first dataframe having columns date,event & temp and second dataframe having columns date,event & wind-speed when you\n",
743 | " join both you will get one dataframe having columns date, event, temp, date, event and wind-speed.\n",
744 | " To join two dataframe column wise\n",
745 | " you have to pass axis=1 in concat() method.\n",
746 | " \n",
747 | " df=pd.concat([dataframe_1,dataframe_2,...,dataframe_n],axis=1)
"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 3,
753 | "metadata": {},
754 | "outputs": [
755 | {
756 | "data": {
757 | "text/html": [
758 | "\n",
759 | "\n",
772 | "
\n",
773 | " \n",
774 | " \n",
775 | " | \n",
776 | " date | \n",
777 | " event | \n",
778 | " temp | \n",
779 | "
\n",
780 | " \n",
781 | " \n",
782 | " \n",
783 | " 0 | \n",
784 | " 01-02-12 | \n",
785 | " sunny | \n",
786 | " 14 | \n",
787 | "
\n",
788 | " \n",
789 | " 1 | \n",
790 | " 03-02-12 | \n",
791 | " cold | \n",
792 | " 16 | \n",
793 | "
\n",
794 | " \n",
795 | " 2 | \n",
796 | " 04-02-12 | \n",
797 | " cold | \n",
798 | " 15 | \n",
799 | "
\n",
800 | " \n",
801 | " 3 | \n",
802 | " 05-02-12 | \n",
803 | " rainy | \n",
804 | " 10 | \n",
805 | "
\n",
806 | " \n",
807 | "
\n",
808 | "
"
809 | ],
810 | "text/plain": [
811 | " date event temp\n",
812 | "0 01-02-12 sunny 14\n",
813 | "1 03-02-12 cold 16\n",
814 | "2 04-02-12 cold 15\n",
815 | "3 05-02-12 rainy 10"
816 | ]
817 | },
818 | "execution_count": 3,
819 | "metadata": {},
820 | "output_type": "execute_result"
821 | }
822 | ],
823 | "source": [
824 | "temp1={\n",
825 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
826 | " \"event\":['sunny','cold','cold','rainy'],\n",
827 | " \"temp\":[14,16,15,10]\n",
828 | "}\n",
829 | "temp=pd.DataFrame(temp1)\n",
830 | "temp"
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "execution_count": 7,
836 | "metadata": {},
837 | "outputs": [
838 | {
839 | "data": {
840 | "text/html": [
841 | "\n",
842 | "\n",
855 | "
\n",
856 | " \n",
857 | " \n",
858 | " | \n",
859 | " date | \n",
860 | " event | \n",
861 | " wind-speed | \n",
862 | "
\n",
863 | " \n",
864 | " \n",
865 | " \n",
866 | " 0 | \n",
867 | " 01-02-12 | \n",
868 | " sunny | \n",
869 | " 12 | \n",
870 | "
\n",
871 | " \n",
872 | " 1 | \n",
873 | " 03-02-12 | \n",
874 | " cold | \n",
875 | " 10 | \n",
876 | "
\n",
877 | " \n",
878 | " 2 | \n",
879 | " 04-02-12 | \n",
880 | " cold | \n",
881 | " 9 | \n",
882 | "
\n",
883 | " \n",
884 | " 3 | \n",
885 | " 05-02-12 | \n",
886 | " rainy | \n",
887 | " 14 | \n",
888 | "
\n",
889 | " \n",
890 | "
\n",
891 | "
"
892 | ],
893 | "text/plain": [
894 | " date event wind-speed\n",
895 | "0 01-02-12 sunny 12\n",
896 | "1 03-02-12 cold 10\n",
897 | "2 04-02-12 cold 9\n",
898 | "3 05-02-12 rainy 14"
899 | ]
900 | },
901 | "execution_count": 7,
902 | "metadata": {},
903 | "output_type": "execute_result"
904 | }
905 | ],
906 | "source": [
907 | "ws={\n",
908 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
909 | " \"event\":['sunny','cold','cold','rainy'],\n",
910 | " \"wind-speed\":[12,10,9,14],\n",
911 | "}\n",
912 | "wind_speed=pd.DataFrame(ws)\n",
913 | "wind_speed"
914 | ]
915 | },
916 | {
917 | "cell_type": "code",
918 | "execution_count": 9,
919 | "metadata": {},
920 | "outputs": [
921 | {
922 | "data": {
923 | "text/html": [
924 | "\n",
925 | "\n",
938 | "
\n",
939 | " \n",
940 | " \n",
941 | " | \n",
942 | " date | \n",
943 | " event | \n",
944 | " temp | \n",
945 | " date | \n",
946 | " event | \n",
947 | " wind-speed | \n",
948 | "
\n",
949 | " \n",
950 | " \n",
951 | " \n",
952 | " 0 | \n",
953 | " 01-02-12 | \n",
954 | " sunny | \n",
955 | " 14 | \n",
956 | " 01-02-12 | \n",
957 | " sunny | \n",
958 | " 12 | \n",
959 | "
\n",
960 | " \n",
961 | " 1 | \n",
962 | " 03-02-12 | \n",
963 | " cold | \n",
964 | " 16 | \n",
965 | " 03-02-12 | \n",
966 | " cold | \n",
967 | " 10 | \n",
968 | "
\n",
969 | " \n",
970 | " 2 | \n",
971 | " 04-02-12 | \n",
972 | " cold | \n",
973 | " 15 | \n",
974 | " 04-02-12 | \n",
975 | " cold | \n",
976 | " 9 | \n",
977 | "
\n",
978 | " \n",
979 | " 3 | \n",
980 | " 05-02-12 | \n",
981 | " rainy | \n",
982 | " 10 | \n",
983 | " 05-02-12 | \n",
984 | " rainy | \n",
985 | " 14 | \n",
986 | "
\n",
987 | " \n",
988 | "
\n",
989 | "
"
990 | ],
991 | "text/plain": [
992 | " date event temp date event wind-speed\n",
993 | "0 01-02-12 sunny 14 01-02-12 sunny 12\n",
994 | "1 03-02-12 cold 16 03-02-12 cold 10\n",
995 | "2 04-02-12 cold 15 04-02-12 cold 9\n",
996 | "3 05-02-12 rainy 10 05-02-12 rainy 14"
997 | ]
998 | },
999 | "execution_count": 9,
1000 | "metadata": {},
1001 | "output_type": "execute_result"
1002 | }
1003 | ],
1004 | "source": [
1005 | "df=pd.concat([temp,wind_speed],axis=1)\n",
1006 | "df"
1007 | ]
1008 | },
1009 | {
1010 | "cell_type": "markdown",
1011 | "metadata": {},
1012 | "source": []
1013 | }
1014 | ],
1015 | "metadata": {
1016 | "kernelspec": {
1017 | "display_name": "Python 3",
1018 | "language": "python",
1019 | "name": "python3"
1020 | },
1021 | "language_info": {
1022 | "codemirror_mode": {
1023 | "name": "ipython",
1024 | "version": 3
1025 | },
1026 | "file_extension": ".py",
1027 | "mimetype": "text/x-python",
1028 | "name": "python",
1029 | "nbconvert_exporter": "python",
1030 | "pygments_lexer": "ipython3",
1031 | "version": "3.6.4"
1032 | }
1033 | },
1034 | "nbformat": 4,
1035 | "nbformat_minor": 2
1036 | }
1037 |
--------------------------------------------------------------------------------
/pandas_part8.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Merging dataframes \n",
8 | "-------\n",
9 | " merge() is a method in pandas in which you can merge two dataframes withou repeating columns as we did in\n",
10 | " concat() method.\n",
11 | " There are two types of merging -\n",
12 | " 1) Inner join\n",
13 | " 2) outer join\n",
14 | " ### 1) Inner join\n",
15 | " In this method you will get the intersetion of two dataframes with merged column.It is the default merge method.\n",
16 | " In merge() method you have to pass the dataframes as arguments and list of columns on which you want to merge\n",
17 | " and dataframes as arguments."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import pandas as pd"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/html": [
37 | "\n",
38 | "\n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " | \n",
55 | " date | \n",
56 | " event | \n",
57 | " temp | \n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " \n",
62 | " 0 | \n",
63 | " 01-02-12 | \n",
64 | " sunny | \n",
65 | " 14 | \n",
66 | "
\n",
67 | " \n",
68 | " 1 | \n",
69 | " 03-02-12 | \n",
70 | " cold | \n",
71 | " 16 | \n",
72 | "
\n",
73 | " \n",
74 | " 2 | \n",
75 | " 04-02-12 | \n",
76 | " cold | \n",
77 | " 15 | \n",
78 | "
\n",
79 | " \n",
80 | " 3 | \n",
81 | " 05-02-12 | \n",
82 | " rainy | \n",
83 | " 10 | \n",
84 | "
\n",
85 | " \n",
86 | "
\n",
87 | "
"
88 | ],
89 | "text/plain": [
90 | " date event temp\n",
91 | "0 01-02-12 sunny 14\n",
92 | "1 03-02-12 cold 16\n",
93 | "2 04-02-12 cold 15\n",
94 | "3 05-02-12 rainy 10"
95 | ]
96 | },
97 | "execution_count": 2,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "temp1={\n",
104 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
105 | " \"event\":['sunny','cold','cold','rainy'],\n",
106 | " \"temp\":[14,16,15,10]\n",
107 | "}\n",
108 | "temp=pd.DataFrame(temp1)\n",
109 | "temp"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 3,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "data": {
119 | "text/html": [
120 | "\n",
121 | "\n",
134 | "
\n",
135 | " \n",
136 | " \n",
137 | " | \n",
138 | " date | \n",
139 | " event | \n",
140 | " wind-speed | \n",
141 | "
\n",
142 | " \n",
143 | " \n",
144 | " \n",
145 | " 0 | \n",
146 | " 01-02-12 | \n",
147 | " sunny | \n",
148 | " 12 | \n",
149 | "
\n",
150 | " \n",
151 | " 1 | \n",
152 | " 03-02-12 | \n",
153 | " cold | \n",
154 | " 10 | \n",
155 | "
\n",
156 | " \n",
157 | " 2 | \n",
158 | " 04-02-12 | \n",
159 | " cold | \n",
160 | " 9 | \n",
161 | "
\n",
162 | " \n",
163 | " 3 | \n",
164 | " 05-02-12 | \n",
165 | " rainy | \n",
166 | " 14 | \n",
167 | "
\n",
168 | " \n",
169 | "
\n",
170 | "
"
171 | ],
172 | "text/plain": [
173 | " date event wind-speed\n",
174 | "0 01-02-12 sunny 12\n",
175 | "1 03-02-12 cold 10\n",
176 | "2 04-02-12 cold 9\n",
177 | "3 05-02-12 rainy 14"
178 | ]
179 | },
180 | "execution_count": 3,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "ws={\n",
187 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
188 | " \"event\":['sunny','cold','cold','rainy'],\n",
189 | " \"wind-speed\":[12,10,9,14],\n",
190 | "}\n",
191 | "wind_speed=pd.DataFrame(ws)\n",
192 | "wind_speed"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 5,
198 | "metadata": {},
199 | "outputs": [
200 | {
201 | "data": {
202 | "text/html": [
203 | "\n",
204 | "\n",
217 | "
\n",
218 | " \n",
219 | " \n",
220 | " | \n",
221 | " date | \n",
222 | " event | \n",
223 | " temp | \n",
224 | " wind-speed | \n",
225 | "
\n",
226 | " \n",
227 | " \n",
228 | " \n",
229 | " 0 | \n",
230 | " 01-02-12 | \n",
231 | " sunny | \n",
232 | " 14 | \n",
233 | " 12 | \n",
234 | "
\n",
235 | " \n",
236 | " 1 | \n",
237 | " 03-02-12 | \n",
238 | " cold | \n",
239 | " 16 | \n",
240 | " 10 | \n",
241 | "
\n",
242 | " \n",
243 | " 2 | \n",
244 | " 04-02-12 | \n",
245 | " cold | \n",
246 | " 15 | \n",
247 | " 9 | \n",
248 | "
\n",
249 | " \n",
250 | " 3 | \n",
251 | " 05-02-12 | \n",
252 | " rainy | \n",
253 | " 10 | \n",
254 | " 14 | \n",
255 | "
\n",
256 | " \n",
257 | "
\n",
258 | "
"
259 | ],
260 | "text/plain": [
261 | " date event temp wind-speed\n",
262 | "0 01-02-12 sunny 14 12\n",
263 | "1 03-02-12 cold 16 10\n",
264 | "2 04-02-12 cold 15 9\n",
265 | "3 05-02-12 rainy 10 14"
266 | ]
267 | },
268 | "execution_count": 5,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "df=pd.merge(temp,wind_speed,on=['date','event'])\n",
275 | "df"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | " If you have different data in the common columns then you can not do inner merge in this case.\n",
283 | " Assume the following case:\n",
284 | " \n",
285 | " temp \n",
286 | " ___________________\n",
287 | " | event | temp |\n",
288 | " |--------|----------|\n",
289 | " | sunny | 40 |\n",
290 | " | hot | 30 |\n",
291 | " | rainy | 25 |\n",
292 | " |________|__________|\n",
293 | " \n",
294 | " wind-speed\n",
295 | " ___________________\n",
296 | " | event |wind-speed|\n",
297 | " |--------|----------|\n",
298 | " | sunny | 12 |\n",
299 | " | hot | 14 |\n",
300 | " | hot | 11 |\n",
301 | " |________|__________|\n",
302 | " \n",
303 | " Merged Dataframe by inner method\n",
304 | " \n",
305 | " ________________________________\n",
306 | " | event | temp | wind-speed |\n",
307 | " |--------|----------|------------|\n",
308 | " | sunny | 40 | 12 | \n",
309 | " | hot | 30 | 14 |\n",
310 | " |________|__________|____________|\n",
311 | " \n",
312 | " You can see that last row is not merged here because there is no common element in the common columns.\n",
313 | " "
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "### Outer join\n",
321 | "This is just like union of two dataframe.The value which dont exist will contain NaN. \n",
322 | " \n",
323 | "\n",
324 | " temp \n",
325 | " ___________________\n",
326 | " | event | temp |\n",
327 | " |--------|----------|\n",
328 | " | sunny | 40 |\n",
329 | " | hot | 30 |\n",
330 | " | rainy | 25 |\n",
331 | " |________|__________|\n",
332 | " \n",
333 | " wind-speed\n",
334 | " ___________________\n",
335 | " | event |wind-speed|\n",
336 | " |--------|----------|\n",
337 | " | sunny | 12 |\n",
338 | " | hot | 14 |\n",
339 | " | hot | 11 |\n",
340 | " |________|__________|\n",
341 | " \n",
342 | " Merged Dataframe by inner method\n",
343 | " \n",
344 | " ________________________________\n",
345 | " | event | temp | wind-speed |\n",
346 | " |--------|----------|------------|\n",
347 | " | sunny | 40 | 12 | \n",
348 | " | hot | 30 | 14 |\n",
349 | " | rainy | 25 | NaN |\n",
350 | " | hot | NaN | 11 |\n",
351 | " |________|__________|____________|\n",
352 | " \n",
353 | " You can see in the final dataframe two rows increased."
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 8,
359 | "metadata": {},
360 | "outputs": [
361 | {
362 | "data": {
363 | "text/html": [
364 | "\n",
365 | "\n",
378 | "
\n",
379 | " \n",
380 | " \n",
381 | " | \n",
382 | " date | \n",
383 | " event | \n",
384 | " temp | \n",
385 | "
\n",
386 | " \n",
387 | " \n",
388 | " \n",
389 | " 0 | \n",
390 | " 01-02-12 | \n",
391 | " sunny | \n",
392 | " 14 | \n",
393 | "
\n",
394 | " \n",
395 | " 1 | \n",
396 | " 03-02-12 | \n",
397 | " cold | \n",
398 | " 16 | \n",
399 | "
\n",
400 | " \n",
401 | " 2 | \n",
402 | " 04-02-12 | \n",
403 | " hot | \n",
404 | " 15 | \n",
405 | "
\n",
406 | " \n",
407 | " 3 | \n",
408 | " 05-02-12 | \n",
409 | " sunny | \n",
410 | " 10 | \n",
411 | "
\n",
412 | " \n",
413 | "
\n",
414 | "
"
415 | ],
416 | "text/plain": [
417 | " date event temp\n",
418 | "0 01-02-12 sunny 14\n",
419 | "1 03-02-12 cold 16\n",
420 | "2 04-02-12 hot 15\n",
421 | "3 05-02-12 sunny 10"
422 | ]
423 | },
424 | "execution_count": 8,
425 | "metadata": {},
426 | "output_type": "execute_result"
427 | }
428 | ],
429 | "source": [
430 | "temp1={\n",
431 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
432 | " \"event\":['sunny','cold','hot','sunny'],\n",
433 | " \"temp\":[14,16,15,10]\n",
434 | "}\n",
435 | "temp=pd.DataFrame(temp1)\n",
436 | "temp"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 7,
442 | "metadata": {},
443 | "outputs": [
444 | {
445 | "data": {
446 | "text/html": [
447 | "\n",
448 | "\n",
461 | "
\n",
462 | " \n",
463 | " \n",
464 | " | \n",
465 | " date | \n",
466 | " event | \n",
467 | " wind-speed | \n",
468 | "
\n",
469 | " \n",
470 | " \n",
471 | " \n",
472 | " 0 | \n",
473 | " 01-02-12 | \n",
474 | " sunny | \n",
475 | " 12 | \n",
476 | "
\n",
477 | " \n",
478 | " 1 | \n",
479 | " 03-02-12 | \n",
480 | " cold | \n",
481 | " 10 | \n",
482 | "
\n",
483 | " \n",
484 | " 2 | \n",
485 | " 04-02-12 | \n",
486 | " cold | \n",
487 | " 9 | \n",
488 | "
\n",
489 | " \n",
490 | " 3 | \n",
491 | " 05-02-12 | \n",
492 | " rainy | \n",
493 | " 14 | \n",
494 | "
\n",
495 | " \n",
496 | "
\n",
497 | "
"
498 | ],
499 | "text/plain": [
500 | " date event wind-speed\n",
501 | "0 01-02-12 sunny 12\n",
502 | "1 03-02-12 cold 10\n",
503 | "2 04-02-12 cold 9\n",
504 | "3 05-02-12 rainy 14"
505 | ]
506 | },
507 | "execution_count": 7,
508 | "metadata": {},
509 | "output_type": "execute_result"
510 | }
511 | ],
512 | "source": [
513 | "ws={\n",
514 | " \"date\":['01-02-12','03-02-12','04-02-12','05-02-12'],\n",
515 | " \"event\":['sunny','cold','cold','rainy'],\n",
516 | " \"wind-speed\":[12,10,9,14],\n",
517 | "}\n",
518 | "wind_speed=pd.DataFrame(ws)\n",
519 | "wind_speed"
520 | ]
521 | },
522 | {
523 | "cell_type": "markdown",
524 | "metadata": {},
525 | "source": [
526 | "##### One extra argument you need to pass in outer join i.e, how=\"outer\".By defaut it is inner."
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 9,
532 | "metadata": {},
533 | "outputs": [
534 | {
535 | "data": {
536 | "text/html": [
537 | "\n",
538 | "\n",
551 | "
\n",
552 | " \n",
553 | " \n",
554 | " | \n",
555 | " date | \n",
556 | " event | \n",
557 | " temp | \n",
558 | " wind-speed | \n",
559 | "
\n",
560 | " \n",
561 | " \n",
562 | " \n",
563 | " 0 | \n",
564 | " 01-02-12 | \n",
565 | " sunny | \n",
566 | " 14.0 | \n",
567 | " 12.0 | \n",
568 | "
\n",
569 | " \n",
570 | " 1 | \n",
571 | " 03-02-12 | \n",
572 | " cold | \n",
573 | " 16.0 | \n",
574 | " 10.0 | \n",
575 | "
\n",
576 | " \n",
577 | " 2 | \n",
578 | " 04-02-12 | \n",
579 | " hot | \n",
580 | " 15.0 | \n",
581 | " NaN | \n",
582 | "
\n",
583 | " \n",
584 | " 3 | \n",
585 | " 05-02-12 | \n",
586 | " sunny | \n",
587 | " 10.0 | \n",
588 | " NaN | \n",
589 | "
\n",
590 | " \n",
591 | " 4 | \n",
592 | " 04-02-12 | \n",
593 | " cold | \n",
594 | " NaN | \n",
595 | " 9.0 | \n",
596 | "
\n",
597 | " \n",
598 | " 5 | \n",
599 | " 05-02-12 | \n",
600 | " rainy | \n",
601 | " NaN | \n",
602 | " 14.0 | \n",
603 | "
\n",
604 | " \n",
605 | "
\n",
606 | "
"
607 | ],
608 | "text/plain": [
609 | " date event temp wind-speed\n",
610 | "0 01-02-12 sunny 14.0 12.0\n",
611 | "1 03-02-12 cold 16.0 10.0\n",
612 | "2 04-02-12 hot 15.0 NaN\n",
613 | "3 05-02-12 sunny 10.0 NaN\n",
614 | "4 04-02-12 cold NaN 9.0\n",
615 | "5 05-02-12 rainy NaN 14.0"
616 | ]
617 | },
618 | "execution_count": 9,
619 | "metadata": {},
620 | "output_type": "execute_result"
621 | }
622 | ],
623 | "source": [
624 | "df=pd.merge(temp,wind_speed,on=['date','event'],how='outer')\n",
625 | "df"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": null,
631 | "metadata": {},
632 | "outputs": [],
633 | "source": []
634 | }
635 | ],
636 | "metadata": {
637 | "kernelspec": {
638 | "display_name": "Python 3",
639 | "language": "python",
640 | "name": "python3"
641 | },
642 | "language_info": {
643 | "codemirror_mode": {
644 | "name": "ipython",
645 | "version": 3
646 | },
647 | "file_extension": ".py",
648 | "mimetype": "text/x-python",
649 | "name": "python",
650 | "nbconvert_exporter": "python",
651 | "pygments_lexer": "ipython3",
652 | "version": "3.6.4"
653 | }
654 | },
655 | "nbformat": 4,
656 | "nbformat_minor": 2
657 | }
658 |
--------------------------------------------------------------------------------