├── img ├── csvfile.png ├── csvfile1.png ├── dataframe.png └── dnld_rep.png ├── 001_Python_Pandas_Methods ├── student_data.csv ├── img │ ├── dad.png │ ├── ddkf.png │ ├── dfat.png │ ├── dfri.png │ ├── midf.png │ ├── dcfsr.png │ ├── dfcsv.png │ ├── dfdict.png │ ├── dfhead.png │ ├── dfiat.png │ ├── dftail.png │ ├── micolh.png │ ├── dcolfmi.png │ ├── dcwaONA.png │ ├── dcwnoNA.png │ ├── ddfdcol.png │ ├── df2dict.png │ ├── dropscol.png │ ├── midfhead.png │ └── midftail.png ├── README.md ├── automobile_data.csv ├── 002_Python_Pandas_DataFrame_from_List.ipynb ├── 005_Python_Pandas_DataFrame_drop_duplicates.ipynb ├── 006_Python_Pandas_DataFrame_drop_columns_with_NA.ipynb ├── 008_Python_Pandas_DataFrame_to_Python_dictionary.ipynb ├── 003_Python_Pandas_DataFrame_head()_and_tail().ipynb ├── 010_Python_Pandas_DataFrame_reset_index.ipynb ├── 001_Python_Pandas_DataFrame_from_Dictionary.ipynb └── 009_Python_Pandas_DataFrame_set_index.ipynb ├── Pandas Cheat Sheet Data Wrangling in Python.pdf ├── Pandas Cheat Sheet for Data Science in Python.pdf ├── stockprice_data.csv ├── LICENSE ├── automobile_data.csv └── README.md /img/csvfile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/img/csvfile.png -------------------------------------------------------------------------------- /img/csvfile1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/img/csvfile1.png -------------------------------------------------------------------------------- /img/dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/img/dataframe.png -------------------------------------------------------------------------------- /img/dnld_rep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/img/dnld_rep.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/student_data.csv: -------------------------------------------------------------------------------- 1 | Name,Marks 2 | Nat,70.88 3 | Harry,85.9 4 | Joe,91.45 5 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dad.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/ddkf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/ddkf.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dfat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dfat.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dfri.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dfri.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/midf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/midf.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dcfsr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dcfsr.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dfcsv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dfcsv.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dfdict.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dfdict.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dfhead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dfhead.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dfiat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dfiat.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dftail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dftail.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/micolh.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/micolh.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dcolfmi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dcolfmi.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dcwaONA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dcwaONA.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dcwnoNA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dcwnoNA.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/ddfdcol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/ddfdcol.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/df2dict.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/df2dict.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/dropscol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/dropscol.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/midfhead.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/midfhead.png -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/img/midftail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/001_Python_Pandas_Methods/img/midftail.png -------------------------------------------------------------------------------- /Pandas Cheat Sheet Data Wrangling in Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/Pandas Cheat Sheet Data Wrangling in Python.pdf -------------------------------------------------------------------------------- /Pandas Cheat Sheet for Data Science in Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/milaan9/10_Python_Pandas_Module/HEAD/Pandas Cheat Sheet for Data Science in Python.pdf -------------------------------------------------------------------------------- /stockprice_data.csv: -------------------------------------------------------------------------------- 1 | Date,Closing price,Return 2 | 1/1/2020,100,0.010000 3 | 2/1/2020,120,0.200000 4 | 3/1/2020,130,0.083333 5 | 4/1/2020,98,-0.246154 6 | 5/1/2020,50,-0.489796 7 | 6/1/2020,102,1.040000 8 | 7/1/2020,104,0.019608 9 | 8/1/2020,150,0.442308 10 | 9/1/2020,160,0.066667 11 | 10/1/2020,109,-0.318750 12 | 11/1/2020,95,-0.128440 13 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/README.md: -------------------------------------------------------------------------------- 1 |

2 | Last Commit 3 | 4 |

5 | 6 | 7 | 8 | # Python Pandas Methods 9 | 10 | In this class, you'll learn about python pandas methods. 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 milaan9 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /automobile_data.csv: -------------------------------------------------------------------------------- 1 | index,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price 2 | 0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495 3 | 1,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,16500 4 | 2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500 5 | 3,audi,sedan,99.8,176.6,ohc,four,102,24,13950 6 | 4,audi,sedan,99.4,176.6,ohc,five,115,18,17450 7 | 5,audi,sedan,99.8,177.3,ohc,five,110,19,15250 8 | 6,audi,wagon,105.8,192.7,ohc,five,110,19,18920 9 | 9,bmw,sedan,101.2,176.8,ohc,four,101,23,16430 10 | 10,bmw,sedan,101.2,176.8,ohc,four,101,23,16925 11 | 11,bmw,sedan,101.2,176.8,ohc,six,121,21,20970 12 | 13,bmw,sedan,103.5,189,ohc,six,182,16,30760 13 | 14,bmw,sedan,103.5,193.8,ohc,six,182,16,41315 14 | 15,bmw,sedan,110,197,ohc,six,182,15,36880 15 | 16,chevrolet,hatchback,88.4,141.1,l,three,48,47,5151 16 | 17,chevrolet,hatchback,94.5,155.9,ohc,four,70,38,6295 17 | 18,chevrolet,sedan,94.5,158.8,ohc,four,70,38,6575 18 | 19,dodge,hatchback,93.7,157.3,ohc,four,68,31,6377 19 | 20,dodge,hatchback,93.7,157.3,ohc,four,68,31,6229 20 | 27,honda,wagon,96.5,157.1,ohc,four,76,30,7295 21 | 28,honda,sedan,96.5,175.4,ohc,four,101,24,12945 22 | 29,honda,sedan,96.5,169.1,ohc,four,100,25,10345 23 | 30,isuzu,sedan,94.3,170.7,ohc,four,78,24,6785 24 | 31,isuzu,sedan,94.5,155.9,ohc,four,70,38, 25 | 32,isuzu,sedan,94.5,155.9,ohc,four,70,38, 26 | 33,jaguar,sedan,113,199.6,dohc,six,176,15,32250 27 | 34,jaguar,sedan,113,199.6,dohc,six,176,15,35550 28 | 35,jaguar,sedan,102,191.7,ohcv,twelve,262,13,36000 29 | 36,mazda,hatchback,93.1,159.1,ohc,four,68,30,5195 30 | 37,mazda,hatchback,93.1,159.1,ohc,four,68,31,6095 31 | 38,mazda,hatchback,93.1,159.1,ohc,four,68,31,6795 32 | 39,mazda,hatchback,95.3,169,rotor,two,101,17,11845 33 | 43,mazda,sedan,104.9,175,ohc,four,72,31,18344 34 | 44,mercedes-benz,sedan,110,190.9,ohc,five,123,22,25552 35 | 45,mercedes-benz,wagon,110,190.9,ohc,five,123,22,28248 36 | 46,mercedes-benz,sedan,120.9,208.1,ohcv,eight,184,14,40960 37 | 47,mercedes-benz,hardtop,112,199.2,ohcv,eight,184,14,45400 38 | 49,mitsubishi,hatchback,93.7,157.3,ohc,four,68,37,5389 39 | 50,mitsubishi,hatchback,93.7,157.3,ohc,four,68,31,6189 40 | 51,mitsubishi,sedan,96.3,172.4,ohc,four,88,25,6989 41 | 52,mitsubishi,sedan,96.3,172.4,ohc,four,88,25,8189 42 | 53,nissan,sedan,94.5,165.3,ohc,four,55,45,7099 43 | 54,nissan,sedan,94.5,165.3,ohc,four,69,31,6649 44 | 55,nissan,sedan,94.5,165.3,ohc,four,69,31,6849 45 | 56,nissan,wagon,94.5,170.2,ohc,four,69,31,7349 46 | 57,nissan,sedan,100.4,184.6,ohcv,six,152,19,13499 47 | 61,porsche,hardtop,89.5,168.9,ohcf,six,207,17,34028 48 | 62,porsche,convertible,89.5,168.9,ohcf,six,207,17,37028 49 | 63,porsche,hatchback,98.4,175.7,dohcv,eight,288,17, 50 | 66,toyota,hatchback,95.7,158.7,ohc,four,62,35,5348 51 | 67,toyota,hatchback,95.7,158.7,ohc,four,62,31,6338 52 | 68,toyota,hatchback,95.7,158.7,ohc,four,62,31,6488 53 | 69,toyota,wagon,95.7,169.7,ohc,four,62,31,6918 54 | 70,toyota,wagon,95.7,169.7,ohc,four,62,27,7898 55 | 71,toyota,wagon,95.7,169.7,ohc,four,62,27,8778 56 | 79,toyota,wagon,104.5,187.8,dohc,six,156,19,15750 57 | 80,volkswagen,sedan,97.3,171.7,ohc,four,52,37,7775 58 | 81,volkswagen,sedan,97.3,171.7,ohc,four,85,27,7975 59 | 82,volkswagen,sedan,97.3,171.7,ohc,four,52,37,7995 60 | 86,volkswagen,sedan,97.3,171.7,ohc,four,100,26,9995 61 | 87,volvo,sedan,104.3,188.8,ohc,four,114,23,12940 62 | 88,volvo,wagon,104.3,188.8,ohc,four,114,23,13415 63 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/automobile_data.csv: -------------------------------------------------------------------------------- 1 | index,company,body-style,wheel-base,length,engine-type,num-of-cylinders,horsepower,average-mileage,price 2 | 0,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,13495 3 | 1,alfa-romero,convertible,88.6,168.8,dohc,four,111,21,16500 4 | 2,alfa-romero,hatchback,94.5,171.2,ohcv,six,154,19,16500 5 | 3,audi,sedan,99.8,176.6,ohc,four,102,24,13950 6 | 4,audi,sedan,99.4,176.6,ohc,five,115,18,17450 7 | 5,audi,sedan,99.8,177.3,ohc,five,110,19,15250 8 | 6,audi,wagon,105.8,192.7,ohc,five,110,19,18920 9 | 9,bmw,sedan,101.2,176.8,ohc,four,101,23,16430 10 | 10,bmw,sedan,101.2,176.8,ohc,four,101,23,16925 11 | 11,bmw,sedan,101.2,176.8,ohc,six,121,21,20970 12 | 13,bmw,sedan,103.5,189,ohc,six,182,16,30760 13 | 14,bmw,sedan,103.5,193.8,ohc,six,182,16,41315 14 | 15,bmw,sedan,110,197,ohc,six,182,15,36880 15 | 16,chevrolet,hatchback,88.4,141.1,l,three,48,47,5151 16 | 17,chevrolet,hatchback,94.5,155.9,ohc,four,70,38,6295 17 | 18,chevrolet,sedan,94.5,158.8,ohc,four,70,38,6575 18 | 19,dodge,hatchback,93.7,157.3,ohc,four,68,31,6377 19 | 20,dodge,hatchback,93.7,157.3,ohc,four,68,31,6229 20 | 27,honda,wagon,96.5,157.1,ohc,four,76,30,7295 21 | 28,honda,sedan,96.5,175.4,ohc,four,101,24,12945 22 | 29,honda,sedan,96.5,169.1,ohc,four,100,25,10345 23 | 30,isuzu,sedan,94.3,170.7,ohc,four,78,24,6785 24 | 31,isuzu,sedan,94.5,155.9,ohc,four,70,38, 25 | 32,isuzu,sedan,94.5,155.9,ohc,four,70,38, 26 | 33,jaguar,sedan,113,199.6,dohc,six,176,15,32250 27 | 34,jaguar,sedan,113,199.6,dohc,six,176,15,35550 28 | 35,jaguar,sedan,102,191.7,ohcv,twelve,262,13,36000 29 | 36,mazda,hatchback,93.1,159.1,ohc,four,68,30,5195 30 | 37,mazda,hatchback,93.1,159.1,ohc,four,68,31,6095 31 | 38,mazda,hatchback,93.1,159.1,ohc,four,68,31,6795 32 | 39,mazda,hatchback,95.3,169,rotor,two,101,17,11845 33 | 43,mazda,sedan,104.9,175,ohc,four,72,31,18344 34 | 44,mercedes-benz,sedan,110,190.9,ohc,five,123,22,25552 35 | 45,mercedes-benz,wagon,110,190.9,ohc,five,123,22,28248 36 | 46,mercedes-benz,sedan,120.9,208.1,ohcv,eight,184,14,40960 37 | 47,mercedes-benz,hardtop,112,199.2,ohcv,eight,184,14,45400 38 | 49,mitsubishi,hatchback,93.7,157.3,ohc,four,68,37,5389 39 | 50,mitsubishi,hatchback,93.7,157.3,ohc,four,68,31,6189 40 | 51,mitsubishi,sedan,96.3,172.4,ohc,four,88,25,6989 41 | 52,mitsubishi,sedan,96.3,172.4,ohc,four,88,25,8189 42 | 53,nissan,sedan,94.5,165.3,ohc,four,55,45,7099 43 | 54,nissan,sedan,94.5,165.3,ohc,four,69,31,6649 44 | 55,nissan,sedan,94.5,165.3,ohc,four,69,31,6849 45 | 56,nissan,wagon,94.5,170.2,ohc,four,69,31,7349 46 | 57,nissan,sedan,100.4,184.6,ohcv,six,152,19,13499 47 | 61,porsche,hardtop,89.5,168.9,ohcf,six,207,17,34028 48 | 62,porsche,convertible,89.5,168.9,ohcf,six,207,17,37028 49 | 63,porsche,hatchback,98.4,175.7,dohcv,eight,288,17, 50 | 66,toyota,hatchback,95.7,158.7,ohc,four,62,35,5348 51 | 67,toyota,hatchback,95.7,158.7,ohc,four,62,31,6338 52 | 68,toyota,hatchback,95.7,158.7,ohc,four,62,31,6488 53 | 69,toyota,wagon,95.7,169.7,ohc,four,62,31,6918 54 | 70,toyota,wagon,95.7,169.7,ohc,four,62,27,7898 55 | 71,toyota,wagon,95.7,169.7,ohc,four,62,27,8778 56 | 79,toyota,wagon,104.5,187.8,dohc,six,156,19,15750 57 | 80,volkswagen,sedan,97.3,171.7,ohc,four,52,37,7775 58 | 81,volkswagen,sedan,97.3,171.7,ohc,four,85,27,7975 59 | 82,volkswagen,sedan,97.3,171.7,ohc,four,52,37,7995 60 | 86,volkswagen,sedan,97.3,171.7,ohc,four,100,26,9995 61 | 87,volvo,sedan,104.3,188.8,ohc,four,114,23,12940 62 | 88,volvo,wagon,104.3,188.8,ohc,four,114,23,13415 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Last Commit 3 | 4 | 5 | 6 | 7 | Stars Badge 8 | Forks Badge 9 | Size 10 | Pull Requests Badge 11 | Issues Badge 12 | Language 13 | MIT License 14 |

15 | 16 | 17 |

18 | binder 19 | colab 20 |

21 | 22 | # 10_Python_Pandas_Module 23 | 24 | 25 | ## Introduction 👋 26 | 27 | ### What is Pandas in Python? 28 | [![Pandas](https://img.shields.io/badge/Pandas-2C2D72?style=flat&logo=pandas&logoColor=white)](https://pandas.pydata.org/) is the most famous python library providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, **real world** data analysis in Python. Additionally, it has the broader goal of becoming **the most powerful and flexible open source data analysis / manipulation tool available in any language**. It is already well on its way towards this goal. 29 | 30 | In Pandas, the data is usually utilized to support the statistical analysis in **SciPy**, plotting functions from **Matplotlib**, and machine learning algorithms in **Scikit-learn**. 31 | 32 | 33 | ## Main Features 34 | Here are just a few of the things that pandas does well: 35 | 36 | - Easy handling of [**missing data**][missing-data] (represented as `NaN`) in floating point as well as non-floating point data 37 | - Size mutability: columns can be [**inserted and deleted**][insertion-deletion] from DataFrame and higher dimensional objects 38 | - Automatic and explicit [**data alignment**][alignment]: objects can be explicitly aligned to a set of labels, or the user can simply 39 | ignore the labels and let `Series`, `DataFrame`, etc. automatically align the data for you in computations 40 | - Powerful, flexible [**group by**][groupby] functionality to perform split-apply-combine operations on data sets, for both aggregating 41 | and transforming data 42 | - Make it [**easy to convert**][conversion] ragged, differently-indexed data in other Python and NumPy data structures 43 | into DataFrame objects 44 | - Intelligent label-based [**slicing**][slicing], [**fancy indexing**][fancy-indexing], and [**subsetting**][subsetting] of 45 | large data sets 46 | - Intuitive [**merging**][merging] and [**joining**][joining] datasets 47 | - Flexible [**reshaping**][reshape] and [**pivoting**][pivot-table] of datasets 48 | - [**Hierarchical**][mi] labeling of axes (possible to have multiple labels per tick) 49 | - Robust IO tools for loading data from [**flat files**][flat-files] (CSV and delimited), [**Excel files**][excel], [**databases**][db], 50 | and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] 51 | - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, 52 | moving window linear regressions, date shifting and lagging, etc. 53 | 54 | 55 | [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data 56 | [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion 57 | [alignment]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures 58 | [groupby]: https://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine 59 | [conversion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe 60 | [slicing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges 61 | [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix 62 | [subsetting]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing 63 | [merging]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging 64 | [joining]: https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index 65 | [reshape]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables 66 | [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations 67 | [mi]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex 68 | [flat-files]: https://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files 69 | [excel]: https://pandas.pydata.org/pandas-docs/stable/io.html#excel-files 70 | [db]: https://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries 71 | [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables 72 | [timeseries]: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality 73 | 74 | 75 | ### Core Components of Pandas Data Structure 76 | Pandas have two core data structure components, and all operations are based on those two objects. Organizing data in a particular way is known as a data structure. Here are the two pandas data structures: 77 | 78 | * **Series** 79 | * **DataFrame** 80 | 81 | --- 82 | 83 | ## Table of contents 📋 84 | 85 | | **No.** | **Name** | 86 | | ------- | -------- | 87 | | 01 | **[Python_Pandas_DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)** | 88 | | | 1.1 **[001_Python_Pandas_DataFrame_from_Dictionary](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/001_Python_Pandas_DataFrame_from_Dictionary.ipynb)** | 89 | | | 1.2 **[Python_Pandas_DataFrame_from_List](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/002_Python_Pandas_DataFrame_from_List.ipynb)** | 90 | | | 1.3 **[Python_Pandas_DataFrame_head()_and_tail()](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/003_Python_Pandas_DataFrame_head()_and_tail().ipynb)** | 91 | | | 1.4 **[004_Python_Pandas_DataFrame_drop_columns](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/004_Python_Pandas_DataFrame_drop_columns.ipynb)** | 92 | | | 1.5 **[Python_Pandas_DataFrame_drop_duplicates](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/005_Python_Pandas_DataFrame_drop_duplicates.ipynb)** | 93 | | | 1.6 **[Python_Pandas_DataFrame_drop_columns_with_NA](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/006_Python_Pandas_DataFrame_drop_columns_with_NA.ipynb)** | 94 | | | 1.7 **[Python_Pandas_DataFrame_rename_columns](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/007_Python_Pandas_DataFrame_rename_columns.ipynb)** | 95 | | | 1.8 **[Python_Pandas_DataFrame_to_Python_dictionary](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/008_Python_Pandas_DataFrame_to_Python_dictionary.ipynb)** | 96 | | | 1.9 **[Python_Pandas_DataFrame_set_index](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/009_Python_Pandas_DataFrame_set_index.ipynb)** | 97 | | | 1.10 **[Python_Pandas_DataFrame_reset_index](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/010_Python_Pandas_DataFrame_reset_index.ipynb)** | 98 | | 02 | **[Python_Pandas_Exercise_1](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/002_Python_Pandas_Exercise_1.ipynb)** | 99 | | 03 | **[Python_Pandas_Exercise_2](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/003_Python_Pandas_Exercise_2.ipynb)** | 100 | | | **[automobile_data.csv](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/automobile_data.csv)** | 101 | | | **[pokemon_data.csv](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/pokemon_data.csv)** | 102 | | 04 | **[Pandas Cheat Sheet Data Wrangling in Python.pdf](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/Pandas%20Cheat%20Sheet%20Data%20Wrangling%20in%20Python.pdf)** | 103 | | 05 | **[Pandas Cheat Sheet for Data Science in Python.pdf](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/Pandas%20Cheat%20Sheet%20for%20Data%20Science%20in%20Python.pdf)** | 104 | 105 | These are online **read-only** versions. However you can **`Run ▶`** all the codes **online** by clicking here ➞ binder 106 | 107 | --- 108 | 109 | ## Install Pandas Module: 110 | 111 | Open your [![Anaconda](https://img.shields.io/badge/Anaconda-342B029.svg?&style=flate&logo=anaconda&logoColor=white)](https://www.anaconda.com/products/individual) Prompt propmt and type and run the following command (individually): 112 | 113 | - pip install pandas 114 | 115 | 116 | Once Installed now we can import it inside our python code. 117 | 118 | --- 119 | 120 | ## Frequently asked questions ❔ 121 | 122 | ### How can I thank you for writing and sharing this tutorial? 🌷 123 | 124 | You can Star Badge and Fork Badge Starring and Forking is free for you, but it tells me and other people that it was helpful and you like this tutorial. 125 | 126 | Go [**`here`**](https://github.com/milaan9/10_Python_Pandas_Module) if you aren't here already and click ➞ **`✰ Star`** and **`ⵖ Fork`** button in the top right corner. You'll be asked to create a GitHub account if you don't already have one. 127 | 128 | --- 129 | 130 | ### How can I read this tutorial without an Internet connection? GIF 131 | 132 | 1. Go [**`here`**](https://github.com/milaan9/10_Python_Pandas_Module) and click the big green ➞ **`Code`** button in the top right of the page, then click ➞ [**`Download ZIP`**](https://github.com/milaan9/10_Python_Pandas_Module/archive/refs/heads/main.zip). 133 | 134 | ![Download ZIP](img/dnld_rep.png) 135 | 136 | 2. Extract the ZIP and open it. Unfortunately I don't have any more specific instructions because how exactly this is done depends on which operating system you run. 137 | 138 | 3. Launch ipython notebook from the folder which contains the notebooks. Open each one of them 139 | 140 | **`Kernel > Restart & Clear Output`** 141 | 142 | This will clear all the outputs and now you can understand each statement and learn interactively. 143 | 144 | If you have git and you know how to use it, you can also clone the repository instead of downloading a zip and extracting it. An advantage with doing it this way is that you don't need to download the whole tutorial again to get the latest version of it, all you need to do is to pull with git and run ipython notebook again. 145 | 146 | --- 147 | 148 | ## Authors ✍️ 149 | 150 | I'm Dr. Milaan Parmar and I have written this tutorial. If you think you can add/correct/edit and enhance this tutorial you are most welcome🙏 151 | 152 | See [github's contributors page](https://github.com/milaan9/10_Python_Pandas_Module/graphs/contributors) for details. 153 | 154 | If you have trouble with this tutorial please tell me about it by [Create an issue on GitHub](https://github.com/milaan9/10_Python_Pandas_Module/issues/new). and I'll make this tutorial better. This is probably the best choice if you had trouble following the tutorial, and something in it should be explained better. You will be asked to create a GitHub account if you don't already have one. 155 | 156 | If you like this tutorial, please [give it a ⭐ star](https://github.com/milaan9/10_Python_Pandas_Module). 157 | 158 | --- 159 | 160 | ## Licence 📜 161 | 162 | You may use this tutorial freely at your own risk. See [LICENSE](./LICENSE). 163 | 164 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/002_Python_Pandas_DataFrame_from_List.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Create Pandas DataFrame from Python List\n", 17 | "\n", 18 | "In this class, you will learn how to convert Python **[List](https://github.com/milaan9/02_Python_Datatypes/blob/main/003_Python_List.ipynb)** to a **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)**. It covers creating DataFrame from different types of a list like single list, multiple lists, nested lists. It creates DataFame from a list where a list can be added as a row or a column.\n", 19 | "\n", 20 | "The List is a simple data structure in Python that stores the values as a List. The List can have heterogeneous elements, i.e., it can have values of different types. To analyze such a List, we can convert it into the pandas DataFrame. By converting the List into a 2-dimensional structure makes it efficient to process.\n", 21 | "\n", 22 | "DataFrame can be created from List using DataFrame constructor. In this class, we will discusses all the cases of it in detail." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Create DataFrame from list using constructor\n", 30 | "\n", 31 | "**DataFrame constructor** can create DataFrame from different data structures in python like **`dict`**, list, set, tuple, and **`ndarray`**.\n", 32 | "\n", 33 | "**Example:**\n", 34 | "\n", 35 | "Here we create a DataFrame object using a list of heterogeneous data. By default, all list elements are added as a row in the DataFrame. And row index is the range of numbers(starting at 0)." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": { 42 | "ExecuteTime": { 43 | "end_time": "2021-06-17T11:49:41.778527Z", 44 | "start_time": "2021-06-17T11:49:41.209203Z" 45 | }, 46 | "scrolled": true 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "['Apple', 10, 'Orange', 55.5]\n", 54 | " 0\n", 55 | "0 Apple\n", 56 | "1 10\n", 57 | "2 Orange\n", 58 | "3 55.5\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "import pandas as pd\n", 64 | "\n", 65 | "# Create list\n", 66 | "fruits_list = ['Apple', 10, 'Orange', 55.50]\n", 67 | "print(fruits_list)\n", 68 | "\n", 69 | "# Create DataFrame from list\n", 70 | "fruits_df = pd.DataFrame(fruits_list)\n", 71 | "print(fruits_df)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Create DataFrame from list with a customized column name\n", 79 | "\n", 80 | "While creating a DataFrame from the list, we can give a customized column label in the resultant DataFrame. By default, it provides a range of integers as column labels, i.e., 0, 1, 2…n.\n", 81 | "\n", 82 | "We can specify column labels into the **`columns=[col_labels]`** parameter in the DataFrame constructor.\n", 83 | "\n", 84 | "**Example:**\n", 85 | "\n", 86 | "In the below example, we create DataFrame from a list of fruit names and provides a column label as **`Fruits`**." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 2, 92 | "metadata": { 93 | "ExecuteTime": { 94 | "end_time": "2021-06-17T11:49:44.302917Z", 95 | "start_time": "2021-06-17T11:49:44.289242Z" 96 | } 97 | }, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "['Apple', 'Banana', 'Orange', 'Mango']\n", 104 | " Fruits\n", 105 | "0 Apple\n", 106 | "1 Banana\n", 107 | "2 Orange\n", 108 | "3 Mango\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "import pandas as pd\n", 114 | "\n", 115 | "# Create list\n", 116 | "fruits_list = ['Apple', 'Banana', 'Orange','Mango']\n", 117 | "print(fruits_list)\n", 118 | "\n", 119 | "# Create DataFrame from list\n", 120 | "fruits_df = pd.DataFrame(fruits_list, columns=['Fruits'])\n", 121 | "print(fruits_df)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## Create DataFrame from list with a customized index\n", 129 | "\n", 130 | "As we just discussed the changing column label, we can even customize the row index as well. We can give a meaningful row index to identify each row uniquely. It becomes easier to access the rows using the index label.\n", 131 | "\n", 132 | "We can specify row index into the **`index=[row_index1, row_index2]`** parameter in the DataFrame constructor. By default, it gives a range of integers as row index i.e. 0, 1, 2…n.\n", 133 | "\n", 134 | "**Example:**\n", 135 | "\n", 136 | "Let’s see how we can provide the custom row index while creating DataFrame from the List." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 3, 142 | "metadata": { 143 | "ExecuteTime": { 144 | "end_time": "2021-06-17T11:49:48.028455Z", 145 | "start_time": "2021-06-17T11:49:48.008928Z" 146 | } 147 | }, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "['Apple', 'Banana', 'Orange', 'Mango']\n", 154 | " 0\n", 155 | "Fruit1 Apple\n", 156 | "Fruit2 Banana\n", 157 | "Fruit3 Orange\n", 158 | "Fruit4 Mango\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "import pandas as pd\n", 164 | "\n", 165 | "# Create list\n", 166 | "fruits_list = ['Apple', 'Banana', 'Orange','Mango']\n", 167 | "print(fruits_list)\n", 168 | "\n", 169 | "# Create DataFrame from list\n", 170 | "fruits_df = pd.DataFrame(fruits_list, index=['Fruit1', 'Fruit2', 'Fruit3', 'Fruit4'])\n", 171 | "print(fruits_df)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Create DataFrame from list by changing data type\n", 179 | "\n", 180 | "While converting a Python List to the DataFrame, we may need to change the values’ data type.\n", 181 | "\n", 182 | "We can change the data type of the list elements using the **`dtype`** parameter of the DataFrame constructor.\n", 183 | "\n", 184 | "**Example:**\n", 185 | "\n", 186 | "Suppose we have a list of fruit’s prices of type **object**. But, while creating DataFrame we need to correct its data type to **float64**. In such case we use **`dtype`** parameter as shown below example." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 4, 192 | "metadata": { 193 | "ExecuteTime": { 194 | "end_time": "2021-06-17T11:49:51.098244Z", 195 | "start_time": "2021-06-17T11:49:51.073834Z" 196 | } 197 | }, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "['50', '100', '60', '20']\n", 204 | "Data type before : 0 object\n", 205 | "dtype: object\n", 206 | "Data type after : 0 float64\n", 207 | "dtype: object\n", 208 | " 0\n", 209 | "0 50.0\n", 210 | "1 100.0\n", 211 | "2 60.0\n", 212 | "3 20.0\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "import pandas as pd\n", 218 | "\n", 219 | "# Create list\n", 220 | "price_list = ['50', '100', '60', '20']\n", 221 | "print(price_list)\n", 222 | "\n", 223 | "# Create DataFrame from list\n", 224 | "price_df = pd.DataFrame(price_list)\n", 225 | "print(\"Data type before : \", price_df.dtypes)\n", 226 | "\n", 227 | "# Create DataFrame from list with type change\n", 228 | "price_df = pd.DataFrame(price_list, dtype='float64')\n", 229 | "print(\"Data type after : \", price_df.dtypes)\n", 230 | "print(price_df)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "## Create DataFrame from hierarchical lists as rows\n", 238 | "\n", 239 | "It may be possible to have data scattered into multiple lists or in the list of lists, also called a **multi-dimensional list**. In such a case, We can pass such a list to the DataFrame constructor to convert it into the DataFrame. By default, it adds each list as a row in the resultant DataFrame.\n", 240 | "\n", 241 | "**Example:**\n", 242 | "\n", 243 | "In the below example, we have a list that has lists of fruit names and their prices. DataFrame constructor will add both the lists as a separate row in the resulting DataFrame." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 5, 249 | "metadata": { 250 | "ExecuteTime": { 251 | "end_time": "2021-06-17T11:49:54.917049Z", 252 | "start_time": "2021-06-17T11:49:54.904842Z" 253 | } 254 | }, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "[['Apple', 'Banana', 'Orange', 'Mango'], [120, 40, 80, 500]]\n", 261 | " 0 1 2 3\n", 262 | "0 Apple Banana Orange Mango\n", 263 | "1 120 40 80 500\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "import pandas as pd\n", 269 | "\n", 270 | "# Create list\n", 271 | "fruits_list = [['Apple', 'Banana', 'Orange', 'Mango'],[120, 40, 80, 500]]\n", 272 | "print(fruits_list)\n", 273 | "\n", 274 | "# Create DataFrame from list\n", 275 | "fruits_df = pd.DataFrame(fruits_list)\n", 276 | "print(fruits_df)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "## Create DataFrame from Hierarchical lists as columns\n", 284 | "\n", 285 | "As discussed in the above section, we have a multi-dimensional list, but we do not want them to add to the DataFrame as a row. Instead, we want to add each list as a separate column in the DataFrame. For that, we need to use the **`transpose()`** function.\n", 286 | "\n", 287 | "**Example:**\n", 288 | "\n", 289 | "Here we have a list of two lists, fruit names and another for the fruits’ price. And we want to add both the list as a separate column in the DataFrame." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 6, 295 | "metadata": { 296 | "ExecuteTime": { 297 | "end_time": "2021-06-17T11:49:56.176800Z", 298 | "start_time": "2021-06-17T11:49:56.161174Z" 299 | } 300 | }, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "[['Apple', 'Banana', 'Orange', 'Mango'], [120, 40, 80, 500]]\n", 307 | " 0 1\n", 308 | "0 Apple 120\n", 309 | "1 Banana 40\n", 310 | "2 Orange 80\n", 311 | "3 Mango 500\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "import pandas as pd\n", 317 | "\n", 318 | "# Create list\n", 319 | "fruits_list = [['Apple', 'Banana', 'Orange', 'Mango'],[120, 40, 80, 500]]\n", 320 | "print(fruits_list)\n", 321 | "\n", 322 | "# Create DataFrame from list\n", 323 | "fruits_df = pd.DataFrame(fruits_list).transpose()\n", 324 | "print(fruits_df)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## Create DataFrame from multiple lists\n", 332 | "\n", 333 | "It is the most common use case in the industry where you have multiple separate lists, and you need to add them as different columns in the DataFrame. This case can be resolved by using following two ways:\n", 334 | "\n", 335 | "1. **`zip(list1, list2...)`**\n", 336 | "2. **`dict { 'col1' : list1, 'col2' : list2}`**\n", 337 | "\n", 338 | "**Example:**\n", 339 | "\n", 340 | "The below example demonstrates the use of **`zip()`** function to combine multiple lists in one list and pass it to the DataFrame constructor." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 7, 346 | "metadata": { 347 | "ExecuteTime": { 348 | "end_time": "2021-06-17T11:49:59.170416Z", 349 | "start_time": "2021-06-17T11:49:59.152839Z" 350 | }, 351 | "scrolled": true 352 | }, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | " Name Price\n", 359 | "0 Apple 120\n", 360 | "1 Banana 40\n", 361 | "2 Orange 80\n", 362 | "3 Mango 500\n" 363 | ] 364 | } 365 | ], 366 | "source": [ 367 | "import pandas as pd\n", 368 | "\n", 369 | "# Create multiple lists\n", 370 | "fruits_list = ['Apple', 'Banana', 'Orange', 'Mango']\n", 371 | "price_list = [120, 40, 80, 500]\n", 372 | "\n", 373 | "# Create DataFrame\n", 374 | "fruits_df = pd.DataFrame(list(zip(fruits_list, price_list )), columns = ['Name', 'Price'])\n", 375 | "print(fruits_df)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "The below example demonstrates the use of Python dictionary data structure to solve the purpose. Here, column names are keys of the dict and, lists are the values of dict which need to be added in the DataFrame." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 8, 388 | "metadata": { 389 | "ExecuteTime": { 390 | "end_time": "2021-06-17T11:50:00.855453Z", 391 | "start_time": "2021-06-17T11:50:00.832994Z" 392 | }, 393 | "scrolled": true 394 | }, 395 | "outputs": [ 396 | { 397 | "name": "stdout", 398 | "output_type": "stream", 399 | "text": [ 400 | "{'Name': ['Apple', 'Banana', 'Orange', 'Mango'], 'Price': [120, 40, 80, 500]}\n", 401 | " Name Price\n", 402 | "0 Apple 120\n", 403 | "1 Banana 40\n", 404 | "2 Orange 80\n", 405 | "3 Mango 500\n" 406 | ] 407 | } 408 | ], 409 | "source": [ 410 | "import pandas as pd\n", 411 | "\n", 412 | "# Create multiple lists\n", 413 | "fruits_list = ['Apple', 'Banana', 'Orange', 'Mango']\n", 414 | "price_list = [120, 40, 80, 500]\n", 415 | "\n", 416 | "# Create dict\n", 417 | "fruits_dict = {'Name': fruits_list,\n", 418 | " 'Price': price_list}\n", 419 | "print(fruits_dict)\n", 420 | "\n", 421 | "# Create DataFrame from dict\n", 422 | "fruits_df = pd.DataFrame(fruits_dict)\n", 423 | "print(fruits_df)\n" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [] 432 | } 433 | ], 434 | "metadata": { 435 | "hide_input": false, 436 | "kernelspec": { 437 | "display_name": "Python 3", 438 | "language": "python", 439 | "name": "python3" 440 | }, 441 | "language_info": { 442 | "codemirror_mode": { 443 | "name": "ipython", 444 | "version": 3 445 | }, 446 | "file_extension": ".py", 447 | "mimetype": "text/x-python", 448 | "name": "python", 449 | "nbconvert_exporter": "python", 450 | "pygments_lexer": "ipython3", 451 | "version": "3.8.8" 452 | }, 453 | "toc": { 454 | "base_numbering": 1, 455 | "nav_menu": {}, 456 | "number_sections": true, 457 | "sideBar": true, 458 | "skip_h1_title": false, 459 | "title_cell": "Table of Contents", 460 | "title_sidebar": "Contents", 461 | "toc_cell": false, 462 | "toc_position": {}, 463 | "toc_section_display": true, 464 | "toc_window_display": false 465 | }, 466 | "varInspector": { 467 | "cols": { 468 | "lenName": 16, 469 | "lenType": 16, 470 | "lenVar": 40 471 | }, 472 | "kernels_config": { 473 | "python": { 474 | "delete_cmd_postfix": "", 475 | "delete_cmd_prefix": "del ", 476 | "library": "var_list.py", 477 | "varRefreshCmd": "print(var_dic_list())" 478 | }, 479 | "r": { 480 | "delete_cmd_postfix": ") ", 481 | "delete_cmd_prefix": "rm(", 482 | "library": "var_list.r", 483 | "varRefreshCmd": "cat(var_dic_list()) " 484 | } 485 | }, 486 | "types_to_exclude": [ 487 | "module", 488 | "function", 489 | "builtin_function_or_method", 490 | "instance", 491 | "_Feature" 492 | ], 493 | "window_display": false 494 | } 495 | }, 496 | "nbformat": 4, 497 | "nbformat_minor": 2 498 | } 499 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/005_Python_Pandas_DataFrame_drop_duplicates.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Drop duplicates in pandas DataFrame\n", 17 | "\n", 18 | "In this class, we learn to remove duplicates from the **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)**.\n", 19 | "\n", 20 | "Data is gathered from various sources. It may not be in the proper form. It contains garbage values and duplicate data. Before analyzing a dataset, it must be clean and precise." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Also, See:\n", 28 | "\n", 29 | "* **[Drop columns in pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/004_Python_Pandas_DataFrame_drop_columns.ipynb)**\n", 30 | "* **[Drop columns with NA in pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/006_Python_Pandas_DataFrame_drop_columns_with_NA.ipynb)**" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "heading_collapsed": true 37 | }, 38 | "source": [ 39 | "## The `DataFrame.drop_duplicates()` function\n", 40 | "\n", 41 | "This function is used to remove the duplicate rows from a DataFrame.\n", 42 | "\n", 43 | "**Syntax:**\n", 44 | "```python\n", 45 | "DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)\n", 46 | "```\n", 47 | "\n", 48 | "**Parameters:**\n", 49 | "\n", 50 | "1. **`subset`**: By default, if the rows have the same values in all the columns, they are considered duplicates. This parameter is used to specify the columns that only need to be considered for identifying duplicates.\n", 51 | "2. **`keep`**: Determines which duplicates (if any) to keep. It takes inputs as,\n", 52 | " * **first** – Drop duplicates except for the first occurrence. This is the default behavior.\n", 53 | " * **last** – Drop duplicates except for the last occurrence.\n", 54 | " * **False** – Drop all duplicates.\n", 55 | "3. **`inplace`**: It is used to specify whether to return a new DataFrame or update an existing one. It is a boolean flag with default False.\n", 56 | "4. **`ignore_index`**: It is a boolean flag to indicate if row index should be reset after dropping duplicate rows. **`False`**: It keeps the original row index. **`True`**: It reset the index, and the resulting rows will be labeled 0, 1, …, n – 1.\n", 57 | "\n", 58 | "**Returns:**\n", 59 | "\n", 60 | "* It returns the DataFrame with dropped columns or None if **`inplace=True`**" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Drop duplicates but keep first\n", 68 | "\n", 69 | "When we have the DataFrame with many duplicate rows that we want to remove we use **`DataFrame.drop_duplicates()`**.\n", 70 | "\n", 71 | "The rows that contain the same values in all the columns then are identified as duplicates. If the row is duplicated then by default **`DataFrame.drop_duplicates()`** keeps the first occurrence of that row and drops all other duplicates of it.\n", 72 | "\n", 73 | "**Example:**\n", 74 | "\n", 75 | "
\n", 76 | "\n", 77 | "
" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 1, 83 | "metadata": { 84 | "ExecuteTime": { 85 | "end_time": "2021-06-17T11:53:50.748771Z", 86 | "start_time": "2021-06-17T11:53:49.901569Z" 87 | } 88 | }, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "Before dropping duplicates: \n", 95 | " name age marks\n", 96 | "0 Joe 20 85.10\n", 97 | "1 Nat 21 77.80\n", 98 | "2 Harry 19 91.54\n", 99 | "3 Joe 20 85.10\n", 100 | "4 Nat 21 77.80\n", 101 | "\n", 102 | "After dropping column: \n", 103 | " name age marks\n", 104 | "0 Joe 20 85.10\n", 105 | "1 Nat 21 77.80\n", 106 | "2 Harry 19 91.54\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "import pandas as pd\n", 112 | "\n", 113 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\", \"Joe\", \"Nat\"], \"age\": [20, 21, 19, 20, 21],\n", 114 | " \"marks\": [85.10, 77.80, 91.54, 85.10, 77.80]}\n", 115 | "\n", 116 | "# Create DataFrame from dict\n", 117 | "student_df = pd.DataFrame(student_dict)\n", 118 | "print(\"Before dropping duplicates: \\n\", student_df)\n", 119 | "\n", 120 | "# drop duplicate rows\n", 121 | "student_df = student_df.drop_duplicates()\n", 122 | "\n", 123 | "print(\"\\nAfter dropping column: \\n\", student_df)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Drop duplicates from defined columns\n", 131 | "\n", 132 | "By default, **`DataFrame.drop_duplicate()`** removes rows with the same values in all the columns. But, we can modify this behavior using a subset parameter.\n", 133 | "\n", 134 | "For example, **`subset=[col1, col2]`** will remove the duplicate rows with the same values in specified columns only, i.e., **`col1`** and **`col2`**.\n", 135 | "\n", 136 | "**Example:**\n", 137 | "\n", 138 | "In the below example, rows for **`Nat`** and **`Sam`** are removed even though their names are different because only **`age`** and **`marks`** columns are considered to check for duplicates.\n", 139 | "\n", 140 | "
\n", 141 | "\n", 142 | "
" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 2, 148 | "metadata": { 149 | "ExecuteTime": { 150 | "end_time": "2021-06-17T11:53:50.858101Z", 151 | "start_time": "2021-06-17T11:53:50.804877Z" 152 | } 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "Before dropping duplicates: \n", 160 | " name age marks\n", 161 | "0 Joe 20 85.10\n", 162 | "1 Nat 21 77.80\n", 163 | "2 Harry 19 91.54\n", 164 | "3 Sam 21 77.80\n", 165 | "\n", 166 | "After dropping column: \n", 167 | " name age marks\n", 168 | "0 Joe 20 85.10\n", 169 | "1 Nat 21 77.80\n", 170 | "2 Harry 19 91.54\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "import pandas as pd\n", 176 | "\n", 177 | "student_dict = {\"name\":[\"Joe\",\"Nat\",\"Harry\",\"Sam\" ], \"age\":[20,21,19,21], \"marks\":[85.10, 77.80, 91.54, 77.80]}\n", 178 | "\n", 179 | "# Create DataFrame from dict\n", 180 | "student_df = pd.DataFrame(student_dict)\n", 181 | "print(\"Before dropping duplicates: \\n\", student_df)\n", 182 | "\n", 183 | "# drop duplicate rows\n", 184 | "student_df = student_df.drop_duplicates(subset=['age','marks'])\n", 185 | "\n", 186 | "print(\"\\nAfter dropping column: \\n\", student_df)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Drop duplicates but keep last\n", 194 | "\n", 195 | "Let’s consider the case where we have a row that is duplicated multiple times in the DataSet. In such a case, To keep only one occurrence of the duplicate row, we can use the **`keep`** parameter of a **`DataFrame.drop_duplicate()`**, which takes the following inputs:\n", 196 | "\n", 197 | "* **first** – Drop duplicates except for the first occurrence of the duplicate row. This is the default behavior.\n", 198 | "* **last** – Drop duplicates except for the last occurrence of the duplicate row.\n", 199 | "* **False** – Drop all the rows which are duplicate.\n", 200 | "\n", 201 | "**Example:**\n", 202 | "\n", 203 | "In the below example, we are dropping the last occurrence of the duplicate rows using **`keep='last'`**." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 3, 209 | "metadata": { 210 | "ExecuteTime": { 211 | "end_time": "2021-06-17T11:53:53.505528Z", 212 | "start_time": "2021-06-17T11:53:53.476234Z" 213 | } 214 | }, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "Before dropping duplicates: \n", 221 | " name age marks\n", 222 | "0 Joe 20 85.10\n", 223 | "1 Nat 21 77.80\n", 224 | "2 Harry 19 91.54\n", 225 | "3 Nat 21 77.80\n", 226 | "\n", 227 | "After dropping column: \n", 228 | " name age marks\n", 229 | "0 Joe 20 85.10\n", 230 | "2 Harry 19 91.54\n", 231 | "3 Nat 21 77.80\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "import pandas as pd\n", 237 | "\n", 238 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\", \"Nat\"], \"age\": [20, 21, 19, 21], \"marks\": [85.10, 77.80, 91.54, 77.80]}\n", 239 | "\n", 240 | "# Create DataFrame from dict\n", 241 | "student_df = pd.DataFrame(student_dict)\n", 242 | "print(\"Before dropping duplicates: \\n\", student_df)\n", 243 | "\n", 244 | "# drop duplicate rows\n", 245 | "student_df = student_df.drop_duplicates(keep='last')\n", 246 | "\n", 247 | "print(\"\\nAfter dropping column: \\n\", student_df)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## Drop all duplicates\n", 255 | "\n", 256 | "As explained in the above section, by default, **`DataFrame.drop_duplicates()`** keeps the duplicate row’s first occurrence and removes all others.\n", 257 | "\n", 258 | "If we need to drop all the duplicate rows, then it can be done by using keep=False, as shown below.\n", 259 | "\n", 260 | "
\n", 261 | "\n", 262 | "
" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 4, 268 | "metadata": { 269 | "ExecuteTime": { 270 | "end_time": "2021-06-17T11:53:56.012334Z", 271 | "start_time": "2021-06-17T11:53:55.978156Z" 272 | } 273 | }, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "Before dropping duplicates: \n", 280 | " name age marks\n", 281 | "0 Joe 20 85.10\n", 282 | "1 Nat 21 77.80\n", 283 | "2 Harry 19 91.54\n", 284 | "3 Nat 21 77.80\n", 285 | "\n", 286 | "After dropping column: \n", 287 | " name age marks\n", 288 | "0 Joe 20 85.10\n", 289 | "2 Harry 19 91.54\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "import pandas as pd\n", 295 | "\n", 296 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\", \"Nat\"], \"age\": [20, 21, 19, 21], \"marks\": [85.10, 77.80, 91.54, 77.80]}\n", 297 | "\n", 298 | "# Create DataFrame from dict\n", 299 | "student_df = pd.DataFrame(student_dict)\n", 300 | "print(\"Before dropping duplicates: \\n\", student_df)\n", 301 | "\n", 302 | "# drop all duplicate rows\n", 303 | "student_df = student_df.drop_duplicates(keep=False)\n", 304 | "\n", 305 | "print(\"\\nAfter dropping column: \\n\", student_df)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "## Drop duplicates in place\n", 313 | "\n", 314 | "By default, **`DataFrame.drop_duplicates()`** removes the duplicates and returns the copy of the DataFrame.\n", 315 | "\n", 316 | "But, if we want to make changes in the existing DataFrame, then set the flag **`inplace=True`**. It can be used when the drop operation is part of the function chaining." 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 5, 322 | "metadata": { 323 | "ExecuteTime": { 324 | "end_time": "2021-06-17T11:53:57.820911Z", 325 | "start_time": "2021-06-17T11:53:57.795519Z" 326 | } 327 | }, 328 | "outputs": [ 329 | { 330 | "name": "stdout", 331 | "output_type": "stream", 332 | "text": [ 333 | "Before dropping duplicates: \n", 334 | " name age marks\n", 335 | "0 Joe 20 85.10\n", 336 | "1 Nat 21 77.80\n", 337 | "2 Harry 19 91.54\n", 338 | "3 Joe 20 85.10\n", 339 | "4 Nat 21 77.80\n", 340 | "\n", 341 | "After dropping column: \n", 342 | " name age marks\n", 343 | "0 Joe 20 85.10\n", 344 | "1 Nat 21 77.80\n", 345 | "2 Harry 19 91.54\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "import pandas as pd\n", 351 | "\n", 352 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\", \"Joe\", \"Nat\"], \"age\": [20, 21, 19, 20, 21],\n", 353 | " \"marks\": [85.10, 77.80, 91.54, 85.10, 77.80]}\n", 354 | "\n", 355 | "# Create DataFrame from dict\n", 356 | "student_df = pd.DataFrame(student_dict)\n", 357 | "print(\"Before dropping duplicates: \\n\", student_df)\n", 358 | "\n", 359 | "# drop duplicate rows\n", 360 | "student_df.drop_duplicates(inplace=True)\n", 361 | "\n", 362 | "print(\"\\nAfter dropping column: \\n\", student_df)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## Drop duplicates and reset the index\n", 370 | "\n", 371 | "When we drop the rows from DataFrame, by default, it keeps the original row index as is. But, if we need to reset the index of the resultant DataFrame, we can do that using the **`ignore_index`** parameter of **`DataFrame.drop_duplicate()`**.\n", 372 | "\n", 373 | "* If **`ignore_index=True`**, it reset the row labels of resultant DataFrame to 0, 1, …, n – 1.\n", 374 | "* If **`ignore_index=False`** it does not change the original row index. By default, it is **False**." 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 6, 380 | "metadata": { 381 | "ExecuteTime": { 382 | "end_time": "2021-06-17T11:53:59.475189Z", 383 | "start_time": "2021-06-17T11:53:59.452728Z" 384 | } 385 | }, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "Before dropping duplicates: \n", 392 | " name age marks\n", 393 | "a Joe 20 85.10\n", 394 | "b Nat 21 77.80\n", 395 | "c Harry 19 91.54\n", 396 | "d Nat 21 77.80\n", 397 | "\n", 398 | "After dropping column: \n", 399 | " name age marks\n", 400 | "0 Joe 20 85.10\n", 401 | "1 Harry 19 91.54\n" 402 | ] 403 | } 404 | ], 405 | "source": [ 406 | "import pandas as pd\n", 407 | "\n", 408 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\", \"Nat\"], \"age\": [20, 21, 19, 21], \"marks\": [85.10, 77.80, 91.54, 77.80]}\n", 409 | "\n", 410 | "# Create DataFrame from dict\n", 411 | "student_df = pd.DataFrame(student_dict, index=['a', 'b', 'c', 'd'])\n", 412 | "print(\"Before dropping duplicates: \\n\", student_df)\n", 413 | "\n", 414 | "# drop duplicate rows\n", 415 | "student_df = student_df.drop_duplicates(keep=False, ignore_index=True)\n", 416 | "\n", 417 | "print(\"\\nAfter dropping column: \\n\", student_df)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [] 426 | } 427 | ], 428 | "metadata": { 429 | "hide_input": false, 430 | "kernelspec": { 431 | "display_name": "Python 3", 432 | "language": "python", 433 | "name": "python3" 434 | }, 435 | "language_info": { 436 | "codemirror_mode": { 437 | "name": "ipython", 438 | "version": 3 439 | }, 440 | "file_extension": ".py", 441 | "mimetype": "text/x-python", 442 | "name": "python", 443 | "nbconvert_exporter": "python", 444 | "pygments_lexer": "ipython3", 445 | "version": "3.8.8" 446 | }, 447 | "toc": { 448 | "base_numbering": 1, 449 | "nav_menu": {}, 450 | "number_sections": true, 451 | "sideBar": true, 452 | "skip_h1_title": false, 453 | "title_cell": "Table of Contents", 454 | "title_sidebar": "Contents", 455 | "toc_cell": false, 456 | "toc_position": {}, 457 | "toc_section_display": true, 458 | "toc_window_display": false 459 | }, 460 | "varInspector": { 461 | "cols": { 462 | "lenName": 16, 463 | "lenType": 16, 464 | "lenVar": 40 465 | }, 466 | "kernels_config": { 467 | "python": { 468 | "delete_cmd_postfix": "", 469 | "delete_cmd_prefix": "del ", 470 | "library": "var_list.py", 471 | "varRefreshCmd": "print(var_dic_list())" 472 | }, 473 | "r": { 474 | "delete_cmd_postfix": ") ", 475 | "delete_cmd_prefix": "rm(", 476 | "library": "var_list.r", 477 | "varRefreshCmd": "cat(var_dic_list()) " 478 | } 479 | }, 480 | "types_to_exclude": [ 481 | "module", 482 | "function", 483 | "builtin_function_or_method", 484 | "instance", 485 | "_Feature" 486 | ], 487 | "window_display": false 488 | } 489 | }, 490 | "nbformat": 4, 491 | "nbformat_minor": 2 492 | } 493 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/006_Python_Pandas_DataFrame_drop_columns_with_NA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Drop columns with NA in pandas DataFrame\n", 17 | "\n", 18 | "This article covers all the cases to remove columns from **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)** that contains missing or NA values.\n", 19 | "\n", 20 | "For multiple reasons, it could happen that data in the Dataset is missing or not available. It is a very usual case where we need to clean the data before start analyzing it." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Also, See:\n", 28 | "\n", 29 | "* **[Drop columns in pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/004_Python_Pandas_DataFrame_drop_columns.ipynb)**\n", 30 | "* **[Drop duplicates in pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/005_Python_Pandas_DataFrame_drop_duplicates.ipynb)**" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "heading_collapsed": true 37 | }, 38 | "source": [ 39 | "## The `DataFrame.dropna()` function\n", 40 | "\n", 41 | "We can use this pandas function to remove columns from the DataFrame with values Not Available(NA).\n", 42 | "\n", 43 | "**Syntax:**\n", 44 | "```python\n", 45 | "DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)\n", 46 | "```\n", 47 | "\n", 48 | "**Parameters:**\n", 49 | "\n", 50 | "1. **`axis`**: It determines the axis to remove. Set it to 1 or **`column`** to remove columns containing missing values. By default, it removes rows with NA from DataFrame.\n", 51 | "2. **`how`**: It takes the following inputs:\n", 52 | " * **`‘any’`**: This is the default case to drop the column if it has at least one value missing.\n", 53 | " * **`‘all’`**: Drop the column only if it has all the values as NA.\n", 54 | "3. **`thresh`**: It applies a condition to drop the columns only if it does not contain the required number of values. It takes an int as input.\n", 55 | "4. **`subset`**: While dropping columns, it is used to specify the list of rows to be considered to find NA.\n", 56 | "5. **`inplace`**: It is used to specify whether to return a new DataFrame or update an existing one. It is a boolean flag with default **`False`**.\n", 57 | "\n", 58 | "**Returns:**\n", 59 | "\n", 60 | "* It returns the DataFrame with dropped NA or None if **`inplace=True`**" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Drop column where at least one value is missing\n", 68 | "\n", 69 | "There is a case when we cannot process the dataset with missing values. If we need to drop such columns that contain NA, we can use the **`axis=columns`** parameter of **`DataFrame.dropna()`** to specify deleting the columns.\n", 70 | "\n", 71 | "By default, it removes the column where one or more values are missing.\n", 72 | "\n", 73 | "**Example:**\n", 74 | "\n", 75 | "In the below example, it drops column **`marks`** because it contains NaN.\n", 76 | "\n", 77 | "
\n", 78 | "\n", 79 | "
" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 1, 85 | "metadata": { 86 | "ExecuteTime": { 87 | "end_time": "2021-06-17T11:55:40.360245Z", 88 | "start_time": "2021-06-17T11:55:39.765526Z" 89 | } 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "Before dropping column NA: \n", 97 | " name age marks\n", 98 | "0 Joe 20 85.10\n", 99 | "1 Sam 21 NaN\n", 100 | "2 Harry 19 91.54\n", 101 | "\n", 102 | "After dropping column NA: \n", 103 | " name age\n", 104 | "0 Joe 20\n", 105 | "1 Sam 21\n", 106 | "2 Harry 19\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "import pandas as pd\n", 112 | "import numpy as np\n", 113 | "\n", 114 | "student_dict = {\"name\": [\"Joe\", \"Sam\", \"Harry\"], \"age\": [20, 21, 19], \"marks\": [85.10, np.nan, 91.54]}\n", 115 | "\n", 116 | "# Create DataFrame from dict\n", 117 | "student_df = pd.DataFrame(student_dict)\n", 118 | "print(\"Before dropping column NA: \\n\", student_df)\n", 119 | "\n", 120 | "# drop column with NaN\n", 121 | "student_df = student_df.dropna(axis='columns')\n", 122 | "\n", 123 | "print(\"\\nAfter dropping column NA: \\n\", student_df)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Drop column where all values are missing\n", 131 | "\n", 132 | "We can drop an empty column from DataFrame using **`DataFrame.dropna()`**.\n", 133 | "\n", 134 | "We need to use how parameter as follows:\n", 135 | "\n", 136 | "* If **`how='all'`**, it drops the column where all the values are NA.\n", 137 | "* By default, **`how='any'`**, it removes the columns where one or more values are NA.\n", 138 | "\n", 139 | "**Example**\n", 140 | "\n", 141 | "The below example shows that it only drops the **`age`** column where all values are NaN. Other columns are not dropped even if it contains NaN.." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 2, 147 | "metadata": { 148 | "ExecuteTime": { 149 | "end_time": "2021-06-17T11:55:45.641921Z", 150 | "start_time": "2021-06-17T11:55:45.620436Z" 151 | } 152 | }, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "Before dropping column NA: \n", 159 | " name age marks\n", 160 | "0 Joe NaN 85.10\n", 161 | "1 Sam NaN NaN\n", 162 | "2 NaN NaN NaN\n", 163 | "3 Harry NaN 91.54\n", 164 | "\n", 165 | "After dropping column NA: \n", 166 | " name marks\n", 167 | "0 Joe 85.10\n", 168 | "1 Sam NaN\n", 169 | "2 NaN NaN\n", 170 | "3 Harry 91.54\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "import pandas as pd\n", 176 | "import numpy as np\n", 177 | "\n", 178 | "student_dict = {\"name\": [\"Joe\", \"Sam\", np.nan, \"Harry\"], \"age\": [np.nan, np.nan, np.nan, np.nan],\n", 179 | " \"marks\": [85.10, np.nan, np.nan, 91.54]}\n", 180 | "\n", 181 | "# Create DataFrame from dict\n", 182 | "student_df = pd.DataFrame(student_dict)\n", 183 | "print(\"Before dropping column NA: \\n\", student_df)\n", 184 | "\n", 185 | "# drop column with NaN\n", 186 | "student_df = student_df.dropna(axis='columns', how='all')\n", 187 | "\n", 188 | "print(\"\\nAfter dropping column NA: \\n\", student_df)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "## Drop column with the number of NA\n", 196 | "\n", 197 | "While cleaning the dataset, we can keep the columns with at least some data available in it else drop otherwise.\n", 198 | "\n", 199 | "We need to use the parameter **`thresh=no_of_nonNA_values`** of **`DataFrame.drop()`** to specify the number of values that must be available in the column. Else, drop the column.\n", 200 | "\n", 201 | "**Example**\n", 202 | "\n", 203 | "In the below example, we keep the column where at least three or more values are available and drop the column if the condition is not met.\n", 204 | "\n", 205 | "
\n", 206 | "\n", 207 | "
" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 3, 213 | "metadata": { 214 | "ExecuteTime": { 215 | "end_time": "2021-06-17T11:55:48.275676Z", 216 | "start_time": "2021-06-17T11:55:48.244431Z" 217 | } 218 | }, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "Before dropping column NA: \n", 225 | " name age marks\n", 226 | "0 Joe NaN 85.10\n", 227 | "1 Sam NaN NaN\n", 228 | "2 NaN NaN NaN\n", 229 | "3 Harry NaN 91.54\n", 230 | "\n", 231 | "After dropping column NA: \n", 232 | " name\n", 233 | "0 Joe\n", 234 | "1 Sam\n", 235 | "2 NaN\n", 236 | "3 Harry\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "import pandas as pd\n", 242 | "import numpy as np\n", 243 | "\n", 244 | "student_dict = {\"name\": [\"Joe\", \"Sam\", np.nan, \"Harry\"], \"age\": [np.nan, np.nan, np.nan, np.nan],\n", 245 | " \"marks\": [85.10, np.nan, np.nan, 91.54]}\n", 246 | "\n", 247 | "# Create DataFrame from dict\n", 248 | "student_df = pd.DataFrame(student_dict)\n", 249 | "print(\"Before dropping column NA: \\n\", student_df)\n", 250 | "\n", 251 | "# keep column with 3 or more non-NA values\n", 252 | "student_df = student_df.dropna(axis='columns', thresh=3)\n", 253 | "\n", 254 | "print(\"\\nAfter dropping column NA: \\n\", student_df)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "## Drop NA from defined rows\n", 262 | "\n", 263 | "Suppose we are interested in dropping the column only if it contains null values in some particular rows. For example, consider when we need to drop a column if it does not have data in its initial rows.\n", 264 | "\n", 265 | "In such a case, we can use **`subset=[row1, row2]`** of **`DataFrame.dropna()`** to specify the list of row indexes so that it drops the columns containing missing values in these rows only, i.e., row1 and row2 in this case.\n", 266 | "\n", 267 | "**Example:**\n", 268 | "\n", 269 | "Let’s see how to delete a column only if it contains the empty value in row 0 or 2, otherwise do not delete the column.\n", 270 | "\n", 271 | "
\n", 272 | "\n", 273 | "
" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 4, 279 | "metadata": { 280 | "ExecuteTime": { 281 | "end_time": "2021-06-17T11:55:52.698476Z", 282 | "start_time": "2021-06-17T11:55:52.652093Z" 283 | } 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "Before dropping column NA: \n", 291 | " name age marks\n", 292 | "0 Joe NaN 85.10\n", 293 | "1 Sam NaN NaN\n", 294 | "2 Harry NaN 91.54\n", 295 | "\n", 296 | "After dropping column NA: \n", 297 | " name marks\n", 298 | "0 Joe 85.10\n", 299 | "1 Sam NaN\n", 300 | "2 Harry 91.54\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "import pandas as pd\n", 306 | "import numpy as np\n", 307 | "\n", 308 | "student_dict = {\"name\": [\"Joe\", \"Sam\", \"Harry\"], \"age\": [np.nan, np.nan, np.nan], \"marks\": [85.10, np.nan, 91.54]}\n", 309 | "\n", 310 | "# Create DataFrame from dict\n", 311 | "student_df = pd.DataFrame(student_dict)\n", 312 | "print(\"Before dropping column NA: \\n\", student_df)\n", 313 | "\n", 314 | "# drop marks column with NaN\n", 315 | "student_df = student_df.dropna(axis='columns', subset=[0, 2])\n", 316 | "\n", 317 | "print(\"\\nAfter dropping column NA: \\n\", student_df)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "## Drop column with missing values in place\n", 325 | "\n", 326 | "We can drop columns from the existing DataFrame or by creating a copy of it. For that, we can use a flag **`inplace`** of **`DataFrame.dropna()`**.\n", 327 | "\n", 328 | "* If the **`inplace=True`**, then it updates the DataFrame and returns None.\n", 329 | "* If **`inplace=False`**, it returns the updated copy of the DataFrame.\n", 330 | "\n", 331 | "**Example:**\n", 332 | "\n", 333 | "As shown in the below example, we are dropping the column from the existing DataFrame without reassigning it to a new variable." 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 5, 339 | "metadata": { 340 | "ExecuteTime": { 341 | "end_time": "2021-06-17T11:55:55.612016Z", 342 | "start_time": "2021-06-17T11:55:55.589558Z" 343 | } 344 | }, 345 | "outputs": [ 346 | { 347 | "name": "stdout", 348 | "output_type": "stream", 349 | "text": [ 350 | "Before dropping row NA: \n", 351 | " name age marks\n", 352 | "0 Joe 20 85.10\n", 353 | "1 Sam 21 NaN\n", 354 | "2 Harry 19 91.54\n", 355 | "\n", 356 | "After dropping row NA: \n", 357 | " name age marks\n", 358 | "0 Joe 20 85.10\n", 359 | "2 Harry 19 91.54\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "import pandas as pd\n", 365 | "import numpy as np\n", 366 | "\n", 367 | "student_dict = {\"name\": [\"Joe\", \"Sam\", \"Harry\"], \"age\": [20, 21, 19], \"marks\": [85.10, np.nan, 91.54]}\n", 368 | "\n", 369 | "# Create DataFrame from dict\n", 370 | "student_df = pd.DataFrame(student_dict)\n", 371 | "print(\"Before dropping row NA: \\n\", student_df)\n", 372 | "\n", 373 | "# drop marks row with NaN\n", 374 | "student_df.dropna(inplace=True)\n", 375 | "\n", 376 | "print(\"\\nAfter dropping row NA: \\n\", student_df)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "## Drop duplicates and reset the index\n", 384 | "\n", 385 | "When we drop the rows from DataFrame, by default, it keeps the original row index as is. But, if we need to reset the index of the resultant DataFrame, we can do that using the **`ignore_index`** parameter of **`DataFrame.drop_duplicate()`**.\n", 386 | "\n", 387 | "* If **`ignore_index=True`**, it reset the row labels of resultant DataFrame to 0, 1, …, n – 1.\n", 388 | "* If **`ignore_index=False`** it does not change the original row index. By default, it is **False**." 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 6, 394 | "metadata": { 395 | "ExecuteTime": { 396 | "end_time": "2021-06-17T11:55:57.125673Z", 397 | "start_time": "2021-06-17T11:55:57.095400Z" 398 | } 399 | }, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "Before dropping duplicates: \n", 406 | " name age marks\n", 407 | "a Joe 20 85.10\n", 408 | "b Nat 21 77.80\n", 409 | "c Harry 19 91.54\n", 410 | "d Nat 21 77.80\n", 411 | "\n", 412 | "After dropping column: \n", 413 | " name age marks\n", 414 | "0 Joe 20 85.10\n", 415 | "1 Harry 19 91.54\n" 416 | ] 417 | } 418 | ], 419 | "source": [ 420 | "import pandas as pd\n", 421 | "\n", 422 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\", \"Nat\"], \"age\": [20, 21, 19, 21], \"marks\": [85.10, 77.80, 91.54, 77.80]}\n", 423 | "\n", 424 | "# Create DataFrame from dict\n", 425 | "student_df = pd.DataFrame(student_dict, index=['a', 'b', 'c', 'd'])\n", 426 | "print(\"Before dropping duplicates: \\n\", student_df)\n", 427 | "\n", 428 | "# drop duplicate rows\n", 429 | "student_df = student_df.drop_duplicates(keep=False, ignore_index=True)\n", 430 | "\n", 431 | "print(\"\\nAfter dropping column: \\n\", student_df)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [] 440 | } 441 | ], 442 | "metadata": { 443 | "hide_input": false, 444 | "kernelspec": { 445 | "display_name": "Python 3", 446 | "language": "python", 447 | "name": "python3" 448 | }, 449 | "language_info": { 450 | "codemirror_mode": { 451 | "name": "ipython", 452 | "version": 3 453 | }, 454 | "file_extension": ".py", 455 | "mimetype": "text/x-python", 456 | "name": "python", 457 | "nbconvert_exporter": "python", 458 | "pygments_lexer": "ipython3", 459 | "version": "3.8.8" 460 | }, 461 | "toc": { 462 | "base_numbering": 1, 463 | "nav_menu": {}, 464 | "number_sections": true, 465 | "sideBar": true, 466 | "skip_h1_title": false, 467 | "title_cell": "Table of Contents", 468 | "title_sidebar": "Contents", 469 | "toc_cell": false, 470 | "toc_position": {}, 471 | "toc_section_display": true, 472 | "toc_window_display": false 473 | }, 474 | "varInspector": { 475 | "cols": { 476 | "lenName": 16, 477 | "lenType": 16, 478 | "lenVar": 40 479 | }, 480 | "kernels_config": { 481 | "python": { 482 | "delete_cmd_postfix": "", 483 | "delete_cmd_prefix": "del ", 484 | "library": "var_list.py", 485 | "varRefreshCmd": "print(var_dic_list())" 486 | }, 487 | "r": { 488 | "delete_cmd_postfix": ") ", 489 | "delete_cmd_prefix": "rm(", 490 | "library": "var_list.r", 491 | "varRefreshCmd": "cat(var_dic_list()) " 492 | } 493 | }, 494 | "types_to_exclude": [ 495 | "module", 496 | "function", 497 | "builtin_function_or_method", 498 | "instance", 499 | "_Feature" 500 | ], 501 | "window_display": false 502 | } 503 | }, 504 | "nbformat": 4, 505 | "nbformat_minor": 2 506 | } 507 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/008_Python_Pandas_DataFrame_to_Python_dictionary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Convert Pandas DataFrame to Python dictionary\n", 17 | "\n", 18 | "In this class, you will learn how to convert **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)** into a Python dictionary. It explains creating different kinds of dictionaries from pandas DataFrame.\n", 19 | "\n", 20 | "Data Analyst needs to collect the data from heterogeneous sources like CSV files or SQL tables or Python data structures like a dictionary, list, etc. Such data is converted into pandas DataFrame.\n", 21 | "\n", 22 | "After analyzing the data, we need to convert the resultant DataFrame back to its original format like CSV files or a dictionary. Or sometimes, we need to convert it into some other form." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## The `DataFrame.to_dict()` function\n", 30 | "\n", 31 | "Pandas have a **`DataFrame.to_dict()`** function to create a Python dict object from DataFrame.\n", 32 | "\n", 33 | "**Syntax:**\n", 34 | "```python\n", 35 | "DataFrame.to_dict(orient='dict', into=)\n", 36 | "```\n", 37 | "\n", 38 | "**Parameters:**\n", 39 | "\n", 40 | "1. **`into`**: It is used to define the type of resultant **`dict`**. We can give an actual class or an empty instance.\n", 41 | "2. **`orient`**: It defines the structure of key-value pairs in the resultant **`dict`**. The below table shows the input parameter, the format in which it creates the **`dict`** and key-value of the resultant **`dict`**.\n", 42 | "\n", 43 | ">**Note:** Abbreviations are allowed. **`s`** indicates series, **`sp`** indicates split, **`r`** indicates record likewise.\n", 44 | "\n", 45 | "| Parameter | Dict format | Key | Value |\n", 46 | "|:---- |:---- |:---- | :---- |\n", 47 | "| **`'dict'` (Default)** | **`{column_label : {row_index : data}}`** | **column label** | **dict of row index and data** |\n", 48 | "| **`'list'`** | **`{column_label : [data]}`** | **column label** | **list of data** |\n", 49 | "| **`'series‘`** | **`{column_label : Series(data)}`** | **column label** | **series of data** |\n", 50 | "| **`'split'`** | **`{'row_index' : [index], ‘column_label’ : [columns], 'data' : [data]}`** | **row index, column labels, data** | **list of row index, list of columns labels, list of data** |\n", 51 | "| **`'records'`** | **`[{column_label : data}, … , {column_label : data}]`** | **column label** | **data** |\n", 52 | "| **`'index'`** | **`{row_index : {column_label : data}}`** | **row index** | **dict of column label and data** |" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "
\n", 60 | "\n", 61 | "
" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Example to convert pandas DataFrame to dict\n", 69 | "\n", 70 | "In the below example, we read the input from the **[student_data.csv](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/student_data.csv)** file and create a DataFrame object. It is then converted into the Python dictionary object.\n", 71 | "\n", 72 | "Input CSV file contains a simple dataset of student data with two columns, **`Name`** and **`Marks`**.\n", 73 | "\n", 74 | "
\n", 75 | "\n", 76 | "
\n", 77 | "\n", 78 | "DataFrame is converted into **`dict`** using the default **`'dict'`** parameter." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 1, 84 | "metadata": { 85 | "ExecuteTime": { 86 | "end_time": "2021-06-17T11:59:02.096128Z", 87 | "start_time": "2021-06-17T11:59:01.500924Z" 88 | } 89 | }, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | " Name Marks\n", 96 | "0 Nat 70.88\n", 97 | "1 Harry 85.90\n", 98 | "2 Joe 91.45\n", 99 | "\n", 100 | "Result dict: \n", 101 | " {'Name': {0: 'Nat', 1: 'Harry', 2: 'Joe'}, 'Marks': {0: 70.88, 1: 85.9, 2: 91.45}}\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "import pandas as pd\n", 107 | "\n", 108 | "# create dataframe from csv\n", 109 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 110 | "print(studentDf)\n", 111 | "\n", 112 | "# create dict from dataframe\n", 113 | "studentDict = studentDf.to_dict()\n", 114 | "print(\"\\nResult dict: \\n\", studentDict)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## DataFrame to dict with a list of values\n", 122 | "\n", 123 | "It is a case when we have DataFrame, which needs to be converted into the dictionary object such that column label should be the keys in the dictionary, and all the columns’ data should be added into the resultant dict as a list of values against each key.\n", 124 | "\n", 125 | "In that case, we can use **`'list'`** parameter of the **`DataFrame.to_dict()`** function.\n", 126 | "\n", 127 | "**Syntax:**\n", 128 | "```python\n", 129 | "{column_label : [data]}\n", 130 | "```\n", 131 | "\n", 132 | "**Example:**\n", 133 | "\n", 134 | "Let’s see how we can use a **`'list'`** parameter to create DataFrame with a list of values." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 2, 140 | "metadata": { 141 | "ExecuteTime": { 142 | "end_time": "2021-06-17T11:59:12.969535Z", 143 | "start_time": "2021-06-17T11:59:12.951960Z" 144 | } 145 | }, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | " Name Marks\n", 152 | "0 Nat 70.88\n", 153 | "1 Harry 85.90\n", 154 | "2 Joe 91.45\n", 155 | "\n", 156 | "Result dict: \n", 157 | " {'Name': ['Nat', 'Harry', 'Joe'], 'Marks': [70.88, 85.9, 91.45]}\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "import pandas as pd\n", 163 | "\n", 164 | "# create dataframe from csv\n", 165 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 166 | "print(studentDf)\n", 167 | "\n", 168 | "# create dict from dataframe\n", 169 | "studentDict = studentDf.to_dict('list')\n", 170 | "print(\"\\nResult dict: \\n\", studentDict)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## DataFrame to dict with pandas series of values\n", 178 | "\n", 179 | "When we need to convert the DataFrame into **`dict`** whereas column name as a key of the **`dict`**. And row index and data as a value in the **`dict`** for the respective keys.\n", 180 | "\n", 181 | "**Syntax:**\n", 182 | "```python\n", 183 | "{column_label : Series(row_index data)}\n", 184 | "```\n", 185 | "\n", 186 | "In that case, we can use the **`'series'`** parameter of **`DataFrame.to_dict()`** function.\n", 187 | "\n", 188 | "**Example:**\n", 189 | "\n", 190 | "In the below example, **`dict`** is created with two entries, one for **`Name`** column and the other for the **`Marks`** column of the DataFrame." 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 3, 196 | "metadata": { 197 | "ExecuteTime": { 198 | "end_time": "2021-06-17T11:59:21.696971Z", 199 | "start_time": "2021-06-17T11:59:21.668657Z" 200 | } 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | " Name Marks\n", 208 | "0 Nat 70.88\n", 209 | "1 Harry 85.90\n", 210 | "2 Joe 91.45\n", 211 | "\n", 212 | "Result dict: \n", 213 | " {'Name': 0 Nat\n", 214 | "1 Harry\n", 215 | "2 Joe\n", 216 | "Name: Name, dtype: object, 'Marks': 0 70.88\n", 217 | "1 85.90\n", 218 | "2 91.45\n", 219 | "Name: Marks, dtype: float64}\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "import pandas as pd\n", 225 | "\n", 226 | "# create dataframe from csv\n", 227 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 228 | "print(studentDf)\n", 229 | "\n", 230 | "# create dict from dataframe\n", 231 | "studentDict = studentDf.to_dict('series')\n", 232 | "print(\"\\nResult dict: \\n\", studentDict)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "## DataFrame to dict without header and index\n", 240 | "\n", 241 | "When we want to collect the data from DataFrame without the column headers or we need to separate the row index and header from the data, we can use the **`'split'`** parameter of **`DataFrame.to_dict()`** function. It splits the input DataFrame into three parts, i.e., row index, column labels, and actual data.\n", 242 | "\n", 243 | "**Syntax:**\n", 244 | "```python\n", 245 | "{'row_index' : [index], 'column_label' : [columns], 'data' : [data]}\n", 246 | "```\n", 247 | "\n", 248 | "**Example:**\n", 249 | "\n", 250 | "We can get the data without index or header from the resultant **`dict`** using key **`data`** as shown below." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 4, 256 | "metadata": { 257 | "ExecuteTime": { 258 | "end_time": "2021-06-17T11:59:31.353108Z", 259 | "start_time": "2021-06-17T11:59:31.327723Z" 260 | } 261 | }, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | " Name Marks\n", 268 | "0 Nat 70.88\n", 269 | "1 Harry 85.90\n", 270 | "2 Joe 91.45\n", 271 | "\n", 272 | " {'index': [0, 1, 2], 'columns': ['Name', 'Marks'], 'data': [['Nat', 70.88], ['Harry', 85.9], ['Joe', 91.45]]}\n", 273 | "\n", 274 | "List of values from DF without index and header: \n", 275 | " [['Nat', 70.88], ['Harry', 85.9], ['Joe', 91.45]]\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "import pandas as pd\n", 281 | "\n", 282 | "# create dataframe from csv\n", 283 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 284 | "print(studentDf)\n", 285 | "\n", 286 | "studentDict = studentDf.to_dict('split')\n", 287 | "print(\"\\n\", studentDict)\n", 288 | "\n", 289 | "# print only data\n", 290 | "print(\"\\nList of values from DF without index and header: \\n\", studentDict['data'])" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "## DataFrame to dict by row\n", 298 | "\n", 299 | "When we have a DataFrame where **each row** contains data that needs to be store in a **separate dictionary** object, i.e., we need a data row-wise, we can use the **`'records'`** parameter of the **`DataFrame.to_dict()`** function.\n", 300 | "\n", 301 | "It returns a list of dictionary objects. A **`dict`** for each row, where the key is a column label, and the value is column data.\n", 302 | "\n", 303 | "**Syntax:**\n", 304 | "```python\n", 305 | "{'row_index' : [index], 'column_label' : [columns], 'data' : [data]}\n", 306 | "```\n", 307 | "**Example:**\n", 308 | "\n", 309 | "In the below example, we created list of dictionary for each student data." 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 5, 315 | "metadata": { 316 | "ExecuteTime": { 317 | "end_time": "2021-06-17T11:59:40.514133Z", 318 | "start_time": "2021-06-17T11:59:40.488748Z" 319 | } 320 | }, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | " Name Marks\n", 327 | "0 Nat 70.88\n", 328 | "1 Harry 85.90\n", 329 | "2 Joe 91.45\n", 330 | "[{'Name': 'Nat', 'Marks': 70.88}, {'Name': 'Harry', 'Marks': 85.9}, {'Name': 'Joe', 'Marks': 91.45}]\n" 331 | ] 332 | }, 333 | { 334 | "name": "stderr", 335 | "output_type": "stream", 336 | "text": [ 337 | "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py:1549: FutureWarning: Using short name for 'orient' is deprecated. Only the options: ('dict', list, 'series', 'split', 'records', 'index') will be used in a future version. Use one of the above to silence this warning.\n", 338 | " warnings.warn(\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "# import pandas library\n", 344 | "import pandas as pd\n", 345 | "\n", 346 | "# create dataframe from csv\n", 347 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 348 | "print(studentDf)\n", 349 | "\n", 350 | "# create dict from dataframe\n", 351 | "studentDict = studentDf.to_dict('record')\n", 352 | "print(studentDict)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "## DataFrame to dict by row index\n", 360 | "\n", 361 | "When we have a DataFrame with row indexes and if we need to convert the data of each row from DataFrame to **`dict`**, we can use the **`index`** parameter of the **`DataFrame.to_dict()`** function.\n", 362 | "\n", 363 | "It returns a list of dictionary objects. A **`dict`** is created for each row. Where the key is a row index, and the value is **`dict`** of column label and data.\n", 364 | "\n", 365 | "**Syntax:**\n", 366 | "```python\n", 367 | "{row_index : {column_label : data}}\n", 368 | "```\n", 369 | "\n", 370 | "**Example:**\n", 371 | "\n", 372 | "In the below example **`dict`** object is created for each row of student data." 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 6, 378 | "metadata": { 379 | "ExecuteTime": { 380 | "end_time": "2021-06-17T11:59:47.728408Z", 381 | "start_time": "2021-06-17T11:59:47.714736Z" 382 | } 383 | }, 384 | "outputs": [ 385 | { 386 | "name": "stdout", 387 | "output_type": "stream", 388 | "text": [ 389 | " Name Marks\n", 390 | "0 Nat 70.88\n", 391 | "1 Harry 85.90\n", 392 | "2 Joe 91.45\n", 393 | "{0: {'Name': 'Nat', 'Marks': 70.88}, 1: {'Name': 'Harry', 'Marks': 85.9}, 2: {'Name': 'Joe', 'Marks': 91.45}}\n" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "import pandas as pd\n", 399 | "\n", 400 | "# create dataframe from csv\n", 401 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 402 | "print(studentDf)\n", 403 | "\n", 404 | "# create dict from dataframe\n", 405 | "studentDict = studentDf.to_dict('index')\n", 406 | "print(studentDict)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## DataFrame to dict with one column as the key\n", 414 | "\n", 415 | "In this section, we target the use case when we need to create a **`dict`** from DataFrame where one column as a key of **`dict`** and other columns as the value of the **`dict`**.\n", 416 | "\n", 417 | "Suppose we have student DataFrame with two columns, student’s **Name**, and student’s **Marks**. And we need to store each student’s data in the **`dict`** where the student name is the Key and their marks as a Value of the **`dict`**.\n", 418 | "\n", 419 | "We can do it in various ways, as shown below:\n", 420 | "\n", 421 | "* Using **`df.set_index('Col1').to_dict()['Col2']`**\n", 422 | "* Using **`zip(df.Col1, df.Col2)`**\n", 423 | "* Using **`df.set_index('Col1').T.to_dict('list')`**\n", 424 | "\n", 425 | "**Example:**\n", 426 | "\n", 427 | "Below example uses **`df.set_index('Col1').to_dict()['Col2']`** to get the expected output." 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 7, 433 | "metadata": { 434 | "ExecuteTime": { 435 | "end_time": "2021-06-17T11:59:55.380171Z", 436 | "start_time": "2021-06-17T11:59:55.351851Z" 437 | } 438 | }, 439 | "outputs": [ 440 | { 441 | "name": "stdout", 442 | "output_type": "stream", 443 | "text": [ 444 | " Name Marks\n", 445 | "0 Nat 70.88\n", 446 | "1 Harry 85.90\n", 447 | "2 Joe 91.45\n", 448 | "{'Nat': 70.88, 'Harry': 85.9, 'Joe': 91.45}\n" 449 | ] 450 | } 451 | ], 452 | "source": [ 453 | "import pandas as pd\n", 454 | "\n", 455 | "# create dataframe from csv\n", 456 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 457 | "print(studentDf)\n", 458 | "\n", 459 | "# create dict with Name as key and marks as value\n", 460 | "studentDict = studentDf.set_index('Name').to_dict()['Marks']\n", 461 | "\n", 462 | "print(studentDict)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "We can also achieve the same result using **`zip()`** the function." 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 8, 475 | "metadata": { 476 | "ExecuteTime": { 477 | "end_time": "2021-06-17T11:59:59.387932Z", 478 | "start_time": "2021-06-17T11:59:59.372311Z" 479 | } 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "# create dict with Name as key and marks as value\n", 484 | "studentDict = dict(zip(studentDf.Name, studentDf.Marks))" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "If we want to collect the **column data into the list**, it can be done by applying **transpose** operation on the DataFrame and then converting it into dict." 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 9, 497 | "metadata": { 498 | "ExecuteTime": { 499 | "end_time": "2021-06-17T12:00:12.894121Z", 500 | "start_time": "2021-06-17T12:00:12.878010Z" 501 | } 502 | }, 503 | "outputs": [ 504 | { 505 | "name": "stdout", 506 | "output_type": "stream", 507 | "text": [ 508 | " Name Marks\n", 509 | "0 Nat 70.88\n", 510 | "1 Harry 85.90\n", 511 | "2 Joe 91.45\n", 512 | "{'Nat': [70.88], 'Harry': [85.9], 'Joe': [91.45]}\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "import pandas as pd\n", 518 | "\n", 519 | "# create dataframe from csv\n", 520 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 521 | "print(studentDf)\n", 522 | "\n", 523 | "# create dict with Name as key and marks as value\n", 524 | "studentDict = studentDf.set_index('Name').T.to_dict('list')\n", 525 | "print(studentDict)" 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": {}, 531 | "source": [ 532 | "## DataFrame to dict using `into` parameter\n", 533 | "\n", 534 | "While converting a DataFrame to **`dict`** if we need output **`dict`** to be of a particular type, we can use the parameter into of **`DataFrame.to_dict()`** function. We can specify the class name or the instance of the class for the resultant **`dict`**.\n", 535 | "\n", 536 | "**Example:**\n", 537 | "\n", 538 | "In the below example, we converted DataFrame to the **`dict`** of type **OrderedDict**." 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 10, 544 | "metadata": { 545 | "ExecuteTime": { 546 | "end_time": "2021-06-17T12:00:19.780268Z", 547 | "start_time": "2021-06-17T12:00:19.759763Z" 548 | } 549 | }, 550 | "outputs": [ 551 | { 552 | "name": "stdout", 553 | "output_type": "stream", 554 | "text": [ 555 | " Name Marks\n", 556 | "0 Nat 70.88\n", 557 | "1 Harry 85.90\n", 558 | "2 Joe 91.45\n", 559 | "OrderedDict([('Name', OrderedDict([(0, 'Nat'), (1, 'Harry'), (2, 'Joe')])), ('Marks', OrderedDict([(0, 70.88), (1, 85.9), (2, 91.45)]))])\n" 560 | ] 561 | } 562 | ], 563 | "source": [ 564 | "# import pandas library\n", 565 | "from collections import OrderedDict\n", 566 | "import pandas as pd\n", 567 | "\n", 568 | "# create dataframe from csv\n", 569 | "studentDf = pd.read_csv(\"student_data.csv\")\n", 570 | "print(studentDf)\n", 571 | "\n", 572 | "# convert dataframe to ordered dict\n", 573 | "studentDict = studentDf.to_dict(into=OrderedDict)\n", 574 | "print(studentDict)" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [] 583 | } 584 | ], 585 | "metadata": { 586 | "hide_input": false, 587 | "kernelspec": { 588 | "display_name": "Python 3", 589 | "language": "python", 590 | "name": "python3" 591 | }, 592 | "language_info": { 593 | "codemirror_mode": { 594 | "name": "ipython", 595 | "version": 3 596 | }, 597 | "file_extension": ".py", 598 | "mimetype": "text/x-python", 599 | "name": "python", 600 | "nbconvert_exporter": "python", 601 | "pygments_lexer": "ipython3", 602 | "version": "3.8.8" 603 | }, 604 | "toc": { 605 | "base_numbering": 1, 606 | "nav_menu": {}, 607 | "number_sections": true, 608 | "sideBar": true, 609 | "skip_h1_title": false, 610 | "title_cell": "Table of Contents", 611 | "title_sidebar": "Contents", 612 | "toc_cell": false, 613 | "toc_position": {}, 614 | "toc_section_display": true, 615 | "toc_window_display": false 616 | }, 617 | "varInspector": { 618 | "cols": { 619 | "lenName": 16, 620 | "lenType": 16, 621 | "lenVar": 40 622 | }, 623 | "kernels_config": { 624 | "python": { 625 | "delete_cmd_postfix": "", 626 | "delete_cmd_prefix": "del ", 627 | "library": "var_list.py", 628 | "varRefreshCmd": "print(var_dic_list())" 629 | }, 630 | "r": { 631 | "delete_cmd_postfix": ") ", 632 | "delete_cmd_prefix": "rm(", 633 | "library": "var_list.r", 634 | "varRefreshCmd": "cat(var_dic_list()) " 635 | } 636 | }, 637 | "types_to_exclude": [ 638 | "module", 639 | "function", 640 | "builtin_function_or_method", 641 | "instance", 642 | "_Feature" 643 | ], 644 | "window_display": false 645 | } 646 | }, 647 | "nbformat": 4, 648 | "nbformat_minor": 2 649 | } 650 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/003_Python_Pandas_DataFrame_head()_and_tail().ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Pandas DataFrame head, tail, at, iat\n", 17 | "\n", 18 | "In this class, we learn how to use **`DataFrame.head()`** and **`DataFrame.tail()`** functions to select top and bottom rows of the **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)** respectively. Also, learn **`DataFrame.at()`** and **`DataFrame.iat()`** functions to access a specific value in the DataFrame." 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## How to use `DataFrame.head()` function\n", 26 | "\n", 27 | "This function is used to see the first n rows in the DataFrame. It is beneficial when we have massive datasets, and it is not possible to see the entire dataset at once.\n", 28 | "\n", 29 | "It takes input as the number of rows to be displayed from the top. The default value is 5.\n", 30 | "\n", 31 | "**Syntax:**\n", 32 | "```python\n", 33 | "DataFrame.head(n=5)\n", 34 | "```\n", 35 | "\n", 36 | "**Example:**\n", 37 | "\n", 38 | "In the below Student DataFrame with columns like Name, Age, and Marks. If we apply **`DataFrame.head()`** function, we can see that only the first five rows are displayed.\n", 39 | "\n", 40 | "
\n", 41 | "\n", 42 | "
" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 1, 48 | "metadata": { 49 | "ExecuteTime": { 50 | "end_time": "2021-06-17T11:50:53.184426Z", 51 | "start_time": "2021-06-17T11:50:52.603379Z" 52 | } 53 | }, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | " Name Age Marks\n", 60 | "0 Joe 20 85.10\n", 61 | "1 Nat 21 77.80\n", 62 | "2 Harry 19 91.54\n", 63 | "3 Jack 17 72.00\n", 64 | "4 Jose 18 87.90\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "import pandas as pd\n", 70 | "\n", 71 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry','Jack','Jose','Jill','Rose'],\n", 72 | " 'Age': [20, 21, 19,17,18,19,17],\n", 73 | " 'Marks': [85.10, 77.80, 91.54,72,87.9,90,72]}\n", 74 | "\n", 75 | "# create DataFrame from dict\n", 76 | "student_df = pd.DataFrame(student_dict)\n", 77 | "\n", 78 | "# display first 5 rows\n", 79 | "topRows = student_df.head()\n", 80 | "print(topRows)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Select top n rows in pandas DataFrame\n", 88 | "\n", 89 | "When we want to see a smaller section of data, we can use the function **`DataFrame.head()`** and pass a parameter as the number of rows to display from the top.\n", 90 | "\n", 91 | "**Example:**\n", 92 | "\n", 93 | "In the below example, after we apply the **`DataFrame.head(3)`** function, only the first three rows of the DataFrame are displayed." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 2, 99 | "metadata": { 100 | "ExecuteTime": { 101 | "end_time": "2021-06-17T11:50:55.942203Z", 102 | "start_time": "2021-06-17T11:50:55.922678Z" 103 | } 104 | }, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | " Name Age Marks\n", 111 | "0 Joe 20 85.10\n", 112 | "1 Nat 21 77.80\n", 113 | "2 Harry 19 91.54\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "# display first 3 rows\n", 119 | "topRows = student_df.head(3)\n", 120 | "print(topRows)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Select top rows except for last n rows\n", 128 | "\n", 129 | "When we have a vast DataFrame, and we want to see all the rows except for the last n rows, we can pass the negative value as a parameter to **`DataFrame.head()`**.\n", 130 | "\n", 131 | "**Example:**\n", 132 | "\n", 133 | "In the below example, if we want to display all the rows except the bottom two rows, we can use **`DataFrame.head(-2)`** function." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 3, 139 | "metadata": { 140 | "ExecuteTime": { 141 | "end_time": "2021-06-17T11:50:59.510526Z", 142 | "start_time": "2021-06-17T11:50:59.482722Z" 143 | } 144 | }, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | " Name Age Marks\n", 151 | "0 Joe 20 85.10\n", 152 | "1 Nat 21 77.80\n", 153 | "2 Harry 19 91.54\n", 154 | "3 Jack 17 72.00\n", 155 | "4 Jose 18 87.90\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "import pandas as pd\n", 161 | "\n", 162 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry','Jack','Jose',\"Jill\",\"Rose\"],\n", 163 | " 'Age': [20, 21, 19,17,18,19,17],\n", 164 | " 'Marks': [85.10, 77.80, 91.54,72,87.9,90,72]}\n", 165 | "\n", 166 | "# create DataFrame from dict\n", 167 | "student_df = pd.DataFrame(student_dict)\n", 168 | "\n", 169 | "# display rows except bottom 2 rows\n", 170 | "topRows = student_df.head(-2)\n", 171 | "print(topRows)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Select top rows from multi-index DataFrames\n", 179 | "\n", 180 | "When Python pandas DataFrame has multiple row index or column headers, then are called multi-level or hierarchical DataFrame. As we have discussed in the above section, we can use the **`DataFrame.head()`** function on multi-index DataFrames to display the top rows.\n", 181 | "\n", 182 | "The below diagram shows hierarchical DataFrame of Student data with two-column headers where column labels **`Name`** and **`Marks`** are at level 0 and **`Surname`** and **`Percentage`** at level 1. Similarly, two-row indexes are index ‘Standard‘ at level 0 and **`Class`** at level 1 of the DataFrame.\n", 183 | "\n", 184 | "**Example:**\n", 185 | "\n", 186 | "The below example shows how to create such DataFrame and display top rows rather than the whole DataFrame.\n", 187 | "\n", 188 | "
\n", 189 | "\n", 190 | "
" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 4, 196 | "metadata": { 197 | "ExecuteTime": { 198 | "end_time": "2021-06-17T11:51:01.441166Z", 199 | "start_time": "2021-06-17T11:51:01.410896Z" 200 | } 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | " Name Marks\n", 208 | " Surname Percentage\n", 209 | "Standard Class \n", 210 | "Standard 1 Class A Joe 91.56\n", 211 | " Class B Nat 87.90\n", 212 | "Standard 2 Class A Harry 70.10\n", 213 | " Class B Sam 65.48\n", 214 | "Standard 3 Class A Jill 72.00\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "import pandas as pd\n", 220 | "\n", 221 | "index = pd.MultiIndex.from_tuples([('Standard 1', 'Class A'), ('Standard 1', 'Class B'),\n", 222 | " ('Standard 2', 'Class A'), ('Standard 2', 'Class B'),\n", 223 | " ('Standard 3', 'Class A'), ('Standard 3', 'Class B')],\n", 224 | " names=['Standard', 'Class'])\n", 225 | "\n", 226 | "columns = pd.MultiIndex.from_tuples([('Name', 'Surname'),\n", 227 | "('Marks', 'Percentage')])\n", 228 | "\n", 229 | "# create multi-index dataframe\n", 230 | "student_df = pd.DataFrame([('Joe', 91.56), ('Nat', 87.90),('Harry', 70.10), \n", 231 | " ('Sam', 65.48), (\"Jill\", 72), (\"Jane\", 80)],\n", 232 | " index=index, columns=columns)\n", 233 | "\n", 234 | "topRows = student_df.head()\n", 235 | "print(topRows)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## How to use `DataFrame.tail()` function\n", 243 | "\n", 244 | "We can use the **`DataFrame.tail()`** function to display the last n rows of the DataFrame. Like the head function, this function is used when we want to view a smaller section of the entire DataFrame.\n", 245 | "\n", 246 | "It takes input as the number of rows to be displayed from the bottom. The default value is 5.\n", 247 | "\n", 248 | "**Syntax:**\n", 249 | "```python\n", 250 | "DataFrame.tail(n=5)\n", 251 | "```\n", 252 | "\n", 253 | "**Example:**\n", 254 | "\n", 255 | "In the below Student DataFrame with columns like **`Name`**, **`Age`**, and **`Marks`**. If we apply **`DataFrame.tail()`** function, we can see that only the bottom five rows are displayed in the output.\n", 256 | "\n", 257 | "
\n", 258 | "\n", 259 | "
" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 5, 265 | "metadata": { 266 | "ExecuteTime": { 267 | "end_time": "2021-06-17T11:51:04.075411Z", 268 | "start_time": "2021-06-17T11:51:04.050999Z" 269 | } 270 | }, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | " Name Age Marks\n", 277 | "2 Harry 19 91.54\n", 278 | "3 Jack 17 72.00\n", 279 | "4 Jose 18 87.90\n", 280 | "5 Jill 19 90.00\n", 281 | "6 Rose 17 72.00\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "import pandas as pd\n", 287 | "\n", 288 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry','Jack','Jose',\"Jill\",\"Rose\"],\n", 289 | " 'Age': [20, 21, 19,17,18,19,17],\n", 290 | " 'Marks': [85.10, 77.80, 91.54,72,87.9,90,72]}\n", 291 | "\n", 292 | "# create DataFrame from dict\n", 293 | "student_df = pd.DataFrame(student_dict)\n", 294 | "\n", 295 | "# display the bottom 5 rows\n", 296 | "bottomRows = student_df.tail()\n", 297 | "print(bottomRows)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "## Select bottom n rows in pandas DataFrame\n", 305 | "\n", 306 | "When we want to see a smaller section of data from the bottom of the DataFrame, we can use the function **`DataFrame.tail()`** and pass a parameter as the number of rows to display from the bottom.\n", 307 | "\n", 308 | "**Example:**\n", 309 | "\n", 310 | "In the below example, after we apply the **`DataFrame.tail(3)`** function, we see that only the last 3 rows of the DataFrame are displayed." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 6, 316 | "metadata": { 317 | "ExecuteTime": { 318 | "end_time": "2021-06-17T11:51:06.989926Z", 319 | "start_time": "2021-06-17T11:51:06.971376Z" 320 | } 321 | }, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | " Name Age Marks\n", 328 | "4 Jose 18 87.9\n", 329 | "5 Jill 19 90.0\n", 330 | "6 Rose 17 72.0\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "bottomRows = student_df.tail(3)\n", 336 | "print(bottomRows)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "## Select bottom rows except for first n rows\n", 344 | "\n", 345 | "When we want to see our entire dataset except for the first few rows, we can use **`DataFrame.tail()`** function and pass the negative value as a parameter to it.\n", 346 | "\n", 347 | "**Example:**\n", 348 | "\n", 349 | "In the below example, if we display all the rows except the top 2 rows using **`DataFrame.tail(2)`**." 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 7, 355 | "metadata": { 356 | "ExecuteTime": { 357 | "end_time": "2021-06-17T11:51:10.120263Z", 358 | "start_time": "2021-06-17T11:51:10.098782Z" 359 | } 360 | }, 361 | "outputs": [ 362 | { 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | " Name Age Marks\n", 367 | "2 Harry 19 91.54\n", 368 | "3 Jack 17 72.00\n", 369 | "4 Jose 18 87.90\n", 370 | "5 Jill 19 90.00\n", 371 | "6 Rose 17 72.00\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "import pandas as pd\n", 377 | "\n", 378 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry','Jack','Jose',\"Jill\",\"Rose\"],\n", 379 | " 'Age': [20, 21, 19,17,18,19,17],\n", 380 | " 'Marks': [85.10, 77.80, 91.54,72,87.9,90,72]}\n", 381 | "\n", 382 | "# create DataFrame from dict\n", 383 | "student_df = pd.DataFrame(student_dict)\n", 384 | "\n", 385 | "# display top rows except the last 2 rows\n", 386 | "bottomRows = student_df.tail(-2)\n", 387 | "print(bottomRows)" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "## Select bottom rows from the multi index DataFrame\n", 395 | "\n", 396 | "We can apply the **`DataFrame.tail()`** function on multi-index DataFrames as well. It works in the same way as normal DataFrames.\n", 397 | "\n", 398 | "**Example:**\n", 399 | "\n", 400 | "In our example, after we have applied the **`DataFrame.tail()`** function, only the bottom 5 row are displayed.\n", 401 | "\n", 402 | "
\n", 403 | "\n", 404 | "
" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 8, 410 | "metadata": { 411 | "ExecuteTime": { 412 | "end_time": "2021-06-17T11:51:14.265723Z", 413 | "start_time": "2021-06-17T11:51:14.244238Z" 414 | } 415 | }, 416 | "outputs": [ 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | " Name Marks\n", 422 | " Surname Percentage\n", 423 | "Standard Class \n", 424 | "Standard 1 Class B Nat 87.90\n", 425 | "Standard 2 Class A Harry 70.10\n", 426 | " Class B Sam 65.48\n", 427 | "Standard 3 Class A Jill 72.00\n", 428 | " Class B Jane 80.00\n" 429 | ] 430 | } 431 | ], 432 | "source": [ 433 | "import pandas as pd\n", 434 | "\n", 435 | "index = pd.MultiIndex.from_tuples([('Standard 1', 'Class A'), ('Standard 1', 'Class B'),\n", 436 | " ('Standard 2', 'Class A'), ('Standard 2', 'Class B'),\n", 437 | " ('Standard 3', 'Class A'), ('Standard 3', 'Class B')],\n", 438 | " names=['Standard', 'Class'])\n", 439 | "\n", 440 | "columns = pd.MultiIndex.from_tuples([('Name', 'Surname'),\n", 441 | "('Marks', 'Percentage')])\n", 442 | "\n", 443 | "# create multi-index dataframe\n", 444 | "student_df = pd.DataFrame([('Joe', 91.56), ('Nat', 87.90), ('Harry', 70.10), \n", 445 | " ('Sam', 65.48), (\"Jill\", 72), (\"Jane\", 80)],\n", 446 | " index=index, columns=columns)\n", 447 | "\n", 448 | "bottomRows = student_df.tail()\n", 449 | "print(bottomRows)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "## Select value using row and column labels using `DataFrame.at`\n", 457 | "\n", 458 | "There are cases in the field of Data Science that we need to access a specific element of the DataFrame using its column label and row index. In such cases, we can use the **`DataFrame.at`** property and pass the row index and column labels of the value to access as parameters. This property can be used with Multi-index DataFrame as well.\n", 459 | "\n", 460 | ">**Note:** It throws **`KeyError`** if the label does not exist in DataFrame.\n", 461 | "\n", 462 | "**Example:**\n", 463 | "\n", 464 | "In the below example, after we use the **`DataFrame.at[2,\"Age\"]`** function, we get 19 as the output because 19 is the value present at row 2 and column Age.\n", 465 | "\n", 466 | "
\n", 467 | "\n", 468 | "
" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 9, 474 | "metadata": { 475 | "ExecuteTime": { 476 | "end_time": "2021-06-17T11:51:18.069877Z", 477 | "start_time": "2021-06-17T11:51:18.059136Z" 478 | } 479 | }, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "19\n" 486 | ] 487 | } 488 | ], 489 | "source": [ 490 | "import pandas as pd\n", 491 | "\n", 492 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry',], 'Age': [20, 21, 19], \n", 493 | " 'Marks': [85.10, 77.80, 91.54]}\n", 494 | "\n", 495 | "# create DataFrame from dict\n", 496 | "student_df = pd.DataFrame(student_dict)\n", 497 | "\n", 498 | "value = student_df.at[2,\"Age\"]\n", 499 | "print(value) # --> Output: 19" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "## Set specific value in pandas DataFrame\n", 507 | "\n", 508 | "When we want to update the value of the particular element from DataFrame based on its column label and row index, we can use DataFrame.at property.\n", 509 | "\n", 510 | "**Example:**\n", 511 | "\n", 512 | "In the below example, after we have applied **`DataFrame.at[2,\"Age\"]=50`** on our DataFrame, the value at that position changed from 19 to 50." 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 10, 518 | "metadata": { 519 | "ExecuteTime": { 520 | "end_time": "2021-06-17T11:51:21.789558Z", 521 | "start_time": "2021-06-17T11:51:21.779795Z" 522 | } 523 | }, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "19\n", 530 | "50\n" 531 | ] 532 | } 533 | ], 534 | "source": [ 535 | "import pandas as pd\n", 536 | "\n", 537 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry',], 'Age': [20, 21, 19],\n", 538 | " 'Marks': [85.10, 77.80, 91.54]}\n", 539 | "\n", 540 | "# create DataFrame from dict\n", 541 | "student_df = pd.DataFrame(student_dict)\n", 542 | "print(student_df.at[2,\"Age\"])\n", 543 | "\n", 544 | "# change the value\n", 545 | "student_df.at[2,\"Age\"] = 50\n", 546 | "print(student_df.at[2,\"Age\"])" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "## Select value using row and column position using DataFrame.iat\n", 554 | "\n", 555 | "We want to access a specific element from a very large DataFrame, but we do not know its column label or row index. We can still access such an element using its column and row positions. For that, we can use **`DataFrame.iat`** property of python pandas. Unlike **`DataFrame.at`** it can work on the row and column index position of the DataFrame.\n", 556 | "\n", 557 | ">**Note:** Index positions starts at 0.\n", 558 | "\n", 559 | "**Example:**\n", 560 | "\n", 561 | "In the below example, we access the second row and the third column using **`DataFrame.iat[1,2]`**.\n", 562 | "\n", 563 | "
\n", 564 | "\n", 565 | "
" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 11, 571 | "metadata": { 572 | "ExecuteTime": { 573 | "end_time": "2021-06-17T11:51:27.351991Z", 574 | "start_time": "2021-06-17T11:51:27.331487Z" 575 | } 576 | }, 577 | "outputs": [ 578 | { 579 | "name": "stdout", 580 | "output_type": "stream", 581 | "text": [ 582 | "77.8\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "import pandas as pd\n", 588 | "\n", 589 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry',], 'Age': [20, 21, 19],\n", 590 | " 'Marks': [85.10, 77.80, 91.54]}\n", 591 | "\n", 592 | "# create DataFrame from dict\n", 593 | "student_df = pd.DataFrame(student_dict)\n", 594 | "\n", 595 | "value = student_df.iat[1,2]\n", 596 | "print(value) # --> Output: 77.80" 597 | ] 598 | }, 599 | { 600 | "cell_type": "markdown", 601 | "metadata": {}, 602 | "source": [ 603 | "## Set specific value in pandas DataFrame\n", 604 | "\n", 605 | "When we want to update the value of the particular element from DataFrame based on its column and row position, we can use **`DataFrame.iat`** property.\n", 606 | "\n", 607 | "**Example:**\n", 608 | "\n", 609 | "In the below example, we change the value at the second row and the third column using **`DataFrame.iat[1,2]=90`**." 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 12, 615 | "metadata": { 616 | "ExecuteTime": { 617 | "end_time": "2021-06-17T11:51:31.000877Z", 618 | "start_time": "2021-06-17T11:51:30.988184Z" 619 | } 620 | }, 621 | "outputs": [ 622 | { 623 | "name": "stdout", 624 | "output_type": "stream", 625 | "text": [ 626 | "77.8\n", 627 | "90.0\n" 628 | ] 629 | } 630 | ], 631 | "source": [ 632 | "import pandas as pd\n", 633 | "\n", 634 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], \n", 635 | " 'Marks': [85.10, 77.80, 91.54]}\n", 636 | "\n", 637 | "# create DataFrame from dict\n", 638 | "student_df = pd.DataFrame(student_dict)\n", 639 | "print(student_df.iat[1,2])\n", 640 | "\n", 641 | "# change value\n", 642 | "student_df.iat[1,2]=90\n", 643 | "print(student_df.iat[1,2])" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [] 652 | } 653 | ], 654 | "metadata": { 655 | "hide_input": false, 656 | "kernelspec": { 657 | "display_name": "Python 3", 658 | "language": "python", 659 | "name": "python3" 660 | }, 661 | "language_info": { 662 | "codemirror_mode": { 663 | "name": "ipython", 664 | "version": 3 665 | }, 666 | "file_extension": ".py", 667 | "mimetype": "text/x-python", 668 | "name": "python", 669 | "nbconvert_exporter": "python", 670 | "pygments_lexer": "ipython3", 671 | "version": "3.8.8" 672 | }, 673 | "toc": { 674 | "base_numbering": 1, 675 | "nav_menu": {}, 676 | "number_sections": true, 677 | "sideBar": true, 678 | "skip_h1_title": false, 679 | "title_cell": "Table of Contents", 680 | "title_sidebar": "Contents", 681 | "toc_cell": false, 682 | "toc_position": {}, 683 | "toc_section_display": true, 684 | "toc_window_display": false 685 | }, 686 | "varInspector": { 687 | "cols": { 688 | "lenName": 16, 689 | "lenType": 16, 690 | "lenVar": 40 691 | }, 692 | "kernels_config": { 693 | "python": { 694 | "delete_cmd_postfix": "", 695 | "delete_cmd_prefix": "del ", 696 | "library": "var_list.py", 697 | "varRefreshCmd": "print(var_dic_list())" 698 | }, 699 | "r": { 700 | "delete_cmd_postfix": ") ", 701 | "delete_cmd_prefix": "rm(", 702 | "library": "var_list.r", 703 | "varRefreshCmd": "cat(var_dic_list()) " 704 | } 705 | }, 706 | "types_to_exclude": [ 707 | "module", 708 | "function", 709 | "builtin_function_or_method", 710 | "instance", 711 | "_Feature" 712 | ], 713 | "window_display": false 714 | } 715 | }, 716 | "nbformat": 4, 717 | "nbformat_minor": 2 718 | } 719 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/010_Python_Pandas_DataFrame_reset_index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Reset index in pandas DataFrame\n", 17 | "\n", 18 | "In this class, we learn how to reset the index of the **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)** We discuss all the cases of resetting the row index of a simple and multi-level DataFrame.\n", 19 | "\n", 20 | "DataFrame is the tabular structure in the Python pandas library. It represents each row and column by the label. Row label is called an **index**, whereas column label is called column index/header.\n", 21 | "\n", 22 | "After performing manipulations and filtering on the large dataset, we finally get the precise DataFrame as required. But, it carries the index of the original dataset. In such a case, we need to reset the index of the DataFrame." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "heading_collapsed": true 29 | }, 30 | "source": [ 31 | "## The `DataFrame.reset_index()` function\n", 32 | "\n", 33 | "After dropping and filtering the rows, this function is used to reset the index of the resultant Python DataFrame. Let’s discuss how to use **`DataFrame.reset_index()`** function in detail.\n", 34 | "\n", 35 | "**Syntax:**\n", 36 | "```python\n", 37 | "DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='')\n", 38 | "```\n", 39 | "\n", 40 | "**Parameters:**\n", 41 | "\n", 42 | "1. **`level`**: In multi-level DataFrame, it takes a **level name or a position** of **Row index** that needs to be reset. By default, it reset all levels in a row index.\n", 43 | "2. **`drop: It is a boolean flag,\n", 44 | " * **`True`** – It does not add the current row index as a new column in DataFrame.\n", 45 | " * **`False`** (Default) – It adds the current row index as a new column in DataFrame.\n", 46 | "3. **`inplace`**: It is used to specify whether to return a new DataFrame or update an existing one. It is a boolean flag with default **`False`**.\n", 47 | "4. **`col_level`**: In multi-level DataFrame, determines which column header level the current row index is inserted into. By default, it is inserted into the first level.\n", 48 | "5. **`col_fill`**: In multi-level DataFrame, if the column headers have multiple levels, it determines how the other levels are named. **For example**, if we have a DataFrame with the two-column headers at levels 0 and 1, and if we add the current index as column header at level 0, we can specify the column header at level 1.\n", 49 | " \n", 50 | "**Return:**\n", 51 | "\n", 52 | "DataFrame with the new index or None if **`inplace=True`**." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "## Reset index to starts at 0\n", 60 | "\n", 61 | "How to reset index in pandas DataFrame?\n", 62 | "\n", 63 | "1. **Create pandas DataFrame**\n", 64 | " * We can create a DataFrame from a CSV file or **`dict`**.\n", 65 | "\n", 66 | "\n", 67 | "2. **Manipulate the DataFrame**\n", 68 | " * When we manipulate the DataFrame like **[drop duplicates](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_Methods/005_Python_Pandas_DataFrame_drop_duplicates.ipynb)** or sort values, we get the new DataFrame, but it carries the original row index.\n", 69 | "**`df = df.drop_duplicates()`**\n", 70 | "\n", 71 | "\n", 72 | "3. **Use `DataFrame.reset_index()` function**\n", 73 | " * We can use **`DataFrame.reset_index()`** to reset the index of the updated DataFrame. By default, it adds the current row index as a new column called **index** in DataFrame, and it will create a new row index as a range of numbers starting at 0.\n", 74 | "**`df = df.reset_index()`**\n", 75 | "\n", 76 | "\n", 77 | "4. **Reset index without adding new column**\n", 78 | " * By default, **`DataFrame.reset_index()`** adds the current row index as a new column in DataFrame. If we do not want to add the new column, we can use the **`drop`** parameter.\n", 79 | "**`df = df.reset_index(drop=True)`**\n", 80 | "\n", 81 | "\n", 82 | "5. **Reset index in place**\n", 83 | " * We can use the parameter **`inplace`** to reset the index in the existing DataFrame rather than create a new copy.\n", 84 | "**`df.reset_index(inplace=True)`**\n", 85 | "\n", 86 | "
\n", 87 | "\n", 88 | "
\n", 89 | "\n", 90 | "**Example:**\n", 91 | "\n", 92 | "We have a student DataFrame with a row index ‘s1’, ‘s2’.. likewise. It contains a row with missing values that we want to remove. After we removed it using **`DataFrame.dropna()`** function, its row index is still the same. But now, the index is not in the sequence.\n", 93 | "\n", 94 | "In such a case, let’s see how to reset the index to the sequence of numbers using **`DataFrame.reset_index()`**." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 2, 100 | "metadata": { 101 | "ExecuteTime": { 102 | "end_time": "2021-06-17T12:08:21.578262Z", 103 | "start_time": "2021-06-17T12:08:21.532367Z" 104 | }, 105 | "scrolled": true 106 | }, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "Original DataFrame: \n", 113 | " Name Age Marks\n", 114 | "s1 Joe 20.0 85.10\n", 115 | "s2 Nat 21.0 77.80\n", 116 | "s3 NaN NaN NaN\n", 117 | "s4 Harry 19.0 91.54\n", 118 | "\n", 119 | "DataFrame after dropping N/A: \n", 120 | " Name Age Marks\n", 121 | "s1 Joe 20.0 85.10\n", 122 | "s2 Nat 21.0 77.80\n", 123 | "s4 Harry 19.0 91.54\n", 124 | "\n", 125 | "DataFrame after resetting index: \n", 126 | " index Name Age Marks\n", 127 | "0 s1 Joe 20.0 85.10\n", 128 | "1 s2 Nat 21.0 77.80\n", 129 | "2 s4 Harry 19.0 91.54\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "import numpy as np\n", 135 | "\n", 136 | "student_dict = {'Name': ['Joe', 'Nat', np.NaN, 'Harry'], 'Age': [20, 21, np.NaN, 19],\n", 137 | " 'Marks': [85.10, 77.80, np.NaN, 91.54]}\n", 138 | "\n", 139 | "# create DataFrame from dict\n", 140 | "student_df = pd.DataFrame(student_dict, index=['s1', 's2', 's3', 's4'])\n", 141 | "print(\"Original DataFrame: \\n\", student_df)\n", 142 | "\n", 143 | "# drop NA\n", 144 | "student_df = student_df.dropna()\n", 145 | "print(\"\\nDataFrame after dropping N/A: \\n\", student_df)\n", 146 | "\n", 147 | "# reset index\n", 148 | "student_df = student_df.reset_index()\n", 149 | "print(\"\\nDataFrame after resetting index: \\n\", student_df)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## Reset index without new column\n", 157 | "\n", 158 | "By default, **`DataFrame.reset_index()`** adds the current row index as a new **index** column in DataFrame. If we do not want to add the new column, we can use the drop parameter.\n", 159 | "\n", 160 | "* If **`drop=True`** then it does not add the new column of the current row index in the DataFrame.\n", 161 | "* If **`drop=False`**, is the default behavior where it adds the new column of the current row index in the DataFrame.\n", 162 | "\n", 163 | "**Example:**\n", 164 | "\n", 165 | "Let’s see how we can reset the index without adding new column." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 3, 171 | "metadata": { 172 | "ExecuteTime": { 173 | "end_time": "2021-06-17T12:08:29.713422Z", 174 | "start_time": "2021-06-17T12:08:29.686081Z" 175 | } 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "Before reset index: \n", 183 | " Name Age Marks\n", 184 | "s1 Joe 20 85.10\n", 185 | "s2 Nat 21 77.80\n", 186 | "s3 Harry 19 91.54\n", 187 | "\n", 188 | "After reset index: \n", 189 | " Name Age Marks\n", 190 | "0 Joe 20 85.10\n", 191 | "1 Nat 21 77.80\n", 192 | "2 Harry 19 91.54\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "import pandas as pd\n", 198 | "\n", 199 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 200 | "\n", 201 | "# create DataFrame from dict\n", 202 | "student_df = pd.DataFrame(student_dict, index=['s1', 's2', 's3'])\n", 203 | "print(\"Before reset index: \\n\", student_df)\n", 204 | "\n", 205 | "# reset index without new column\n", 206 | "student_df = student_df.reset_index(drop=True)\n", 207 | "print(\"\\nAfter reset index: \\n\", student_df)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Reset index in place\n", 215 | "\n", 216 | "In the above examples, whenever we executed reset index operation, pandas created a new copy of DataFrame because the modification is not-in place.\n", 217 | "\n", 218 | "Specify **`inplace=True`** to reset index in the existing DataFrame rather than creating a copy of it.\n", 219 | "\n", 220 | "* If the **`inplace=True`** then it updates the existing DataFrame and does not return anything.\n", 221 | "* If the **`inplace=False`** then it creates a new DataFrame with an updated index and returns it.\n", 222 | "\n", 223 | ">**Note:** You don’t need to assign the result back to a variable as we are performing modifications in place.\n", 224 | "\n", 225 | "**Example:**" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 4, 231 | "metadata": { 232 | "ExecuteTime": { 233 | "end_time": "2021-06-17T12:08:36.681602Z", 234 | "start_time": "2021-06-17T12:08:36.663049Z" 235 | } 236 | }, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "Before reset index: \n", 243 | " Name Age Marks\n", 244 | "s1 Joe 20 85.10\n", 245 | "s2 Nat 21 77.80\n", 246 | "s3 Harry 19 91.54\n", 247 | "\n", 248 | "After reset index: \n", 249 | " index Name Age Marks\n", 250 | "0 s1 Joe 20 85.10\n", 251 | "1 s2 Nat 21 77.80\n", 252 | "2 s3 Harry 19 91.54\n" 253 | ] 254 | } 255 | ], 256 | "source": [ 257 | "import pandas as pd\n", 258 | "\n", 259 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 260 | "\n", 261 | "# create DataFrame from dict\n", 262 | "student_df = pd.DataFrame(student_dict, index=['s1', 's2', 's3'])\n", 263 | "print(\"Before reset index: \\n\", student_df)\n", 264 | "\n", 265 | "# reset index in place\n", 266 | "student_df.reset_index(inplace=True)\n", 267 | "print(\"\\nAfter reset index: \\n\", student_df)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## Reset index starts from 1\n", 275 | "\n", 276 | "Suppose we have a huge dataset which we need to filter. After filtering the DataFrame, it still carries the original index. When we want to reset the index of the DataFrame such that the new index should start with 1, we can do that in two steps,\n", 277 | "\n", 278 | "* Use **`DataFrame.reset_index()`** to reset the row index to start at o.\n", 279 | "* Use the **`index`** parameter of the DataFrame to re-assign the index by adding 1 to each row index of the resultant DataFrame.\n", 280 | "\n", 281 | "**Example:**\n", 282 | "\n", 283 | "In the below example, we first reset the index to the sequence of numbers and then added 1 to each index." 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 5, 289 | "metadata": { 290 | "ExecuteTime": { 291 | "end_time": "2021-06-17T12:08:38.115669Z", 292 | "start_time": "2021-06-17T12:08:38.087350Z" 293 | } 294 | }, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "Before reset index: \n", 301 | " Name Age Marks\n", 302 | "s1 Joe 20 85.10\n", 303 | "s2 Nat 21 77.80\n", 304 | "s3 Harry 19 91.54\n", 305 | "\n", 306 | "After reset index: \n", 307 | " index Name Age Marks\n", 308 | "1 s1 Joe 20 85.10\n", 309 | "2 s2 Nat 21 77.80\n", 310 | "3 s3 Harry 19 91.54\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "import pandas as pd\n", 316 | "\n", 317 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 318 | "\n", 319 | "# create DataFrame from dict\n", 320 | "student_df = pd.DataFrame(student_dict, index=['s1', 's2', 's3'])\n", 321 | "print(\"Before reset index: \\n\", student_df)\n", 322 | "\n", 323 | "# reset index\n", 324 | "student_df = student_df.reset_index()\n", 325 | "\n", 326 | "# add 1 to each index\n", 327 | "student_df.index = student_df.index + 1\n", 328 | "\n", 329 | "print(\"\\nAfter reset index: \\n\", student_df)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## Reset index to the range of numbers\n", 337 | "\n", 338 | "In our student DataFrame, suppose we want to assign the Identity number to each student starting from 101. We can use the index parameter of DataFrame to change the **`index`** as a range of numbers that begins at a specific number.\n", 339 | "\n", 340 | "First, we need to generate the range of numbers and then assign it to the **`DataFrame.index`** to reset the original index.\n", 341 | "\n", 342 | "**Example:**\n", 343 | "\n", 344 | "In the below example, **[pd.RangeIndex()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.RangeIndex.html)** function is used to generate the range of numbers which starts at 101 till the last row i.e. **`len(df)`**. Assign this range to the **`df.index`**." 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 6, 350 | "metadata": { 351 | "ExecuteTime": { 352 | "end_time": "2021-06-17T12:08:39.175226Z", 353 | "start_time": "2021-06-17T12:08:39.158626Z" 354 | } 355 | }, 356 | "outputs": [ 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "Before reset index: \n", 362 | " Name Age Marks\n", 363 | "s1 Joe 20 85.10\n", 364 | "s2 Nat 21 77.80\n", 365 | "s3 Harry 19 91.54\n", 366 | "\n", 367 | "After reset index: \n", 368 | " Name Age Marks\n", 369 | "101 Joe 20 85.10\n", 370 | "102 Nat 21 77.80\n", 371 | "103 Harry 19 91.54\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "import pandas as pd\n", 377 | "\n", 378 | "student_dict = {'Name':['Joe','Nat','Harry'], 'Age':[20,21,19], 'Marks':[85.10, 77.80, 91.54]}\n", 379 | "\n", 380 | "# create DataFrame from dict\n", 381 | "student_df = pd.DataFrame(student_dict, index=['s1','s2','s3'])\n", 382 | "print(\"Before reset index: \\n\", student_df)\n", 383 | "\n", 384 | "# assign new index from 1 to n\n", 385 | "student_df.index = pd.RangeIndex(start=101, stop=101+len(student_df), step=1)\n", 386 | "\n", 387 | "print(\"\\nAfter reset index: \\n\", student_df)" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "## Reset index and change column name\n", 395 | "\n", 396 | "As we have already discussed, **`DataFrame.reset_index()`** adds the current index as a new column with the name **index** in the DataFrame. If we want to give a name to such a newly added column, then we need to use **`DataFrame.rename()`** function with **`DataFrame.reset_index()`**.\n", 397 | "\n", 398 | "**Example:**\n", 399 | "\n", 400 | "Let’s see how to do the method chaining of **`DataFrame.reset_index()`** and **`DataFrame.rename()`** functions to rename a new **index** column to **ID**." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 7, 406 | "metadata": { 407 | "ExecuteTime": { 408 | "end_time": "2021-06-17T12:08:39.890059Z", 409 | "start_time": "2021-06-17T12:08:39.862718Z" 410 | } 411 | }, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "Before reset index: \n", 418 | " Name Age Marks\n", 419 | "s1 Joe 20 85.10\n", 420 | "s2 Nat 21 77.80\n", 421 | "s3 Harry 19 91.54\n", 422 | "\n", 423 | "After reset index: \n", 424 | " ID Name Age Marks\n", 425 | "0 s1 Joe 20 85.10\n", 426 | "1 s2 Nat 21 77.80\n", 427 | "2 s3 Harry 19 91.54\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "import pandas as pd\n", 433 | "\n", 434 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 435 | "\n", 436 | "# create DataFrame from dict\n", 437 | "student_df = pd.DataFrame(student_dict, index=['s1', 's2', 's3'])\n", 438 | "print(\"Before reset index: \\n\", student_df)\n", 439 | "\n", 440 | "# reset index and rename\n", 441 | "student_df = student_df.reset_index().rename(columns={'index': 'ID'})\n", 442 | "print(\"\\nAfter reset index: \\n\", student_df)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "## Reset multi-level index\n", 450 | "\n", 451 | "When Python pandas DataFrame has multiple row index or column headers, then are called **multi-level** or **hierarchical** DataFrame. We can apply **`DataFrame.reset index()`** on such multi-index DataFrame.\n", 452 | "\n", 453 | "The below diagram shows hierarchical DataFrame of Student data with two-column headers where column labels **`Name`** and **`Marks`** are at level 0 and **`Surname`** and **`Percentage`** at level 1. Similarly, two-row indexes where index **`Standard`** at level 0 and **`Class`** at level 1 of the DataFrame.\n", 454 | "\n", 455 | "
\n", 456 | "\n", 457 | "
\n", 458 | "\n", 459 | "**Example:**\n", 460 | "\n", 461 | "The below example show how to create such DataFrame." 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 8, 467 | "metadata": { 468 | "ExecuteTime": { 469 | "end_time": "2021-06-17T12:08:40.561440Z", 470 | "start_time": "2021-06-17T12:08:40.541911Z" 471 | }, 472 | "scrolled": false 473 | }, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "\n", 480 | "After reset index: \n", 481 | " Name Marks\n", 482 | " Surname Percentage\n", 483 | "Standard Class \n", 484 | "Standard 1 Class A Joe 91.56\n", 485 | " Class B Nat 87.90\n", 486 | "Standard 2 Class A Harry 70.10\n", 487 | " Class B Sam 65.48\n" 488 | ] 489 | } 490 | ], 491 | "source": [ 492 | "import pandas as pd\n", 493 | "\n", 494 | "index = pd.MultiIndex.from_tuples([('Standard 1', 'Class A'),\n", 495 | " ('Standard 1', 'Class B'),\n", 496 | " ('Standard 2', 'Class A'),\n", 497 | " ('Standard 2', 'Class B')],\n", 498 | " names=['Standard', 'Class'])\n", 499 | "columns = pd.MultiIndex.from_tuples([('Name', 'Surname'),\n", 500 | " ('Marks', 'Percentage')])\n", 501 | "\n", 502 | "# create multi-index dataframe\n", 503 | "student_df = pd.DataFrame([('Joe', 91.56),\n", 504 | " ('Nat', 87.90),\n", 505 | " ('Harry', 70.10),\n", 506 | " ('Sam', 65.48)],\n", 507 | " index=index, columns=columns)\n", 508 | "print(\"\\nAfter reset index: \\n\", student_df)" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "Now we see how to reset the index of the multi-level DataFrame using **`DataFrame.reset index()`**. By default, it reset the index of all the levels and add the new range of indexes in the DataFrame." 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 9, 521 | "metadata": { 522 | "ExecuteTime": { 523 | "end_time": "2021-06-17T12:08:42.041400Z", 524 | "start_time": "2021-06-17T12:08:42.014062Z" 525 | } 526 | }, 527 | "outputs": [ 528 | { 529 | "name": "stdout", 530 | "output_type": "stream", 531 | "text": [ 532 | " Standard Class Name Marks\n", 533 | " Surname Percentage\n", 534 | "0 Standard 1 Class A Joe 91.56\n", 535 | "1 Standard 1 Class B Nat 87.90\n", 536 | "2 Standard 2 Class A Harry 70.10\n", 537 | "3 Standard 2 Class B Sam 65.48\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "# reset multi-level index\n", 543 | "student_df = student_df.reset_index()\n", 544 | "print(student_df)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "### Reset index by level\n", 552 | "\n", 553 | "As we have seen, in the case of a multi-level index, by default **`DataFrame.reset_index()`** applies to the index of all the levels. If we want to reset the index of the specific level only then, we can use the **`level`** parameter of the **`DataFrame.reset_index()`** function.\n", 554 | "\n", 555 | "It takes a **level position** or **level name** as input to reset that particular index only.\n", 556 | "\n", 557 | "**Example:**\n", 558 | "\n", 559 | "In the below example, we reset the index of the **`'Standard'`** level only." 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 10, 565 | "metadata": { 566 | "ExecuteTime": { 567 | "end_time": "2021-06-17T12:08:43.536499Z", 568 | "start_time": "2021-06-17T12:08:43.510137Z" 569 | } 570 | }, 571 | "outputs": [ 572 | { 573 | "name": "stdout", 574 | "output_type": "stream", 575 | "text": [ 576 | " Standard Name Marks\n", 577 | " Surname Percentage\n", 578 | "Class \n", 579 | "Class A Standard 1 Joe 91.56\n", 580 | "Class B Standard 1 Nat 87.90\n", 581 | "Class A Standard 2 Harry 70.10\n", 582 | "Class B Standard 2 Sam 65.48\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "student_df = pd.DataFrame([('Joe', 91.56),\n", 588 | " ('Nat', 87.90),\n", 589 | " ('Harry', 70.10),\n", 590 | " ('Sam', 65.48)],\n", 591 | " index=index, columns=columns)\n", 592 | "\n", 593 | "# reset multi-level index by level\n", 594 | "student_df = student_df.reset_index(level='Standard')\n", 595 | "print(student_df)" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "### Reset index and creates new column in level\n", 603 | "\n", 604 | "As we have observed in the above section, by default, **`DataFrame.reset_index()`** all the new column at the first level, i.e., level 0. If we want to add the new index column to other levels, we can use the **`col_level`** parameter.\n", 605 | "\n", 606 | "It takes the level name or level position as an input if the columns have multiple levels, so it determines which level the labels are inserted into.\n", 607 | "\n", 608 | "**Example:**\n", 609 | "\n", 610 | "In the below example, it reset the index of level **`Standard`** only and add it as a new column at level 1." 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 11, 616 | "metadata": { 617 | "ExecuteTime": { 618 | "end_time": "2021-06-17T12:08:44.861195Z", 619 | "start_time": "2021-06-17T12:08:44.839221Z" 620 | } 621 | }, 622 | "outputs": [ 623 | { 624 | "name": "stdout", 625 | "output_type": "stream", 626 | "text": [ 627 | " Name Marks\n", 628 | " Standard Surname Percentage\n", 629 | "Class \n", 630 | "Class A Standard 1 Joe 91.56\n", 631 | "Class B Standard 1 Nat 87.90\n", 632 | "Class A Standard 2 Harry 70.10\n", 633 | "Class B Standard 2 Sam 65.48\n" 634 | ] 635 | } 636 | ], 637 | "source": [ 638 | "student_df = pd.DataFrame([('Joe', 91.56),\n", 639 | " ('Nat', 87.90),\n", 640 | " ('Harry', 70.10),\n", 641 | " ('Sam', 65.48)],\n", 642 | " index=index, columns=columns)\n", 643 | "\n", 644 | "# reset multi-level index and add at level 1\n", 645 | "student_df = student_df.reset_index(level='Standard', col_level=1)\n", 646 | "print(student_df)" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "### Reset index and name other level\n", 654 | "\n", 655 | "As we see in the above section, in multi-level DataFrame, we have added the **`Standard`** index at level 1. If there is a case when we need to rename the other level, we need to use the **`col_fill`** parameter of DataFrame.\n", 656 | "\n", 657 | "We can specify any existing column label under which the new column will be assigned. If we specify the new label, then it will create one.\n", 658 | "\n", 659 | "**Example:**\n", 660 | "\n", 661 | "In the below example, we create a new column from the index **`Standard`** at level 1 and assign a new column label **`New_Header`** at level 0 of this new column." 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 12, 667 | "metadata": { 668 | "ExecuteTime": { 669 | "end_time": "2021-06-17T12:08:46.574064Z", 670 | "start_time": "2021-06-17T12:08:46.550629Z" 671 | } 672 | }, 673 | "outputs": [ 674 | { 675 | "name": "stdout", 676 | "output_type": "stream", 677 | "text": [ 678 | " New_Header Name Marks\n", 679 | " Standard Surname Percentage\n", 680 | "Class \n", 681 | "Class A Standard 1 Joe 91.56\n", 682 | "Class B Standard 1 Nat 87.90\n", 683 | "Class A Standard 2 Harry 70.10\n", 684 | "Class B Standard 2 Sam 65.48\n" 685 | ] 686 | } 687 | ], 688 | "source": [ 689 | "student_df = pd.DataFrame([('Joe', 91.56),\n", 690 | " ('Nat', 87.90),\n", 691 | " ('Harry', 70.10),\n", 692 | " ('Sam', 65.48)],\n", 693 | " index=index, columns=columns)\n", 694 | "\n", 695 | "# reset multi-level index\n", 696 | "student_df = student_df.reset_index(level='Standard', col_level=1, col_fill='New_Header')\n", 697 | "print(student_df)" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [] 706 | } 707 | ], 708 | "metadata": { 709 | "hide_input": false, 710 | "kernelspec": { 711 | "display_name": "Python 3", 712 | "language": "python", 713 | "name": "python3" 714 | }, 715 | "language_info": { 716 | "codemirror_mode": { 717 | "name": "ipython", 718 | "version": 3 719 | }, 720 | "file_extension": ".py", 721 | "mimetype": "text/x-python", 722 | "name": "python", 723 | "nbconvert_exporter": "python", 724 | "pygments_lexer": "ipython3", 725 | "version": "3.8.8" 726 | }, 727 | "toc": { 728 | "base_numbering": 1, 729 | "nav_menu": {}, 730 | "number_sections": true, 731 | "sideBar": true, 732 | "skip_h1_title": false, 733 | "title_cell": "Table of Contents", 734 | "title_sidebar": "Contents", 735 | "toc_cell": false, 736 | "toc_position": {}, 737 | "toc_section_display": true, 738 | "toc_window_display": false 739 | }, 740 | "varInspector": { 741 | "cols": { 742 | "lenName": 16, 743 | "lenType": 16, 744 | "lenVar": 40 745 | }, 746 | "kernels_config": { 747 | "python": { 748 | "delete_cmd_postfix": "", 749 | "delete_cmd_prefix": "del ", 750 | "library": "var_list.py", 751 | "varRefreshCmd": "print(var_dic_list())" 752 | }, 753 | "r": { 754 | "delete_cmd_postfix": ") ", 755 | "delete_cmd_prefix": "rm(", 756 | "library": "var_list.r", 757 | "varRefreshCmd": "cat(var_dic_list()) " 758 | } 759 | }, 760 | "types_to_exclude": [ 761 | "module", 762 | "function", 763 | "builtin_function_or_method", 764 | "instance", 765 | "_Feature" 766 | ], 767 | "window_display": false 768 | } 769 | }, 770 | "nbformat": 4, 771 | "nbformat_minor": 2 772 | } 773 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/001_Python_Pandas_DataFrame_from_Dictionary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Create Pandas DataFrame from Python Dictionary\n", 17 | "\n", 18 | "In this class shows how to convert a **[Python dictionary](https://github.com/milaan9/02_Python_Datatypes/blob/main/005_Python_Dictionary.ipynb)** to **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)**. It covers the creating DataFrame from all types of dictionaries using the DataFrame constructor and **`from_dict()`** method.\n", 19 | "\n", 20 | "And at the end of this class, we summarize the usage of both ways with the comparison. So please stay till the end." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Python dictionary is the data structure that stores the data in key-value pairs. By converting data from dictionary format to DataFrame will make it very competent for analysis by using functions of DataFrame.\n", 28 | "\n", 29 | "There are multiple ways to convert Python dictionary object into Pandas DataFrame. Majorly used ways are,\n", 30 | "\n", 31 | "1. DataFrame constructor\n", 32 | "2. **`from_dict()`**" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Create DataFrame from dict using constructor\n", 40 | "\n", 41 | "**DataFrame constructor** can be used to create DataFrame from different data structures in python like dict, list, set, tuple, and ndarray.\n", 42 | "\n", 43 | "**Example:** \n", 44 | "\n", 45 | "We create a DataFrame object using dictionary objects contain student data.\n", 46 | "\n", 47 | "
\n", 48 | "\n", 49 | "
" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 1, 55 | "metadata": { 56 | "ExecuteTime": { 57 | "end_time": "2021-06-17T11:43:12.573656Z", 58 | "start_time": "2021-06-17T11:43:11.829037Z" 59 | }, 60 | "scrolled": true 61 | }, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "{'name': ['Joe', 'Nat', 'Harry'], 'age': [20, 21, 19], 'marks': [85.1, 77.8, 91.54]}\n", 68 | " name age marks\n", 69 | "0 Joe 20 85.10\n", 70 | "1 Nat 21 77.80\n", 71 | "2 Harry 19 91.54\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "import pandas as pd\n", 77 | "\n", 78 | "# Create dict object\n", 79 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\"], \"age\": [20, 21, 19], \"marks\": [85.10, 77.80, 91.54]}\n", 80 | "print(student_dict)\n", 81 | "\n", 82 | "# Create DataFrame from dict\n", 83 | "student_df = pd.DataFrame(student_dict)\n", 84 | "print(student_df)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | ">**Note:** When you convert a **`dict`** to DataFrame by default, all the keys of the **`dict`** object becomes columns, and the range of numbers 0, 1, 2,…,n is assigned as a row index." 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## DataFrame from dict with required columns only\n", 99 | "\n", 100 | "While converting the whole **`dict`** to DataFrame, we may need only some of the columns to be included in the resulting DataFrame.\n", 101 | "\n", 102 | "We can select only required columns by passing list column labels to **`columns=['col1', 'col2']`** parameter in the constructor.\n", 103 | "\n", 104 | "**Example:**\n", 105 | "\n", 106 | "In the case of student DataFrame for analyzing the annual score, we need only student **`name`** and **`marks`** whereas the **`age`** column is not required. We can select only required columns, as shown in the below example." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 2, 112 | "metadata": { 113 | "ExecuteTime": { 114 | "end_time": "2021-06-17T11:43:14.927142Z", 115 | "start_time": "2021-06-17T11:43:14.899802Z" 116 | } 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "{'name': ['Joe', 'Nat', 'Harry'], 'age': [20, 21, 19], 'marks': [85.1, 77.8, 91.54]}\n", 124 | " name marks\n", 125 | "0 Joe 85.10\n", 126 | "1 Nat 77.80\n", 127 | "2 Harry 91.54\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "import pandas as pd\n", 133 | "\n", 134 | "# Create dict object\n", 135 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\"], \"age\": [20, 21, 19], \"marks\": [85.10, 77.80, 91.54]}\n", 136 | "print(student_dict)\n", 137 | "\n", 138 | "# Create DataFrame from dict\n", 139 | "student_df = pd.DataFrame(student_dict, columns=[\"name\", \"marks\"])\n", 140 | "print(student_df)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## DataFrame from dict with user-defined indexes\n", 148 | "\n", 149 | "In pandas DataFrame, each row has an index that is used to identify each row. In some cases, we need to provide a customized index for each row. We can do that while creating the DataFrame from **`dict`** using the **`index`** parameter of the DataFrame constructor.\n", 150 | "\n", 151 | "The default index is a range of integers starting from 0 to a number of rows. We can pass a list of the row indexes as **`index=['index1','index2']`** to the dataFrame constructor.\n", 152 | "\n", 153 | "**Example:**\n", 154 | "\n", 155 | "In the below example, we have given a customer index for each student, making it more readable and easy to access the row using it." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 3, 161 | "metadata": { 162 | "ExecuteTime": { 163 | "end_time": "2021-06-17T11:43:17.231317Z", 164 | "start_time": "2021-06-17T11:43:17.218623Z" 165 | } 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "{'name': ['Joe', 'Nat', 'Harry'], 'age': [20, 21, 19], 'marks': [85.1, 77.8, 91.54]}\n", 173 | " name age marks\n", 174 | "stud1 Joe 20 85.10\n", 175 | "stud2 Nat 21 77.80\n", 176 | "stud3 Harry 19 91.54\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "# import pandas library\n", 182 | "import pandas as pd\n", 183 | "\n", 184 | "# Create dict object\n", 185 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\"], \"age\": [20, 21, 19], \"marks\": [85.10, 77.80, 91.54]}\n", 186 | "print(student_dict)\n", 187 | "\n", 188 | "# Create DataFrame from dict\n", 189 | "student_df = pd.DataFrame(student_dict, index=[\"stud1\", \"stud2\", \"stud3\"])\n", 190 | "print(student_df)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "## DataFrame from dict by changing the column data type\n", 198 | "\n", 199 | "By default, while creating a DataFrame from **`dict`** using constructor, it keeps the original data type of the values in dict. But, if we need to change the data type of the data in the resulting DataFrame, we can use the **`dtype`** parameter in the constructor.\n", 200 | "\n", 201 | "Only one data type is allowed to specify as **`dtype='data_type'`** which will be applicable for all the data in the resultant DataFrame. If we do not force such a data type, it internally infers from the Data.\n", 202 | "\n", 203 | "**Example:**\n", 204 | "\n", 205 | "As you can see below example, we are trying to change the data type to **float64** for all the columns. But, it changes the data type of **`age`** and **`marks`** columns only to **float64** even though the **`marks`** column type was **`object`**. But, the **`name`** column type is not changed because string values in that column cannot be converted to **float64**." 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 4, 211 | "metadata": { 212 | "ExecuteTime": { 213 | "end_time": "2021-06-17T11:43:18.680516Z", 214 | "start_time": "2021-06-17T11:43:18.659036Z" 215 | } 216 | }, 217 | "outputs": [ 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "DataFrame with inferred data type : \n", 223 | " name object\n", 224 | "age int64\n", 225 | "marks object\n", 226 | "dtype: object\n", 227 | "DataFrame with changed data type : \n", 228 | " name object\n", 229 | "age float64\n", 230 | "marks float64\n", 231 | "dtype: object\n", 232 | " name age marks\n", 233 | "0 Joe 20.0 85.00\n", 234 | "1 Nat 21.0 77.00\n", 235 | "2 Harry 19.0 91.54\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "# import pandas library\n", 241 | "import pandas as pd\n", 242 | "\n", 243 | "# Create dict object\n", 244 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\"], \"age\": [20, 21, 19], \"marks\": [\"85\", \"77\", \"91.54\"]}\n", 245 | "\n", 246 | "# Create DataFrame from dict\n", 247 | "student_df = pd.DataFrame(student_dict)\n", 248 | "print(\"DataFrame with inferred data type : \\n\", student_df.dtypes)\n", 249 | "\n", 250 | "student_df = pd.DataFrame(student_dict, dtype=\"float64\")\n", 251 | "print(\"DataFrame with changed data type : \\n\", student_df.dtypes)\n", 252 | "\n", 253 | "print(student_df)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "ExecuteTime": { 260 | "end_time": "2021-05-29T14:10:13.245353Z", 261 | "start_time": "2021-05-29T14:10:13.213129Z" 262 | } 263 | }, 264 | "source": [ 265 | ">**Note:** It changes the data type only if it is compatible with the new data type. Otherwise, it keeps the original data type." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 5, 271 | "metadata": { 272 | "ExecuteTime": { 273 | "end_time": "2021-06-17T11:43:22.590141Z", 274 | "start_time": "2021-06-17T11:43:22.554986Z" 275 | } 276 | }, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | " index company body-style wheel-base length engine-type \\\n", 283 | "0 0 alfa-romero convertible 88.6 168.8 dohc \n", 284 | "1 1 alfa-romero convertible 88.6 168.8 dohc \n", 285 | "2 2 alfa-romero hatchback 94.5 171.2 ohcv \n", 286 | "3 3 audi sedan 99.8 176.6 ohc \n", 287 | "4 4 audi sedan 99.4 176.6 ohc \n", 288 | ".. ... ... ... ... ... ... \n", 289 | "56 81 volkswagen sedan 97.3 171.7 ohc \n", 290 | "57 82 volkswagen sedan 97.3 171.7 ohc \n", 291 | "58 86 volkswagen sedan 97.3 171.7 ohc \n", 292 | "59 87 volvo sedan 104.3 188.8 ohc \n", 293 | "60 88 volvo wagon 104.3 188.8 ohc \n", 294 | "\n", 295 | " num-of-cylinders horsepower average-mileage price \n", 296 | "0 four 111 21 13495.0 \n", 297 | "1 four 111 21 16500.0 \n", 298 | "2 six 154 19 16500.0 \n", 299 | "3 four 102 24 13950.0 \n", 300 | "4 five 115 18 17450.0 \n", 301 | ".. ... ... ... ... \n", 302 | "56 four 85 27 7975.0 \n", 303 | "57 four 52 37 7995.0 \n", 304 | "58 four 100 26 9995.0 \n", 305 | "59 four 114 23 12940.0 \n", 306 | "60 four 114 23 13415.0 \n", 307 | "\n", 308 | "[61 rows x 10 columns]\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "cars = pd.read_csv(\"automobile_data.csv\")\n", 314 | "print(cars)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "## DataFrame from dict with a single value\n", 322 | "\n", 323 | "If we have a **`dict`** with only single values for each key and need to convert such **`dict`** to the DataFrame, we can use the DataFrame constructor.\n", 324 | "\n", 325 | "In such a case, it converts the **`dict`** to DataFrame as we have seen before, like keys of the **`dict`** will be column labels and values will be the column data. But, we must provide the index parameter to give the row index. Else it throws an error,\n", 326 | "\n", 327 | "```python\n", 328 | "ValueError: If using all scalar values, you must pass an index\n", 329 | "```\n", 330 | "**Example:**\n", 331 | "\n", 332 | "In the below example, we have provided the customized **`index=['stud1']`** to the DataFrame." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 6, 338 | "metadata": { 339 | "ExecuteTime": { 340 | "end_time": "2021-06-17T11:43:29.882536Z", 341 | "start_time": "2021-06-17T11:43:29.866912Z" 342 | } 343 | }, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "{'name': 'Smith', 'age': 22, 'marks': 88.9}\n", 350 | " name age marks\n", 351 | "stud1 Smith 22 88.9\n" 352 | ] 353 | } 354 | ], 355 | "source": [ 356 | "# import pandas library\n", 357 | "import pandas as pd\n", 358 | "\n", 359 | "# Create dict object\n", 360 | "student_dict = {'name': 'Smith', 'age': 22, 'marks': 88.90}\n", 361 | "print(student_dict)\n", 362 | "\n", 363 | "student_df = pd.DataFrame(student_dict, index=['stud1'])\n", 364 | "print(student_df)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "## DataFrame from dict with key and value as a column\n", 372 | "\n", 373 | "Suppose we have a dictionary object where the key is the **student’s name**, and the value is the student’s marks. And we want the keys in one column and all the values in another column of the DataFrame.\n", 374 | "\n", 375 | "For that, rather than passing a whole **`dict`** object, we need to pass each key-value pair in the dictionary to the DataFrame constructor to create a new DataFrame.\n", 376 | "\n", 377 | "We can get the entry of key-value pair using **`dict.items()`** and pass that function to the constructor.\n", 378 | "\n", 379 | "**Example:**\n", 380 | "\n", 381 | "As shown in the below example, we need to pass an entry of key-value to the constructor and give **`column`** labels using columns parameter." 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 7, 387 | "metadata": { 388 | "ExecuteTime": { 389 | "end_time": "2021-06-17T11:43:50.961390Z", 390 | "start_time": "2021-06-17T11:43:50.924284Z" 391 | } 392 | }, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "{'Joe': 85.1, 'Nat': 75.83, 'Harry': 69.7}\n", 399 | " name marks\n", 400 | "0 Joe 85.10\n", 401 | "1 Nat 75.83\n", 402 | "2 Harry 69.70\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "import pandas as pd\n", 408 | "\n", 409 | "# Create dict object\n", 410 | "student_dict = {\"Joe\": 85.10, \"Nat\": 75.83, \"Harry\": 69.70}\n", 411 | "print(student_dict)\n", 412 | "\n", 413 | "# Create DataFrame from dict\n", 414 | "student_df = pd.DataFrame(student_dict.items(), columns=[\"name\", \"marks\"])\n", 415 | "print(student_df)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "## Create DataFrame from list of dict\n", 423 | "\n", 424 | "For the sake of our understanding, consider the case where each school stores data of students into the dictionary data structure. Each school store different information about students. Like, some school stores student’s hobby whereas some school only stores academic information. If we want to analyze data of all the students from the city, we need to gather all this information into the DataFrame.\n", 425 | "\n", 426 | "To convert such a list of **`dict`** from different schools can be converted to a single DataFrame using either **`DataFrame.from_dict()`** function or DataFrame constructor.\n", 427 | "\n", 428 | "By default, keys of all the different dictionary objects are converted into columns of resultant DataFrame. It handles the missing keys by adding NaN where the values for the column are missing.\n", 429 | "\n", 430 | "**Example:**\n", 431 | "\n", 432 | "Let’s see how we can use a constructor to create DataFrame from different dictionary objects." 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 8, 438 | "metadata": { 439 | "ExecuteTime": { 440 | "end_time": "2021-06-17T11:43:55.527743Z", 441 | "start_time": "2021-06-17T11:43:55.510653Z" 442 | } 443 | }, 444 | "outputs": [ 445 | { 446 | "name": "stdout", 447 | "output_type": "stream", 448 | "text": [ 449 | "[{'name': 'Joe', 'age': 20, 'marks': '85.58', 'hobby': 'smimming'}, {'name': 'Nat', 'age': 21, 'marks': '77.21'}, {'name': 'Harry', 'age': 19, 'marks': '91.54'}]\n", 450 | " name age marks hobby\n", 451 | "0 Joe 20 85.58 smimming\n", 452 | "1 Nat 21 77.21 NaN\n", 453 | "2 Harry 19 91.54 NaN\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "import pandas as pd\n", 459 | "\n", 460 | "# Create dict object\n", 461 | "student_dict = [{\"name\": \"Joe\", \"age\": 20, \"marks\": \"85.58\", \"hobby\": \"smimming\"},\n", 462 | " {\"name\": \"Nat\", \"age\": 21, \"marks\": \"77.21\", },\n", 463 | " {\"name\": \"Harry\", \"age\": 19, \"marks\": \"91.54\"}]\n", 464 | "print(student_dict)\n", 465 | "\n", 466 | "# Create DataFrame object\n", 467 | "student_df = pd.DataFrame(student_dict)\n", 468 | "print(student_df)" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "## The `from_dict()` function\n", 476 | "\n", 477 | "This is another way of creating DataFrame from a Python dictionary using **`DataFrame.from_dict() `** method.\n", 478 | "\n", 479 | ">**Note:** This method is useful for the cases when you need to **transpose** the DataFrame i.e. when we need the keys in the dictionary object as rows in the resultant DataFrame. In all the other cases DataFrame constructor should be preferred.\n", 480 | "\n", 481 | "**Syntax:**\n", 482 | "\n", 483 | "```python\n", 484 | "DataFrame.from_dict(data, orient='columns', dtype=None, columns=None)\n", 485 | "```\n", 486 | "\n", 487 | "**Parameters:**\n", 488 | "\n", 489 | "1. **`data`**: It takes **`dict`**, **`list`**, **`set`**, **`ndarray`**, **`Iterable`**, or **`DataFrame`** as input. An empty DataFrame will be created if it is not provided. The resultant column order follows the insertion order.\n", 490 | "\n", 491 | "\n", 492 | "2. **`orient`**: (Optional) If the keys of the **`dict`** should be the rows of the DataFrame, then set **`orient = index`** else set it to **`column (Default)`** if the keys should be columns of the resultant DataFrame.\n", 493 | "\n", 494 | "\n", 495 | "3. **`dtype`**: (Optional) data type to force on resulting DataFrame. Only a single data type is allowed. If not given, then it’s inferred from the data.\n", 496 | "\n", 497 | "\n", 498 | "4. **`columns`**: (Optional) **Only be used in case of** **`orient=\"index\"`** to specify column labels in the resulting DataFrame. Default column labels are range of integer i.e. 0,1,2…n. \n", 499 | ">**Note:** If we use the **`columns`** parameter with **`orient='columns'`** it throws an **`ValueError: cannot use columns parameter with orient='columns'`**" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "## DataFrame from dict with dict keys as a row\n", 507 | "\n", 508 | "It is used to transpose the DataFrame, i.e., when keys in the dictionary should be the rows in the resultant DataFrame. We can change the orientation of the DataFrame using a parameter **`orient=\"index\"`** in **`DataFrame.from_dict()`**.\n", 509 | "\n", 510 | "**Example:**\n", 511 | "\n", 512 | "In the below example, keys **`name`**, **`age`**, and **`marks`** becomes row indexes in the DataFrame, and values are added in respective rows. New column labels are provided using **`columns`** parameter." 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 9, 518 | "metadata": { 519 | "ExecuteTime": { 520 | "end_time": "2021-06-17T11:44:17.327291Z", 521 | "start_time": "2021-06-17T11:44:17.309714Z" 522 | } 523 | }, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "{'name': ['Joe', 'Nat', 'Harry'], 'age': [20, 21, 19], 'marks': [85.1, 77.8, 91.54]}\n", 530 | " stud1 stud2 stud3\n", 531 | "name Joe Nat Harry\n", 532 | "age 20 21 19\n", 533 | "marks 85.1 77.8 91.54\n" 534 | ] 535 | } 536 | ], 537 | "source": [ 538 | "import pandas as pd\n", 539 | "\n", 540 | "# Create dict object\n", 541 | "student_dict = {\"name\": [\"Joe\", \"Nat\", \"Harry\"], \"age\": [20, 21, 19], \"marks\": [85.10, 77.80, 91.54]}\n", 542 | "print(student_dict)\n", 543 | "\n", 544 | "# Create DataFrame from dict\n", 545 | "student_df = pd.DataFrame.from_dict(student_dict, orient=\"index\", columns=[\"stud1\", \"stud2\", \"stud3\"])\n", 546 | "print(student_df)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "## DataFrame from dict where values are variable-length lists\n", 554 | "\n", 555 | "It is a widespread use case in the IT industry where data is stored in the dictionary with different values against each key.\n", 556 | "\n", 557 | "If such a dictionary object needs to be converted into the DataFrame such that keys and values will be added as columns in DataFrame. Then it can be done using chaining of **`DataFrame.from_dict()`**, **`stack()`**, and **`reset_index()`** functions.\n", 558 | "\n", 559 | "**Example:**\n", 560 | "\n", 561 | "Here, we have **`dict`** with values are of different sizes and still we need to add all the key-values into a DataFrame." 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 10, 567 | "metadata": { 568 | "ExecuteTime": { 569 | "end_time": "2021-06-17T11:44:25.307177Z", 570 | "start_time": "2021-06-17T11:44:25.279837Z" 571 | } 572 | }, 573 | "outputs": [ 574 | { 575 | "name": "stdout", 576 | "output_type": "stream", 577 | "text": [ 578 | "{'Grade A': ['Joe', 'Harry'], 'Grade B': ['Nat']}\n", 579 | " level_0 0\n", 580 | "0 Grade A Joe\n", 581 | "1 Grade A Harry\n", 582 | "0 Grade B Nat\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "import pandas as pd\n", 588 | "\n", 589 | "# Create dict object\n", 590 | "student_dict = {\"Grade A\": [\"Joe\", \"Harry\"], \"Grade B\": [\"Nat\"]}\n", 591 | "print(student_dict)\n", 592 | "\n", 593 | "student_df = pd.DataFrame.from_dict(student_dict, 'index').stack().reset_index(level=0)\n", 594 | "print(student_df)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "metadata": {}, 600 | "source": [ 601 | "## DataFrame from dict nested dict\n", 602 | "\n", 603 | "In this section, we cover the complex structure of the dictionary object where we have a hierarchical structure of the dictionary i.e. one dictionary object into another dictionary object.\n", 604 | "\n", 605 | "**Example:**\n", 606 | "\n", 607 | "We have a student dictionary object where student data categorized by their grades and further divided as per their class. Such a dictionary object is converted into the multi-index DataFrame using **`DataFrame.from_dict()`** by iterating over each key and its values and parameter **`orient='index'`**." 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 11, 613 | "metadata": { 614 | "ExecuteTime": { 615 | "end_time": "2021-06-17T11:44:33.987255Z", 616 | "start_time": "2021-06-17T11:44:33.965772Z" 617 | } 618 | }, 619 | "outputs": [ 620 | { 621 | "name": "stdout", 622 | "output_type": "stream", 623 | "text": [ 624 | "{'Grade A': {'Class A': {'name': 'Joe', 'marks': 91.56}, 'Class B': {'name': 'Harry', 'marks': 87.9}}, 'Grade B': {'Class A': {'name': 'Sam', 'marks': 70}, 'Class B': {'name': 'Alan', 'marks': 65.48}}}\n", 625 | " name marks\n", 626 | "Grade A Class A Joe 91.56\n", 627 | " Class B Harry 87.90\n", 628 | "Grade B Class A Sam 70.00\n", 629 | " Class B Alan 65.48\n" 630 | ] 631 | } 632 | ], 633 | "source": [ 634 | "import pandas as pd\n", 635 | "\n", 636 | "# Create hierarchical dict\n", 637 | "student_dict = {\"Grade A\": {'Class A': {'name': 'Joe', 'marks': 91.56},\n", 638 | " 'Class B': {'name': 'Harry', 'marks': 87.90}},\n", 639 | " \"Grade B\": {'Class A': {'name': 'Sam', 'marks': 70},\n", 640 | " 'Class B': {'name': 'Alan', 'marks': 65.48}}}\n", 641 | "print(student_dict)\n", 642 | "\n", 643 | "# Create multi-index DataFrame\n", 644 | "student_df = pd.DataFrame.from_dict({(i, j): student_dict[i][j]\n", 645 | " for i in student_dict.keys()\n", 646 | " for j in student_dict[i].keys()},\n", 647 | " orient='index')\n", 648 | "print(student_df)\n" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "## DataFrame constructor vs `from_dict()`\n", 656 | "\n", 657 | "The below table summarizes all the cases of converting **`dict`** to the DataFrame that we have already discussed in this article. It shows the comparison of using the DataFrame constructor and **`DataFrame.from_dict()`** method.\n", 658 | "\n", 659 | "It will help you to choose the correct function for converting the **`dict`** to the DataFrame.\n", 660 | "\n", 661 | "| Use Case | DataFrame constructor | **`from_dict()`** method |\n", 662 | "|:---- |:---- |:---- |\n", 663 | "| **`Custom column names`** | **Yes** | **No** | \n", 664 | "| **`custom index`** | **Yes** | **No** | \n", 665 | "| **`dict with a single value`** | **Yes** | **No** | \n", 666 | "| **`list of dict`** | **Yes** | **Yes** | \n", 667 | "| **`handle missing keys`** | **Yes** | **Yes** | \n", 668 | "| **`keys and values as columns`** | **Yes** | **Yes** | \n", 669 | "| **`change datatype`** | **Yes** | **Yes** | \n", 670 | "| **`Orient=column(Keys as columns)`** | **Yes** | **Yes** | \n", 671 | "| **`Orient=index(Keys as rows)`** | **No** | **Yes** | \n", 672 | "| **`Multi-index DataFrame`** | **No** | **Yes** | " 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [] 681 | } 682 | ], 683 | "metadata": { 684 | "hide_input": false, 685 | "kernelspec": { 686 | "display_name": "Python 3", 687 | "language": "python", 688 | "name": "python3" 689 | }, 690 | "language_info": { 691 | "codemirror_mode": { 692 | "name": "ipython", 693 | "version": 3 694 | }, 695 | "file_extension": ".py", 696 | "mimetype": "text/x-python", 697 | "name": "python", 698 | "nbconvert_exporter": "python", 699 | "pygments_lexer": "ipython3", 700 | "version": "3.8.8" 701 | }, 702 | "toc": { 703 | "base_numbering": 1, 704 | "nav_menu": {}, 705 | "number_sections": true, 706 | "sideBar": true, 707 | "skip_h1_title": false, 708 | "title_cell": "Table of Contents", 709 | "title_sidebar": "Contents", 710 | "toc_cell": false, 711 | "toc_position": {}, 712 | "toc_section_display": true, 713 | "toc_window_display": false 714 | }, 715 | "varInspector": { 716 | "cols": { 717 | "lenName": 16, 718 | "lenType": 16, 719 | "lenVar": 40 720 | }, 721 | "kernels_config": { 722 | "python": { 723 | "delete_cmd_postfix": "", 724 | "delete_cmd_prefix": "del ", 725 | "library": "var_list.py", 726 | "varRefreshCmd": "print(var_dic_list())" 727 | }, 728 | "r": { 729 | "delete_cmd_postfix": ") ", 730 | "delete_cmd_prefix": "rm(", 731 | "library": "var_list.r", 732 | "varRefreshCmd": "cat(var_dic_list()) " 733 | } 734 | }, 735 | "types_to_exclude": [ 736 | "module", 737 | "function", 738 | "builtin_function_or_method", 739 | "instance", 740 | "_Feature" 741 | ], 742 | "window_display": false 743 | } 744 | }, 745 | "nbformat": 4, 746 | "nbformat_minor": 2 747 | } 748 | -------------------------------------------------------------------------------- /001_Python_Pandas_Methods/009_Python_Pandas_DataFrame_set_index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "All the IPython Notebooks in this lecture series by Dr. Milan Parmar are available @ **[GitHub](https://github.com/milaan9/10_Python_Pandas_Module/tree/main/001_Python_Pandas_Methods)**\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Set index in pandas DataFrame\n", 17 | "\n", 18 | "In this class, we learn how to set the index of the **[Pandas DataFrame](https://github.com/milaan9/10_Python_Pandas_Module/blob/main/001_Python_Pandas_DataFrame.ipynb)** using existing columns or a list of labels. We cover all the cases of replacing the existing row labels or assign new ones.\n", 19 | "\n", 20 | "DataFrame is the tabular structure in the Python pandas library. It represents each row and column by the label. Row label is called an **index**, whereas column label is called column index/header.\n", 21 | "\n", 22 | "By default, while creating DataFrame, Python pandas assign a range of numbers (starting at 0) as a row index. Row indexes are used to identify each row. We can set a new row index or replace the existing ones using **`DataFrame.set_index()`** function, which we discuss further in more detail." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "heading_collapsed": true 29 | }, 30 | "source": [ 31 | "## The `DataFrame.set_index()` function\n", 32 | "\n", 33 | "This function is used to re-assign a row label using the existing column of the DataFrame. It can assign one or multiple columns as a row index. Let’s see how to use **`DataFrame.set_index()`** function to set row index or replace existing.\n", 34 | "\n", 35 | "**Syntax:**\n", 36 | "```python\n", 37 | "DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)\n", 38 | "```\n", 39 | "\n", 40 | "**Parameters:**\n", 41 | "\n", 42 | "1. **`keys`**: It takes a single or list of column labels to set as an index. It also takes a list of new labels as input.\n", 43 | "2. **`drop`**: It is a flag to specify if columns to be used as the new index should be deleted From DataFrame or not. By default value is **`True`**, i.e., assign the column as an index and **delete** it.\n", 44 | "3. **`append`**: It is a flag to specify whether to append columns to the existing index. By default, it is **`False`**, i.e., it replaces the current index rather than appending.\n", 45 | "4. **`inplace`**: It is used to specify whether to return a new DataFrame or update an existing one. It is a boolean flag with default **`False`**.\n", 46 | "5. **`verify_integrity`**: It is a boolean flag,\n", 47 | " * If **`True`**, then it checks the new index for duplicates and throws **`ValueError`**.\n", 48 | " * If **`False`**, then it defers the check until necessary.\n", 49 | " \n", 50 | ">**Note:** Setting to **`False`** will improve the performance of this method.\n", 51 | "\n", 52 | "**Return:**\n", 53 | "\n", 54 | "DataFrame with the changed row labels or None if **`inplace=True`**." 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Set index using a column\n", 62 | "\n", 63 | "How to set index in pandas DataFrame?\n", 64 | "\n", 65 | "1. **Create pandas DataFrame**\n", 66 | " * We can create a DataFrame from a CSV file or **`dict`**.\n", 67 | "\n", 68 | "\n", 69 | "2. **Identify the columns to set as index**\n", 70 | " * We can set a specific column or multiple columns as an index in pandas DataFrame. Create a list of column labels to be used to set an index.\n", 71 | "**`['col_label1', 'col_label2'...]`**\n", 72 | "\n", 73 | "\n", 74 | "3. **Use **`DataFrame.set_index()`** function**\n", 75 | " * We need to pass the column or list of column labels as input to the DataFrame.set_index() function to set it as an index of DataFrame. **By default, these new index columns are deleted from the DataFrame**.\n", 76 | "**`df = df.set_index(['col_label1', 'col_label2'…])`**\n", 77 | "\n", 78 | "\n", 79 | "4. **Set the index in place**\n", 80 | " * We can use the parameter **`inplace`** to set the index in the existing DataFrame rather than create a new copy.\n", 81 | "**`df.set_index(inplace=True)`**\n", 82 | "\n", 83 | "**Example:**\n", 84 | "\n", 85 | "Let’s see how we can set a specific column as an index in the DataFrame. In the below example, we have default index as a range of numbers replaced with set index using first column **`Name`** of the student DataFrame." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 1, 91 | "metadata": { 92 | "ExecuteTime": { 93 | "end_time": "2021-06-17T12:00:33.128743Z", 94 | "start_time": "2021-06-17T12:00:32.551117Z" 95 | } 96 | }, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "Before set index: \n", 103 | " Name Age Marks\n", 104 | "0 Joe 20 85.10\n", 105 | "1 Nat 21 77.80\n", 106 | "2 Harry 19 91.54\n", 107 | "\n", 108 | "After set index: \n", 109 | " Age Marks\n", 110 | "Name \n", 111 | "Joe 20 85.10\n", 112 | "Nat 21 77.80\n", 113 | "Harry 19 91.54\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "import pandas as pd\n", 119 | "\n", 120 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 121 | "\n", 122 | "# create DataFrame from dict\n", 123 | "student_df = pd.DataFrame(student_dict)\n", 124 | "print(\"Before set index: \\n\", student_df)\n", 125 | "\n", 126 | "# set index using column\n", 127 | "student_df = student_df.set_index('Name')\n", 128 | "print(\"\\nAfter set index: \\n\", student_df)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## Set index using a list\n", 136 | "\n", 137 | "As we have seen, we can pass column labels of the DataFrame to assign it as an index of the DataFrame. We can also give a list of labels which can be strings or numbers to **`DataFrame.set_index()`** function to set a new index in the DataFrame.\n", 138 | "\n", 139 | "First, we create a Python **`Index`** object from a list of labels and then pass it as input to the **`DataFrame.set_index()`** function.\n", 140 | "\n", 141 | "**Example:**" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 2, 147 | "metadata": { 148 | "ExecuteTime": { 149 | "end_time": "2021-06-17T12:00:44.674022Z", 150 | "start_time": "2021-06-17T12:00:44.654495Z" 151 | } 152 | }, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "Before set index: \n", 159 | " Name Age Marks\n", 160 | "0 Joe 20 85.10\n", 161 | "1 Nat 21 77.80\n", 162 | "2 Harry 19 91.54\n", 163 | "\n", 164 | "After set index: \n", 165 | " Name Age Marks\n", 166 | "s1 Joe 20 85.10\n", 167 | "s2 Nat 21 77.80\n", 168 | "s3 Harry 19 91.54\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "import pandas as pd\n", 174 | "\n", 175 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 176 | "\n", 177 | "# create DataFrame from dict\n", 178 | "student_df = pd.DataFrame(student_dict)\n", 179 | "print(\"Before set index: \\n\", student_df)\n", 180 | "\n", 181 | "index = pd.Index(['s1', 's2', 's3'])\n", 182 | "student_df = student_df.set_index(index)\n", 183 | "print(\"\\nAfter set index: \\n\", student_df)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Set index using multiple columns\n", 191 | "\n", 192 | "Python pandas have DataFrame with multiple columns or rows as an index, and they are also called **multi-index** DataFrame. If we want to set multiple columns as row labels, we can use **`DataFrame.set_index()`** function.\n", 193 | "\n", 194 | "**Example:**\n", 195 | "\n", 196 | "In the below example, we pass a list of existing column labels **`Name`** and **`Marks`** to set a **multi-level** index in the student DataFrame.\n", 197 | "\n", 198 | ">**Note:** It throws **`KeyError`** for unknown column labels." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 3, 204 | "metadata": { 205 | "ExecuteTime": { 206 | "end_time": "2021-06-17T12:00:55.218336Z", 207 | "start_time": "2021-06-17T12:00:55.199781Z" 208 | } 209 | }, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "Before set index: \n", 216 | " Name Age Marks\n", 217 | "0 Joe 20 85.10\n", 218 | "1 Nat 21 77.80\n", 219 | "2 Harry 19 91.54\n", 220 | "\n", 221 | "After set index: \n", 222 | " Age\n", 223 | "Name Marks \n", 224 | "Joe 85.10 20\n", 225 | "Nat 77.80 21\n", 226 | "Harry 91.54 19\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "import pandas as pd\n", 232 | "\n", 233 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 234 | "\n", 235 | "# create DataFrame from dict\n", 236 | "student_df = pd.DataFrame(student_dict)\n", 237 | "print(\"Before set index: \\n\", student_df)\n", 238 | "\n", 239 | "# set multi-index\n", 240 | "student_df = student_df.set_index(['Name', 'Marks'])\n", 241 | "print(\"\\nAfter set index: \\n\", student_df)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## Set multi-index using a list and column\n", 249 | "\n", 250 | "If there is a case where we want to create a two-level row index of the DataFrame, where one level is the new list of labels and another level is created from the existing column.\n", 251 | "\n", 252 | "We can use **`DataFrame.set_index()`** to set the multi-level index of pandas DataFrame using a combination of a new list and the existing column.\n", 253 | "\n", 254 | "We need to create a Python **`Index`** object from a list of new labels and pass that **`Index`** object and an existing column label as input to the **`DataFrame.set_index()`** function to create a two-level index.\n", 255 | "\n", 256 | "**Example:**\n", 257 | "\n", 258 | "Here, we are passing two parameters to the DataFrame.set_index() function. The first parameter is the **Python Index** created using multiple strings of size matches to the length of DataFrame. The second parameter is the existing column label **`Name`** of student DataFrame." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 4, 264 | "metadata": { 265 | "ExecuteTime": { 266 | "end_time": "2021-06-17T12:01:03.478492Z", 267 | "start_time": "2021-06-17T12:01:03.448219Z" 268 | } 269 | }, 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "Before set index: \n", 276 | " Name Age Marks\n", 277 | "0 Joe 20 85.10\n", 278 | "1 Nat 21 77.80\n", 279 | "2 Harry 19 91.54\n", 280 | "\n", 281 | "After set index: \n", 282 | " Age Marks\n", 283 | " Name \n", 284 | "s1 Joe 20 85.10\n", 285 | "s2 Nat 21 77.80\n", 286 | "s3 Harry 19 91.54\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "import pandas as pd\n", 292 | "\n", 293 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 294 | "\n", 295 | "# create DataFrame from dict\n", 296 | "student_df = pd.DataFrame(student_dict)\n", 297 | "print(\"Before set index: \\n\", student_df)\n", 298 | "\n", 299 | "index = pd.Index(['s1', 's2', 's3'])\n", 300 | "student_df = student_df.set_index([index, 'Name'])\n", 301 | "print(\"\\nAfter set index: \\n\", student_df)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "## Set multi-index using two Python series\n", 309 | "\n", 310 | "When we want to replace the existing index with the multiple new series rather than the existing columns, we can create such a multi-index DataFrame by assigning new series using **`DataFrame.set_index()`** function.\n", 311 | "\n", 312 | "**Example:**\n", 313 | "\n", 314 | "Let’s see how we can pass two Python **`series`** of numbers as a first and second-level index of the DataFrame." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 5, 320 | "metadata": { 321 | "ExecuteTime": { 322 | "end_time": "2021-06-17T12:01:08.660074Z", 323 | "start_time": "2021-06-17T12:01:08.574657Z" 324 | } 325 | }, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "Before set index: \n", 332 | " Name Age Marks\n", 333 | "0 Joe 20 85.10\n", 334 | "1 Nat 21 77.80\n", 335 | "2 Harry 19 91.54\n", 336 | "\n", 337 | "After set index: \n", 338 | " Name Age Marks\n", 339 | "1 1 Joe 20 85.10\n", 340 | "2 4 Nat 21 77.80\n", 341 | "3 9 Harry 19 91.54\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "import pandas as pd\n", 347 | "\n", 348 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 349 | "\n", 350 | "# create DataFrame from dict\n", 351 | "student_df = pd.DataFrame(student_dict)\n", 352 | "print(\"Before set index: \\n\", student_df)\n", 353 | "\n", 354 | "# set multi-index\n", 355 | "s = pd.Series([1, 2, 3])\n", 356 | "student_df = student_df.set_index([s, s ** 2])\n", 357 | "print(\"\\nAfter set index: \\n\", student_df)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "## Set index using a Python range\n", 365 | "\n", 366 | "Suppose we need to set a sequence of numbers as an index of the DataFrame such that it should start at any number. For example, we want to assign a roll number to the student DataFrame beginning from 1.\n", 367 | "\n", 368 | "It is not feasible to pass all the numbers as a list to the **`DataFrame.set_index()`** function. In such a case, we can use the **[Python range()](https://github.com/milaan9/04_Python_Functions/blob/main/002_Python_Functions_Built_in/053_Python_range().ipynb)** function.\n", 369 | "\n", 370 | "We can create pandas Index using range() function and pass it to the **`DataFrame.set_index()`** function.\n", 371 | "\n", 372 | "**Example:**\n", 373 | "\n", 374 | "Let’s see how we can use **[Python range()](https://github.com/milaan9/04_Python_Functions/blob/main/002_Python_Functions_Built_in/053_Python_range%28%29.ipynb)** function with **`DataFrame.set_index()`** to assign a sequential index to the DataFrame." 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 6, 380 | "metadata": { 381 | "ExecuteTime": { 382 | "end_time": "2021-06-17T12:01:15.517414Z", 383 | "start_time": "2021-06-17T12:01:15.489099Z" 384 | } 385 | }, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "Before set index: \n", 392 | " Name Age Marks\n", 393 | "s1 Joe 20 85.10\n", 394 | "s2 Nat 21 77.80\n", 395 | "s3 Harry 19 91.54\n", 396 | "\n", 397 | "After set index: \n", 398 | " Name Age Marks\n", 399 | "1 Joe 20 85.10\n", 400 | "2 Nat 21 77.80\n", 401 | "3 Harry 19 91.54\n" 402 | ] 403 | } 404 | ], 405 | "source": [ 406 | "import pandas as pd\n", 407 | "\n", 408 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 409 | "\n", 410 | "# create DataFrame from dict\n", 411 | "student_df = pd.DataFrame(student_dict, index=['s1', 's2', 's3'])\n", 412 | "print(\"Before set index: \\n\", student_df)\n", 413 | "\n", 414 | "# set index\n", 415 | "index = pd.Index(range(1, 4, 1))\n", 416 | "student_df = student_df.set_index(index)\n", 417 | "print(\"\\nAfter set index: \\n\", student_df)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "## Set index but keep column\n", 425 | "\n", 426 | "By default, **`DataFrame.set_index()`** function takes column name as input which should be used as an index of the DataFrame. After setting the new index, it deletes the column which is used.\n", 427 | "\n", 428 | "If we do not want to delete such a column from DataFrame, then we need to use the drop parameter of **`DataFrame.set_index()`**. It is a boolean flag such that,\n", 429 | "\n", 430 | "* If **`drop=True`** (default case), it deletes the column and uses it as an index.\n", 431 | "* If **`drop=False`**, it does not delete the column and uses it as an index.\n", 432 | "\n", 433 | "**Example:**\n", 434 | "\n", 435 | "In the below example, we use the **`Name`** column as an index of the student DataFrame without deleting it." 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 7, 441 | "metadata": { 442 | "ExecuteTime": { 443 | "end_time": "2021-06-17T12:01:23.013423Z", 444 | "start_time": "2021-06-17T12:01:22.990963Z" 445 | }, 446 | "scrolled": true 447 | }, 448 | "outputs": [ 449 | { 450 | "name": "stdout", 451 | "output_type": "stream", 452 | "text": [ 453 | "Before set index: \n", 454 | " Name Age Marks\n", 455 | "0 Joe 20 85.10\n", 456 | "1 Nat 21 77.80\n", 457 | "2 Harry 19 91.54\n", 458 | "\n", 459 | "After set index: \n", 460 | " Name Age Marks\n", 461 | "Name \n", 462 | "Joe Joe 20 85.10\n", 463 | "Nat Nat 21 77.80\n", 464 | "Harry Harry 19 91.54\n" 465 | ] 466 | } 467 | ], 468 | "source": [ 469 | "import pandas as pd\n", 470 | "\n", 471 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 472 | "\n", 473 | "# create DataFrame from dict\n", 474 | "student_df = pd.DataFrame(student_dict)\n", 475 | "print(\"Before set index: \\n\", student_df)\n", 476 | "\n", 477 | "# set index, keep column\n", 478 | "student_df = student_df.set_index('Name', drop=False)\n", 479 | "print(\"\\nAfter set index: \\n\", student_df)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "## Set index by keeping old index\n", 487 | "\n", 488 | "**`DataFrame.set_index()`** is used to set a new index to the DataFrame. It is also used to extend the existing DataFrame, i.e., we can update the index by append to the existing index.\n", 489 | "\n", 490 | "We need to use the **`append`** parameter of the **`DataFrame.set_index()`** function to append the new index to the existing one. By default, the value of **`append`** is **`False`**.\n", 491 | "\n", 492 | "**Example:**\n", 493 | "\n", 494 | "In the below example, each row has an index like s1, s2, and so on, but we update the index by appending values in the **`Name`** column." 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 8, 500 | "metadata": { 501 | "ExecuteTime": { 502 | "end_time": "2021-06-17T12:01:29.680824Z", 503 | "start_time": "2021-06-17T12:01:29.656902Z" 504 | } 505 | }, 506 | "outputs": [ 507 | { 508 | "name": "stdout", 509 | "output_type": "stream", 510 | "text": [ 511 | "Before set index: \n", 512 | " Name Age Marks\n", 513 | "s1 Joe 20 85.10\n", 514 | "s2 Nat 21 77.80\n", 515 | "s3 Harry 19 91.54\n", 516 | "\n", 517 | "After set index: \n", 518 | " Age Marks\n", 519 | " Name \n", 520 | "s1 Joe 20 85.10\n", 521 | "s2 Nat 21 77.80\n", 522 | "s3 Harry 19 91.54\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "import pandas as pd\n", 528 | "\n", 529 | "student_dict = {'Name':['Joe','Nat','Harry'], 'Age':[20,21,19], 'Marks':[85.10, 77.80, 91.54]}\n", 530 | "\n", 531 | "# create DataFrame from dict\n", 532 | "student_df = pd.DataFrame(student_dict, index = ['s1','s2','s3'])\n", 533 | "print(\"Before set index: \\n\", student_df)\n", 534 | "\n", 535 | "# set index by append\n", 536 | "student_df = student_df.set_index('Name', append=True)\n", 537 | "print(\"\\nAfter set index: \\n\", student_df)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "## Set index in place\n", 545 | "\n", 546 | "In the above examples, whenever we executed **`DataFrame.set_index()`** operation, pandas created a new copy of DataFrame because the modification is not-in place.\n", 547 | "\n", 548 | "Specify **`inplace=True`** to set index in the existing DataFrame rather than creating a copy of it.\n", 549 | "\n", 550 | "If **`inplace=True`** then it updates the existing DataFrame and does not return anything.\n", 551 | "If **`inplace=False`** then it creates a new DataFrame with updated changes and returns it.\n", 552 | "\n", 553 | ">**Note:** You don’t need to assign the result back to a variable as we are performing modifications in place.\n", 554 | "\n", 555 | "**Example:**" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 9, 561 | "metadata": { 562 | "ExecuteTime": { 563 | "end_time": "2021-06-17T12:01:34.047473Z", 564 | "start_time": "2021-06-17T12:01:34.028918Z" 565 | } 566 | }, 567 | "outputs": [ 568 | { 569 | "name": "stdout", 570 | "output_type": "stream", 571 | "text": [ 572 | "Before set index: \n", 573 | " Name Age Marks\n", 574 | "0 Joe 20 85.10\n", 575 | "1 Nat 21 77.80\n", 576 | "2 Harry 19 91.54\n", 577 | "\n", 578 | "After set index: \n", 579 | " Age Marks\n", 580 | "Name \n", 581 | "Joe 20 85.10\n", 582 | "Nat 21 77.80\n", 583 | "Harry 19 91.54\n" 584 | ] 585 | } 586 | ], 587 | "source": [ 588 | "import pandas as pd\n", 589 | "\n", 590 | "student_dict = {'Name': ['Joe', 'Nat', 'Harry'], 'Age': [20, 21, 19], 'Marks': [85.10, 77.80, 91.54]}\n", 591 | "\n", 592 | "# create DataFrame from dict\n", 593 | "student_df = pd.DataFrame(student_dict)\n", 594 | "print(\"Before set index: \\n\", student_df)\n", 595 | "\n", 596 | "# set index in place\n", 597 | "student_df.set_index('Name', inplace=True)\n", 598 | "print(\"\\nAfter set index: \\n\", student_df)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "## Set index using a column with duplicates\n", 606 | "\n", 607 | "As we have discussed, we need to pass a column name that needs to be used to set row index in the DataFrame.\n", 608 | "\n", 609 | "But, the column may contain duplicate values. By default, **`DataFrame.set_index()`** allows duplicate index. If we want to change this behavior, then we can use the **`verify_integrity`** parameter of **`DataFrame.set_index()`**.\n", 610 | "\n", 611 | "* If **`verify_integrity=True`**, then it checks the new index for duplicates and throws **`ValueError`**.\n", 612 | "* If **`verify_integrity=False`**, then it defers the check until necessary.\n", 613 | "\n", 614 | ">**Note:** Setting to **`False`** will improve the performance of this method.\n", 615 | "\n", 616 | "**Example:**\n", 617 | "\n", 618 | "In the below example, we set **`verify_integrity=True`** and use the **`Name`** column to set an index that contains duplicate values." 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 10, 624 | "metadata": { 625 | "ExecuteTime": { 626 | "end_time": "2021-06-17T12:01:40.485869Z", 627 | "start_time": "2021-06-17T12:01:40.048379Z" 628 | } 629 | }, 630 | "outputs": [ 631 | { 632 | "name": "stdout", 633 | "output_type": "stream", 634 | "text": [ 635 | "Before set index: \n", 636 | " Name Age Marks\n", 637 | "0 Joe 20 85.10\n", 638 | "1 Nat 21 77.80\n", 639 | "2 Joe 19 91.54\n" 640 | ] 641 | }, 642 | { 643 | "ename": "ValueError", 644 | "evalue": "Index has duplicate keys: Index(['Joe'], dtype='object', name='Name')", 645 | "output_type": "error", 646 | "traceback": [ 647 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 648 | "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", 649 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;31m# set index error case\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mstudent_df\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstudent_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_index\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Name'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverify_integrity\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 11\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"\\nAfter set index: \\n\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstudent_df\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 650 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36mset_index\u001b[1;34m(self, keys, drop, append, inplace, verify_integrity)\u001b[0m\n\u001b[0;32m 4777\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mverify_integrity\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_unique\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4778\u001b[0m \u001b[0mduplicates\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mduplicated\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4779\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Index has duplicate keys: {duplicates}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4780\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4781\u001b[0m \u001b[1;31m# use set to handle duplicate column names gracefully in case of drop\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 651 | "\u001b[1;31mValueError\u001b[0m: Index has duplicate keys: Index(['Joe'], dtype='object', name='Name')" 652 | ] 653 | } 654 | ], 655 | "source": [ 656 | "import pandas as pd\n", 657 | "\n", 658 | "student_dict = {'Name':['Joe','Nat','Joe'], 'Age':[20,21,19], 'Marks':[85.10, 77.80, 91.54]}\n", 659 | "\n", 660 | "# create DataFrame from dict\n", 661 | "student_df = pd.DataFrame(student_dict)\n", 662 | "print(\"Before set index: \\n\", student_df)\n", 663 | "\n", 664 | "# set index error case\n", 665 | "student_df = student_df.set_index('Name', verify_integrity=True)\n", 666 | "print(\"\\nAfter set index: \\n\", student_df)" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": {}, 672 | "source": [ 673 | "## Set index by column number\n", 674 | "\n", 675 | "If we need to set single or multiple columns as an index of the DataFrame, but we do not know the column labels to pass to **`DataFrame.set_index()`** function. In such a case, we can use the columns parameter of the DataFrame to retrieve the column index position.\n", 676 | "\n", 677 | "We need to create a list of columns using column position **`df.columns[[0,1]]`** and pass it to the **`DataFrame.set_index()`** function.\n", 678 | "\n", 679 | "**Example:**\n", 680 | "\n", 681 | "In the below example, we set column 0 and 2 as an index of the student DataFrame." 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 11, 687 | "metadata": { 688 | "ExecuteTime": { 689 | "end_time": "2021-06-17T12:01:46.411582Z", 690 | "start_time": "2021-06-17T12:01:46.385215Z" 691 | } 692 | }, 693 | "outputs": [ 694 | { 695 | "name": "stdout", 696 | "output_type": "stream", 697 | "text": [ 698 | "Before set index: \n", 699 | " Name Age Marks\n", 700 | "0 Joe 20 85.10\n", 701 | "1 Nat 21 77.80\n", 702 | "2 Harry 19 91.54\n", 703 | "\n", 704 | "After set index: \n", 705 | " Age\n", 706 | "Name Marks \n", 707 | "Joe 85.10 20\n", 708 | "Nat 77.80 21\n", 709 | "Harry 91.54 19\n" 710 | ] 711 | } 712 | ], 713 | "source": [ 714 | "import pandas as pd\n", 715 | "\n", 716 | "student_dict = {'Name':['Joe','Nat','Harry'], 'Age':[20,21,19], 'Marks':[85.10, 77.80, 91.54]}\n", 717 | "\n", 718 | "# create DataFrame from dict\n", 719 | "student_df = pd.DataFrame(student_dict)\n", 720 | "print(\"Before set index: \\n\", student_df)\n", 721 | "\n", 722 | "# set index\n", 723 | "cols = list(student_df.columns[[0,2]])\n", 724 | "student_df = student_df.set_index(cols)\n", 725 | "print(\"\\nAfter set index: \\n\", student_df)" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [] 734 | } 735 | ], 736 | "metadata": { 737 | "hide_input": false, 738 | "kernelspec": { 739 | "display_name": "Python 3", 740 | "language": "python", 741 | "name": "python3" 742 | }, 743 | "language_info": { 744 | "codemirror_mode": { 745 | "name": "ipython", 746 | "version": 3 747 | }, 748 | "file_extension": ".py", 749 | "mimetype": "text/x-python", 750 | "name": "python", 751 | "nbconvert_exporter": "python", 752 | "pygments_lexer": "ipython3", 753 | "version": "3.8.8" 754 | }, 755 | "toc": { 756 | "base_numbering": 1, 757 | "nav_menu": {}, 758 | "number_sections": true, 759 | "sideBar": true, 760 | "skip_h1_title": false, 761 | "title_cell": "Table of Contents", 762 | "title_sidebar": "Contents", 763 | "toc_cell": false, 764 | "toc_position": {}, 765 | "toc_section_display": true, 766 | "toc_window_display": false 767 | }, 768 | "varInspector": { 769 | "cols": { 770 | "lenName": 16, 771 | "lenType": 16, 772 | "lenVar": 40 773 | }, 774 | "kernels_config": { 775 | "python": { 776 | "delete_cmd_postfix": "", 777 | "delete_cmd_prefix": "del ", 778 | "library": "var_list.py", 779 | "varRefreshCmd": "print(var_dic_list())" 780 | }, 781 | "r": { 782 | "delete_cmd_postfix": ") ", 783 | "delete_cmd_prefix": "rm(", 784 | "library": "var_list.r", 785 | "varRefreshCmd": "cat(var_dic_list()) " 786 | } 787 | }, 788 | "types_to_exclude": [ 789 | "module", 790 | "function", 791 | "builtin_function_or_method", 792 | "instance", 793 | "_Feature" 794 | ], 795 | "window_display": false 796 | } 797 | }, 798 | "nbformat": 4, 799 | "nbformat_minor": 2 800 | } 801 | --------------------------------------------------------------------------------