├── .gitignore
├── LICENSE
├── README.md
├── data
├── bios.csv
├── noc_regions.csv
├── olympics-data.xlsx
├── results.csv
├── results.feather
└── results.parquet
├── images
└── thumbnail.jpg
├── pandas-cheat-sheet.md.html
├── requirements.txt
├── tutorial.ipynb
└── warmup-data
└── coffee.csv
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | *.DS_Store
163 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Keith Galli
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Complete Pandas Tutorial
2 | A comprehensive tutorial on the Python Pandas library, updated to be consistent with best practices and features available in 2024.
3 |
4 |
5 |
6 | The tutorial can be watched [here](https://youtu.be/2uvysYbKdjM?si=8UnGt0bwLwo-eEQL)
7 |
8 | The code that is walked through in the tutorial is in [tutorial.ipynb](./tutorial.ipynb)
9 |
10 | # Getting Started with Pandas Locally
11 |
12 | To get started with Pandas locally, you can follow these steps to set up your environment and clone the recommended repository.
13 |
14 | ## Setting Up Your Local Environment
15 |
16 | ### Step 1: Install Python
17 |
18 | First, ensure you have Python installed on your system. You can download Python from the [official website](https://www.python.org/).
19 |
20 | ### Step 2: Fork the Repository
21 |
22 | Fork the repository to your own GitHub account by visiting [complete-pandas-tutorial](https://github.com/KeithGalli/complete-pandas-tutorial) and clicking the "Fork" button in the top-right corner.
23 |
24 | ### Step 3: Clone the Forked Repository
25 |
26 | Clone your forked repository to your local machine. Open a terminal or command prompt and run:
27 |
28 | ```sh
29 | git clone https://github.com/yourusername/complete-pandas-tutorial.git
30 | cd complete-pandas-tutorial
31 | ```
32 |
33 | Replace `yourusername` with your actual GitHub username.
34 |
35 | ### Step 4: Create a Virtual Environment (optional)
36 |
37 | Creating a virtual environment is a good practice to manage dependencies for your projects. Run the following command:
38 |
39 | ```sh
40 | python -m venv myenv
41 | ```
42 |
43 | Activate the virtual environment:
44 |
45 | - On Windows:
46 | ```sh
47 | myenv\Scripts\activate
48 | ```
49 | - On macOS/Linux:
50 | ```sh
51 | source myenv/bin/activate
52 | ```
53 |
54 | To deactivate the virtual environment, run:
55 |
56 | - On Windows:
57 | ```sh
58 | myenv\Scripts\deactivate.bat
59 | ```
60 | - On macOS/Linux:
61 | ```sh
62 | deactivate
63 | ```
64 |
65 | ### Step 5: Install Required Libraries
66 |
67 | With the virtual environment activated, install the necessary libraries from the `requirements.txt` file:
68 |
69 | ```sh
70 | pip install -r requirements.txt
71 | ```
72 |
73 | ### Step 6: Open Your Code Editor
74 |
75 | You can use your favorite code editor like Visual Studio Code or PyCharm. Open the cloned repository folder in your code editor.
76 |
77 | ### Step 7: Create a Jupyter Notebook
78 |
79 | Create a new Jupyter Notebook file in your code editor:
80 |
81 | - In Visual Studio Code, click on the "New File" icon or press `Ctrl+N`, then save the file with a `.ipynb` extension.
82 | - In PyCharm, right-click on the project folder, select "New", and then "Jupyter Notebook".
83 | - **Else**, if these options don't work or you are using an editor that doesn't support Jupyter Notebooks, run the following command in your terminal:
84 | ```sh
85 | jupyter notebook
86 | ```
87 | This will open Jupyter Notebook in your web browser.
88 |
89 | ## Using Google Colab
90 |
91 | If you prefer not to set up things locally, you can use Google Colab, which allows you to run Python code in your browser without any setup.
92 |
93 | Go to [Google Colab](https://colab.research.google.com/) and start a new notebook. You can upload your dataset and start coding with Pandas immediately.
94 |
95 | # Python Pandas Cheat Sheet
96 |
97 | This cheat sheet is a companion to the "Complete Python Pandas Data Science Tutorial."
98 |
99 | ## Creating DataFrames
100 |
101 | Creating DataFrames is the foundation of using Pandas. Here’s how to create a simple DataFrame and display its content.
102 |
103 | ```python
104 | import pandas as pd
105 | import numpy as np
106 |
107 | # Create a simple DataFrame
108 | df = pd.DataFrame(
109 | [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
110 | columns=["A", "B", "C"],
111 | index=["x", "y", "z", "zz"]
112 | )
113 |
114 | # Display the first few rows
115 | df.head()
116 |
117 | # Display the last two rows
118 | df.tail(2)
119 | ```
120 |
121 | ## Loading Data
122 | Loading data into DataFrames from various file formats is crucial for real-world data analysis.
123 |
124 | ```python
125 | # Load data from CSV
126 | coffee = pd.read_csv('./warmup-data/coffee.csv')
127 |
128 | # Load data from Parquet
129 | results = pd.read_parquet('./data/results.parquet')
130 |
131 | # Load data from Excel
132 | olympics_data = pd.read_excel('./data/olympics-data.xlsx', sheet_name="results")
133 | ```
134 |
135 | ## Accessing Data
136 | Accessing different parts of the DataFrame allows for flexible data manipulation and inspection.
137 |
138 | ```python
139 | # Access columns
140 | df.columns
141 |
142 | # Access index
143 | df.index.tolist()
144 |
145 | # General info about the DataFrame
146 | df.info()
147 |
148 | # Statistical summary
149 | df.describe()
150 |
151 | # Number of unique values in each column
152 | df.nunique()
153 |
154 | # Access unique values in a column
155 | df['A'].unique()
156 |
157 | # Shape and size of DataFrame
158 | df.shape
159 | df.size
160 | ```
161 | ## Filtering Data
162 | Filtering data is essential for extracting relevant subsets based on conditions.
163 |
164 | ```python
165 | # Filter rows based on conditions
166 | bios.loc[bios["height_cm"] > 215]
167 |
168 | # Multiple conditions
169 | bios[(bios['height_cm'] > 215) & (bios['born_country']=='USA')]
170 |
171 | # Filter by string conditions
172 | bios[bios['name'].str.contains("keith", case=False)]
173 |
174 | # Regex filters
175 | bios[bios['name'].str.contains(r'^[AEIOUaeiou]', na=False)]
176 | ```
177 | ## Adding/Removing Columns
178 | Adding and removing columns is important for maintaining and analyzing relevant data.
179 |
180 | ```python
181 | # Add a new column
182 | coffee['price'] = 4.99
183 |
184 | # Conditional column
185 | coffee['new_price'] = np.where(coffee['Coffee Type']=='Espresso', 3.99, 5.99)
186 |
187 | # Remove a column
188 | coffee.drop(columns=['price'], inplace=True)
189 |
190 | # Rename columns
191 | coffee.rename(columns={'new_price': 'price'}, inplace=True)
192 |
193 | # Create new columns from existing ones
194 | coffee['revenue'] = coffee['Units Sold'] * coffee['price']
195 | ```
196 | ## Merging and Concatenating Data
197 | Merging and concatenating DataFrames is useful for combining different datasets for comprehensive analysis.
198 |
199 | ```python
200 | # Merge DataFrames
201 | nocs = pd.read_csv('./data/noc_regions.csv')
202 | bios_new = pd.merge(bios, nocs, left_on='born_country', right_on='NOC', how='left')
203 |
204 | # Concatenate DataFrames
205 | usa = bios[bios['born_country']=='USA'].copy()
206 | gbr = bios[bios['born_country']=='GBR'].copy()
207 | new_df = pd.concat([usa, gbr])
208 | ```
209 | ## Handling Null Values
210 | Handling null values is essential to ensure the integrity of data analysis.
211 |
212 | ```python
213 | # Fill NaNs with a specific value
214 | coffee['Units Sold'].fillna(0, inplace=True)
215 |
216 | # Interpolate missing values
217 | coffee['Units Sold'].interpolate(inplace=True)
218 |
219 | # Drop rows with NaNs
220 | coffee.dropna(subset=['Units Sold'], inplace=True)
221 | ```
222 | ## Aggregating Data
223 | Aggregation functions like value counts and group by help in summarizing data efficiently.
224 |
225 | ```python
226 | # Value counts
227 | bios['born_city'].value_counts()
228 |
229 | # Group by and aggregation
230 | coffee.groupby(['Coffee Type'])['Units Sold'].sum()
231 | coffee.groupby(['Coffee Type'])['Units Sold'].mean()
232 |
233 | # Pivot table
234 | pivot = coffee.pivot(columns='Coffee Type', index='Day', values='revenue')
235 | ```
236 | ## Advanced Functionality
237 | Advanced functionalities such as rolling calculations, rankings, and shifts can provide deeper insights.
238 |
239 | ```python
240 | # Cumulative sum
241 | coffee['cumsum'] = coffee['Units Sold'].cumsum()
242 |
243 | # Rolling window
244 | latte = coffee[coffee['Coffee Type']=="Latte"].copy()
245 | latte['3day'] = latte['Units Sold'].rolling(3).sum()
246 |
247 | # Rank
248 | bios['height_rank'] = bios['height_cm'].rank(ascending=False)
249 |
250 | # Shift
251 | coffee['yesterday_revenue'] = coffee['revenue'].shift(1)
252 | ```
253 | ## New Functionality
254 | The PyArrow backend offers optimized performance for certain operations, particularly string operations.
255 |
256 | ```python
257 | # PyArrow backend
258 | results_arrow = pd.read_csv('./data/results.csv', engine='pyarrow', dtype_backend='pyarrow')
259 | results_arrow.info()
260 | ```
261 |
--------------------------------------------------------------------------------
/data/noc_regions.csv:
--------------------------------------------------------------------------------
1 | NOC,region,notes
2 | AFG,Afghanistan,
3 | AHO,Curacao,Netherlands Antilles
4 | ALB,Albania,
5 | ALG,Algeria,
6 | AND,Andorra,
7 | ANG,Angola,
8 | ANT,Antigua,Antigua and Barbuda
9 | ANZ,Australia,Australasia
10 | ARG,Argentina,
11 | ARM,Armenia,
12 | ARU,Aruba,
13 | ASA,American Samoa,
14 | AUS,Australia,
15 | AUT,Austria,
16 | AZE,Azerbaijan,
17 | BAH,Bahamas,
18 | BAN,Bangladesh,
19 | BAR,Barbados,
20 | BDI,Burundi,
21 | BEL,Belgium,
22 | BEN,Benin,
23 | BER,Bermuda,
24 | BHU,Bhutan,
25 | BIH,Bosnia and Herzegovina,
26 | BIZ,Belize,
27 | BLR,Belarus,
28 | BOH,Czech Republic,Bohemia
29 | BOL,Boliva,
30 | BOT,Botswana,
31 | BRA,Brazil,
32 | BRN,Bahrain,
33 | BRU,Brunei,
34 | BUL,Bulgaria,
35 | BUR,Burkina Faso,
36 | CAF,Central African Republic,
37 | CAM,Cambodia,
38 | CAN,Canada,
39 | CAY,Cayman Islands,
40 | CGO,Republic of Congo,
41 | CHA,Chad,
42 | CHI,Chile,
43 | CHN,China,
44 | CIV,Ivory Coast,
45 | CMR,Cameroon,
46 | COD,Democratic Republic of the Congo,
47 | COK,Cook Islands,
48 | COL,Colombia,
49 | COM,Comoros,
50 | CPV,Cape Verde,
51 | CRC,Costa Rica,
52 | CRO,Croatia,
53 | CRT,Greece,Crete
54 | CUB,Cuba,
55 | CYP,Cyprus,
56 | CZE,Czech Republic,
57 | DEN,Denmark,
58 | DJI,Djibouti,
59 | DMA,Dominica,
60 | DOM,Dominican Republic,
61 | ECU,Ecuador,
62 | EGY,Egypt,
63 | ERI,Eritrea,
64 | ESA,El Salvador,
65 | ESP,Spain,
66 | EST,Estonia,
67 | ETH,Ethiopia,
68 | EUN,Russia,
69 | FIJ,Fiji,
70 | FIN,Finland,
71 | FRA,France,
72 | FRG,Germany,
73 | FSM,Micronesia,
74 | GAB,Gabon,
75 | GAM,Gambia,
76 | GBR,UK,
77 | GBS,Guinea-Bissau,
78 | GDR,Germany,
79 | GEO,Georgia,
80 | GEQ,Equatorial Guinea,
81 | GER,Germany,
82 | GHA,Ghana,
83 | GRE,Greece,
84 | GRN,Grenada,
85 | GUA,Guatemala,
86 | GUI,Guinea,
87 | GUM,Guam,
88 | GUY,Guyana,
89 | HAI,Haiti,
90 | HKG,China,Hong Kong
91 | HON,Honduras,
92 | HUN,Hungary,
93 | INA,Indonesia,
94 | IND,India,
95 | IOA,Individual Olympic Athletes,Individual Olympic Athletes
96 | IRI,Iran,
97 | IRL,Ireland,
98 | IRQ,Iraq,
99 | ISL,Iceland,
100 | ISR,Israel,
101 | ISV,"Virgin Islands, US",Virgin Islands
102 | ITA,Italy,
103 | IVB,"Virgin Islands, British",
104 | JAM,Jamaica,
105 | JOR,Jordan,
106 | JPN,Japan,
107 | KAZ,Kazakhstan,
108 | KEN,Kenya,
109 | KGZ,Kyrgyzstan,
110 | KIR,Kiribati,
111 | KOR,South Korea,
112 | KOS,Kosovo,
113 | KSA,Saudi Arabia,
114 | KUW,Kuwait,
115 | LAO,Laos,
116 | LAT,Latvia,
117 | LBA,Libya,
118 | LBR,Liberia,
119 | LCA,Saint Lucia,
120 | LES,Lesotho,
121 | LIB,Lebanon,
122 | LIE,Liechtenstein,
123 | LTU,Lithuania,
124 | LUX,Luxembourg,
125 | MAD,Madagascar,
126 | MAL,Malaysia,
127 | MAR,Morocco,
128 | MAS,Malaysia,
129 | MAW,Malawi,
130 | MDA,Moldova,
131 | MDV,Maldives,
132 | MEX,Mexico,
133 | MGL,Mongolia,
134 | MHL,Marshall Islands,
135 | MKD,Macedonia,
136 | MLI,Mali,
137 | MLT,Malta,
138 | MNE,Montenegro,
139 | MON,Monaco,
140 | MOZ,Mozambique,
141 | MRI,Mauritius,
142 | MTN,Mauritania,
143 | MYA,Myanmar,
144 | NAM,Namibia,
145 | NBO,Malaysia,North Borneo
146 | NCA,Nicaragua,
147 | NED,Netherlands,
148 | NEP,Nepal,
149 | NFL,Canada,Newfoundland
150 | NGR,Nigeria,
151 | NIG,Niger,
152 | NOR,Norway,
153 | NRU,Nauru,
154 | NZL,New Zealand,
155 | OMA,Oman,
156 | PAK,Pakistan,
157 | PAN,Panama,
158 | PAR,Paraguay,
159 | PER,Peru,
160 | PHI,Philippines,
161 | PLE,Palestine,
162 | PLW,Palau,
163 | PNG,Papua New Guinea,
164 | POL,Poland,
165 | POR,Portugal,
166 | PRK,North Korea,
167 | PUR,Puerto Rico,
168 | QAT,Qatar,
169 | RHO,Zimbabwe,
170 | ROT,NA,Refugee Olympic Team
171 | ROU,Romania,
172 | RSA,South Africa,
173 | RUS,Russia,
174 | RWA,Rwanda,
175 | SAA,Germany,
176 | SAM,Samoa,
177 | SCG,Serbia,Serbia and Montenegro
178 | SEN,Senegal,
179 | SEY,Seychelles,
180 | SIN,Singapore,
181 | SKN,Saint Kitts,Turks and Caicos Islands
182 | SLE,Sierra Leone,
183 | SLO,Slovenia,
184 | SMR,San Marino,
185 | SOL,Solomon Islands,
186 | SOM,Somalia,
187 | SRB,Serbia,
188 | SRI,Sri Lanka,
189 | SSD,South Sudan,
190 | STP,Sao Tome and Principe,
191 | SUD,Sudan,
192 | SUI,Switzerland,
193 | SUR,Suriname,
194 | SVK,Slovakia,
195 | SWE,Sweden,
196 | SWZ,Swaziland,
197 | SYR,Syria,
198 | TAN,Tanzania,
199 | TCH,Czech Republic,
200 | TGA,Tonga,
201 | THA,Thailand,
202 | TJK,Tajikistan,
203 | TKM,Turkmenistan,
204 | TLS,Timor-Leste,
205 | TOG,Togo,
206 | TPE,Taiwan,
207 | TTO,Trinidad,Trinidad and Tobago
208 | TUN,Tunisia,
209 | TUR,Turkey,
210 | TUV,NA,Tuvalu
211 | UAE,United Arab Emirates,
212 | UAR,Syria,United Arab Republic
213 | UGA,Uganda,
214 | UKR,Ukraine,
215 | UNK,NA,Unknown
216 | URS,Russia,
217 | URU,Uruguay,
218 | USA,USA,
219 | UZB,Uzbekistan,
220 | VAN,Vanuatu,
221 | VEN,Venezuela,
222 | VIE,Vietnam,
223 | VIN,Saint Vincent,
224 | VNM,Vietnam,
225 | WIF,Trinidad,West Indies Federation
226 | YAR,Yemen,North Yemen
227 | YEM,Yemen,
228 | YMD,Yemen,South Yemen
229 | YUG,Serbia,Yugoslavia
230 | ZAM,Zambia,
231 | ZIM,Zimbabwe,
--------------------------------------------------------------------------------
/data/olympics-data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/a45fad77f8791a639f0de52a7b169cb65a1482a2/data/olympics-data.xlsx
--------------------------------------------------------------------------------
/data/results.feather:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/a45fad77f8791a639f0de52a7b169cb65a1482a2/data/results.feather
--------------------------------------------------------------------------------
/data/results.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/a45fad77f8791a639f0de52a7b169cb65a1482a2/data/results.parquet
--------------------------------------------------------------------------------
/images/thumbnail.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeithGalli/complete-pandas-tutorial/a45fad77f8791a639f0de52a7b169cb65a1482a2/images/thumbnail.jpg
--------------------------------------------------------------------------------
/pandas-cheat-sheet.md.html:
--------------------------------------------------------------------------------
1 |
This cheat sheet is a companion to the “Complete Python Pandas Data Science Tutorial.” For a more detailed walkthrough, check out the video here.
4 |Creating DataFrames is the foundation of using Pandas. Here’s how to create a simple DataFrame and display its content.
6 |import pandas as pd
7 | import numpy as np
8 |
9 | # Create a simple DataFrame
10 | df = pd.DataFrame(
11 | [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
12 | columns=["A", "B", "C"],
13 | index=["x", "y", "z", "zz"]
14 | )
15 |
16 | # Display the first few rows
17 | df.head()
18 |
19 | # Display the last two rows
20 | df.tail(2)
21 |
22 | Loading data into DataFrames from various file formats is crucial for real-world data analysis.
24 |# Load data from CSV
25 | coffee = pd.read_csv('./warmup-data/coffee.csv')
26 |
27 | # Load data from Parquet
28 | results = pd.read_parquet('./data/results.parquet')
29 |
30 | # Load data from Excel
31 | olympics_data = pd.read_excel('./data/olympics-data.xlsx', sheet_name="results")
32 |
33 | Accessing different parts of the DataFrame allows for flexible data manipulation and inspection.
35 |# Access columns
36 | df.columns
37 |
38 | # Access index
39 | df.index.tolist()
40 |
41 | # General info about the DataFrame
42 | df.info()
43 |
44 | # Statistical summary
45 | df.describe()
46 |
47 | # Number of unique values in each column
48 | df.nunique()
49 |
50 | # Access unique values in a column
51 | df['A'].unique()
52 |
53 | # Shape and size of DataFrame
54 | df.shape
55 | df.size
56 |
57 | Filtering data is essential for extracting relevant subsets based on conditions.
59 |# Filter rows based on conditions
60 | bios.loc[bios["height_cm"] > 215]
61 |
62 | # Multiple conditions
63 | bios[(bios['height_cm'] > 215) & (bios['born_country']=='USA')]
64 |
65 | # Filter by string conditions
66 | bios[bios['name'].str.contains("keith", case=False)]
67 |
68 | # Regex filters
69 | bios[bios['name'].str.contains(r'^[AEIOUaeiou]', na=False)]
70 |
71 | Adding and removing columns is important for maintaining and analyzing relevant data.
73 |# Add a new column
74 | coffee['price'] = 4.99
75 |
76 | # Conditional column
77 | coffee['new_price'] = np.where(coffee['Coffee Type']=='Espresso', 3.99, 5.99)
78 |
79 | # Remove a column
80 | coffee.drop(columns=['price'], inplace=True)
81 |
82 | # Rename columns
83 | coffee.rename(columns={'new_price': 'price'}, inplace=True)
84 |
85 | # Create new columns from existing ones
86 | coffee['revenue'] = coffee['Units Sold'] * coffee['price']
87 |
88 | Merging and concatenating DataFrames is useful for combining different datasets for comprehensive analysis.
90 |# Merge DataFrames
91 | nocs = pd.read_csv('./data/noc_regions.csv')
92 | bios_new = pd.merge(bios, nocs, left_on='born_country', right_on='NOC', how='left')
93 |
94 | # Concatenate DataFrames
95 | usa = bios[bios['born_country']=='USA'].copy()
96 | gbr = bios[bios['born_country']=='GBR'].copy()
97 | new_df = pd.concat([usa, gbr])
98 |
99 | Handling null values is essential to ensure the integrity of data analysis.
101 |# Fill NaNs with a specific value
102 | coffee['Units Sold'].fillna(0, inplace=True)
103 |
104 | # Interpolate missing values
105 | coffee['Units Sold'].interpolate(inplace=True)
106 |
107 | # Drop rows with NaNs
108 | coffee.dropna(subset=['Units Sold'], inplace=True)
109 |
110 | Aggregation functions like value counts and group by help in summarizing data efficiently.
112 |# Value counts
113 | bios['born_city'].value_counts()
114 |
115 | # Group by and aggregation
116 | coffee.groupby(['Coffee Type'])['Units Sold'].sum()
117 | coffee.groupby(['Coffee Type'])['Units Sold'].mean()
118 |
119 | # Pivot table
120 | pivot = coffee.pivot(columns='Coffee Type', index='Day', values='revenue')
121 |
122 | Advanced functionalities such as rolling calculations, rankings, and shifts can provide deeper insights.
124 |# Cumulative sum
125 | coffee['cumsum'] = coffee['Units Sold'].cumsum()
126 |
127 | # Rolling window
128 | latte = coffee[coffee['Coffee Type']=="Latte"].copy()
129 | latte['3day'] = latte['Units Sold'].rolling(3).sum()
130 |
131 | # Rank
132 | bios['height_rank'] = bios['height_cm'].rank(ascending=False)
133 |
134 | # Shift
135 | coffee['yesterday_revenue'] = coffee['revenue'].shift(1)
136 |
137 | The PyArrow backend offers optimized performance for certain operations, particularly string operations.
139 |# PyArrow backend
140 | results_arrow = pd.read_csv('./data/results.csv', engine='pyarrow', dtype_backend='pyarrow')
141 | results_arrow.info()
142 |
143 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | openpyxl
3 | pandas
4 | pyarrow
5 | pyjanitor
6 | ipykernel
--------------------------------------------------------------------------------
/warmup-data/coffee.csv:
--------------------------------------------------------------------------------
1 | Day,Coffee Type,Units Sold
2 | Monday,Espresso,25
3 | Monday,Latte,15
4 | Tuesday,Espresso,30
5 | Tuesday,Latte,20
6 | Wednesday,Espresso,35
7 | Wednesday,Latte,25
8 | Thursday,Espresso,40
9 | Thursday,Latte,30
10 | Friday,Espresso,45
11 | Friday,Latte,35
12 | Saturday,Espresso,45
13 | Saturday,Latte,35
14 | Sunday,Espresso,45
15 | Sunday,Latte,35
16 |
--------------------------------------------------------------------------------