├── .gitignore
├── Lectures
├── 01 Introduction to Data Processing
│ ├── 01 What is Data Processing.ipynb
│ ├── 02 Introduction to NumPy and Pandas.ipynb
│ └── images
│ │ ├── array-list.png
│ │ ├── banner.png
│ │ ├── csv.png
│ │ ├── csv.webp
│ │ ├── data-processing-core.png
│ │ ├── diamond.png
│ │ ├── diamond.webp
│ │ ├── json.png
│ │ ├── numpy-pandas.png
│ │ └── types-of-data.png
├── 03 Numpy Fundamentals
│ ├── 01 Introduction to NumPy.ipynb
│ ├── 02 Creating Numpy Arrays.ipynb
│ ├── 03 Indexing and Slicing Arrays.ipynb
│ ├── 04 NumPy Data Types.ipynb
│ ├── 05 Broadcasting.ipynb
│ ├── 06 Vectorization.ipynb
│ ├── 07 Mathematical Operations with NumPy.ipynb
│ ├── 08 Array Manipulation.ipynb
│ ├── 09 NumPy Sorting and Searching.ipynb
│ ├── 10 Saving and Loading Numpy Objects.ipynb
│ ├── 11 Advanced Numpy Topics.ipynb
│ └── images
│ │ ├── 1darray.png
│ │ ├── 2darray.png
│ │ ├── array-list.png
│ │ ├── array-reshaping.png
│ │ ├── banner.png
│ │ ├── boolean-indexing.png
│ │ ├── broadcasting-error.jpg
│ │ ├── broadcasting-error.png
│ │ ├── broadcasting-multiply.png
│ │ ├── broadcasting.png
│ │ ├── flattening-array.png
│ │ ├── indexing.png
│ │ ├── matrix-multiply-numbers.png
│ │ ├── matrix-multiply.png
│ │ ├── nd-array.png
│ │ ├── ndarray.png
│ │ ├── np-dtypes.webp
│ │ ├── np-int-range.png
│ │ ├── numpy-array-data-structure.png
│ │ ├── numpy-arrays.png
│ │ ├── numpy-arrays.webp
│ │ ├── numpy-indexing.png
│ │ ├── numpy-slicing.png
│ │ ├── ravel.png
│ │ ├── reshaping-array.png
│ │ ├── rule-1.png
│ │ ├── rule-2.png
│ │ ├── rule-3.png
│ │ ├── saving-numpy-array.png
│ │ ├── scalar-vector-matrix-tensor.png
│ │ ├── set-operations.png
│ │ ├── sorting-array.png
│ │ ├── splitting-array.png
│ │ ├── stacking-array.png
│ │ ├── stacking.png
│ │ ├── tiling-array.png
│ │ ├── titanic.png
│ │ ├── vectorization.png
│ │ └── view-copy.png
├── 04 Pandas Fundamentals
│ ├── 01 Introduction to Pandas.ipynb
│ ├── 02 Pandas Series.ipynb
│ ├── 03 Pandas Dataframes.ipynb
│ ├── 04 Basic Indexing and Selecting Data.ipynb
│ ├── 05 More on Indexing.ipynb
│ ├── 06 Multi-indexing.ipynb
│ ├── 07 Database Fundamentals.ipynb
│ ├── 08 Applying Functions to Series and DataFrames.ipynb
│ ├── 09 String Manipulation.ipynb
│ ├── 10 Date and Time in Pandas.ipynb
│ ├── 11 Categorical Data.ipynb
│ ├── 12 Handling Duplicate Data.ipynb
│ ├── 13 Handling Missing Values.ipynb
│ ├── 14 Merging, Joining, and Concatenating DataFrames.ipynb
│ ├── 15 Grouping and Aggregating Data.ipynb
│ ├── 16 Reshaping Data: Pivoting, Melting, Stacking, and Unstacking.ipynb
│ ├── 17 Styling DataFrames.ipynb
│ ├── 18 Plotting with Pandas.ipynb
│ └── images
│ │ ├── apply-function.png
│ │ ├── banner.png
│ │ ├── categorical.png
│ │ ├── concat.png
│ │ ├── concatenation.png
│ │ ├── data-query.webp
│ │ ├── data-types.png
│ │ ├── database-tables.png
│ │ ├── database.png
│ │ ├── dataframe-properties.png
│ │ ├── dataframe.png
│ │ ├── duplicates.png
│ │ ├── foreign-key.png
│ │ ├── many-to-many.png
│ │ ├── melt.png
│ │ ├── merge-left.svg
│ │ ├── merge.png
│ │ ├── missing-values.png
│ │ ├── one-to-many.png
│ │ ├── one-to-one.png
│ │ ├── pandas-data-structures.png
│ │ ├── pandas-data-types.png
│ │ ├── pandas-grouping.png
│ │ ├── pivot.png
│ │ ├── primary-key.png
│ │ ├── series-properties.png
│ │ ├── series.png
│ │ ├── sql-joins.png
│ │ ├── sql-query-execution.png
│ │ ├── stack.png
│ │ ├── subset-pandas.png
│ │ ├── table-row-column.png
│ │ ├── time-series.png
│ │ ├── types-of-databases.png
│ │ ├── unstack.png
│ │ └── wide-vs-long.png
├── 06 Exploratory Data Analysis (EDA)
│ ├── 01 Introduction to Exploratory Data Analysis.ipynb
│ ├── 02 Examining Data Types and Structures.ipynb
│ ├── 02 Handling Missing Data.ipynb
│ ├── 03 Normalization and Standardization.ipynb
│ ├── 04 Understanding Data Irregularities (Outliars and others).ipynb
│ ├── 05 Encoding Categorical Variables.ipynb
│ ├── 06 Detecting and Removing Duplicates.ipynb
│ ├── 07 Feature Engineering - Creation and Transformation.ipynb
│ ├── 08 Feature Engineering - Extraction.ipynb
│ ├── 09 Feature Engineering - Selection.ipynb
│ └── images
│ │ ├── anomaly-vs-outlier.png
│ │ ├── autoencoder-image.png
│ │ ├── autoencoder.png
│ │ ├── banner.png
│ │ ├── data-concept-drift.png
│ │ ├── data-drift-2.png
│ │ ├── data-drift-3.png
│ │ ├── data-drift-vs-outliar.png
│ │ ├── data-drift.png
│ │ ├── deduplication.jpg
│ │ ├── duplicate-detection.webp
│ │ ├── feat-extraction.png
│ │ ├── feature-engineering.jpg
│ │ ├── feature-engineering.webp
│ │ ├── feature-extraction-2.webp
│ │ ├── feature-extraction.webp
│ │ ├── feature-selection.jpg
│ │ ├── feature-selection.ppm
│ │ ├── feature-transformation-2.png
│ │ ├── feature-transformation.png
│ │ ├── lda.png
│ │ ├── lda.webp
│ │ ├── normalization-standardization.avif
│ │ ├── novelty.png
│ │ ├── outliar.jpg
│ │ ├── outliar.png
│ │ ├── pca.jpg
│ │ ├── resume.pdf
│ │ ├── tsne-algorithm.webp
│ │ └── wrapper-based.webp
├── 07 Preparing Data for Machine Learning
│ ├── 01 Train-Test Split.ipynb
│ ├── 02 Cross-Validation Setup.ipynb
│ └── 03 Data Pipeline Creation.ipynb
├── 08 Course Projects
│ ├── 01 Applying Data Processing Techniques to Real Datasets.ipynb
│ ├── 02 Iris Dataset.ipynb
│ ├── 03 Titanic Dataset.ipynb
│ └── 04 Twitter Sentiment Analysis Dataset.ipynb
└── images
│ └── exercise-banner.gif
├── README.md
└── images
├── banner.png
└── pytopia-course.png
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | *.npy
163 | *.npz
164 | *.txt
165 | *.csv
166 | test.ipynb
167 |
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/01 What is Data Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "vscode": {
7 | "languageId": "plaintext"
8 | }
9 | },
10 | "source": [
11 | ""
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Introduction to Data Processing"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "In today's fast-paced, data-driven world, information is being generated at an unprecedented rate. From social media interactions and e-commerce transactions to sensor data and scientific experiments, data has become a crucial asset for organizations across all industries. The ability to harness the power of this data is what sets successful companies apart from their competitors.\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "Data, in its raw form, is often unstructured, noisy, and difficult to interpret. This is where data processing comes into play. Data processing is the foundation upon which insights are generated, decisions are made, and value is created. It involves transforming raw data into a clean, structured, and meaningful format that can be easily analyzed and understood.\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "Imagine you are a retailer with millions of customer transactions. Without proper data processing, this information would be nothing more than a massive collection of numbers and text. However, by applying data processing techniques, you can:\n",
40 | "\n",
41 | "- Identify purchasing patterns and trends\n",
42 | "- Segment customers based on their behavior\n",
43 | "- Optimize inventory management\n",
44 | "- Personalize marketing campaigns\n",
45 | "- Detect fraudulent activities\n"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "These insights enable you to make data-driven decisions that improve customer satisfaction, increase operational efficiency, and drive business growth.\n"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "Data processing is not limited to business applications. In scientific research, data processing is essential for making groundbreaking discoveries. By analyzing vast amounts of data from experiments and simulations, scientists can uncover patterns, test hypotheses, and develop new theories.\n"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "As machine learning and artificial intelligence continue to advance, the importance of data processing becomes even more evident. Machine learning algorithms rely on high-quality, preprocessed data to learn patterns and make accurate predictions. Without proper data processing, even the most sophisticated algorithms will fail to deliver reliable results.\n"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "metadata": {},
72 | "source": [
73 | "In this course, we will explore the fundamental concepts and techniques of data processing, with a focus on their application in machine learning. By the end of this course, you will have a solid understanding of how to transform raw data into a valuable asset that drives insights and powers intelligent systems."
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "**Table of contents** \n",
81 | "- [Definition of Data Processing](#toc1_) \n",
82 | "- [Types of Data](#toc2_) \n",
83 | " - [Structured Data](#toc2_1_) \n",
84 | " - [Semi-structured Data](#toc2_2_) \n",
85 | " - [Unstructured Data](#toc2_3_) \n",
86 | "- [Data Processing in Machine Learning](#toc3_) \n",
87 | " - [Preparing Data for Machine Learning Algorithms](#toc3_1_) \n",
88 | " - [Significance of Data Processing in Machine Learning](#toc3_2_) \n",
89 | " - [Impact of Data Quality on Model Performance](#toc3_3_) \n",
90 | "- [Stages of Data Processing](#toc4_) \n",
91 | " - [Data Collection](#toc4_1_) \n",
92 | " - [Data Cleaning](#toc4_2_) \n",
93 | " - [Data Transformation](#toc4_3_) \n",
94 | " - [Data Enrichment](#toc4_4_) \n",
95 | " - [Store Data](#toc4_5_) \n",
96 | "- [Challenges in Data Processing](#toc5_) \n",
97 | "- [Conclusion](#toc6_) \n",
98 | "\n",
99 | "\n",
106 | ""
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "## [Definition of Data Processing](#toc0_)"
114 | ]
115 | },
116 | {
117 | "cell_type": "markdown",
118 | "metadata": {},
119 | "source": [
120 | "Data processing is the act of converting raw data into a format that is more meaningful, insightful, and useful for further analysis and decision-making. It involves a series of steps and techniques that transform data from its original form into a structured, cleaned, and enriched state.\n"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "Consider raw data as a rough diamond. In its unpolished form, a diamond may not appear particularly valuable or attractive. However, through the process of cutting, shaping, and polishing, a skilled craftsman can transform the raw diamond into a brilliant, valuable gem. Similarly, data processing takes raw data and refines it into a valuable asset that can drive business value and scientific discoveries.\n"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "
"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "At its core, data processing aims to:\n",
142 | "\n",
143 | "1. **Extract**: Identify and extract relevant data from various sources, such as databases, files, or APIs.\n",
144 | "\n",
145 | "2. **Clean**: Remove inconsistencies, errors, and missing values from the data. This step ensures that the data is accurate and reliable.\n",
146 | "\n",
147 | "3. **Transform**: Convert the data into a standardized format that is suitable for analysis. This may involve tasks such as normalizing values, aggregating data points, or creating new features.\n",
148 | "\n",
149 | "4. **Enrich**: Combine the data with additional information from external sources to provide more context and depth.\n",
150 | "\n",
151 | "5. **Store**: Save the processed data in a structured format, such as a database or file, for easy access and retrieval.\n"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "
"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "By applying these steps, data processing transforms raw data into a more meaningful and useful format. For example, consider a dataset containing customer purchase histories. In its raw form, this data may include irrelevant information, missing values, and inconsistent formats. Through data processing, you can:\n"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "- Remove irrelevant columns and rows\n",
173 | "- Handle missing values by imputing them or removing the corresponding records\n",
174 | "- Convert the data into a standardized format (e.g., converting all dates to a specific format)\n",
175 | "- Enrich the data by adding customer demographics or product categories\n",
176 | "- Store the processed data in a structured database for efficient querying and analysis\n"
177 | ]
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "metadata": {},
182 | "source": [
183 | "The processed data is now ready to be analyzed, visualized, and used for decision-making. By transforming the raw data into a more meaningful format, data processing enables organizations to extract valuable insights that would otherwise remain hidden in the vast amounts of unstructured data.\n"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "metadata": {},
189 | "source": [
190 | "In the context of machine learning, data processing is a critical step in preparing data for training models. Machine learning algorithms require clean, consistent, and relevant data to learn patterns and make accurate predictions. By applying data processing techniques, you can ensure that the input data meets the requirements of the chosen algorithm and improves the overall performance of the model."
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "## [Types of Data](#toc0_)"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "Data comes in various forms and structures. Understanding the different types of data is crucial for determining the appropriate processing techniques and analysis methods. In this section, we will explore three main categories of data: structured, semi-structured, and unstructured.\n"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "
"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "### [Structured Data](#toc0_)"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "Structured data is data that follows a predefined schema and can be easily organized into rows and columns. It has a fixed format and conforms to a specific data model. Structured data is typically stored in relational databases and can be efficiently queried using SQL (Structured Query Language).\n"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "Examples of structured data include:\n",
233 | "- Customer information in a CRM (Customer Relationship Management) system\n",
234 | "- Financial transactions in a banking database\n",
235 | "- Inventory records in an ERP (Enterprise Resource Planning) system\n"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "Characteristics of structured data:\n",
243 | "- Follows a predefined schema\n",
244 | "- Has a fixed format and structure\n",
245 | "- Can be easily searched and sorted\n",
246 | "- Suitable for complex queries and aggregations\n",
247 | "- Efficiently stored in relational databases\n"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "metadata": {},
253 | "source": [
254 | "
"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "### [Semi-structured Data](#toc0_)"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "Semi-structured data has a partially defined structure but does not conform to a rigid schema. It contains tags or metadata that describe the data, but the structure may vary across different records. Semi-structured data is often stored in formats like XML (eXtensible Markup Language) or JSON (JavaScript Object Notation).\n"
269 | ]
270 | },
271 | {
272 | "cell_type": "markdown",
273 | "metadata": {},
274 | "source": [
275 | "Examples of semi-structured data include:\n",
276 | "- XML files used for data exchange between systems\n",
277 | "- JSON data returned by web APIs\n",
278 | "- Log files generated by applications\n"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "Characteristics of semi-structured data:\n",
286 | "- Has a partially defined structure\n",
287 | "- Contains tags or metadata to describe the data\n",
288 | "- Allows for flexibility in the structure across records\n",
289 | "- Can be parsed and processed using specialized tools\n",
290 | "- Suitable for hierarchical or nested data representations\n"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "
"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "### [Unstructured Data](#toc0_)"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "metadata": {},
310 | "source": [
311 | "Unstructured data does not follow a predefined schema or structure. It is often free-form text, images, audio, or video data that cannot be easily organized into rows and columns. Unstructured data requires specialized processing techniques and tools to extract meaningful insights.\n"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "Examples of unstructured data include:\n",
319 | "- Social media posts and comments\n",
320 | "- Customer reviews and feedback\n",
321 | "- Images and videos shared on platforms like Instagram or YouTube\n",
322 | "- Audio recordings of customer support calls\n"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "Characteristics of unstructured data:\n",
330 | "- Lacks a predefined schema or structure\n",
331 | "- Often in the form of text, images, audio, or video\n",
332 | "- Requires advanced processing techniques like natural language processing (NLP) or computer vision\n",
333 | "- Can be challenging to store and query efficiently\n",
334 | "- Offers rich opportunities for insights and analysis\n"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "Understanding the type of data you are working with is essential for selecting the appropriate data processing techniques. Structured data can be easily processed using traditional data processing tools and stored in relational databases. Semi-structured data requires parsing and transformation techniques to extract relevant information. Unstructured data demands advanced processing methods, such as text mining, sentiment analysis, or image recognition, to derive meaningful insights.\n"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "In the context of machine learning, structured and semi-structured data are commonly used for training models, while unstructured data often requires preprocessing steps to convert it into a structured format suitable for machine learning algorithms.\n"
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "metadata": {},
354 | "source": [
355 | "By recognizing the characteristics and challenges associated with each type of data, you can develop effective strategies for data processing and analysis, enabling you to unlock the full potential of your data assets."
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {},
361 | "source": [
362 | "## [Data Processing in Machine Learning](#toc0_)"
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {},
368 | "source": [
369 | "Data processing plays a pivotal role in machine learning, serving as the foundation upon which models are built and trained. Machine learning algorithms rely on high-quality, well-structured data to learn patterns, make accurate predictions, and generate valuable insights. In this section, we will explore the significance of data processing in machine learning, discuss techniques for preparing data, and highlight the impact of data quality on model performance.\n"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "### [Preparing Data for Machine Learning Algorithms](#toc0_)"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "Preparing data for machine learning algorithms involves several key steps:\n",
384 | "\n",
385 | "1. **Data Collection**: Gather relevant data from various sources, such as databases, APIs, or file systems. Ensure that the collected data aligns with the problem statement and contains sufficient information for training the models.\n",
386 | "\n",
387 | "2. **Data Exploration**: Perform exploratory data analysis (EDA) to gain insights into the data. Visualize the data using plots and summary statistics to identify patterns, distributions, and relationships. EDA helps in understanding the characteristics of the data and guides the subsequent preprocessing steps.\n",
388 | "\n",
389 | "3. **Data Cleaning**: Handle missing values, remove duplicates, and address inconsistencies in the data. Techniques such as imputation, filtering, and data transformation can be applied to clean the data and ensure its quality.\n",
390 | "\n",
391 | "4. **Feature Selection and Extraction**: Select relevant features that have a strong correlation with the target variable. Remove irrelevant or redundant features to reduce dimensionality and improve model efficiency. Apply feature extraction techniques, such as principal component analysis (PCA) or t-SNE, to create new informative features.\n",
392 | "\n",
393 | "5. **Data Splitting**: Split the preprocessed data into training, validation, and testing sets. The training set is used to train the machine learning model, the validation set is used for hyperparameter tuning and model selection, and the testing set is used to evaluate the final model's performance on unseen data.\n",
394 | "\n",
395 | "6. **Data Transformation**: Apply data transformation techniques, such as scaling, normalization, or encoding categorical variables, to ensure that the data is in a suitable format for the chosen machine learning algorithm. Different algorithms may have specific requirements for data representation.\n"
396 | ]
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "### [Significance of Data Processing in Machine Learning](#toc0_)"
403 | ]
404 | },
405 | {
406 | "cell_type": "markdown",
407 | "metadata": {},
408 | "source": [
409 | "Data processing is a critical step in the machine learning pipeline. It involves transforming raw data into a format that is suitable for training machine learning models. The quality and structure of the input data directly influence the performance and accuracy of the resulting models.\n"
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "metadata": {},
415 | "source": [
416 | "Here are some key reasons why data processing is significant in machine learning:\n",
417 | "\n",
418 | "1. **Feature Engineering**: Data processing allows for the creation of new features or the transformation of existing features to capture relevant information. Feature engineering helps in extracting meaningful patterns and relationships from the data, which can improve the predictive power of machine learning models.\n",
419 | "\n",
420 | "2. **Data Cleaning**: Real-world data often contains noise, inconsistencies, and missing values. Data processing techniques are used to clean the data by handling missing values, removing outliers, and resolving inconsistencies. Clean data ensures that the machine learning algorithms can learn from reliable and accurate information.\n",
421 | "\n",
422 | "3. **Data Integration**: Machine learning projects often involve data from multiple sources. Data processing techniques are used to integrate and merge data from different sources, ensuring consistency and compatibility. Integrated data provides a comprehensive view of the problem domain and enables more robust model training.\n",
423 | "\n",
424 | "4. **Data Scaling and Normalization**: Many machine learning algorithms are sensitive to the scale and distribution of the input features. Data processing techniques, such as scaling and normalization, are applied to bring the features into a consistent range and distribution. This helps in improving the convergence and performance of the learning algorithms.\n"
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "metadata": {},
430 | "source": [
431 | "### [Impact of Data Quality on Model Performance](#toc0_)"
432 | ]
433 | },
434 | {
435 | "cell_type": "markdown",
436 | "metadata": {},
437 | "source": [
438 | "The quality of the input data has a significant impact on the performance of machine learning models. Poor data quality can lead to inaccurate predictions, biased models, and suboptimal decision-making. Here are some ways in which data quality affects model performance:\n",
439 | "\n",
440 | "1. **Noise and Outliers**: Noisy data or the presence of outliers can mislead the learning algorithms and result in overfitting or underfitting. Outliers can distort the patterns and relationships in the data, leading to incorrect generalizations.\n",
441 | "\n",
442 | "2. **Missing Values**: Missing values in the data can introduce bias and reduce the effectiveness of machine learning models. Incomplete data can lead to inaccurate predictions and limit the model's ability to capture the true underlying patterns.\n",
443 | "\n",
444 | "3. **Imbalanced Data**: Imbalanced datasets, where one class significantly outnumbers the other, can bias the model towards the majority class. This can result in poor performance on the minority class and limit the model's ability to make accurate predictions for underrepresented instances.\n",
445 | "\n",
446 | "4. **Inconsistent Data**: Inconsistencies in the data, such as conflicting values or inconsistent formatting, can confuse the learning algorithms and lead to incorrect patterns being learned. Consistent and standardized data is essential for accurate model training.\n",
447 | "\n",
448 | "5. **Insufficient Data**: Insufficient training data can limit the model's ability to learn complex patterns and generalize well to unseen instances. Models trained on small datasets may suffer from high variance and overfitting, resulting in poor performance on new data.\n"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "To mitigate the impact of data quality issues, it is crucial to invest time and effort in data preprocessing and quality assurance. Techniques such as data validation, anomaly detection, and data augmentation can help in identifying and addressing data quality problems. Regular monitoring and maintenance of data pipelines ensure that the data used for training and inference remains reliable and consistent.\n"
456 | ]
457 | },
458 | {
459 | "cell_type": "markdown",
460 | "metadata": {},
461 | "source": [
462 | "In summary, data processing is a critical component of machine learning. It enables the transformation of raw data into a format suitable for training models, handles data quality issues, and ensures that the input data is reliable and informative. By paying close attention to data processing and quality, machine learning practitioners can build robust and accurate models that drive valuable insights and decision-making."
463 | ]
464 | },
465 | {
466 | "cell_type": "markdown",
467 | "metadata": {},
468 | "source": [
469 | "## [Stages of Data Processing](#toc0_)"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {},
475 | "source": [
476 | "Data processing is a multi-stage process that involves transforming raw data into meaningful insights and actionable information. Each stage plays a crucial role in ensuring the accuracy, reliability, and usability of the data. In this section, we will explore the five key stages of data processing: data collection, data cleaning, data transformation, data analysis, and data modeling.\n"
477 | ]
478 | },
479 | {
480 | "cell_type": "markdown",
481 | "metadata": {},
482 | "source": [
483 | "
"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "### [Data Collection](#toc0_)"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "Data collection is the first stage of the data processing pipeline. It involves gathering raw data from various sources, such as databases, APIs, sensors, or user inputs. The goal is to acquire a comprehensive and representative dataset that aligns with the objectives of the data processing task.\n"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "metadata": {},
503 | "source": [
504 | "Key considerations during data collection include:\n",
505 | "- Identifying relevant data sources\n",
506 | "- Determining the required data format and structure\n",
507 | "- Establishing data collection mechanisms and protocols\n",
508 | "- Ensuring data privacy and security during the collection process\n",
509 | "- Validating the accuracy and completeness of the collected data\n"
510 | ]
511 | },
512 | {
513 | "cell_type": "markdown",
514 | "metadata": {},
515 | "source": [
516 | "### [Data Cleaning](#toc0_)"
517 | ]
518 | },
519 | {
520 | "cell_type": "markdown",
521 | "metadata": {},
522 | "source": [
523 | "Once the data is collected, the next stage is data cleaning. Real-world data often contains errors, inconsistencies, and missing values that can impact the quality of the analysis. Data cleaning involves identifying and correcting these issues to ensure the integrity and reliability of the dataset.\n"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "Common data cleaning tasks include:\n",
531 | "- Handling missing or incomplete data\n",
532 | "- Removing duplicates and irrelevant records\n",
533 | "- Correcting data inconsistencies and formatting issues\n",
534 | "- Dealing with outliers and anomalies\n",
535 | "- Standardizing data representation and encoding\n"
536 | ]
537 | },
538 | {
539 | "cell_type": "markdown",
540 | "metadata": {},
541 | "source": [
542 | "Data cleaning is a critical step in preparing the data for subsequent processing stages and analysis.\n"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "metadata": {},
548 | "source": [
549 | "### [Data Transformation](#toc0_)"
550 | ]
551 | },
552 | {
553 | "cell_type": "markdown",
554 | "metadata": {},
555 | "source": [
556 | "Data transformation involves converting the cleaned data into a format suitable for analysis and modeling. This stage aims to structure and organize the data in a way that facilitates efficient processing and extraction of meaningful insights.\n"
557 | ]
558 | },
559 | {
560 | "cell_type": "markdown",
561 | "metadata": {},
562 | "source": [
563 | "Data transformation tasks may include:\n",
564 | "- Aggregating data from multiple sources\n",
565 | "- Merging and joining datasets based on common attributes\n",
566 | "- Reshaping data into a desired format (e.g., wide to long, or vice versa)\n",
567 | "- Creating new features or variables through calculations or derivations\n",
568 | "- Scaling and normalizing numerical data\n",
569 | "- Encoding categorical variables\n"
570 | ]
571 | },
572 | {
573 | "cell_type": "markdown",
574 | "metadata": {},
575 | "source": [
576 | "The transformed data is often stored in a structured format, such as a relational database or a file format optimized for analysis (e.g., CSV, Parquet).\n"
577 | ]
578 | },
579 | {
580 | "cell_type": "markdown",
581 | "metadata": {},
582 | "source": [
583 | "### [Data Enrichment](#toc0_)\n",
584 | "\n",
585 | "The data enrichment process is the process of enhancing the data with additional information to improve the quality of the data. This can be done by adding more data to the existing data or by adding more attributes to the existing data. The data enrichment process can be done in many ways, such as:\n",
586 | "\n",
587 | "- Adding more data to the existing data\n",
588 | "- Adding more attributes to the existing data\n",
589 | "- Adding more information to the existing data\n",
590 | "- Adding more context to the existing data\n",
591 | "- Adding more metadata to the existing data\n"
592 | ]
593 | },
594 | {
595 | "cell_type": "markdown",
596 | "metadata": {},
597 | "source": [
598 | "### [Store Data](#toc0_)\n",
599 | "\n",
600 | "The Store Data process is responsible saving the processed data in a structured format, such as a database or file, for easy access and retrieval. This process is important for data management and organization, as it allows for efficient storage and retrieval of data for future analysis and decision-making. The stored data can be used for various purposes, such as generating reports, conducting analysis, and making informed decisions based on the data."
601 | ]
602 | },
603 | {
604 | "cell_type": "markdown",
605 | "metadata": {},
606 | "source": [
607 | "By following these stages of data processing, organizations can transform raw data into valuable insights and make data-driven decisions that drive innovation and growth."
608 | ]
609 | },
610 | {
611 | "cell_type": "markdown",
612 | "metadata": {},
613 | "source": [
614 | "## [Challenges in Data Processing](#toc0_)"
615 | ]
616 | },
617 | {
618 | "cell_type": "markdown",
619 | "metadata": {},
620 | "source": [
621 | "While data processing offers numerous benefits, it also presents several challenges that organizations must navigate. In this section, we will discuss four common challenges in data processing: data volume and variety, data quality and inconsistency, data security and privacy, and integration of data from multiple sources.\n"
622 | ]
623 | },
624 | {
625 | "cell_type": "markdown",
626 | "metadata": {},
627 | "source": [
628 | "1. Data Volume and Variety\n",
629 | " - The rapid growth of data volume and variety poses significant challenges for data processing. Organizations are dealing with massive amounts of structured, semi-structured, and unstructured data from various sources, such as social media, IoT devices, and transactional systems. Processing and analyzing this data requires scalable infrastructure, advanced technologies, and efficient algorithms to handle the ever-increasing data volume and complexity.\n",
630 | "\n",
631 | "2. Data Quality and Inconsistency\n",
632 | " - Ensuring data quality and consistency is a critical challenge in data processing. Data often contains errors, duplicates, missing values, and inconsistencies that can lead to inaccurate insights and flawed decision-making. Identifying and resolving these issues requires robust data validation, cleansing, and standardization techniques. Maintaining data quality becomes even more challenging when dealing with data from multiple sources and in different formats.\n",
633 | "\n",
634 | "3. Data Security and Privacy\n",
635 | " - Data security and privacy are paramount concerns in data processing. With the increasing amount of sensitive and personal information being collected and processed, organizations must implement stringent measures to protect data from unauthorized access, breaches, and misuse. Compliance with data protection regulations, such as GDPR and CCPA, adds another layer of complexity to data processing. Balancing the need for data utilization with the responsibility of safeguarding individuals' privacy is a delicate task.\n",
636 | "\n",
637 | "4. Integration of Data from Multiple Sources\n",
638 | " - Integrating data from multiple sources is a significant challenge in data processing. Organizations often have data stored in various systems, databases, and formats, making it difficult to consolidate and harmonize the information. Integrating data requires establishing common data models, resolving schema discrepancies, and ensuring data compatibility. The lack of standardization and the presence of legacy systems further complicate the data integration process.\n"
639 | ]
640 | },
641 | {
642 | "cell_type": "markdown",
643 | "metadata": {},
644 | "source": [
645 | "To address these challenges, organizations need to adopt robust data processing strategies and leverage advanced technologies. This includes investing in scalable data storage and processing infrastructure, implementing data governance frameworks, and employing data quality management practices. Additionally, organizations must prioritize data security and privacy, implementing strict access controls, encryption, and anonymization techniques to protect sensitive information.\n"
646 | ]
647 | },
648 | {
649 | "cell_type": "markdown",
650 | "metadata": {},
651 | "source": [
652 | "Collaboration between data teams, business stakeholders, and IT departments is crucial to overcoming data processing challenges. By fostering a data-driven culture, establishing clear data policies, and continuously monitoring and optimizing data processing workflows, organizations can effectively tackle these challenges and harness the full potential of their data assets."
653 | ]
654 | },
655 | {
656 | "cell_type": "markdown",
657 | "metadata": {},
658 | "source": [
659 | "## [Conclusion](#toc0_)"
660 | ]
661 | },
662 | {
663 | "cell_type": "markdown",
664 | "metadata": {},
665 | "source": [
666 | "In this lecture, we have explored the fundamental concepts of data processing and its significance in today's data-driven world. Let's recap the key points we covered:\n",
667 | "\n",
668 | "1. Data processing is the transformation of raw data into a meaningful and useful format.\n",
669 | "2. There are three main types of data: structured, semi-structured, and unstructured, each with its own characteristics and processing requirements.\n",
670 | "3. Data processing plays a crucial role in machine learning by preparing data for algorithms and ensuring high-quality input for accurate predictions.\n",
671 | "4. The stages of data processing include data collection, cleaning, transformation, analysis, and modeling.\n",
672 | "5. Effective data processing leads to improved data quality, enhanced decision-making, increased efficiency, and better customer insights.\n",
673 | "6. Challenges in data processing include handling large volumes and variety of data, ensuring data quality and consistency, maintaining security and privacy, and integrating data from multiple sources.\n"
674 | ]
675 | },
676 | {
677 | "cell_type": "markdown",
678 | "metadata": {},
679 | "source": [
680 | "As aspiring machine learning practitioners, it is essential to have a strong understanding of data processing. The quality and relevance of the data you feed into your machine learning models directly impact their performance and reliability. By mastering data processing techniques, you can ensure that your models are built on a solid foundation of clean, structured, and meaningful data.\n"
681 | ]
682 | },
683 | {
684 | "cell_type": "markdown",
685 | "metadata": {},
686 | "source": [
687 | "Moreover, data processing skills are not limited to machine learning. They are valuable across various domains, including business intelligence, scientific research, and data-driven decision-making. By developing expertise in data processing, you open up a wide range of career opportunities and become a valuable asset to any data-centric organization.\n"
688 | ]
689 | },
690 | {
691 | "cell_type": "markdown",
692 | "metadata": {},
693 | "source": [
694 | "As we progress through this course, we will dive deeper into the practical aspects of data processing using powerful tools like NumPy and Pandas. You will learn how to load, manipulate, transform, and analyze data efficiently, enabling you to tackle real-world data challenges with confidence.\n"
695 | ]
696 | },
697 | {
698 | "cell_type": "markdown",
699 | "metadata": {},
700 | "source": [
701 | "Remember, data processing is an iterative process that requires continuous learning and adaptation. As new technologies emerge and data landscapes evolve, it is crucial to stay updated with the latest techniques and best practices. By combining your understanding of data processing fundamentals with hands-on experience and a commitment to ongoing learning, you will be well-equipped to excel in the exciting field of machine learning and data science."
702 | ]
703 | }
704 | ],
705 | "metadata": {
706 | "language_info": {
707 | "name": "python"
708 | }
709 | },
710 | "nbformat": 4,
711 | "nbformat_minor": 2
712 | }
713 |
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/array-list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/array-list.png
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/banner.png
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/csv.png
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/csv.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/csv.webp
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/data-processing-core.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/data-processing-core.png
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/diamond.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/diamond.png
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/diamond.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/diamond.webp
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/json.png
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/numpy-pandas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/numpy-pandas.png
--------------------------------------------------------------------------------
/Lectures/01 Introduction to Data Processing/images/types-of-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/01 Introduction to Data Processing/images/types-of-data.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/06 Vectorization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Vectorization"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Vectorization is a fundamental concept in NumPy that allows you to perform operations on arrays efficiently without the need for explicit loops. It is a powerful technique that can significantly improve the performance and readability of your code when working with large datasets.\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "**Vectorization** refers to the process of converting scalar operations into vector operations. In other words, it involves performing operations on entire arrays or vectors instead of individual elements.\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "Consider the following example:\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "import numpy as np\n",
45 | "\n",
46 | "a = np.array([1, 5, 3])\n",
47 | "b = np.array([2, 3, 1])\n",
48 | "\n",
49 | "c = a + b"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "
"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "In this example, we have two NumPy arrays, `a` and `b`, and we perform element-wise addition using the `+` operator. The resulting array `c` contains the sum of corresponding elements from `a` and `b`.\n"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "This is an example of vectorization. Instead of using a loop to iterate over each element and perform the addition, NumPy automatically applies the operation to the entire arrays, resulting in a concise and efficient solution.\n"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "Vectorization is a key feature of NumPy and plays a crucial role in achieving high performance and efficient computations. Here are some reasons why vectorization is important:\n",
78 | "\n",
79 | "1. **Performance**: Vectorized operations in NumPy are implemented in optimized C code, which executes much faster than pure Python code. By leveraging vectorization, you can take advantage of the underlying hardware optimizations and achieve significant speedups, especially when working with large arrays.\n",
80 | "\n",
81 | "2. **Readability and Conciseness**: Vectorized code is often more concise and easier to read compared to explicit loops. It allows you to express operations on entire arrays using a single line of code, making your code more readable and maintainable.\n",
82 | "\n",
83 | "3. **Memory Efficiency**: Vectorized operations in NumPy are memory-efficient. They minimize the need for temporary variables and intermediate results, reducing memory overhead and improving overall memory usage.\n",
84 | "\n",
85 | "4. **Parallelization**: Vectorized operations can take advantage of parallel processing capabilities of modern hardware, such as multi-core CPUs and GPUs. NumPy can automatically distribute the workload across multiple cores or utilize specialized hardware instructions, enabling faster execution of large-scale computations.\n",
86 | "\n",
87 | "5. **Integration with Other Libraries**: Many scientific Python libraries, such as SciPy, Pandas, and Matplotlib, are built on top of NumPy and leverage its vectorization capabilities. By using vectorized operations in NumPy, you can seamlessly integrate with these libraries and benefit from their optimized performance.\n"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "*Vectorization is a fundamental concept in NumPy that enables efficient and high-performance computations on arrays. By embracing vectorization, you can write concise, readable, and performant code, making it an essential skill for anyone working with numerical data in Python.*\n"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "In the following sections, we will explore vectorized operations, techniques, and best practices to help you maximize the benefits of vectorization in your NumPy code."
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "**Table of contents** \n",
109 | "- [How Vectorization Works](#toc1_) \n",
110 | " - [Operating on Arrays vs. Individual Elements](#toc1_1_) \n",
111 | " - [Under the Hood: NumPy's Optimized C Implementation](#toc1_2_) \n",
112 | "- [Vectorization vs. Python Loops](#toc2_) \n",
113 | " - [Performance Comparison](#toc2_1_) \n",
114 | " - [Readability and Conciseness](#toc2_2_) \n",
115 | "- [Writing Vectorized Code](#toc3_) \n",
116 | " - [Tips and Best Practices](#toc3_1_) \n",
117 | " - [Example: Vectorizing a Function](#toc3_2_) \n",
118 | " - [Example: Vectorize Computations with `np.vectorize()`](#toc3_3_) \n",
119 | "- [Pitfalls and Limitations of Vectorization](#toc4_) \n",
120 | " - [When Vectorization May Not Be Appropriate](#toc4_1_) \n",
121 | " - [Memory Considerations](#toc4_2_) \n",
122 | "- [Summary and Conclusion](#toc5_) \n",
123 | "\n",
124 | "\n",
131 | ""
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "metadata": {},
137 | "source": [
138 | "## [How Vectorization Works](#toc0_)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "Vectorization is a powerful technique in NumPy that allows you to perform operations on entire arrays efficiently. Let's dive deeper into how vectorization works and explore the underlying mechanisms that make it so effective.\n"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "### [Operating on Arrays vs. Individual Elements](#toc0_)\n"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "In traditional programming, when you want to perform an operation on a collection of elements, you typically use a loop to iterate over each element and apply the operation individually. For example, consider the following code that squares each element of a list:\n"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 2,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "numbers = [1, 2, 3, 4, 5]\n",
169 | "squared = []\n",
170 | "\n",
171 | "for num in numbers:\n",
172 | " squared.append(num ** 2)"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "In this approach, you explicitly loop over each element, square it, and append the result to a new list. While this works, it can be inefficient, especially for large datasets.\n"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "In contrast, vectorization in NumPy allows you to perform operations on entire arrays without the need for explicit loops. You can apply an operation to all elements of an array simultaneously, like this:\n"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 3,
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/plain": [
197 | "array([ 1, 4, 9, 16, 25])"
198 | ]
199 | },
200 | "execution_count": 3,
201 | "metadata": {},
202 | "output_type": "execute_result"
203 | }
204 | ],
205 | "source": [
206 | "numbers = np.array([1, 2, 3, 4, 5])\n",
207 | "squared = numbers ** 2\n",
208 | "squared"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "Here, the `**` operator is applied element-wise to the entire `numbers` array, resulting in a new array `squared` with each element squared. NumPy handles the looping internally, making the code more concise and efficient.\n"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "### [Under the Hood: NumPy's Optimized C Implementation](#toc0_)\n"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "metadata": {},
228 | "source": [
229 | "The secret behind NumPy's efficient vectorization lies in its optimized C implementation. When you perform a vectorized operation in NumPy, the actual computation is carried out by pre-compiled C code that is highly optimized for performance.\n"
230 | ]
231 | },
232 | {
233 | "cell_type": "markdown",
234 | "metadata": {},
235 | "source": [
236 | "NumPy's C implementation takes advantage of several low-level optimizations:\n",
237 | "\n",
238 | "1. **Contiguous Memory Layout**: NumPy arrays are stored in contiguous memory blocks, allowing for efficient memory access and cache utilization. This contiguous layout enables faster read and write operations compared to non-contiguous data structures like Python lists.\n",
239 | "\n",
240 | "2. **SIMD Instructions**: Modern CPUs support Single Instruction, Multiple Data (SIMD) instructions, which allow multiple data elements to be processed simultaneously. NumPy's C implementation leverages SIMD instructions to perform operations on multiple array elements in parallel, significantly speeding up computations.\n",
241 | "\n",
242 | "3. **Loop Unrolling**: Loop unrolling is a technique where the compiler optimizes loops by duplicating the loop body multiple times, reducing the overhead of loop control statements. NumPy's C implementation employs loop unrolling to minimize the overhead of iterating over array elements.\n",
243 | "\n",
244 | "4. **Caching and Memory Management**: NumPy's C implementation is designed to efficiently utilize CPU caches and minimize memory transfers. It employs techniques like data locality optimization and cache-friendly algorithms to ensure optimal performance.\n"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "By leveraging these low-level optimizations, NumPy's vectorized operations can achieve significant speedups compared to pure Python implementations. The C code is pre-compiled and optimized for the specific hardware architecture, allowing NumPy to take full advantage of the available computing resources.\n"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "*It's important to note that while vectorization is highly efficient, it may not always be the best approach for every problem. In some cases, especially when dealing with complex or non-vectorizable operations, explicit loops or other techniques may be more appropriate. However, for most common numerical computations, vectorization provides a powerful and efficient solution.*\n"
259 | ]
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "Understanding how vectorization works under the hood gives you a deeper appreciation for the performance benefits it offers. By leveraging NumPy's optimized C implementation and vectorized operations, you can write efficient and high-performance code for numerical computations in Python."
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "## [Vectorization vs. Python Loops](#toc0_)"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "When it comes to performing operations on arrays or collections of data, you have two main approaches in Python: vectorization using NumPy and traditional Python loops. Let's compare these approaches in terms of performance, readability, and conciseness.\n"
280 | ]
281 | },
282 | {
283 | "cell_type": "markdown",
284 | "metadata": {},
285 | "source": [
286 | "### [Performance Comparison](#toc0_)\n"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "One of the primary advantages of vectorization over Python loops is performance. Vectorized operations in NumPy are significantly faster than their loop-based counterparts. Let's consider an example to illustrate the performance difference:\n"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 4,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "import time"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 5,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "# Python loop\n",
312 | "def square_loop(numbers):\n",
313 | " squared = []\n",
314 | " for num in numbers:\n",
315 | " squared.append(num ** 2)\n",
316 | " return squared\n"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 6,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "# Vectorized operation\n",
326 | "def square_vectorized(numbers):\n",
327 | " return numbers ** 2\n"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 7,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "# Generate a large array of numbers\n",
337 | "numbers = np.random.rand(10000000)\n"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 8,
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "# Measure execution time for Python loop\n",
347 | "start_time = time.time()\n",
348 | "squared_loop = square_loop(numbers)\n",
349 | "end_time = time.time()\n",
350 | "loop_time = end_time - start_time\n"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 9,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "# Measure execution time for vectorized operation\n",
360 | "start_time = time.time()\n",
361 | "squared_vectorized = square_vectorized(numbers)\n",
362 | "end_time = time.time()\n",
363 | "vectorized_time = end_time - start_time\n"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "In this example, we define two functions: `square_loop()` uses a Python loop to square each element of the input array, while `square_vectorized()` uses a vectorized operation (`**`) to square the entire array at once.\n"
371 | ]
372 | },
373 | {
374 | "cell_type": "markdown",
375 | "metadata": {},
376 | "source": [
377 | "When we run this code with a large array of numbers, the performance difference becomes evident:\n"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 10,
383 | "metadata": {},
384 | "outputs": [
385 | {
386 | "name": "stdout",
387 | "output_type": "stream",
388 | "text": [
389 | "Python loop time: 1.140 seconds\n",
390 | "Vectorized time: 0.010 seconds\n"
391 | ]
392 | }
393 | ],
394 | "source": [
395 | "print(f\"Python loop time: {loop_time:.3f} seconds\")\n",
396 | "print(f\"Vectorized time: {vectorized_time:.3f} seconds\")"
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {},
402 | "source": [
403 | "The vectorized operation is orders of magnitude faster than the Python loop. This performance gap becomes even more significant as the size of the array increases.\n"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "The reason for this performance difference lies in the optimized C implementation of NumPy's vectorized operations. NumPy's C code is pre-compiled and optimized for the specific hardware architecture, taking advantage of low-level optimizations such as SIMD instructions and efficient memory management.\n"
411 | ]
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "metadata": {},
416 | "source": [
417 | "### [Readability and Conciseness](#toc0_)\n"
418 | ]
419 | },
420 | {
421 | "cell_type": "markdown",
422 | "metadata": {},
423 | "source": [
424 | "In addition to the performance benefits, vectorization often leads to more readable and concise code compared to Python loops. Let's revisit the previous example:\n"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 11,
430 | "metadata": {},
431 | "outputs": [],
432 | "source": [
433 | "# Python loop\n",
434 | "squared_loop = []\n",
435 | "for num in numbers:\n",
436 | " squared_loop.append(num ** 2)\n",
437 | "\n",
438 | "# Vectorized operation\n",
439 | "squared_vectorized = numbers ** 2"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "The vectorized operation expresses the intent of squaring each element of the array in a single line of code. It is clear, concise, and easier to understand at a glance.\n"
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {},
452 | "source": [
453 | "On the other hand, the Python loop requires multiple lines of code to achieve the same result. It involves initializing an empty list, iterating over each element, performing the operation, and appending the result to the list. While this approach is more explicit, it can become verbose and harder to read, especially for complex operations or nested loops.\n"
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {},
459 | "source": [
460 | "Vectorization allows you to express operations on entire arrays using a more declarative and intuitive syntax. It abstracts away the low-level details of looping and element-wise operations, making your code more expressive and maintainable.\n"
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {},
466 | "source": [
467 | "*However, it's important to note that not all operations can be easily vectorized. In some cases, especially when dealing with complex or non-vectorizable operations, Python loops may be necessary. It's essential to choose the appropriate approach based on the specific requirements of your problem and the readability and maintainability of your code.*\n"
468 | ]
469 | },
470 | {
471 | "cell_type": "markdown",
472 | "metadata": {},
473 | "source": [
474 | "In summary, vectorization offers significant performance advantages over Python loops, thanks to NumPy's optimized C implementation. It also leads to more readable and concise code, making your intentions clearer and reducing the chances of errors. By leveraging vectorization whenever possible, you can write efficient and expressive code for numerical computations in Python."
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {},
480 | "source": [
481 | "## [Writing Vectorized Code](#toc0_)"
482 | ]
483 | },
484 | {
485 | "cell_type": "markdown",
486 | "metadata": {},
487 | "source": [
488 | "When working with NumPy, writing vectorized code is essential for achieving optimal performance and readability. In this section, we'll explore some tips and best practices for writing vectorized code and walk through examples of vectorizing a function and using `np.vectorize()`.\n"
489 | ]
490 | },
491 | {
492 | "cell_type": "markdown",
493 | "metadata": {},
494 | "source": [
495 | "### [Tips and Best Practices](#toc0_)\n"
496 | ]
497 | },
498 | {
499 | "cell_type": "markdown",
500 | "metadata": {},
501 | "source": [
502 | "Here are some tips and best practices to keep in mind when writing vectorized code in NumPy:\n",
503 | "\n",
504 | "1. **Think in terms of arrays**: Approach problems from an array-oriented perspective. Consider how you can express operations on entire arrays rather than individual elements.\n",
505 | "\n",
506 | "2. **Leverage NumPy functions and operators**: NumPy provides a wide range of functions and operators that operate on arrays. Familiarize yourself with these functions and use them instead of writing explicit loops.\n",
507 | "\n",
508 | "3. **Avoid Python loops**: Whenever possible, try to replace Python loops with vectorized operations. Vectorized code is more concise and performs better than loop-based code.\n",
509 | "\n",
510 | "4. **Use broadcasting**: Broadcasting allows arrays with different shapes to be used in arithmetic operations. Leverage broadcasting to perform operations between arrays of different sizes without the need for explicit reshaping.\n",
511 | "\n",
512 | "5. **Vectorize computations with `np.vectorize()`**: If you have a custom function that operates on scalar values, you can use `np.vectorize()` to create a vectorized version of the function that can be applied to arrays.\n",
513 | "\n",
514 | "6. **Profile and optimize**: When working with large datasets or complex operations, profile your code to identify performance bottlenecks. Use profiling tools like `%timeit` in Jupyter Notebook or `cProfile` to measure the execution time of different parts of your code and optimize accordingly.\n",
515 | "\n",
516 | "7. **Continuously learn and explore**: NumPy is a vast library with many features and optimizations. Keep learning and exploring new techniques and functions to improve your vectorized code.\n"
517 | ]
518 | },
519 | {
520 | "cell_type": "markdown",
521 | "metadata": {},
522 | "source": [
523 | "By following these tips and best practices, you can write efficient and readable vectorized code in NumPy.\n"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "### [Example: Vectorizing a Function](#toc0_)\n"
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {},
536 | "source": [
537 | "Let's walk through an example of vectorizing a function to illustrate the process. Consider a scenario where you have a function that calculates the Euclidean distance between two points:\n"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 12,
543 | "metadata": {},
544 | "outputs": [],
545 | "source": [
546 | "def euclidean_distance(point1, point2):\n",
547 | " return np.sqrt(np.sum((point1 - point2) ** 2))"
548 | ]
549 | },
550 | {
551 | "cell_type": "markdown",
552 | "metadata": {},
553 | "source": [
554 | "This function takes two points as input and calculates the Euclidean distance between them. However, it only works with individual points. If you have arrays of points and want to calculate the distances between corresponding points, you would need to use a loop:\n"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 13,
560 | "metadata": {},
561 | "outputs": [
562 | {
563 | "data": {
564 | "text/plain": [
565 | "[1.4142135623730951, 1.4142135623730951, 1.4142135623730951]"
566 | ]
567 | },
568 | "execution_count": 13,
569 | "metadata": {},
570 | "output_type": "execute_result"
571 | }
572 | ],
573 | "source": [
574 | "points1 = np.array([[1, 2], [3, 4], [5, 6]])\n",
575 | "points2 = np.array([[2, 3], [4, 5], [6, 7]])\n",
576 | "\n",
577 | "distances = []\n",
578 | "for i in range(len(points1)):\n",
579 | " distance = euclidean_distance(points1[i], points2[i])\n",
580 | " distances.append(distance)\n",
581 | "\n",
582 | "distances"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": 14,
588 | "metadata": {},
589 | "outputs": [
590 | {
591 | "data": {
592 | "text/plain": [
593 | "2.449489742783178"
594 | ]
595 | },
596 | "execution_count": 14,
597 | "metadata": {},
598 | "output_type": "execute_result"
599 | }
600 | ],
601 | "source": [
602 | "# Doesn't work with array of points\n",
603 | "euclidean_distance(points1, points2)"
604 | ]
605 | },
606 | {
607 | "cell_type": "markdown",
608 | "metadata": {},
609 | "source": [
610 | "To vectorize this operation and eliminate the loop, you can modify the `euclidean_distance()` function to work with arrays:\n"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": 15,
616 | "metadata": {},
617 | "outputs": [],
618 | "source": [
619 | "def euclidean_distance_vectorized(points1, points2):\n",
620 | " return np.sqrt(np.sum((points1 - points2) ** 2, axis=1))"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 16,
626 | "metadata": {},
627 | "outputs": [
628 | {
629 | "data": {
630 | "text/plain": [
631 | "array([1.41421356, 1.41421356, 1.41421356])"
632 | ]
633 | },
634 | "execution_count": 16,
635 | "metadata": {},
636 | "output_type": "execute_result"
637 | }
638 | ],
639 | "source": [
640 | "euclidean_distance_vectorized(points1, points2)"
641 | ]
642 | },
643 | {
644 | "cell_type": "markdown",
645 | "metadata": {},
646 | "source": [
647 | "In the vectorized version, we assume that `points1` and `points2` are arrays of points, where each row represents a point. We subtract the arrays element-wise, square the differences, sum along the second axis (axis=1), and then calculate the square root.\n"
648 | ]
649 | },
650 | {
651 | "cell_type": "markdown",
652 | "metadata": {},
653 | "source": [
654 | "Now you can calculate the distances between corresponding points using a single function call:\n"
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": 17,
660 | "metadata": {},
661 | "outputs": [],
662 | "source": [
663 | "distances = euclidean_distance_vectorized(points1, points2)"
664 | ]
665 | },
666 | {
667 | "cell_type": "markdown",
668 | "metadata": {},
669 | "source": [
670 | "The resulting `distances` array will contain the Euclidean distances between the corresponding points in `points1` and `points2`.\n"
671 | ]
672 | },
673 | {
674 | "cell_type": "markdown",
675 | "metadata": {},
676 | "source": [
677 | "By vectorizing the function, you can perform the distance calculation on entire arrays of points efficiently, without the need for explicit loops.\n"
678 | ]
679 | },
680 | {
681 | "cell_type": "markdown",
682 | "metadata": {},
683 | "source": [
684 | "### [Example: Vectorize Computations with `np.vectorize()`](#toc0_)\n"
685 | ]
686 | },
687 | {
688 | "cell_type": "markdown",
689 | "metadata": {},
690 | "source": [
691 | "In addition to manually vectorizing functions, NumPy provides the `np.vectorize()` function that can be used to automatically vectorize a scalar function. Let's consider an example where we have a function that calculates the square of a number if it's even, and the cube of a number if it's odd:\n"
692 | ]
693 | },
694 | {
695 | "cell_type": "code",
696 | "execution_count": 18,
697 | "metadata": {},
698 | "outputs": [],
699 | "source": [
700 | "def square_or_cube(x):\n",
701 | " if x % 2 == 0:\n",
702 | " return x ** 2\n",
703 | " else:\n",
704 | " return x ** 3"
705 | ]
706 | },
707 | {
708 | "cell_type": "markdown",
709 | "metadata": {},
710 | "source": [
711 | "This function works on a single scalar value. To apply it to an array of values, you can use `np.vectorize()`:\n"
712 | ]
713 | },
714 | {
715 | "cell_type": "code",
716 | "execution_count": 19,
717 | "metadata": {},
718 | "outputs": [],
719 | "source": [
720 | "square_or_cube_vectorized = np.vectorize(square_or_cube)\n",
721 | "\n",
722 | "numbers = np.array([1, 2, 3, 4, 5])\n",
723 | "result = square_or_cube_vectorized(numbers)"
724 | ]
725 | },
726 | {
727 | "cell_type": "markdown",
728 | "metadata": {},
729 | "source": [
730 | "In this example, `np.vectorize()` creates a vectorized version of the `square_or_cube()` function. The resulting `square_or_cube_vectorized` function can be applied directly to an array of numbers, and it will return an array with the square or cube of each number based on the condition.\n"
731 | ]
732 | },
733 | {
734 | "cell_type": "markdown",
735 | "metadata": {},
736 | "source": [
737 | "Using `np.vectorize()` is a convenient way to vectorize scalar functions without explicitly modifying the function code. However, it's important to note that `np.vectorize()` is essentially a loop in disguise and does not provide the same performance benefits as manually vectorizing the function using NumPy operations. The vectorize function is provided primarily for convenience, not for performance. The implementation is essentially a for loop.\n"
738 | ]
739 | },
740 | {
741 | "cell_type": "markdown",
742 | "metadata": {},
743 | "source": [
744 | "*Vectorizing functions is a common technique in NumPy to improve performance and readability. Whenever you encounter a function that operates on individual elements or requires a loop, consider whether it can be vectorized to work with arrays directly, either by manually modifying the function or using `np.vectorize()`.*\n"
745 | ]
746 | },
747 | {
748 | "cell_type": "markdown",
749 | "metadata": {},
750 | "source": [
751 | "Remember, vectorization is a powerful tool in NumPy, but it may not always be applicable to every situation. In some cases, especially when dealing with complex or non-vectorizable operations, explicit loops or other approaches may be necessary. However, whenever possible, strive to write vectorized code to leverage the performance and expressiveness of NumPy."
752 | ]
753 | },
754 | {
755 | "cell_type": "markdown",
756 | "metadata": {},
757 | "source": [
758 | "## [Pitfalls and Limitations of Vectorization](#toc0_)"
759 | ]
760 | },
761 | {
762 | "cell_type": "markdown",
763 | "metadata": {},
764 | "source": [
765 | "While vectorization is a powerful technique in NumPy that can greatly improve performance and readability, it's important to be aware of its pitfalls and limitations. In this section, we'll discuss situations where vectorization may not be appropriate and consider memory implications when working with large arrays.\n"
766 | ]
767 | },
768 | {
769 | "cell_type": "markdown",
770 | "metadata": {},
771 | "source": [
772 | "Vectorization is not always the best approach for every problem. Here are some situations where vectorization may not be suitable:\n",
773 | "\n",
774 | "1. **Complex or non-vectorizable operations**: Some operations or algorithms may not be easily expressible using NumPy's vectorized operations. If your problem involves complex logic, multiple conditions, or sequential dependencies, vectorization may not be straightforward or even possible.\n",
775 | "\n",
776 | "2. **Small datasets**: If you are working with small datasets or arrays, the overhead of vectorization may outweigh its benefits. In such cases, using explicit loops or other approaches may be more efficient.\n",
777 | "\n",
778 | "3. **Non-numeric data**: Vectorization is primarily designed for numerical computations. If your data involves non-numeric types, such as strings or custom objects, vectorization may not be directly applicable. NumPy does provide some limited support for vectorized string operations, but it may not be as extensive as numeric operations.\n",
779 | "\n",
780 | "4. **Readability and maintainability**: While vectorization can lead to more concise and expressive code, it may sometimes sacrifice readability and maintainability. If the vectorized code becomes too complex or difficult to understand, it may be better to opt for a more explicit and readable implementation, even if it involves loops.\n",
781 | "\n",
782 | "5. **Debugging and troubleshooting**: Debugging vectorized code can be more challenging compared to explicit loops. When errors occur in vectorized operations, it may be harder to pinpoint the exact location and cause of the issue.\n"
783 | ]
784 | },
785 | {
786 | "cell_type": "markdown",
787 | "metadata": {},
788 | "source": [
789 | "It's important to assess the specific requirements and characteristics of your problem to determine whether vectorization is appropriate. In some cases, a combination of vectorization and explicit loops or other techniques may be the best approach.\n"
790 | ]
791 | },
792 | {
793 | "cell_type": "markdown",
794 | "metadata": {},
795 | "source": [
796 | "*Vectorization is a powerful technique in NumPy, but it's crucial to be aware of its limitations and consider memory implications, especially when dealing with large datasets. By understanding when vectorization may not be appropriate and being mindful of memory usage, you can make informed decisions and optimize your code for performance and efficiency.*"
797 | ]
798 | },
799 | {
800 | "cell_type": "markdown",
801 | "metadata": {},
802 | "source": [
803 | "## [Summary and Conclusion](#toc0_)"
804 | ]
805 | },
806 | {
807 | "cell_type": "markdown",
808 | "metadata": {},
809 | "source": [
810 | "In this lecture, we explored the concept of vectorization in NumPy and its significance in writing efficient and readable code. Let's summarize the key points and highlight the importance of vectorization in NumPy.\n"
811 | ]
812 | },
813 | {
814 | "cell_type": "markdown",
815 | "metadata": {},
816 | "source": [
817 | "Vectorization is a fundamental concept in NumPy and plays a crucial role in scientific computing and data analysis with Python. Its significance lies in the following aspects:\n",
818 | "\n",
819 | "1. **Performance**: Vectorization enables NumPy to perform computations efficiently, taking advantage of the underlying hardware optimizations. It allows you to process large datasets quickly and efficiently, making it suitable for demanding computational tasks.\n",
820 | "\n",
821 | "2. **Readability and Maintainability**: Vectorized code is more concise and expressive, making it easier to understand and maintain. It reduces the chances of errors and bugs that may arise from explicit loops and element-wise operations.\n",
822 | "\n",
823 | "3. **Integration with the Scientific Python Ecosystem**: NumPy serves as the foundation for many scientific Python libraries, such as SciPy, Pandas, and Matplotlib. These libraries leverage NumPy's vectorization capabilities to provide high-level functionality for data manipulation, analysis, and visualization.\n",
824 | "\n",
825 | "4. **Scalability**: Vectorization allows you to scale your computations to larger datasets and more complex problems. By leveraging vectorization, you can process and analyze massive amounts of data efficiently, enabling you to tackle real-world challenges.\n",
826 | "\n",
827 | "5. **Interoperability**: NumPy's vectorization is compatible with other programming languages and environments, such as C, Fortran, and MATLAB. This interoperability enables seamless integration with existing codebases and facilitates the exchange of data and algorithms between different systems.\n"
828 | ]
829 | },
830 | {
831 | "cell_type": "markdown",
832 | "metadata": {},
833 | "source": [
834 | "*Vectorization is a key skill for anyone working with numerical computations and data analysis in Python. By mastering vectorization in NumPy, you can write efficient, readable, and scalable code that leverages the full potential of modern hardware and the scientific Python ecosystem.*\n"
835 | ]
836 | },
837 | {
838 | "cell_type": "markdown",
839 | "metadata": {},
840 | "source": [
841 | "As you continue your journey with NumPy and scientific computing, remember to embrace vectorization whenever possible, while being mindful of its limitations and memory considerations. With practice and experience, you'll develop the intuition and expertise to apply vectorization effectively and tackle complex computational problems with ease."
842 | ]
843 | }
844 | ],
845 | "metadata": {
846 | "kernelspec": {
847 | "display_name": "py310",
848 | "language": "python",
849 | "name": "python3"
850 | },
851 | "language_info": {
852 | "codemirror_mode": {
853 | "name": "ipython",
854 | "version": 3
855 | },
856 | "file_extension": ".py",
857 | "mimetype": "text/x-python",
858 | "name": "python",
859 | "nbconvert_exporter": "python",
860 | "pygments_lexer": "ipython3",
861 | "version": "3.10.12"
862 | }
863 | },
864 | "nbformat": 4,
865 | "nbformat_minor": 2
866 | }
867 |
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/11 Advanced Numpy Topics.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
"
8 | ]
9 | }
10 | ],
11 | "metadata": {
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "nbformat": 4,
17 | "nbformat_minor": 2
18 | }
19 |
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/1darray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/1darray.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/2darray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/2darray.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/array-list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/array-list.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/array-reshaping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/array-reshaping.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/banner.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/boolean-indexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/boolean-indexing.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/broadcasting-error.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/broadcasting-error.jpg
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/broadcasting-error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/broadcasting-error.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/broadcasting-multiply.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/broadcasting-multiply.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/broadcasting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/broadcasting.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/flattening-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/flattening-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/indexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/indexing.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/matrix-multiply-numbers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/matrix-multiply-numbers.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/matrix-multiply.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/matrix-multiply.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/nd-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/nd-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/ndarray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/ndarray.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/np-dtypes.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/np-dtypes.webp
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/np-int-range.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/np-int-range.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/numpy-array-data-structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/numpy-array-data-structure.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/numpy-arrays.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/numpy-arrays.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/numpy-arrays.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/numpy-arrays.webp
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/numpy-indexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/numpy-indexing.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/numpy-slicing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/numpy-slicing.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/ravel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/ravel.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/reshaping-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/reshaping-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/rule-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/rule-1.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/rule-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/rule-2.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/rule-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/rule-3.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/saving-numpy-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/saving-numpy-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/scalar-vector-matrix-tensor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/scalar-vector-matrix-tensor.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/set-operations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/set-operations.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/sorting-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/sorting-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/splitting-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/splitting-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/stacking-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/stacking-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/stacking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/stacking.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/tiling-array.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/tiling-array.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/titanic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/titanic.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/vectorization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/vectorization.png
--------------------------------------------------------------------------------
/Lectures/03 Numpy Fundamentals/images/view-copy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytopia/Data-Processing-for-ML/865b99cd8fbe4c01b5d0c06352648480a0746ca3/Lectures/03 Numpy Fundamentals/images/view-copy.png
--------------------------------------------------------------------------------
/Lectures/04 Pandas Fundamentals/01 Introduction to Pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Introduction to Pandas"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Pandas is a powerful open-source Python library designed for efficient and intuitive data manipulation and analysis. It provides data structures and functions that make working with structured data simple and expressive. The name \"Pandas\" is derived from the term \"Panel Data,\" which refers to multidimensional structured datasets.\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "With Pandas, you can easily handle and manipulate large datasets, clean and preprocess data, merge and join multiple datasets, handle missing data, and perform a wide range of data transformations. Pandas is particularly well-suited for working with tabular data, such as CSV files, Excel spreadsheets, and SQL databases.\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "Pandas was created by Wes McKinney in 2008 while he was working at AQR Capital Management. McKinney needed a tool to perform quantitative analysis on financial data, and he found the existing tools in Python to be inadequate. As a result, he developed Pandas to provide a more efficient and user-friendly way to work with data in Python.\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "Since its initial release, Pandas has grown to become one of the most popular and widely-used libraries in the Python data science ecosystem. It has a large and active community of developers and users who contribute to its development and provide support through forums, mailing lists, and social media.\n"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Pandas is built on top of NumPy, which is a library for working with large, multi-dimensional arrays and matrices. Pandas extends the capabilities of NumPy by providing a more flexible and feature-rich data structure called the DataFrame, which allows for the efficient manipulation and analysis of tabular data.\n"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "In addition to NumPy, Pandas integrates well with other popular data science libraries in Python, such as:\n",
57 | "\n",
58 | "- **Matplotlib**: A plotting library for creating static, animated, and interactive visualizations.\n",
59 | "- **Seaborn**: A statistical data visualization library based on Matplotlib.\n",
60 | "- **Scikit-learn**: A machine learning library that provides tools for data preprocessing, modeling, and evaluation.\n",
61 | "- **Statsmodels**: A library for statistical modeling and econometrics.\n"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "Pandas also provides seamless integration with other tools in the data science workflow, such as Jupyter Notebook for interactive data exploration and analysis, and libraries like SQLAlchemy for working with databases.\n"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "By leveraging the power of Pandas and its integration with other libraries, data scientists and analysts can efficiently perform a wide range of data processing, analysis, and visualization tasks in Python."
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "**Table of contents** \n",
83 | "- [Key Features and Benefits](#toc1_) \n",
84 | " - [Data Structures for Efficient Data Manipulation](#toc1_1_) \n",
85 | " - [Handling Missing Data](#toc1_2_) \n",
86 | " - [Merging, Joining Datasets and Concatenation](#toc1_3_) \n",
87 | " - [String, Datetime, and Categorical Data Functionality](#toc1_4_) \n",
88 | " - [Integration with Other Libraries and Tools](#toc1_5_) \n",
89 | "- [Pandas Data Structures](#toc2_) \n",
90 | " - [Overview of Series and DataFrame](#toc2_1_) \n",
91 | " - [Series](#toc2_2_) \n",
92 | " - [DataFrame](#toc2_3_) \n",
93 | "- [Getting Started with Pandas](#toc3_) \n",
94 | "- [Installation](#toc4_) \n",
95 | "- [Importing Pandas](#toc5_) \n",
96 | "- [Basic Usage Examples](#toc6_) \n",
97 | " - [Creating a Series](#toc6_1_) \n",
98 | " - [Creating a DataFrame](#toc6_2_) \n",
99 | " - [Accessing Data](#toc6_3_) \n",
100 | "- [Pandas Datatypes](#toc7_) \n",
101 | "\n",
102 | "\n",
109 | ""
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | "## [Key Features and Benefits](#toc0_)"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {},
122 | "source": [
123 | "Pandas offers a wide range of features and benefits that make it an essential tool for data manipulation and analysis in Python. Let's explore some of the key features and benefits in more detail.\n"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "### [Data Structures for Efficient Data Manipulation](#toc0_)\n"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {},
136 | "source": [
137 | "Pandas provides two primary data structures: Series and DataFrame, which allow for efficient and flexible data manipulation.\n",
138 | "\n",
139 | "- **Series**: A one-dimensional labeled array that can hold any data type, similar to a column in a spreadsheet or a SQL table.\n",
140 | "- **DataFrame**: A two-dimensional labeled data structure with columns of potentially different data types, similar to a table in a spreadsheet or a SQL database.\n"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "
"
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "These data structures are optimized for performance and memory usage, enabling you to work with large datasets efficiently.\n"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### [Handling Missing Data](#toc0_)\n"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "In real-world datasets, missing data is a common occurrence. Pandas provides built-in functions and methods to easily handle missing data, such as:\n",
169 | "\n",
170 | "- Detecting missing data using `isnull()` and `notnull()` functions.\n",
171 | "- Filling missing data with a specified value or using forward/backward filling methods.\n",
172 | "- Dropping rows or columns with missing data using `dropna()` function.\n"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "
"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "These functions allow you to clean and preprocess your data effectively, ensuring data integrity and reliability.\n"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "### [Merging, Joining Datasets and Concatenation](#toc0_)\n"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "Pandas provides a range of functions for combining multiple datasets based on common columns or indexes, similar to SQL join operations. Some of the key functions include:\n",
201 | "\n",
202 | "- `concat()`: Concatenate pandas objects along a particular axis.\n",
203 | "- `merge()`: Merge DataFrame or named Series objects with a database-style join.\n",
204 | "- `join()`: Join columns of another DataFrame.\n"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "
"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "These functions allow you to easily combine data from different sources and perform complex data transformations.\n"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "### [String, Datetime, and Categorical Data Functionality](#toc0_)\n"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "Pandas has extensive support for working with string and datetime data, as well as categorical data functionality. Some of the key features include:\n",
233 | "\n",
234 | "- String methods for string manipulation, such as `split()`, `replace()`, `lower()`, and `upper()`.\n",
235 | "- Datetime parsing and formatting using `to_datetime()` function and datetime accessor methods.\n",
236 | "- Time series functionality, such as date range generation, frequency conversion, and resampling.\n",
237 | "- Categorical data types for efficient storage and manipulation of categorical variables."
238 | ]
239 | },
240 | {
241 | "cell_type": "markdown",
242 | "metadata": {},
243 | "source": [
244 | "
"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {},
250 | "source": [
251 | "These features make it easy to work with text and time-based data, which are common in many real-world datasets.\n"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "### [Integration with Other Libraries and Tools](#toc0_)\n"
259 | ]
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "Pandas integrates seamlessly with other popular libraries and tools in the data science ecosystem, such as:\n",
266 | "\n",
267 | "- Plotting libraries like Matplotlib and Seaborn for data visualization.\n",
268 | "- Machine learning libraries like Scikit-learn for data preprocessing and model training.\n",
269 | "- Statistical modeling libraries like Statsmodels for advanced statistical analysis.\n",
270 | "- Jupyter Notebook for interactive data exploration and analysis.\n",
271 | "- Databases like SQLite and PostgreSQL using SQLAlchemy for data storage and retrieval.\n"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "This integration allows you to leverage the power of multiple libraries and tools to perform end-to-end data science tasks efficiently.\n"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "By providing these key features and benefits, Pandas empowers data scientists and analysts to efficiently manipulate, analyze, and gain insights from their data, making it an indispensable tool in the Python data science stack."
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "## [Pandas Data Structures](#toc0_)"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "Pandas provides two primary data structures: Series and DataFrame, which are designed to make data manipulation and analysis intuitive and efficient. These data structures are built on top of NumPy arrays, leveraging their performance benefits while providing additional functionality and flexibility.\n"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "### [Overview of Series and DataFrame](#toc0_)\n"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "- **Series**: A one-dimensional labeled array that can hold any data type, similar to a column in a spreadsheet or a SQL table. It consists of an index and a data column.\n",
314 | "\n",
315 | "- **DataFrame**: A two-dimensional labeled data structure with columns of potentially different data types, similar to a table in a spreadsheet or a SQL database. It consists of an index and multiple columns, where each column is a Series.\n"
316 | ]
317 | },
318 | {
319 | "cell_type": "markdown",
320 | "metadata": {},
321 | "source": [
322 | "Both Series and DataFrame are designed to handle heterogeneous data types, meaning they can contain a mix of integers, floats, strings, and other data types. They also provide a wide range of methods and functions for data manipulation, analysis, and visualization.\n"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "### [Series](#toc0_)\n"
330 | ]
331 | },
332 | {
333 | "cell_type": "markdown",
334 | "metadata": {},
335 | "source": [
336 | "A Series is a one-dimensional array-like object that contains a sequence of values and an associated array of labels called an index. The index provides a way to access and manipulate the data in the Series.\n"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {},
342 | "source": [
343 | "Some key characteristics of a Series include:\n",
344 | "\n",
345 | "- Homogeneous data: All elements in a Series must be of the same data type.\n",
346 | "- Immutable size: The size of a Series cannot be changed once it is created.\n",
347 | "- Labeled index: Each element in a Series is associated with a unique label in the index.\n"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "
"
355 | ]
356 | },
357 | {
358 | "cell_type": "markdown",
359 | "metadata": {},
360 | "source": [
361 | "Series are useful for representing a single column of data, such as a list of prices, names, or temperatures. They provide methods for data manipulation, selection, and computation, making it easy to perform operations on the data.\n"
362 | ]
363 | },
364 | {
365 | "cell_type": "markdown",
366 | "metadata": {},
367 | "source": [
368 | "### [DataFrame](#toc0_)\n"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "A DataFrame is a two-dimensional table-like data structure that consists of an ordered collection of columns, each of which can be a different data type. It is similar to a spreadsheet or a SQL table, with rows and columns labeled with an index and column names, respectively.\n"
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "metadata": {},
381 | "source": [
382 | "Some key characteristics of a DataFrame include:\n",
383 | "\n",
384 | "- Heterogeneous data: Each column in a DataFrame can contain a different data type.\n",
385 | "- Mutable size: The size of a DataFrame can be changed by adding or removing rows and columns.\n",
386 | "- Labeled axes: Both the rows and columns of a DataFrame have labels, which can be used for data selection and manipulation.\n"
387 | ]
388 | },
389 | {
390 | "cell_type": "markdown",
391 | "metadata": {},
392 | "source": [
393 | "
"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "metadata": {},
399 | "source": [
400 | "DataFrames are suitable for representing and manipulating structured data, such as a CSV file or a database table. They provide a wide range of functions and methods for data cleaning, preprocessing, merging, grouping, and aggregation, making it easy to perform complex data transformations and analysis.\n"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "The combination of Series and DataFrame in Pandas provides a powerful and flexible toolkit for working with structured data in Python. They allow you to efficiently load, manipulate, and analyze data from various sources, such as CSV files, Excel spreadsheets, SQL databases, and more.\n"
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "metadata": {},
413 | "source": []
414 | },
415 | {
416 | "cell_type": "markdown",
417 | "metadata": {},
418 | "source": [
419 | "## [Getting Started with Pandas](#toc0_)"
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "To start using Pandas in your Python projects, you need to install the library and import it into your Python environment. In this section, we will cover the installation process, importing Pandas, and some basic usage examples to help you get started.\n"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "metadata": {},
432 | "source": [
433 | "## [Installation](#toc0_)\n"
434 | ]
435 | },
436 | {
437 | "cell_type": "markdown",
438 | "metadata": {},
439 | "source": [
440 | "Pandas can be installed using pip, the package installer for Python. To install Pandas, open a terminal or command prompt and run the following command:\n"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | "```\n",
448 | "pip install pandas\n",
449 | "```\n"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {},
455 | "source": [
456 | "If you are using Anaconda or Miniconda, you can install Pandas using the conda package manager:\n"
457 | ]
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | "```\n",
464 | "conda install pandas\n",
465 | "```\n"
466 | ]
467 | },
468 | {
469 | "cell_type": "markdown",
470 | "metadata": {},
471 | "source": [
472 | "Once the installation is complete, you can verify that Pandas is installed correctly by running the following command in your Python environment:\n"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 1,
478 | "metadata": {},
479 | "outputs": [
480 | {
481 | "name": "stdout",
482 | "output_type": "stream",
483 | "text": [
484 | "2.0.3\n"
485 | ]
486 | }
487 | ],
488 | "source": [
489 | "import pandas as pd\n",
490 | "print(pd.__version__)"
491 | ]
492 | },
493 | {
494 | "cell_type": "markdown",
495 | "metadata": {},
496 | "source": [
497 | "If Pandas is installed correctly, this command will print the version number of the installed Pandas library.\n"
498 | ]
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "metadata": {},
503 | "source": [
504 | "## [Importing Pandas](#toc0_)\n"
505 | ]
506 | },
507 | {
508 | "cell_type": "markdown",
509 | "metadata": {},
510 | "source": [
511 | "To use Pandas in your Python scripts or Jupyter Notebooks, you need to import the library. The convention is to import Pandas using the alias `pd`:\n"
512 | ]
513 | },
514 | {
515 | "cell_type": "markdown",
516 | "metadata": {},
517 | "source": [
518 | "```python\n",
519 | "import pandas as pd\n",
520 | "```\n"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {},
526 | "source": [
527 | "By importing Pandas with the alias `pd`, you can access all the functions and classes provided by the library using the `pd.` prefix.\n"
528 | ]
529 | },
530 | {
531 | "cell_type": "markdown",
532 | "metadata": {},
533 | "source": [
534 | "## [Basic Usage Examples](#toc0_)\n"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {},
540 | "source": [
541 | "Let's explore some basic usage examples to demonstrate how to create and manipulate Pandas data structures.\n"
542 | ]
543 | },
544 | {
545 | "cell_type": "markdown",
546 | "metadata": {},
547 | "source": [
548 | "### [Creating a Series](#toc0_)\n"
549 | ]
550 | },
551 | {
552 | "cell_type": "markdown",
553 | "metadata": {},
554 | "source": [
555 | "To create a Pandas Series, you can pass a list of values to the `pd.Series()` function:\n"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 2,
561 | "metadata": {},
562 | "outputs": [
563 | {
564 | "data": {
565 | "text/plain": [
566 | "0 1\n",
567 | "1 2\n",
568 | "2 3\n",
569 | "3 4\n",
570 | "4 5\n",
571 | "dtype: int64"
572 | ]
573 | },
574 | "execution_count": 2,
575 | "metadata": {},
576 | "output_type": "execute_result"
577 | }
578 | ],
579 | "source": [
580 | "s = pd.Series([1, 2, 3, 4, 5])\n",
581 | "s"
582 | ]
583 | },
584 | {
585 | "cell_type": "markdown",
586 | "metadata": {},
587 | "source": [
588 | "By default, Pandas assigns an integer index to each element in the Series, starting from 0.\n"
589 | ]
590 | },
591 | {
592 | "cell_type": "markdown",
593 | "metadata": {},
594 | "source": [
595 | "### [Creating a DataFrame](#toc0_)\n"
596 | ]
597 | },
598 | {
599 | "cell_type": "markdown",
600 | "metadata": {},
601 | "source": [
602 | "To create a Pandas DataFrame, you can pass a dictionary of lists or a list of dictionaries to the `pd.DataFrame()` function:\n"
603 | ]
604 | },
605 | {
606 | "cell_type": "code",
607 | "execution_count": 3,
608 | "metadata": {},
609 | "outputs": [
610 | {
611 | "data": {
612 | "text/html": [
613 | "
\n", 631 | " | name | \n", 632 | "age | \n", 633 | "city | \n", 634 | "
---|---|---|---|
0 | \n", 639 | "Alice | \n", 640 | "25 | \n", 641 | "New York | \n", 642 | "
1 | \n", 645 | "Bob | \n", 646 | "30 | \n", 647 | "London | \n", 648 | "
2 | \n", 651 | "Charlie | \n", 652 | "35 | \n", 653 | "Paris | \n", 654 | "