├── LICENSE
├── README.md
├── chapter01
    ├── 1.batch.py
    ├── 2.real_time_streaming.py
    ├── 3.semi_real_time.py
    ├── 4.work_with_queue.py
    ├── 5.sql_databases.py
    ├── 6.no_sql_databases.py
    └── 7.api.py
├── chapter02
    ├── accuracy.py
    ├── average_timeliness.py
    ├── completeness.py
    ├── consistency.py
    ├── data_compliance.py
    ├── data_usage.py
    ├── duplication.py
    ├── timeliness.py
    └── uniqueness.py
├── chapter03
    ├── great_expectations
    │   ├── code
    │   │   ├── 1.data_set_up.py
    │   │   ├── 2.mock_test_dataset.py
    │   │   └── 3.with_pandas_profiler.py
    │   └── great_expectations
    │   │   ├── checkpoints
    │   │       └── expect_iris_ckpnt.yml
    │   │   ├── expectations
    │   │       └── expect_iris.json
    │   │   ├── great_expectations.yml
    │   │   ├── plugins
    │   │       └── custom_data_docs
    │   │       │   └── styles
    │   │       │       └── data_docs_custom_styles.css
    │   │   └── uncommitted
    │   │       └── data_docs
    │   │           └── local_site
    │   │               ├── expectations
    │   │                   └── expect_iris.html
    │   │               ├── index.html
    │   │               └── static
    │   │                   ├── fonts
    │   │                       └── HKGrotesk
    │   │                       │   ├── HKGrotesk-Italic.otf
    │   │                       │   ├── HKGrotesk-LightItalic.otf
    │   │                       │   ├── HKGrotesk-MediumItalic.otf
    │   │                       │   └── HKGrotesk-SemiBoldItalic.otf
    │   │                   ├── images
    │   │                       ├── favicon.ico
    │   │                       ├── glossary_scroller.gif
    │   │                       ├── iterative-dev-loop.png
    │   │                       ├── logo-long-vector.svg
    │   │                       ├── logo-long.png
    │   │                       ├── short-logo-vector.svg
    │   │                       ├── short-logo.png
    │   │                       └── validation_failed_unexpected_values.gif
    │   │                   └── styles
    │   │                       ├── data_docs_custom_styles_template.css
    │   │                       └── data_docs_default_styles.css
    ├── intoduction
    │   └── identify_trends.py
    └── pandas_profiling
    │   ├── data_profile_report.html
    │   ├── pandas_profiler.ipynb
    │   └── pandas_profiler.json
├── chapter04
    ├── 1.descriptive_stats.py
    ├── 2.rename_columns.py
    ├── 3.dropping_columns.py
    ├── 4.data_types.py
    ├── 5.date_time.py
    ├── 6.format_date.py
    ├── 7.extract_datetime_components.py
    ├── 8.time_deltas.py
    └── 9.time_zones.py
├── chapter05
    ├── 1.use_case.py
    ├── 2.inner_join.py
    ├── 3.outer_merge.py
    ├── 4.right_merge.py
    ├── 5.left_merge.py
    ├── 6a.manage_duplicates.py
    ├── 6b.manage_duplicates_validate.py
    ├── 6c.merge_and_aggregate.py
    ├── 6d.dmanage_duplicates_concatenation.py
    ├── 7a.managed_duplicated_columns.py
    ├── 7b.drop_columns_merge.py
    ├── 7c.use_keys_merge.py
    ├── 8a.perfomance_benchmark_set_index.py
    ├── 8b.performance_benchmark_sort_indexes.py
    ├── 8c.performance_benchmark_memory.py
    ├── 9a.concatenate_row_wise.py
    ├── 9b.reset_index.py
    └── 9c.concatenate_column_wise.py
├── chapter06
    ├── 1.use_case.py
    ├── 2.groupby_full_example.py
    ├── 3.apply_axis0.py
    ├── 4.apply_axis1.py
    ├── 5.simple_filtering.py
    └── 6.advanced_filtering.py
├── chapter07
    ├── 1.postgressql.py
    ├── 2.pymongo.py
    ├── 3.pymongo_expand.py
    ├── 4a.kafka_producer.py
    ├── 4b.kafka_consumer.py
    ├── 5.time_based_partitioning.py
    ├── 6.geo_partitioning.py
    ├── 7.hybrid_partitioning.py
    ├── __pycache__
    │   └── pymongo.cpython-312.pyc
    ├── setup
    │   ├── cleanup_script.sh
    │   ├── docker-compose.yml
    │   └── setup_postgres.sh
    ├── template_aws_s3.py
    └── template_bigquery.py
├── chapter08
    ├── 1.detect_missing_data.py
    ├── 10.winsorizing.py
    ├── 11.data_transformation.py
    ├── 12.mahalanobis_distance.py
    ├── 13.clustering.py
    ├── 14.multivariate_trimming.py
    ├── 2.delete_missing_data.py
    ├── 3.mean_imputation.py
    ├── 4.median_imputation.py
    ├── 5.indicator_imputation.py
    ├── 6.outliers_visualisation.py
    ├── 7.identify_univariate_outliers.py
    ├── 8.handle_univariate_outliers_deletions.py
    └── 9.trimming.py
├── chapter09
    ├── min_max_scaling.py
    ├── robust_scaler.py
    └── zscaler.py
├── chapter10
    ├── 1a.label_encoding.py
    ├── 1b.label_encoding_forced.py
    ├── 2.one_hot_encoding.py
    ├── 3.target_encoding.py
    ├── 4.frequency_encoding.py
    └── 5.binary_encoding.py
├── chapter11
    ├── 1.decomposing_time_series
    │   ├── noise.py
    │   ├── seasonality.py
    │   └── trend.py
    ├── 2.types
    │   ├── multivariate.py
    │   └── univariate.py
    ├── 3.missing_values
    │   ├── 1.identify_missing_values.py
    │   ├── 2.remove_missing_values.py
    │   ├── 3.back_forward_fill.py
    │   └── 4.interpolation.py
    ├── 4.analisis
    │   └── autocorrelation.py
    ├── 5.outliers
    │   ├── 1.seasonal_decomposition.py
    │   ├── 2.autocorrelation.py
    │   ├── 3.arima.py
    │   └── 4.moving_average.py
    └── 6.feature_engineering
    │   ├── 1.lags.py
    │   └── 2.seasonal_differencing.py
├── chapter12
    ├── 1.text_cleaning.py
    ├── 10.word_tokenisation.py
    ├── 11.bpe_tokeniser.py
    ├── 12.tokenisation_wordpiece.py
    ├── 13.specialised_tokenisers.py
    ├── 14.embedding_bert.py
    ├── 15.embedding_bge.py
    ├── 16.embedding_gte.py
    ├── 2.punctuation.py
    ├── 3.pii_detection.py
    ├── 4.rare_words.py
    ├── 5.spelling_checker.py
    ├── 6.fuzzy_matching.py
    ├── 7.fixed_chunking.py
    ├── 8.paragraph_chunking.py
    ├── 9.semantic_chunking.py
    └── 9.semantic_similarity.py
└── chapter13
    ├── 1.image_prerpocessing.py
    ├── 2.ocr.py
    ├── 3.ocr_with_llms.py
    ├── 4.image_captioning.py
    ├── 5.whisper.py
    ├── 6.emotion_detection.py
    ├── 7.write_highlights.py
    ├── audio
        └── 3.chain orchestrator.mp3
    └── images
        ├── 1.png
        ├── 10.png
        ├── 11.png
        ├── 12.png
        ├── 13.png
        ├── 14.png
        ├── 15.png
        ├── 16.png
        ├── 17.png
        ├── 18.png
        ├── 19.png
        ├── 2.png
        ├── 20.png
        ├── 21.png
        ├── 22.png
        ├── 23.png
        ├── 24.png
        ├── 25.png
        ├── 3.png
        ├── 4.png
        ├── 5.png
        ├── 6.png
        ├── 7.png
        ├── 8.png
        └── 9.png


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/chapter01/1.batch.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | 
 4 | # Step 1: Generate Mock Data
 5 | def generate_mock_data(num_records):
 6 |     data = []
 7 |     for _ in range(num_records):
 8 |         record = {
 9 |             'id': random.randint(1, 1000),
10 |             'value': random.random() * 100
11 |         }
12 |         data.append(record)
13 |     return data
14 | 
15 | # Step 2: Batch Processing
16 | def process_in_batches(data, batch_size):
17 |     for i in range(0, len(data), batch_size):
18 |         yield data[i:i + batch_size]
19 | 
20 | # Step 3: Transform Data
21 | def transform_data(batch):
22 |     transformed_batch = []
23 |     for record in batch:
24 |         transformed_record = {
25 |             'id': record['id'],
26 |             'value': record['value'],
27 |             'transformed_value': record['value'] * 1.1  # Example transformation
28 |         }
29 |         transformed_batch.append(transformed_record)
30 |     return transformed_batch
31 | 
32 | # Step 4: Load Data
33 | def load_data(batch):
34 |     for record in batch:
35 |         # Simulate loading data into a database
36 |         print(f"Loading record into database: {record}")
37 | 
38 | # Main Function
39 | def main():
40 |     # Parameters
41 |     num_records = 100  # Total number of records to generate
42 |     batch_size = 10    # Number of records per batch
43 | 
44 |     # Generate data
45 |     data = generate_mock_data(num_records)
46 |     print("Original data:",data)
47 |     
48 |     # Process and load data in batches
49 |     for batch in process_in_batches(data, batch_size):
50 |         transformed_batch = transform_data(batch)
51 |         print("Batch before loading:")
52 |         for record in transformed_batch:
53 |             print(record)
54 |         load_data(transformed_batch)
55 |         time.sleep(1)  # Simulate time delay between batches
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/chapter01/2.real_time_streaming.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | 
 4 | # Step 1: Generate Mock Data Continuously
 5 | def generate_mock_data():
 6 |     while True:
 7 |         record = {
 8 |             'id': random.randint(1, 1000),
 9 |             'value': random.random() * 100
10 |         }
11 |         yield record
12 |         time.sleep(0.5)  # Simulate data arriving every 0.5 seconds
13 | 
14 | # Step 2: Stream Processing with a time limit
15 | def process_stream(run_time_seconds=10):
16 |     start_time = time.time()
17 |     for record in generate_mock_data():
18 |         transformed_record = transform_data(record)
19 |         load_data(transformed_record)
20 |         
21 |         # Check if the run time has exceeded the limit
22 |         if time.time() - start_time > run_time_seconds:
23 |             print("Time limit reached. Terminating the stream processing.")
24 |             break
25 | 
26 | # Step 3: Transform Data
27 | def transform_data(record):
28 |     transformed_record = {
29 |         'id': record['id'],
30 |         'value': record['value'],
31 |         'transformed_value': record['value'] * 1.1  # Example transformation
32 |     }
33 |     return transformed_record
34 | 
35 | # Step 4: Load Data
36 | def load_data(record):
37 |     # Simulate loading data into a database
38 |     print(f"Loading record into database: {record}")
39 | 
40 | # Main Function
41 | def main():
42 |     process_stream(run_time_seconds=10)  # Run the stream for 10 seconds
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/chapter01/3.semi_real_time.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | from collections import deque
 4 | 
 5 | # Step 1: Generate Mock Data Continuously
 6 | def generate_mock_data():
 7 |     while True:
 8 |         record = {
 9 |             'id': random.randint(1, 1000),
10 |             'value': random.random() * 100
11 |         }
12 |         yield record
13 |         time.sleep(0.1)  # Simulate data arriving every 0.1 seconds
14 | 
15 | # Step 2: Process Semi-Real-Time
16 | def process_semi_real_time(batch_size, interval):
17 |     buffer = deque()
18 |     start_time = time.time()
19 | 
20 |     for record in generate_mock_data():
21 |         buffer.append(record)
22 |         
23 |         # Check if interval has elapsed or buffer size reached
24 |         if (time.time() - start_time) >= interval or len(buffer) >= batch_size:
25 |             # Process and clear the buffer
26 |             transformed_batch = transform_data(list(buffer))  # Convert deque to list
27 |             print(f"Batch of {len(transformed_batch)} records before loading:")
28 |             for rec in transformed_batch:
29 |                 print(rec)
30 |             load_data(transformed_batch)
31 |             buffer.clear()
32 |             start_time = time.time()  # Reset start time
33 | 
34 | # Step 3: Transform Data
35 | def transform_data(batch):
36 |     transformed_batch = []
37 |     for record in batch:
38 |         transformed_record = {
39 |             'id': record['id'],
40 |             'value': record['value'],
41 |             'transformed_value': record['value'] * 1.1  # Example transformation
42 |         }
43 |         transformed_batch.append(transformed_record)
44 |     return transformed_batch
45 | 
46 | # Step 4: Load Data
47 | def load_data(batch):
48 |     for record in batch:
49 |         # Simulate loading data into a database
50 |         print(f"Loading record into database: {record}")
51 | 
52 | # Main Function
53 | def main():
54 |     batch_size = 5    # Number of records to process per batch
55 |     interval = 3.0    # Maximum time interval (in seconds) to process a batch
56 | 
57 |     process_semi_real_time(batch_size, interval)
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/chapter01/4.work_with_queue.py:
--------------------------------------------------------------------------------
 1 | from queue import Queue
 2 | 
 3 | def read_message_queue():
 4 |     q = Queue()
 5 |     
 6 |     # Adding messages to the queue
 7 |     for i in range(10):  # Mocking messages
 8 |         q.put(f"message {i}")
 9 |     
10 |     # Reading and processing messages from the queue
11 |     while not q.empty():
12 |         message = q.get()
13 |         process_message(message)
14 |         q.task_done()  # Signal that the task is done
15 | 
16 | def process_message(message):
17 |     print(f"Processing message: {message}")
18 | 
19 | # Example usage
20 | read_message_queue()
21 | 


--------------------------------------------------------------------------------
/chapter01/5.sql_databases.py:
--------------------------------------------------------------------------------
 1 | def read_sql():
 2 |     # Simulating a SQL table with a dictionary
 3 |     sql_table = [
 4 |         {"id": 1, "name": "Alice", "age": 30},
 5 |         {"id": 2, "name": "Bob", "age": 24},
 6 |     ]
 7 |     for row in sql_table:
 8 |         process_row(row)
 9 | 
10 | def process_row(row):
11 |     print(f"Processing row: id={row['id']}, name={row['name']}, age={row['age']}")
12 | 
13 | # Example usage
14 | read_sql()
15 | 
16 | print(f"{'id':<5} {'name':<10} {'age':<3}")
17 | print("-" * 20)
18 | # Print each row
19 | for row in sql_table:
20 | print(f"{row['id']:<5} {row['name']:<10} {row['age']:<3}")
21 | 


--------------------------------------------------------------------------------
/chapter01/6.no_sql_databases.py:
--------------------------------------------------------------------------------
 1 | def read_nosql():
 2 |     data_store = {
 3 |         "1": {"name": "Alice", "age": 30},
 4 |         "2": {"name": "Bob", "age": 24},
 5 |     }
 6 |     for key, value in data_store.items():
 7 |         process_entry(key, value)
 8 | 
 9 | def process_entry(key, value):
10 |     print(f"Processing key: {key} with value: {value}")
11 | 
12 | # Example usage
13 | read_nosql()
14 | 


--------------------------------------------------------------------------------
/chapter01/7.api.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pandas as pd
 3 | 
 4 | # Define the API endpoint URL
 5 | url = "https://www.thecocktaildb.com/api/json/v1/1/search.php?s=margarita"
 6 | 
 7 | # Make the API request
 8 | response = requests.get(url)
 9 | 
10 | # Check if the request was successful (status code 200)
11 | if response.status_code == 200:
12 |     # Extract the response JSON data
13 |     data = response.json()
14 |     
15 |     # Check if the API response contains cocktails data
16 |     if 'drinks' in data:
17 |         # Create DataFrame from drinks data
18 |         df = pd.DataFrame(data['drinks'])
19 |         
20 |         # Print the resulting DataFrame
21 |         print(df.head())
22 |     else:
23 |         print("No drinks found.")
24 | else:
25 |     print(f"Failed to retrieve data from API. Status code: {response.status_code}")
26 | 


--------------------------------------------------------------------------------
/chapter02/accuracy.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample dataset
 4 | data = {
 5 |     'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
 6 |     'Age': [25, 30, 28, 28, 22],
 7 |     'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
 8 |     'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco']
 9 | }
10 | 
11 | # Reference dataset for accuracy comparison
12 | reference_data = {
13 |     'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
14 |     'Age': [25, 30, 29, 28, 22],
15 |     'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
16 |     'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco']
17 | }
18 | 
19 | df = pd.DataFrame(data)
20 | reference_df = pd.DataFrame(reference_data)
21 | 
22 | # Step 1: Import necessary libraries
23 | # We import the pandas library to work with the dataset.
24 | 
25 | # Step 2: Create a sample dataset and a reference dataset
26 | # We create a sample dataset and a reference dataset with the same structure.
27 | 
28 | # Step 3: Create DataFrames
29 | df = pd.DataFrame(data)
30 | reference_df = pd.DataFrame(reference_data)
31 | 
32 | # Step 4: Compare data to the reference
33 | accuracy_check = df == reference_df
34 | 
35 | # Step 5: Calculate accuracy percentage
36 | accuracy_percentage = accuracy_check.mean() * 100
37 | # We calculate the accuracy percentage by taking the mean of the accuracy check for each column and multiplying by 100.
38 | 
39 | # Step 6: Display the accuracy results
40 | print("Accuracy Check:")
41 | print(accuracy_check)
42 | print("\nAccuracy Percentage:")
43 | print(accuracy_percentage)
44 | 


--------------------------------------------------------------------------------
/chapter02/average_timeliness.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from datetime import datetime, timedelta
 4 | 
 5 | # Generate a random dataset with timestamps
 6 | np.random.seed(0)  # For reproducibility
 7 | n_samples = 100
 8 | start_time = datetime(2023, 10, 25, 9, 0, 0)
 9 | end_time = datetime(2023, 10, 25, 16, 0, 0)
10 | 
11 | timestamps = [start_time + timedelta(minutes=np.random.randint(0, (end_time - start_time).total_seconds() / 60)) for _ in range(n_samples)]
12 | values = np.random.randint(50, 101, n_samples)
13 | 
14 | df = pd.DataFrame({'Timestamp': timestamps, 'Value': values})
15 | 
16 | # Reference timestamp (current time for this example)
17 | reference_timestamp = datetime(2023, 10, 25, 12, 0, 0)
18 | 
19 | # Define a timeliness threshold (in minutes)
20 | timeliness_threshold = 30
21 | 
22 | # Calculate timeliness
23 | df['Timeliness'] = (reference_timestamp - df['Timestamp']).dt.total_seconds() / 60
24 | df['Timely'] = df['Timeliness'] <= timeliness_threshold
25 | 
26 | # Calculate the average timeliness
27 | average_timeliness = df['Timeliness'].mean()
28 | 
29 | # Display results
30 | print("Dataset with Timestamps:")
31 | print(df.head())
32 | 
33 | print("\nAverage Timeliness (in minutes):", average_timeliness)
34 | print("Percentage of Timely Records:", (df['Timely'].sum() / n_samples) * 100, "%")
35 | 


--------------------------------------------------------------------------------
/chapter02/completeness.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample dataset
 4 | data = {
 5 |     'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
 6 |     'Age': [25, 30, None, 28, 22],
 7 |     'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
 8 |     'City': ['New York', 'Los Angeles', 'Chicago', None, 'San Francisco']
 9 | }
10 | 
11 | df = pd.DataFrame(data)
12 | 
13 | # Step 1: Import necessary libraries
14 | # We import the pandas library to work with the dataset.
15 | 
16 | # Step 2: Create a sample dataset
17 | # We create a simple dataset with columns 'Name', 'Age', 'Gender', and 'City'. Some values are intentionally missing (represented as 'None').
18 | 
19 | # Step 3: Create a DataFrame
20 | df = pd.DataFrame(data)
21 | # We create a DataFrame using the sample data.
22 | 
23 | # Step 4: Check completeness
24 | completeness = df.isnull().sum()
25 | # The .isnull() method checks for missing values in the DataFrame, and .sum() counts the missing values for each column.
26 | 
27 | # Step 5: Calculate completeness percentage
28 | total_records = len(df)
29 | completeness_percentage = (1- completeness / total_records) * 100
30 | # We calculate the completeness percentage by dividing the count of missing values by the total number of records and then multiplying by 100.
31 | 
32 | # Step 6: Display the completeness results
33 | print("Completeness Check:")
34 | print(completeness)
35 | print("\nCompleteness Percentage:")
36 | print(completeness_percentage)
37 | 


--------------------------------------------------------------------------------
/chapter02/consistency.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Create a sample dataset
 4 | data = {
 5 |     'ProductID': [1, 2, 3, 4, 5],
 6 |     'ProductName': ['PROD001', 'PROD002', 'Product003', 'PROD004', 'PROD005'],
 7 | }
 8 | 
 9 | df = pd.DataFrame(data)
10 | 
11 | # Define the expected prefix
12 | expected_prefix = "PROD"
13 | 
14 | # Check consistency and create a boolean mask for inconsistent names
15 | inconsistent_mask = ~df['ProductName'].str.startswith(expected_prefix)
16 | 
17 | # Create a new column to indicate consistency
18 | df['Consistency'] = ~inconsistent_mask
19 | 
20 | # Calculate the percentage of consistent rows
21 | consistent_percentage = (df['Consistency'].sum() / len(df)) * 100
22 | 
23 | # Display the dataset with the consistency check results
24 | print("Dataset with Consistency Check:")
25 | print(df)
26 | 
27 | # Display the percentage of consistent rows
28 | print(f"Percentage of Consistent Rows: {consistent_percentage:.2f}%")


--------------------------------------------------------------------------------
/chapter02/data_compliance.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | # Simulate a dataset with compliance checks
 4 | def simulate_data_compliance(num_records):
 5 |     data_records = []
 6 |     compliant_count = 0  # Counter for compliant records
 7 | 
 8 |     for _ in range(num_records):
 9 |         # Generate a random record (e.g., containing age and consent fields)
10 |         age = random.randint(18, 100)
11 |         consent_given = random.choice([True, False])
12 | 
13 |         # Define compliance rules
14 |         age_rule = age >= 18
15 |         consent_rule = age >= 18 and consent_given
16 | 
17 |         # Check compliance with specific regulations
18 |         age_compliant = "Age Compliant" if age_rule else "Age Non-Compliant"
19 |         consent_compliant = "Consent Compliant" if consent_rule else "Consent Non-Compliant"
20 | 
21 |         # Define overall compliance status
22 |         compliance_status = "Compliant" if age_rule and consent_rule else "Non-Compliant"
23 | 
24 |         # Count compliant records
25 |         if compliance_status == "Compliant":
26 |             compliant_count += 1
27 | 
28 |         data_records.append({
29 |             "Age": age,
30 |             "Consent Given": consent_given,
31 |             "Age Compliance": age_compliant,
32 |             "Consent Compliance": consent_compliant,
33 |             "Overall Compliance Status": compliance_status
34 |         })
35 | 
36 |     # Calculate the percentage of compliant records
37 |     percentage_compliant = (compliant_count / num_records) * 100
38 | 
39 |     return data_records, percentage_compliant
40 | 
41 | # Define the number of data records to simulate
42 | num_records = 100
43 | 
44 | # Simulate data compliance checks
45 | data_records, percentage_compliant = simulate_data_compliance(num_records)
46 | 
47 | # Display the results for a sample of data records and the percentage of compliance
48 | sample_size = 10
49 | for record in data_records[:sample_size]:
50 |     print(record)
51 | 
52 | print(f"\nPercentage of Compliant Records: {percentage_compliant:.2f}%")
53 | 


--------------------------------------------------------------------------------
/chapter02/data_usage.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | # Simulated data usage metrics
 4 | def simulate_data_usage():
 5 |     # Simulate the number of users in the organization
 6 |     num_users = 500
 7 |     
 8 |     # Simulate data utilization rates for each user (percentage)
 9 |     data_utilization_rates = [random.uniform(20, 90) for _ in range(num_users)]
10 |     
11 |     # Simulate the number of data requests or queries made by each user
12 |     data_requests = [random.randint(1, 100) for _ in range(num_users)]
13 |     
14 |     # Calculate the overall data utilization rate for the organization
15 |     organization_data_utilization_rate = sum(data_utilization_rates) / num_users
16 |     
17 |     # Calculate the total number of data requests or queries
18 |     total_data_requests = sum(data_requests)
19 |     
20 |     # Simulate user satisfaction surveys (on a scale of 1 to 5)
21 |     user_satisfaction_scores = [random.randint(1, 5) for _ in range(num_users)]
22 |     
23 |     # Calculate average user satisfaction score
24 |     avg_user_satisfaction_score = sum(user_satisfaction_scores) / num_users
25 |     
26 |     return {
27 |         "data_utilization_rates": data_utilization_rates,
28 |         "organization_data_utilization_rate": organization_data_utilization_rate,
29 |         "data_requests": data_requests,
30 |         "total_data_requests": total_data_requests,
31 |         "user_satisfaction_scores": user_satisfaction_scores,
32 |         "avg_user_satisfaction_score": avg_user_satisfaction_score,
33 |     }
34 | 
35 | # Run the simulation
36 | data_usage_metrics = simulate_data_usage()
37 | 
38 | # Display the results
39 | print("\nOrganization Data Utilization Rate:")
40 | print(f"{data_usage_metrics['organization_data_utilization_rate']:.2f}%")
41 | print("\nTotal Number of Data Requests or Queries:")
42 | print(data_usage_metrics["total_data_requests"])
43 | print("\nAverage User Satisfaction Score:")
44 | print(f"{data_usage_metrics['avg_user_satisfaction_score']:.2f}")
45 | 


--------------------------------------------------------------------------------
/chapter02/duplication.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Create a sample dataset with duplicate records
 4 | data = {
 5 |     'EmployeeID': [101, 102, 103, 101, 104, 105, 102],
 6 |     'FirstName': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Bob'],
 7 |     'LastName': ['Smith', 'Johnson', 'Brown', 'Davis', 'Lee', 'White', 'Johnson'],
 8 | }
 9 | 
10 | df = pd.DataFrame(data)
11 | 
12 | # Check for duplicate records based on the 'EmployeeID' column
13 | duplicated_mask = df.duplicated(subset='EmployeeID', keep='first')
14 | 
15 | # Create a new column to indicate duplicate records
16 | df['IsDuplicate'] = duplicated_mask
17 | 
18 | # Calculate the percentage of duplicate records
19 | duplicate_percentage = (df['IsDuplicate'].sum() / len(df)) * 100
20 | 
21 | # Display the dataset with the duplicate records marked
22 | print("Dataset with Duplicate Records:")
23 | print(df)
24 | 
25 | # Display the percentage of duplicate records
26 | print(f"Percentage of Duplicate Records: {duplicate_percentage:.2f}%")
27 | 


--------------------------------------------------------------------------------
/chapter02/timeliness.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datetime import datetime
 3 | 
 4 | # Sample dataset with timestamps
 5 | data = {
 6 |     'Timestamp': ['2023-10-25 10:00:00', '2023-10-25 11:00:00', '2023-10-25 12:00:00'],
 7 |     'Value': [50, 55, 60]
 8 | }
 9 | 
10 | # Convert the 'Timestamp' column to datetime objects
11 | df = pd.DataFrame(data)
12 | df['Timestamp'] = pd.to_datetime(df['Timestamp'])
13 | 
14 | # Reference timestamp (current time for this example)
15 | reference_timestamp = datetime(2023, 10, 25, 12, 30, 0)
16 | 
17 | # Step 1: Import necessary libraries and create the dataset
18 | # We import Pandas and the datetime module and create a sample dataset with timestamps.
19 | 
20 | # Step 2: Convert timestamps to datetime objects
21 | # We convert the 'Timestamp' column to datetime objects to work with timestamps effectively.
22 | 
23 | # Step 3: Define the reference timestamp
24 | # In this example, we set a reference timestamp, which represents the current time.
25 | 
26 | # Step 4: Calculate timeliness
27 | timeliness_check = df['Timestamp'] < reference_timestamp
28 | 
29 | # Step 5: Display timeliness results
30 | print("Timeliness Check:")
31 | print(timeliness_check)
32 | 


--------------------------------------------------------------------------------
/chapter02/uniqueness.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Create a sample dataset
 4 | data = {
 5 |     'Email': ['john.doe@example.com', 'jane.smith@example.com', 'james.doe@example.com', 'susan.brown@example.com'],
 6 | }
 7 | 
 8 | df = pd.DataFrame(data)
 9 | 
10 | # Check uniqueness and create a boolean mask for duplicated email addresses
11 | duplicated_mask = df['Email'].duplicated(keep='first')
12 | 
13 | # Create a new column to indicate uniqueness
14 | df['Uniqueness'] = ~duplicated_mask
15 | 
16 | # Calculate the percentage of unique records
17 | unique_percentage = (df['Uniqueness'].sum() / len(df)) * 100
18 | 
19 | # Display the dataset with the uniqueness check results
20 | print("Dataset with Uniqueness Check:")
21 | print(df)
22 | 
23 | # Display the percentage of unique records
24 | print(f"Percentage of Unique Records: {unique_percentage:.2f}%")
25 | 


--------------------------------------------------------------------------------
/chapter03/great_expectations/code/1.data_set_up.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | 
4 | # Load the 'iris' dataset from seaborn library
5 | iris_data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
6 | 
7 | iris_data.to_csv('../data/iris_data.csv', index=False)
8 | print("File written! :)")
9 | 


--------------------------------------------------------------------------------
/chapter03/great_expectations/code/2.mock_test_dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | # Load the 'iris' dataset from seaborn library
 5 | iris_data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
 6 | 
 7 | # do some transformtions that will fail the expectations
 8 | #update values
 9 | iris_data['sepal_length'] = 60
10 | 
11 | #rename columns
12 | iris_data.rename(columns={'petal_width': 'petal_w'}, inplace=True)
13 | 
14 | #write dataframe
15 | iris_data.to_csv('../data/iris_data_test.csv', index=False)
16 | print("File written! :)")
17 | 


--------------------------------------------------------------------------------
/chapter03/great_expectations/code/3.with_pandas_profiler.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from ydata_profiling import ProfileReport
 3 | 
 4 | # Load the 'iris' dataset from seaborn library
 5 | iris_data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
 6 | 
 7 | # Then run Pandas Profiling
 8 | profile = ProfileReport(iris_data, title="Pandas Profiling Report", explorative=True)
 9 | 
10 | # And obtain an Expectation Suite from the profile report
11 | suite = profile.to_expectation_suite(suite_name="my_pandas_profiling_suite")
12 | 


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/checkpoints/expect_iris_ckpnt.yml:
--------------------------------------------------------------------------------
 1 | name: expect_iris_ckpnt
 2 | config_version: 1.0
 3 | template_name:
 4 | module_name: great_expectations.checkpoint
 5 | class_name: SimpleCheckpoint
 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
 7 | expectation_suite_name:
 8 | batch_request: {}
 9 | action_list:
10 |   - name: store_validation_result
11 |     action:
12 |       class_name: StoreValidationResultAction
13 |   - name: store_evaluation_params
14 |     action:
15 |       class_name: StoreEvaluationParametersAction
16 |   - name: update_data_docs
17 |     action:
18 |       class_name: UpdateDataDocsAction
19 | evaluation_parameters: {}
20 | runtime_configuration: {}
21 | validations:
22 |   - batch_request:
23 |       datasource_name: iris_data.csv
24 |       data_connector_name: default_inferred_data_connector_name
25 |       data_asset_name: iris_data_test.csv
26 |       data_connector_query:
27 |         index: -1
28 |     expectation_suite_name: expect_iris
29 | profilers: []
30 | ge_cloud_id:
31 | expectation_suite_ge_cloud_id:
32 | 


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css:
--------------------------------------------------------------------------------
 1 | /*index page*/
 2 | .ge-index-page-site-name-title {}
 3 | .ge-index-page-table-container {}
 4 | .ge-index-page-table {}
 5 | .ge-index-page-table-profiling-links-header {}
 6 | .ge-index-page-table-expectations-links-header {}
 7 | .ge-index-page-table-validations-links-header {}
 8 | .ge-index-page-table-profiling-links-list {}
 9 | .ge-index-page-table-profiling-links-item {}
10 | .ge-index-page-table-expectation-suite-link {}
11 | .ge-index-page-table-validation-links-list {}
12 | .ge-index-page-table-validation-links-item {}
13 | 
14 | /*breadcrumbs*/
15 | .ge-breadcrumbs {}
16 | .ge-breadcrumbs-item {}
17 | 
18 | /*navigation sidebar*/
19 | .ge-navigation-sidebar-container {}
20 | .ge-navigation-sidebar-content {}
21 | .ge-navigation-sidebar-title {}
22 | .ge-navigation-sidebar-link {}
23 | 


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-Italic.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-Italic.otf


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-LightItalic.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-LightItalic.otf


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-MediumItalic.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-MediumItalic.otf


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-SemiBoldItalic.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-SemiBoldItalic.otf


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/favicon.ico


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/glossary_scroller.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/glossary_scroller.gif


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/iterative-dev-loop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/iterative-dev-loop.png


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/logo-long.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/logo-long.png


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/short-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/short-logo.png


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/validation_failed_unexpected_values.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/validation_failed_unexpected_values.gif


--------------------------------------------------------------------------------
/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/styles/data_docs_custom_styles_template.css:
--------------------------------------------------------------------------------
 1 | /*index page*/
 2 | .ge-index-page-site-name-title {}
 3 | .ge-index-page-table-container {}
 4 | .ge-index-page-table {}
 5 | .ge-index-page-table-profiling-links-header {}
 6 | .ge-index-page-table-expectations-links-header {}
 7 | .ge-index-page-table-validations-links-header {}
 8 | .ge-index-page-table-profiling-links-list {}
 9 | .ge-index-page-table-profiling-links-item {}
10 | .ge-index-page-table-expectation-suite-link {}
11 | .ge-index-page-table-validation-links-list {}
12 | .ge-index-page-table-validation-links-item {}
13 | 
14 | /*breadcrumbs*/
15 | .ge-breadcrumbs {}
16 | .ge-breadcrumbs-item {}
17 | 
18 | /*navigation sidebar*/
19 | .ge-navigation-sidebar-container {}
20 | .ge-navigation-sidebar-content {}
21 | .ge-navigation-sidebar-title {}
22 | .ge-navigation-sidebar-link {}
23 | 


--------------------------------------------------------------------------------
/chapter03/intoduction/identify_trends.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import seaborn as sns
 3 | import numpy as np
 4 | 
 5 | # Generate hypothetical sales data
 6 | np.random.seed(42)
 7 | sales_data = np.random.normal(loc=1000, scale=300, size=1000)
 8 | 
 9 | # Plotting the distribution
10 | plt.figure(figsize=(10, 6))
11 | sns.histplot(sales_data, bins=30, kde=True, color='skyblue')
12 | plt.title('Distribution of Daily Sales Revenue')
13 | plt.xlabel('Sales Revenue')
14 | plt.ylabel('Frequency')
15 | plt.show()
16 | 


--------------------------------------------------------------------------------
/chapter04/1.descriptive_stats.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | 
 6 | # Create the initial and expanded e-commerce dataset
 7 | data = {
 8 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 9 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
10 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
11 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
12 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
13 | }
14 | 
15 | df = pd.DataFrame(data)
16 | 
17 | # Convert 'Timestamp' to datetime
18 | df['Timestamp'] = pd.to_datetime(df['Timestamp'])
19 | 
20 | # Display the initial and expanded dataset
21 | print("Initial and Expanded Dataset:")
22 | print(df)
23 | 
24 | # Remove irrelevant column 'CustomerID'
25 | df = df.drop(columns=['CustomerID'])
26 | 
27 | # Descriptive statistics
28 | desc_stats = df.describe()
29 | print("\nDescriptive Statistics:")
30 | print(desc_stats)
31 | 
32 | # Visualize distributions
33 | plt.figure(figsize=(15, 8))
34 | 
35 | # Distribution of Purchase Amount
36 | plt.subplot(2, 2, 1)
37 | sns.histplot(df['PurchaseAmount'], kde=True, color='skyblue')
38 | plt.title('Distribution of Purchase Amount')
39 | 
40 | # Distribution of Payment Methods
41 | plt.subplot(2, 2, 2)
42 | sns.countplot(x='PaymentMethod', data=df, palette='Set2')
43 | plt.title('Distribution of Payment Methods')
44 | 
45 | # Distribution of Product Names
46 | plt.subplot(2, 1, 2)
47 | sns.countplot(x='ProductName', data=df, palette='Set2')
48 | plt.title('Distribution of Product Names')
49 | 
50 | plt.tight_layout()
51 | plt.show()
52 | 


--------------------------------------------------------------------------------
/chapter04/2.rename_columns.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | # Create the initial and expanded e-commerce dataset
 5 | data = {
 6 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 7 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 8 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
 9 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
10 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
11 | }
12 | 
13 | df = pd.DataFrame(data)
14 | 
15 | # Convert 'Timestamp' to datetime
16 | df['Timestamp'] = pd.to_datetime(df['Timestamp'])
17 | 
18 | # Display the initial and expanded dataset
19 | print("Initial and Expanded Dataset:")
20 | print(df)
21 | 
22 | # Scenario: Renaming Columns with Error Handling
23 | 
24 | try:
25 |     # Attempt to rename a single column
26 |     df.rename(columns={'ProductName': 'OldProductName'}, inplace=True)
27 | except ValueError as ve:
28 |     print(f"Error: {ve}")
29 | 
30 | # Check if the column exists before renaming
31 | if 'OldProductName' in df.columns:
32 |     try:
33 |         # Attempt to rename multiple columns
34 |         df.rename(columns={'OldProductName': 'NewProductName', 'PurchaseAmount': 'NewPurchaseAmount'}, inplace=True)
35 |     except ValueError as ve:
36 |         print(f"Error: {ve}")
37 | else:
38 |     print("Error: Column 'OldProductName' does not exist in the DataFrame.")
39 | 
40 | # Display the dataset after renaming (if successful)
41 | print("\nDataset after Renaming (if successful):")
42 | print(df)
43 | 


--------------------------------------------------------------------------------
/chapter04/3.dropping_columns.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Create the initial e-commerce dataset
 4 | data = {
 5 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 6 |     'NewProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 7 |     'NewPurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
 8 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
 9 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | 
14 | # Display the initial e-commerce dataset
15 | print("Initial E-commerce Dataset:")
16 | print(df)
17 | 
18 | # Display the initial memory usage
19 | print("Initial Memory Usage:")
20 | print(df.memory_usage().sum() / (1024 ** 2), "MB")  # Convert bytes to megabytes
21 | 
22 | # Save a copy of the DataFrame before dropping columns for comparison
23 | df_before_drop = df.copy()
24 | 
25 | # Scenario: Dropping Irrelevant Columns
26 | columns_to_drop = ['CustomerID', 'Timestamp']  # Replace with the names of the columns you want to drop
27 | 
28 | try:
29 |     # Drop columns considered irrelevant for the current analysis
30 |     df.drop(columns=columns_to_drop, inplace=True)
31 | except KeyError as ke:
32 |     print(f"Error: {ke}")
33 | 
34 | # Display the DataFrame after dropping columns
35 | print("\nDataFrame after Dropping Irrelevant Columns:")
36 | print(df.columns)
37 | 
38 | # Display the DataFrame before dropping columns for comparison
39 | print("\nDataFrame Before Dropping Columns:")
40 | print(df_before_drop.columns)
41 | 
42 | # Display the memory usage after dropping columns
43 | print("\nMemory Usage After Dropping Columns:")
44 | print(df.memory_usage().sum() / (1024 ** 2), "MB")  # Convert bytes to megabytes
45 | 
46 | 


--------------------------------------------------------------------------------
/chapter04/4.data_types.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Create the initial e-commerce dataset
 4 | data = {
 5 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 6 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 7 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
 8 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
 9 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | 
14 | # Display the initial e-commerce dataset
15 | print("Initial E-commerce Dataset:")
16 | print(df)
17 | 
18 | # Inspect data types of columns
19 | print("\nData Types of Columns:")
20 | print(df.dtypes)
21 | 
22 | # Convert 'PurchaseAmount' to numeric
23 | df['PurchaseAmount'] = pd.to_numeric(df['PurchaseAmount'], errors='coerce')
24 | 
25 | # Convert 'ProductName' to string
26 | df['ProductName'] = df['ProductName'].astype('str')
27 | 
28 | # Convert 'PaymentMethod' to categorical
29 | df['PaymentMethod'] = df['PaymentMethod'].astype('category')
30 | 
31 | # Convert 'CustomerID' to numeric
32 | df['CustomerID'] = pd.to_numeric(df['CustomerID'], errors='coerce')
33 | 
34 | # Add a new boolean column 'HasDevice'
35 | df['HasDive'] = df['ProductName'].str.contains('Dive', case=False)
36 | df['HasDive'] = df['HasDive'].astype('bool')
37 | 
38 | # Display the dataset after type transformations and adding 'HasDive'
39 | print("\nE-commerce Dataset After Type Transformations and Adding 'HasDive':")
40 | print(df)
41 | 
42 | # Inspect data types of columns after transformations
43 | print("\nData Types of Columns After Transformations:")
44 | print(df.dtypes)
45 | 


--------------------------------------------------------------------------------
/chapter04/5.date_time.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datetime import datetime
 3 | from dateutil import parser
 4 | 
 5 | # Sample dataset
 6 | data = {
 7 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 8 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 9 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
10 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
11 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
12 | }
13 | 
14 | df = pd.DataFrame(data)
15 | print(df)
16 | 
17 | # Method 1: Using strptime
18 | # Comment: Explicitly define the timestamp format for parsing
19 | df['Timestamp1'] = df['Timestamp'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H:%M:%S'))
20 | 
21 | # Method 2: Using dateutil.parser.parse()
22 | # Comment: Automatically detect the timestamp format for parsing
23 | df['Timestamp2'] = df['Timestamp'].apply(parser.parse)
24 | 
25 | # Method 3: Using pd.to_datetime()
26 | # Comment: A concise method for parsing timestamps in a pandas DataFrame
27 | df['Timestamp3'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
28 | 
29 | # Display the DataFrame after parsing
30 | print("\nData Types of Columns:")
31 | print(df.dtypes)
32 | print(df)
33 | 
34 | 


--------------------------------------------------------------------------------
/chapter04/6.format_date.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample dataset
 4 | data = {
 5 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 6 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 7 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
 8 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
 9 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | print(df)
14 | 
15 | # Ensure 'Timestamp' column is of type datetime
16 | df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
17 | 
18 | # Display the DataFrame after parsing
19 | print("\nDataFrame After Parsing:")
20 | print(df)
21 | 
22 | # Method 4: Using strftime for custom formatting
23 | # Comment: The strftime method is used to customize the display format of datetime objects
24 | df['FormattedTimestamp'] = df['Timestamp'].dt.strftime('%b %d, %Y %I:%M %p')
25 | 
26 | # Display the DataFrame with the formatted timestamp
27 | print("\nDataFrame with Formatted Timestamp:")
28 | print(df[['Timestamp', 'FormattedTimestamp']])
29 | 
30 | # Display data types of columns
31 | print("\nData Types of Columns After Transformations:")
32 | print(df.dtypes)
33 | 


--------------------------------------------------------------------------------
/chapter04/7.extract_datetime_components.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample dataset
 4 | data = {
 5 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 6 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 7 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
 8 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
 9 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | print(df)
14 | 
15 | # Ensure 'Timestamp' column is of type datetime
16 | df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
17 | 
18 | # Display the DataFrame after parsing
19 | print("DataFrame After Parsing:")
20 | print(df)
21 | 
22 | # Extracting Components: Day, Month, Year
23 | df['Day'] = df['Timestamp'].dt.day
24 | df['Month'] = df['Timestamp'].dt.month
25 | df['Year'] = df['Timestamp'].dt.year
26 | 
27 | # Display the DataFrame with extracted components
28 | print("\nDataFrame with Extracted Components:")
29 | print(df[['Timestamp', 'Day', 'Month', 'Year']])
30 | 
31 | 


--------------------------------------------------------------------------------
/chapter04/8.time_deltas.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample dataset
 4 | data = {
 5 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 6 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 7 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
 8 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
 9 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | 
14 | # Convert 'Timestamp' to datetime
15 | df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
16 | 
17 | # Sort DataFrame by 'Timestamp'
18 | df.sort_values(by='Timestamp', inplace=True)
19 | 
20 | # Calculate time differences and add to DataFrame
21 | df['TimeSincePreviousPurchase'] = df['Timestamp'].diff()
22 | df['TimeUntilNextPurchase'] = -df['Timestamp'].diff(-1)
23 | 
24 | # Display the DataFrame with timedelta columns
25 | print("DataFrame with Time Differences:")
26 | print(df[['Timestamp', 'TimeSincePreviousPurchase', 'TimeUntilNextPurchase']])
27 | 
28 | # Create diff with longer periods
29 | df['TimeDifference2periods'] = df['Timestamp'].diff(periods=2)
30 | 
31 | print("DataFrame with Time Differences:")
32 | print(df[['Timestamp', 'TimeSincePreviousPurchase', "TimeDifference2periods"]])
33 | 
34 | # Fill missing values on diff
35 | df['TimeDiff2periods_nonulls'] = df['Timestamp'].diff(periods=2).fillna(0)
36 | print("DataFrame with Time Differences:")
37 | print(df[['Timestamp', 'TimeDiff2periods_nonulls', "TimeDifference2periods"]])
38 | 
39 | 


--------------------------------------------------------------------------------
/chapter04/9.time_zones.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample dataset
 4 | data = {
 5 |     'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 6 |     'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'],
 7 |     'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130],
 8 |     'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'],
 9 |     'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | 
14 | # Convert 'Timestamp' to datetime
15 | df['Timestamp'] = pd.to_datetime(df['Timestamp'])
16 | 
17 | # Localize timestamps to a specific time zone (e.g., 'UTC')
18 | df['Timestamp_UTC'] = df['Timestamp'].dt.tz_localize('UTC')
19 | 
20 | # Convert localized timestamps to a different time zone (e.g., 'America/New_York')
21 | df['Timestamp_NY'] = df['Timestamp_UTC'].dt.tz_convert('America/New_York')
22 | 
23 | # Display the DataFrame with time zone-handled timestamps
24 | print(df[['Timestamp', 'Timestamp_UTC', 'Timestamp_NY']])
25 | 
26 | 


--------------------------------------------------------------------------------
/chapter05/1.use_case.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 8 | })
 9 | 
10 | # Sample project assignment data
11 | project_data = pd.DataFrame({
12 |     'employee_id': [2, 3, 4, 5, 6],
13 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE']
14 | })
15 | 
16 | 
17 | # Displaying the results
18 | print("employee_data Result:")
19 | print(employee_data)
20 | 
21 | print("project_data Result:")
22 | print(project_data)
23 | 
24 | 


--------------------------------------------------------------------------------
/chapter05/2.inner_join.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 8 | })
 9 | 
10 | # Sample project assignment data
11 | project_data = pd.DataFrame({
12 |     'employee_id': [2, 3, 4, 5, 6],
13 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE']
14 | })
15 | 
16 | # Performing an inner join
17 | merged_data = pd.merge(employee_data, project_data, on='employee_id', how='inner')
18 | 
19 | # Displaying the results
20 | print("Merged Data Result:")
21 | print(merged_data)


--------------------------------------------------------------------------------
/chapter05/3.outer_merge.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 8 | })
 9 | 
10 | # Sample project assignment data
11 | project_data = pd.DataFrame({
12 |     'employee_id': [2, 3, 4, 5, 6],
13 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE']
14 | })
15 | 
16 | # Performing a full outer merge
17 | full_outer_merged_data = pd.merge(employee_data, project_data, on='employee_id', how='outer')
18 | 
19 | # Displaying the results
20 | print("Full Outer Merged Data Result:")
21 | print(full_outer_merged_data)


--------------------------------------------------------------------------------
/chapter05/4.right_merge.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 8 | })
 9 | 
10 | # Sample project assignment data
11 | project_data = pd.DataFrame({
12 |     'employee_id': [2, 3, 4, 5, 6],
13 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE']
14 | })
15 | 
16 | # Performing a right merge
17 | right_merged_data = pd.merge(employee_data, project_data, on='employee_id', how='right')
18 | 
19 | # Displaying the results
20 | print("Right Merged Data Result:")
21 | print(right_merged_data)


--------------------------------------------------------------------------------
/chapter05/5.left_merge.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 8 | })
 9 | 
10 | # Sample project assignment data
11 | project_data = pd.DataFrame({
12 |     'employee_id': [2, 3, 4, 5, 6],
13 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE']
14 | })
15 | 
16 | # Performing a left merge
17 | left_merged_data = pd.merge(employee_data, project_data, on='employee_id', how='left')
18 | 
19 | # Displaying the results
20 | print("Left Merged Data Result:")
21 | print(left_merged_data)


--------------------------------------------------------------------------------
/chapter05/6a.manage_duplicates.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data with potential duplicate keys
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 2, 3, 4, 5, 5],
 6 |     'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'],
 7 |     'department': ['HR', 'IT', 'IT', 'Marketing', 'Finance', 'IT', 'IT']
 8 | })
 9 | 
10 | # Sample project assignment data with potential duplicate keys
11 | project_data = pd.DataFrame({
12 |     'employee_id': [2, 3, 4, 5, 5, 6],
13 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectD', 'ProjectE']
14 | })
15 | 
16 | # Handling duplicates
17 | ## Drop duplicates
18 | employee_data = employee_data.drop_duplicates(subset='employee_id', keep='first')
19 | project_data = project_data.drop_duplicates(subset='employee_id', keep='first')
20 | 
21 | # Performing a merge
22 | merged_data = pd.merge(employee_data, project_data, on='employee_id', how='inner')
23 | 
24 | # Displaying the results
25 | print("Merged Data Result after handling duplicates:")
26 | print(merged_data)


--------------------------------------------------------------------------------
/chapter05/6b.manage_duplicates_validate.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data with potential duplicate keys
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 2, 3, 4, 5, 5],
 6 |     'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'],
 7 |     'department': ['HR', 'IT', 'IT', 'Marketing', 'Finance', 'IT', 'IT']
 8 | })
 9 | 
10 | # Sample project assignment data with potential duplicate keys
11 | project_data = pd.DataFrame({
12 |     'employee_id': [2, 3, 4, 5, 5, 6],
13 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectD', 'ProjectE']
14 | })
15 | 
16 | # Performing a merge with validation to ensure no duplicates in the key column of the left DataFrame
17 | try:
18 |     merged_data = pd.merge(employee_data, project_data, on='employee_id', how='inner', validate='one_to_many')
19 |     print("Merged Data Result:")
20 |     print(merged_data)
21 | except ValueError as e:
22 |     print("Merge failed:", e)
23 |     


--------------------------------------------------------------------------------
/chapter05/6c.merge_and_aggregate.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample employee data with potential duplicate keys
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 2, 3, 4, 5, 5],
 6 |     'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'],
 7 |     'department': ['HR', 'IT', 'IT', 'Marketing', 'Finance', 'IT', 'IT'],
 8 |     'salary': [50000, 60000, 60000, 55000, 65000, 70000, 70000]  # Added salary for aggregation
 9 | })
10 | 
11 | # Sample project assignment data with no duplicate keys
12 | project_data = pd.DataFrame({
13 |     'employee_id': [2, 3, 4, 5, 7, 6],
14 |     'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectD', 'ProjectE']
15 | })
16 | 
17 | # Aggregating duplicate entries in employee_data
18 | aggregated_employee_data = employee_data.groupby('employee_id').agg({
19 |     'name': 'first',  # Keep the first name encountered
20 |     'department': 'first',  # Keep the first department encountered
21 |     'salary': 'sum'  # Sum the salaries in case of duplicates
22 | }).reset_index()
23 | 
24 | # Performing a merge
25 | merged_data = pd.merge(aggregated_employee_data, project_data, on='employee_id', how='inner')
26 | 
27 | # Displaying the results
28 | print("Merged Data Result after aggregation:")
29 | print(merged_data)
30 | 


--------------------------------------------------------------------------------
/chapter05/6d.dmanage_duplicates_concatenation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Creating a sample DataFrame with potential duplicate keys
 4 | employee_data = pd.DataFrame({
 5 |     'employee_id': [1, 2, 2, 3, 4, 5, 5],
 6 |     'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Marketing', 'Finance', 'IT', 'HR']
 8 | })
 9 | 
10 | # Displaying the original DataFrame
11 | print("Original Employee Data:")
12 | print(employee_data)
13 | 
14 | # Concatenating department names for each employee_id
15 | employee_data['department'] = employee_data.groupby('employee_id')['department'].transform(lambda x: ', '.join(x))
16 | # Removing duplicate entries based on employee_id
17 | employee_data = employee_data.drop_duplicates('employee_id')
18 | 
19 | # Displaying the modified DataFrame
20 | print("\nModified Employee Data after Concatenation and Removing Duplicates:")
21 | print(employee_data)
22 | 


--------------------------------------------------------------------------------
/chapter05/7a.managed_duplicated_columns.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Creating two sample DataFrames with the same column names
 4 | employee_data_1 = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 8 | })
 9 | 
10 | employee_data_2 = pd.DataFrame({
11 |     'employee_id': [6, 7, 8, 9, 10],
12 |     'name': ['Frank', 'Grace', 'Hannah', 'Ian', 'Jill'],
13 |     'department': ['Logistics', 'Marketing', 'IT', 'Marketing', 'Finance']
14 | })
15 | 
16 | # Merging the two DataFrames with suffixes to differentiate identical columns
17 | merged_data = pd.merge(employee_data_1, employee_data_2, on='employee_id', how='outer', suffixes=('_1', '_2'))
18 | 
19 | # Displaying the merged DataFrame
20 | print("Merged Employee Data with Suffixes:")
21 | print(merged_data)


--------------------------------------------------------------------------------
/chapter05/7b.drop_columns_merge.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Creating two sample DataFrames with some identical columns
 4 | employee_data_1 = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']  # More reliable department information
 8 | })
 9 | 
10 | employee_data_2 = pd.DataFrame({
11 |     'employee_id': [1, 2, 3, 4, 5],
12 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
13 |     'department': ['Human Resources', 'Information Technology', 'Sales', 'Financial', 'Technical']  # Less reliable, drop this
14 | })
15 | 
16 | # Dropping the less reliable 'department' column from the second DataFrame before merging
17 | employee_data_2.drop(columns=['department'], inplace=True)
18 | 
19 | # Merging the two DataFrames on 'employee_id' and 'name' which are the reliable keys
20 | merged_data = pd.merge(employee_data_1, employee_data_2, on=['employee_id', 'name'], how='inner')
21 | 
22 | # Displaying the merged DataFrame
23 | print("Merged Employee Data with More Reliable Department Information:")
24 | print(merged_data)


--------------------------------------------------------------------------------
/chapter05/7c.use_keys_merge.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Creating two sample DataFrames with identical keys and some identical columns
 4 | employee_data_1 = pd.DataFrame({
 5 |     'employee_id': [1, 2, 3, 4, 5],
 6 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 7 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'],
 8 |     'salary': [50000, 60000, 70000, 80000, 90000]
 9 | })
10 | 
11 | employee_data_2 = pd.DataFrame({
12 |     'employee_id': [1, 2, 3, 4, 5],  # Identical keys
13 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],  # Identical column
14 |     'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'],
15 |     'bonus': [3000, 4000, 5000, 6000, 7000]
16 | })
17 | 
18 | # Merging the two DataFrames with suffixes to differentiate identical columns
19 | merged_data = pd.merge(employee_data_1, employee_data_2, on=['employee_id', 'name'], how='inner', suffixes=('_1', '_2'))
20 | 
21 | # Displaying the merged DataFrame
22 | print("Merged Employee Data with Identical Keys and Columns:")
23 | print(merged_data)


--------------------------------------------------------------------------------
/chapter05/8a.perfomance_benchmark_set_index.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from time import time
 4 | 
 5 | # Number of rows for the benchmarking example
 6 | num_rows = 5
 7 | 
 8 | # Creating two sample DataFrames with identical keys and some identical columns
 9 | employee_data_1 = pd.DataFrame({
10 |     'employee_id': np.arange(num_rows),
11 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
12 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'],
13 |     'salary': [50000, 60000, 70000, 80000, 90000]
14 | })
15 | 
16 | employee_data_2 = pd.DataFrame({
17 |     'employee_id': np.arange(num_rows),
18 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
19 |     'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'],
20 |     'bonus': [3000, 4000, 5000, 6000, 7000]
21 | })
22 | 
23 | # Merge operation
24 | start_time = time()
25 | merged_data = pd.merge(employee_data_1, employee_data_2, on=['employee_id', 'name'], how='inner', suffixes=('_1', '_2'))
26 | end_time = time()
27 | merge_time = end_time - start_time
28 | 
29 | # Displaying the merged DataFrame
30 | print("Merged Employee Data:")
31 | print(merged_data)
32 | print(f"Merge operation took: {merge_time:.5f} seconds")
33 | 
34 | # Utilizing indexes
35 | employee_data_1.set_index('employee_id', inplace=True)
36 | employee_data_2.set_index('employee_id', inplace=True)
37 | 
38 | # Repeating the merge operation after reducing memory usage
39 | start_time = time()
40 | merged_data_reduced = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2'))
41 | end_time = time()
42 | merge_reduced_time = end_time - start_time
43 | print("Optimised Merged Employee Data:")
44 | print(merged_data_reduced)
45 | print(f"Merge operation with after optimisation took: {merge_reduced_time:.5f} seconds")


--------------------------------------------------------------------------------
/chapter05/8b.performance_benchmark_sort_indexes.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from time import time
 4 | 
 5 | # Number of rows for the benchmarking example
 6 | num_rows = 5
 7 | 
 8 | # Creating two sample DataFrames with identical keys and some identical columns
 9 | employee_data_1 = pd.DataFrame({
10 |     'employee_id': np.arange(num_rows),
11 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
12 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'],
13 |     'salary': [50000, 60000, 70000, 80000, 90000]
14 | })
15 | 
16 | employee_data_2 = pd.DataFrame({
17 |     'employee_id': np.arange(num_rows),
18 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
19 |     'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'],
20 |     'bonus': [3000, 4000, 5000, 6000, 7000]
21 | })
22 | 
23 | # Utilizing indexes
24 | employee_data_1.set_index('employee_id', inplace=True)
25 | employee_data_2.set_index('employee_id', inplace=True)
26 | 
27 | 
28 | # Merge operation
29 | start_time = time()
30 | merged_data = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2'))
31 | end_time = time()
32 | merge_time = end_time - start_time
33 | 
34 | # Displaying the merged DataFrame
35 | print("Merged Employee Data:")
36 | print(merged_data)
37 | print(f"Merge operation took: {merge_time:.5f} seconds")
38 | 
39 | # Sort indexes
40 | employee_data_1.sort_index(inplace=True)
41 | employee_data_2.sort_index(inplace=True)
42 | 
43 | # Repeating the merge operation after reducing memory usage
44 | start_time = time()
45 | merged_data_reduced = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2'))
46 | end_time = time()
47 | merge_reduced_time = end_time - start_time
48 | 
49 | print(f"Merge operation after optimisation took: {merge_reduced_time:.5f} seconds")


--------------------------------------------------------------------------------
/chapter05/8c.performance_benchmark_memory.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from time import time
 4 | 
 5 | # Number of rows for the benchmarking example
 6 | num_rows = 5
 7 | 
 8 | # Creating two sample DataFrames with identical keys and some identical columns
 9 | employee_data_1 = pd.DataFrame({
10 |     'employee_id': np.arange(num_rows),
11 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
12 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'],
13 |     'salary': [50000, 60000, 70000, 80000, 90000]
14 | })
15 | 
16 | employee_data_2 = pd.DataFrame({
17 |     'employee_id': np.arange(num_rows),
18 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
19 |     'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'],
20 |     'bonus': [3000, 4000, 5000, 6000, 7000]
21 | })
22 | 
23 | # Utilizing indexes
24 | employee_data_1.set_index('employee_id', inplace=True)
25 | employee_data_2.set_index('employee_id', inplace=True)
26 | 
27 | # Sort indexes
28 | employee_data_1.sort_index(inplace=True)
29 | employee_data_2.sort_index(inplace=True)
30 | 
31 | # Merge operation
32 | start_time = time()
33 | merged_data = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2'))
34 | end_time = time()
35 | merge_time = end_time - start_time
36 | 
37 | # Displaying the merged DataFrame
38 | print("Merged Employee Data:")
39 | print(merged_data)
40 | print(f"Merge operation took: {merge_time:.5f} seconds")
41 | 
42 | # Reduce memory usage by downcasting numerical columns
43 | employee_data_1['salary'] = pd.to_numeric(employee_data_1['salary'], downcast='integer')
44 | employee_data_2['bonus'] = pd.to_numeric(employee_data_2['bonus'], downcast='integer')
45 | 
46 | # Repeating the merge operation after reducing memory usage
47 | start_time = time()
48 | merged_data_reduced = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2'))
49 | end_time = time()
50 | merge_reduced_time = end_time - start_time
51 | 
52 | print(f"Merge operation after optimisation took: {merge_reduced_time:.5f} seconds")


--------------------------------------------------------------------------------
/chapter05/9a.concatenate_row_wise.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Creating two sample DataFrames with some identical columns
 5 | employee_data_1 = pd.DataFrame({
 6 |     'employee_id': np.arange(1, 6),
 7 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 8 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 9 | })
10 | 
11 | employee_data_2 = pd.DataFrame({
12 |     'employee_id': np.arange(6, 11),
13 |     'name': ['Frank', 'Grace', 'Hannah', 'Ian', 'Jill'],
14 |     'department': ['Logistics', 'HR', 'IT', 'Marketing', 'Finance']
15 | })
16 | 
17 | # Concatenating the two DataFrames row-wise
18 | concatenated_data = pd.concat([employee_data_1, employee_data_2], axis=0)
19 | 
20 | # Displaying the concatenated DataFrame
21 | print("Concatenated Employee Data:")
22 | print(concatenated_data)


--------------------------------------------------------------------------------
/chapter05/9b.reset_index.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Creating two sample DataFrames with some identical columns
 5 | employee_data_1 = pd.DataFrame({
 6 |     'employee_id': np.arange(1, 6),
 7 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 8 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 9 | })
10 | 
11 | employee_data_2 = pd.DataFrame({
12 |     'employee_id': np.arange(6, 11),
13 |     'name': ['Frank', 'Grace', 'Hannah', 'Ian', 'Jill'],
14 |     'department': ['Logistics', 'HR', 'IT', 'Marketing', 'Finance']
15 | })
16 | 
17 | # Concatenating the two DataFrames row-wise
18 | concatenated_data = pd.concat([employee_data_1, employee_data_2], axis=0)
19 | 
20 | # Displaying the concatenated DataFrame before resetting the index
21 | print("Concatenated Employee Data (Before Resetting Index):")
22 | print(concatenated_data)
23 | 
24 | # Resetting the index
25 | concatenated_data_reset = concatenated_data.reset_index(drop=True)
26 | 
27 | # Displaying the concatenated DataFrame after resetting the index
28 | print("\nConcatenated Employee Data (After Resetting Index):")
29 | print(concatenated_data_reset)
30 | 


--------------------------------------------------------------------------------
/chapter05/9c.concatenate_column_wise.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Creating two sample DataFrames with some identical columns
 5 | employee_data_1 = pd.DataFrame({
 6 |     'employee_id': np.arange(1, 6),
 7 |     'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 8 |     'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT']
 9 | })
10 | 
11 | # Creating additional data that could represent a different aspect of employee information
12 | employee_performance = pd.DataFrame({
13 |     'employee_id': np.arange(1, 6),
14 |     'performance_rating': [3, 4, 5, 3, 4]
15 | })
16 | 
17 | # Concatenating the two DataFrames column-wise
18 | concatenated_data = pd.concat([employee_data_1, employee_performance], axis=1)
19 | 
20 | # Displaying the concatenated DataFrame
21 | print("Concatenated Employee Data (Column-wise):")
22 | print(concatenated_data)
23 | 


--------------------------------------------------------------------------------
/chapter06/1.use_case.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample sales data
 4 | data = {
 5 |     'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'],
 6 |     'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'],
 7 |     'Region': ['North', 'South', 'East', 'West', 'North', 'South'],
 8 |     'Sales': [200, 300, 150, 350, 100, 250],
 9 |     'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']
10 | }
11 | 
12 | df = pd.DataFrame(data)
13 | print(df)
14 | 


--------------------------------------------------------------------------------
/chapter06/2.groupby_full_example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Extended sample sales data
 4 | data = {
 5 |     'Category': [
 6 |         'Electronics', 'Electronics', 'Electronics', 'Electronics', 
 7 |         'Furniture', 'Furniture', 'Furniture', 'Furniture',
 8 |         'Clothing', 'Clothing', 'Clothing', 'Clothing', 
 9 |         'Electronics', 'Furniture', 'Clothing'
10 |     ],
11 |     'Sub-Category': [
12 |         'Mobile', 'Laptop', 'Tablet', 'Laptop', 
13 |         'Chair', 'Table', 'Desk', 'Table',
14 |         'Men', 'Women', 'Kids', 'Men', 
15 |         'Mobile', 'Chair', 'Women'
16 |     ],
17 |     'Region': [
18 |         'North', 'South', 'East', 'West', 
19 |         'North', 'South', 'East', 'West', 
20 |         'North', 'South', 'East', 'West',
21 |         'North', 'West', 'East'
22 |     ],
23 |     'Sales': [
24 |         200, 300, 250, 400, 
25 |         150, 350, 200, 400, 
26 |         100, 250, 150, 300,
27 |         220, 170, 270
28 |     ],
29 |     'Date': [
30 |         '2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', 
31 |         '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
32 |         '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12',
33 |         '2023-01-13', '2023-01-14', '2023-01-15'
34 |     ]
35 | }
36 | 
37 | df = pd.DataFrame(data)
38 | print("_____________")
39 | print("Sample df is shown below")
40 | print(df)
41 | 
42 | # Group by 'Category' and aggregate the 'Sales' column
43 | category_sales = df.groupby('Category')['Sales'].sum().reset_index()
44 | print("_____________")
45 | print("Sales per Category are shown below:")
46 | print(category_sales)
47 | 
48 | # Group by 'Category' and 'Region' and aggregate the 'Sales' column
49 | category_region_sales = df.groupby(['Category', 'Region'])['Sales'].sum().reset_index()
50 | print("_____________")
51 | print("Sales per Category and Region are shown below:")
52 | print(category_region_sales)
53 | 
54 | # Group by 'Category' and 'Region' and apply multiple aggregation functions
55 | print("_____________")
56 | print("Total and Mean Sales per Category and Region are shown below:")
57 | category_region_sales_agg = df.groupby(['Category', 'Region'])['Sales'].agg(['sum', 'mean']).reset_index()
58 | print(category_region_sales_agg)
59 | 
60 | # Multiple column aggregations
61 | print("_____________")
62 | print("Multiple column aggregations:")
63 | advanced_agg = df.groupby(['Category', 'Region']).agg({
64 |     'Sales': ['sum', 'mean', 'count'],
65 |     'Sub-Category': 'nunique'  # Unique count of Sub-Category
66 | }).reset_index()
67 | print(advanced_agg)
68 | 
69 | # ____________________________________________________________________
70 | # Define custom aggregation functions
71 | print("_____________")
72 | print("Custom Aggregations:")
73 | def range_sales(series):
74 |     return series.max() - series.min()
75 | 
76 | def coefficient_of_variation(series):
77 |     return series.std() / series.mean()
78 | 
79 | # Group by 'Category', 'Region', and apply multiple aggregations including custom functions
80 | advanced_agg_custom = df.groupby('Region').agg({
81 |     'Sales': ['sum', 'mean', 'count', range_sales, coefficient_of_variation],
82 |     'Sub-Category': 'nunique'
83 | }).reset_index()
84 | 
85 | # Rename columns for clarity
86 | advanced_agg_custom.columns = [
87 |     'Region', 'Total Sales', 'Average Sales', 'Number of Transactions',
88 |     'Sales Range', 'Coefficient of Variation', 'Unique Sub-Categories'
89 | ]
90 | 
91 | print(advanced_agg_custom)
92 | print(# Displaying only the specified columns
93 | print(advanced_agg_custom[['Region',  'Total Sales', 'Sales Range', 'Coefficient of Variation', 'Unique Sub-Categories']]))
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/chapter06/3.apply_axis0.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Sample sales data with additional columns
 5 | data = {
 6 |     'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'],
 7 |     'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'],
 8 |     'Sales': [100, 200, 150, 300, 120, 180],
 9 |     'Quantity': [10, 5, 8, 3, 15, 12],
10 |     'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']
11 | }
12 | df = pd.DataFrame(data)
13 | 
14 | # Convert 'Date' column to datetime format
15 | df['Date'] = pd.to_datetime(df['Date'])
16 | 
17 | # Define a custom function to compute multiple statistics for 'Sales' and 'Quantity'
18 | def compute_statistics(series):
19 |     sum_sales = series['Sales'].sum()
20 |     mean_sales = series['Sales'].mean()
21 |     std_sales = series['Sales'].std()
22 |     cv_sales = std_sales / mean_sales
23 |     
24 |     sum_quantity = series['Quantity'].sum()
25 |     mean_quantity = series['Quantity'].mean()
26 |     std_quantity = series['Quantity'].std()
27 |     cv_quantity = std_quantity / mean_quantity
28 |     
29 |     return pd.Series([sum_sales, mean_sales, std_sales, cv_sales, sum_quantity, mean_quantity, std_quantity, cv_quantity],
30 |                      index=['Sum_Sales', 'Mean_Sales', 'Std_Sales', 'CV_Sales', 
31 |                             'Sum_Quantity', 'Mean_Quantity', 'Std_Quantity', 'CV_Quantity'])
32 | 
33 | # Group by 'Category' and apply custom function to compute statistics of 'Sales' and 'Quantity'
34 | result_complex = df.groupby('Category').apply(compute_statistics).reset_index()
35 | 
36 | print("Using apply() for complex function (multiple statistics calculation for 'Sales' and 'Quantity'):")
37 | print(result_complex)


--------------------------------------------------------------------------------
/chapter06/4.apply_axis1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Sample sales data with additional columns
 5 | data = {
 6 |     'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'],
 7 |     'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'],
 8 |     'Sales': [100, 200, 150, 300, 120, 180],
 9 |     'Quantity': [10, 5, 8, 3, 15, 12],
10 |     'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']
11 | }
12 | df = pd.DataFrame(data)
13 | 
14 | # Convert 'Date' column to datetime format
15 | df['Date'] = pd.to_datetime(df['Date'])
16 | 
17 | # Define a function to compute Total_Sales_Quantity and Sales_Quantity_Ratio
18 | def compute_metrics(row):
19 |     total_sales_quantity = row['Sales'] + row['Quantity']
20 |     sales_quantity_ratio = row['Sales'] / row['Quantity'] if row['Quantity'] != 0 else np.nan
21 |     return pd.Series([total_sales_quantity, sales_quantity_ratio], index=['Total_Sales_Quantity', 'Sales_Quantity_Ratio'])
22 | 
23 | # Apply the function row-wise (axis=1) to calculate new metrics
24 | df[['Total_Sales_Quantity', 'Sales_Quantity_Ratio']] = df.apply(compute_metrics, axis=1)
25 | 
26 | # Group by 'Category' to calculate metrics per category
27 | category_metrics = df.groupby('Category')[['Total_Sales_Quantity', 'Sales_Quantity_Ratio']].mean().reset_index()
28 | 
29 | print("DataFrame with Total_Sales_Quantity and Sales_Quantity_Ratio per Category:")
30 | print(category_metrics)
31 | 


--------------------------------------------------------------------------------
/chapter06/5.simple_filtering.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample sales data
 4 | data = {
 5 |     'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'],
 6 |     'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'],
 7 |     'Sales': [100, 200, 150, 300, 120, 180],
 8 |     'Quantity': [10, 5, 8, 3, 15, 12],
 9 |     'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']
10 | }
11 | df = pd.DataFrame(data)
12 | 
13 | # Filter to show products with quantity > 10
14 | filtered_data = df[df['Quantity'] > 10]
15 | 
16 | print("Filtered Data:")
17 | print(filtered_data)
18 | 


--------------------------------------------------------------------------------
/chapter06/6.advanced_filtering.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Sample sales data with additional columns
 4 | data = {
 5 |     'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics'],
 6 |     'Sub-Category': ['Mobile', 'Laptop', 'Tablet', 'Headphones', 'Smartwatch', 'Printer'],
 7 |     'Sales': [1000, 1500, 800, 300, 400, 600],
 8 |     'Quantity': [50, 25, 40, 15, 20, 30],
 9 |     'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']
10 | }
11 | df = pd.DataFrame(data)
12 | 
13 | # Convert 'Date' column to datetime format
14 | df['Date'] = pd.to_datetime(df['Date'])
15 | 
16 | # Filter criteria: Sales greater than 1000 and Quantity less than 30
17 | filtered_data = df[(df['Sales'] > 1000) & (df['Quantity'] < 30)]
18 | 
19 | print("Filtered Data based on Multiple Criteria:")
20 | print(filtered_data)
21 | 


--------------------------------------------------------------------------------
/chapter07/1.postgressql.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import psycopg2
 3 | from psycopg2 import sql
 4 | 
 5 | # Function to check if a table exists in the database
 6 | def table_exists(cursor, table_name):
 7 |     cursor.execute(
 8 |         sql.SQL("SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = %s)"),
 9 |         [table_name]
10 |     )
11 |     return cursor.fetchone()[0]
12 | 
13 | # Function to create a table in the database
14 | def create_table(cursor, table_name):
15 |     cursor.execute(
16 |         sql.SQL("""
17 |             CREATE TABLE {} (
18 |                 id SERIAL PRIMARY KEY,
19 |                 name VARCHAR(255),
20 |                 age INT
21 |             )
22 |         """).format(sql.Identifier(table_name))
23 |     )
24 | 
25 | # Function to insert data into the table
26 | def insert_data(cursor, table_name, data):
27 |     cursor.executemany(
28 |         sql.SQL("INSERT INTO {} (name, age) VALUES (%s, %s)").format(sql.Identifier(table_name)),
29 |         data
30 |     )
31 | 
32 | # Function to fetch and print data from the table
33 | def print_table_data(cursor, table_name):
34 |     cursor.execute(
35 |         sql.SQL("SELECT * FROM {}").format(sql.Identifier(table_name))
36 |     )
37 |     rows = cursor.fetchall()
38 |     for row in rows:
39 |         print(row)
40 | 
41 | # Mock DataFrame
42 | data = {
43 |     'name': ['Alice', 'Bob', 'Charlie'],
44 |     'age': [25, 30, 22]
45 | }
46 | 
47 | df = pd.DataFrame(data)
48 | 
49 | # PostgreSQL connection parameters
50 | db_params = {
51 |     'dbname': 'learn_sql',
52 |     'user': 'the_great_coder',
53 |     'password': 'the_great_coder_again',
54 |     'host': 'localhost',
55 |     'port': '5432'
56 | }
57 | 
58 | # Connect to PostgreSQL
59 | conn = psycopg2.connect(**db_params)
60 | cursor = conn.cursor()
61 | 
62 | # Specify the table name
63 | table_name = 'example_table'
64 | 
65 | # Check if the table exists, and create it if it doesn't
66 | if not table_exists(cursor, table_name):
67 |     create_table(cursor, table_name)
68 | 
69 | # Insert data into the table
70 | insert_data(cursor, table_name, df.values.tolist())
71 | 
72 | # Commit the changes
73 | conn.commit()
74 | 
75 | # Print the data from the table
76 | print_table_data(cursor, table_name)
77 | 
78 | # Close the connection
79 | cursor.close()
80 | conn.close()
81 | 


--------------------------------------------------------------------------------
/chapter07/2.pymongo.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | # MongoDB connection parameters
 4 | mongo_params = {
 5 |     'host': 'localhost',
 6 |     'port': 27017,
 7 | }
 8 | 
 9 | # Function to check if a collection exists in the database
10 | def collection_exists(db, collection_name):
11 |     return collection_name in db.list_collection_names()
12 | 
13 | # Function to create a collection in the database
14 | def create_collection(db, collection_name):
15 |     db.create_collection(collection_name)
16 | 
17 | # Function to insert data into a collection
18 | def insert_data(collection, data):
19 |     collection.insert_many(data)
20 | 
21 | # Mock document data
22 | documents = [
23 |     {'name': 'Alice', 'age': 25},
24 |     {'name': 'Bob', 'age': 30},
25 |     {'name': 'Charlie', 'age': 22}
26 | ]
27 | 
28 | # MongoDB database and collection names
29 | db_name = 'no_sql_db'
30 | collection_name = 'best_collection_ever'
31 | 
32 | # Connect to MongoDB
33 | client = MongoClient(**mongo_params)
34 | db = client[db_name]
35 | 
36 | # Check if the collection exists, and create it if it doesn't
37 | if not collection_exists(db, collection_name):
38 |     create_collection(db, collection_name)
39 | 
40 | # Get the collection
41 | collection = db[collection_name]
42 | 
43 | # Insert data into the collection
44 | insert_data(collection, documents)
45 | 
46 | # Query data from the collection
47 | result = collection.find()
48 | for document in result:
49 |     print(document)
50 | 
51 | # Close the MongoDB connection
52 | client.close()
53 | 


--------------------------------------------------------------------------------
/chapter07/3.pymongo_expand.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | 
 3 | # MongoDB connection parameters
 4 | mongo_params = {
 5 |     'host': 'localhost',
 6 |     'port': 27017,
 7 | }
 8 | 
 9 | # Function to check if a collection exists in the database
10 | def collection_exists(db, collection_name):
11 |     return collection_name in db.list_collection_names()
12 | 
13 | # Function to create a collection in the database
14 | def create_collection(db, collection_name):
15 |     db.create_collection(collection_name)
16 | 
17 | # Function to insert data into a collection
18 | def insert_data(collection, data):
19 |     collection.insert_many(data)
20 | 
21 | # Mock document data with different structures
22 | documents = [
23 |     {'name': 'Alice', 'age': 25, 'email': 'alice@example.com'},
24 |     {'name': 'Bob', 'age': 30, 'address': '123 Main St'},
25 |     {'name': 'Charlie', 'age': 22, 'hobbies': ['reading', 'gaming']},
26 |     {'name': 'David', 'age': 40, 'email': 'david@example.com', 'address': '456 Elm St', 'active': True},
27 |     {'name': 'Eve', 'age': 35, 'email': 'eve@example.com', 'phone': '555-1234'}
28 | ]
29 | 
30 | # MongoDB database and collection names
31 | db_name = 'no_sql_db'
32 | collection_name = 'best_collection_ever'
33 | 
34 | # Connect to MongoDB
35 | client = MongoClient(**mongo_params)
36 | db = client[db_name]
37 | 
38 | # Check if the collection exists, and create it if it doesn't
39 | if not collection_exists(db, collection_name):
40 |     create_collection(db, collection_name)
41 | 
42 | # Get the collection
43 | collection = db[collection_name]
44 | 
45 | # Insert data into the collection
46 | insert_data(collection, documents)
47 | 
48 | # Query data from the collection
49 | result = collection.find()
50 | for document in result:
51 |     print(document)
52 | 
53 | # Close the MongoDB connection
54 | client.close()
55 | 


--------------------------------------------------------------------------------
/chapter07/4a.kafka_producer.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | from confluent_kafka import Producer
 3 | import json
 4 | 
 5 | # MongoDB connection
 6 | mongo_client = MongoClient('mongodb://localhost:27017')
 7 | db = mongo_client['no_sql_db']
 8 | collection = db['best_collection_ever']
 9 | 
10 | # Kafka producer configuration
11 | kafka_config = {
12 |     'bootstrap.servers': 'localhost:9092'
13 | }
14 | producer = Producer(kafka_config)
15 | 
16 | def delivery_report(err, msg):
17 |     if err is not None:
18 |         print(f'Message delivery failed: {err}')
19 |     else:
20 |         print(f'Message delivered to {msg.topic()} [{msg.partition()}]')
21 | 
22 | # Read from MongoDB and produce to Kafka
23 | for document in collection.find():
24 |     # Convert MongoDB document to JSON string
25 |     message = json.dumps(document, default=str)
26 |     
27 |     # Produce message to Kafka
28 |     producer.produce('mongodb_topic', value=message.encode('utf-8'), callback=delivery_report)
29 |     producer.poll(0)
30 | 
31 | producer.flush()


--------------------------------------------------------------------------------
/chapter07/4b.kafka_consumer.py:
--------------------------------------------------------------------------------
 1 | from confluent_kafka import Consumer, KafkaError
 2 | import json
 3 | import time
 4 | 
 5 | # Kafka consumer configuration
 6 | consumer_config = {
 7 |     'bootstrap.servers': 'localhost:9092',
 8 |     'group.id': 'mongodb_consumer_group',
 9 |     'auto.offset.reset': 'earliest'
10 | }
11 | 
12 | consumer = Consumer(consumer_config)
13 | consumer.subscribe(['mongodb_topic'])
14 | 
15 | # Set the duration for which the consumer should run (in seconds)
16 | run_duration = 10  # For example, 10 seconds
17 | start_time = time.time()
18 | 
19 | print("Starting consumer...")
20 | 
21 | while True:
22 |     # Check if the specified duration has passed
23 |     if time.time() - start_time > run_duration:
24 |         print("Time limit reached, shutting down consumer.")
25 |         break
26 | 
27 |     msg = consumer.poll(1.0)
28 | 
29 |     if msg is None:
30 |         continue
31 |     if msg.error():
32 |         if msg.error().code() == KafkaError._PARTITION_EOF:
33 |             print('Reached end of partition')
34 |         else:
35 |             print(f'Error: {msg.error()}')
36 |     else:
37 |         # Process the message
38 |         document = json.loads(msg.value().decode('utf-8'))
39 |         print(f'Received document: {document}')
40 |         # Add your processing logic here
41 | 
42 | consumer.close()
43 | print("Consumer closed.")


--------------------------------------------------------------------------------
/chapter07/5.time_based_partitioning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import pyarrow as pa
 4 | import pyarrow.parquet as pq
 5 | from datetime import datetime
 6 | 
 7 | # Sample data
 8 | data = {"timestamp": ["2022-01-01", "2022-01-01", "2022-01-02"],
 9 |         "value": [10, 15, 12]}
10 | 
11 | # Create a Pandas DataFrame
12 | df = pd.DataFrame(data)
13 | 
14 | # Convert the timestamp column to a datetime type
15 | df["timestamp"] = pd.to_datetime(df["timestamp"])
16 | 
17 | # Time-based partitioning
18 | base_path = "path_to_write_data"
19 | for timestamp, group in df.groupby(df["timestamp"].dt.date):
20 |     # Create the directory if it doesn't exist
21 |     os.makedirs(base_path, exist_ok=True)
22 |     
23 |     partition_path = os.path.join(base_path, str(timestamp))
24 |     
25 |     table = pa.Table.from_pandas(group)
26 |     pq.write_table(table, partition_path)
27 | 
28 | # To read data from a specific partition
29 | specific_partition_path = "/Users/maria.zervou/projects/python_best_practices/data_sinks/data/2022-01-01"
30 | partitioned_data = pq.read_table(specific_partition_path).to_pandas()
31 | 


--------------------------------------------------------------------------------
/chapter07/6.geo_partitioning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import pyarrow as pa
 4 | import pyarrow.parquet as pq
 5 | from datetime import datetime
 6 | 
 7 | # Create a base directory for storing partitioned data
 8 | base_directory = "/Users/maria.zervou/projects/python_best_practices/data_sinks/geo_data"
 9 | os.makedirs(base_directory, exist_ok=True)
10 | 
11 | # Geographic partitioning
12 | geo_data = {"region": ["North", "South", "East"],
13 |             "value": [10, 15, 12]}
14 | geo_df = pd.DataFrame(geo_data)
15 | 
16 | for region, group in geo_df.groupby("region"):
17 |     # Create a directory for each region within the base directory
18 |     region_path = os.path.join(base_directory, region)
19 |     
20 |     # Convert the group to a PyArrow Table and write it to the partition path
21 |     table = pa.Table.from_pandas(group)
22 |     pq.write_table(table, region_path)
23 | 


--------------------------------------------------------------------------------
/chapter07/7.hybrid_partitioning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import pyarrow as pa
 4 | import pyarrow.parquet as pq
 5 | from datetime import datetime
 6 | 
 7 | # Create a base directory for storing partitioned data
 8 | base_directory = "/Users/maria.zervou/projects/python_best_practices/data_sinks/hybrid_data"
 9 | 
10 | # Hybrid partitioning
11 | hybrid_data = {"timestamp": ["2022-01-01", "2022-01-01", "2022-01-02"],
12 |                "region": ["North", "South", "East"],
13 |                "value": [10, 15, 12]}
14 | hybrid_df = pd.DataFrame(hybrid_data)
15 | 
16 | for (timestamp, region), group in hybrid_df.groupby(["timestamp", "region"]):
17 |     # Create a directory for each timestamp and region combination within the base directory
18 |     timestamp_path = os.path.join(base_directory, str(timestamp))
19 |     os.makedirs(timestamp_path, exist_ok=True)
20 |     timestamp_region_path = os.path.join(base_directory, str(timestamp), str(region))
21 |    
22 |     # Convert the group to a PyArrow Table and write it to the partition path
23 |     table = pa.Table.from_pandas(group)
24 |     pq.write_table(table, timestamp_region_path)
25 | 
26 | 


--------------------------------------------------------------------------------
/chapter07/__pycache__/pymongo.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter07/__pycache__/pymongo.cpython-312.pyc


--------------------------------------------------------------------------------
/chapter07/setup/cleanup_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Function to print section headers
 4 | print_header() {
 5 |     echo "========================================"
 6 |     echo "$1"
 7 |     echo "========================================"
 8 | }
 9 | 
10 | # Stop and remove Docker containers
11 | print_header "Stopping and removing Docker containers"
12 | docker-compose down -v
13 | docker rm -f $(docker ps -aq)
14 | 
15 | # Remove Kafka data
16 | print_header "Removing Kafka data"
17 | rm -rf /tmp/kafka-logs /tmp/zookeeper
18 | 
19 | # MongoDB cleanup
20 | print_header "Cleaning up MongoDB"
21 | mongo <<EOF
22 | show dbs
23 | var dbs = db.adminCommand('listDatabases').databases;
24 | dbs.forEach(function(database) {
25 |     if (database.name != 'admin' && database.name != 'config' && database.name != 'local') {
26 |         db = db.getSiblingDB(database.name);
27 |         db.dropDatabase();
28 |         print("Dropped database: " + database.name);
29 |     }
30 | });
31 | EOF
32 | 
33 | # PostgreSQL cleanup
34 | print_header "Cleaning up PostgreSQL"
35 | sudo -u postgres psql <<EOF
36 | \list
37 | DROP DATABASE IF EXISTS your_database_name;
38 | \list
39 | EOF
40 | 
41 | print_header "Cleanup complete"


--------------------------------------------------------------------------------
/chapter07/setup/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   zookeeper:
 5 |     image: confluentinc/cp-zookeeper:latest
 6 |     ports:
 7 |       - "2181:2181"
 8 |     environment:
 9 |       ZOOKEEPER_CLIENT_PORT: 2181
10 | 
11 |   kafka:
12 |     image: confluentinc/cp-kafka:latest
13 |     ports:
14 |       - "9092:9092"
15 |     environment:
16 |       KAFKA_BROKER_ID: 1
17 |       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
18 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
19 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
20 |       KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
21 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1


--------------------------------------------------------------------------------
/chapter07/setup/setup_postgres.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check the operating system
 4 | OS=$(uname)
 5 | 
 6 | if [ "$OS" == "Darwin" ]; then
 7 |     # macOS setup using Homebrew
 8 |     echo "Detected macOS. Installing PostgreSQL via Homebrew..."
 9 |     brew update
10 |     brew install postgresql
11 |     brew services start postgresql
12 | elif [ -f /etc/debian_version ]; then
13 |     # Debian-based Linux (e.g., Ubuntu)
14 |     echo "Detected Debian-based Linux. Installing PostgreSQL via apt-get..."
15 |     sudo apt-get update
16 |     sudo apt-get install -y postgresql postgresql-contrib
17 |     sudo service postgresql start
18 | else
19 |     echo "Unsupported OS. Please install PostgreSQL manually."
20 |     exit 1
21 | fi
22 | 
23 | # Switch to the postgres user and execute SQL commands
24 | psql postgres << EOF
25 | 
26 | -- Create a new database user if it doesn't exist
27 | DO \$\$
28 | BEGIN
29 |    IF NOT EXISTS (SELECT FROM pg_catalog.pg_user WHERE usename = 'the_great_coder') THEN
30 |       CREATE USER the_great_coder WITH PASSWORD 'the_great_coder_again';
31 |    END IF;
32 | END
33 | \$\$;
34 | 
35 | EOF
36 | 
37 | # Create a new database outside of the DO block
38 | psql postgres << EOF
39 | 
40 | CREATE DATABASE learn_sql2 OWNER the_great_coder;
41 | 
42 | EOF
43 | 
44 | psql postgres << EOF
45 | 
46 | -- Grant privileges to the user on the database
47 | GRANT ALL PRIVILEGES ON DATABASE learn_sql2 TO the_great_coder;
48 | 
49 | EOF
50 | 
51 | echo "PostgreSQL setup completed. Database and user created."
52 | 


--------------------------------------------------------------------------------
/chapter07/template_aws_s3.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pyarrow.parquet as pq
 3 | import boto3
 4 | from io import BytesIO
 5 | 
 6 | # Mock some data (replace this with your actual data)
 7 | data = {'Name': ['Alice', 'Bob', 'Charlie'],
 8 |         'Age': [25, 30, 22],
 9 |         'City': ['New York', 'San Francisco', 'Los Angeles']}
10 | 
11 | df = pd.DataFrame(data)
12 | 
13 | # Convert DataFrame to Parquet format
14 | parquet_buffer = BytesIO()
15 | pq.write_table(pq.Table.from_pandas(df), parquet_buffer)
16 | 
17 | # AWS credentials and S3 bucket information
18 | aws_access_key_id = 'YOUR_ACCESS_KEY_ID'
19 | aws_secret_access_key = 'YOUR_SECRET_ACCESS_KEY'
20 | bucket_name = 'your-s3-bucket'
21 | file_key = 'example_data.parquet'  # The key (path) of the file in S3
22 | 
23 | # Create a connection to S3
24 | s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
25 | 
26 | # Upload the Parquet file to S3
27 | s3.put_object(Body=parquet_buffer.getvalue(), Bucket=bucket_name, Key=file_key)
28 | 
29 | print(f"File '{file_key}' uploaded to S3 bucket '{bucket_name}'.")
30 | 


--------------------------------------------------------------------------------
/chapter07/template_bigquery.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import bigquery
 2 | from google.cloud.bigquery import SchemaField
 3 | 
 4 | # Set up credentials and project ID
 5 | # Replace 'your_project_id' and 'path/to/credentials.json' with your actual values
 6 | client = bigquery.Client(project='your_project_id', credentials_path='path/to/credentials.json')
 7 | 
 8 | # Define the dataset and table names
 9 | dataset_name = 'your_dataset'
10 | table_name = 'your_table'
11 | 
12 | # Define table schema (replace with your own schema)
13 | schema = [
14 |     SchemaField('column1', 'STRING', mode='REQUIRED'),
15 |     SchemaField('column2', 'INTEGER', mode='NULLABLE'),
16 |     # Add more fields as needed
17 | ]
18 | 
19 | # Check if the table exists
20 | dataset_ref = client.dataset(dataset_name)
21 | table_ref = dataset_ref.table(table_name)
22 | table_exists = client.get_table(table_ref, retry=3, timeout=30, max_results=None) is not None
23 | 
24 | # Create the table if it doesn't exist
25 | if not table_exists:
26 |     table = bigquery.Table(table_ref, schema=schema)
27 |     client.create_table(table)
28 | 
29 | # Insert data into the table (Mock dataset)
30 | rows_to_insert = [
31 |     ('value1', 1),
32 |     ('value2', 2),
33 |     ('value3', 3),
34 |     # Add more rows as needed
35 | ]
36 | 
37 | # Construct the data to be inserted
38 | data_to_insert = [dict(zip([field.name for field in schema], row)) for row in rows_to_insert]
39 | 
40 | # Perform the data insertion
41 | errors = client.insert_rows(table, data_to_insert)
42 | 
43 | # Check for insertion errors
44 | if errors:
45 |     print(f"Errors occurred during data insertion: {errors}")
46 | 
47 | # Close the BigQuery client
48 | client.close()
49 | 


--------------------------------------------------------------------------------
/chapter08/1.detect_missing_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | # Generate student data with missing ages and test scores
 5 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 6 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 7 | 
 8 | df = pd.DataFrame(data)
 9 | print(df.head())
10 | # Detect missing values
11 | missing_values = df.isnull()
12 | 
13 | # Check if there are any missing values in the entire DataFrame
14 | any_missing = missing_values.any().any()
15 | 
16 | print("Are there any missing values in the dataset?", any_missing)
17 | print("\nMissing Values Detection:")
18 | print(missing_values)
19 | 
20 | # Count the number of null rows
21 | null_rows_count = missing_values.any(axis=1).sum()
22 | 
23 | print("Count of Rows with at least one Missing Value:", null_rows_count)
24 | print(8/len(df))


--------------------------------------------------------------------------------
/chapter08/10.winsorizing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from scipy.stats.mstats import winsorize
 5 | 
 6 | # Generate student data with missing ages and test scores
 7 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 8 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 9 | 
10 | df = pd.DataFrame(data)
11 | 
12 | # Fill NaN values with the mean of each column
13 | df.fillna(df.mean(), inplace=True)
14 | 
15 | # Display the original dataset statistics
16 | print("Original Dataset Statistics:")
17 | print(df.describe())
18 | 
19 | # Plot the distributions before outlier handling
20 | plt.figure(figsize=(12, 5))
21 | 
22 | plt.subplot(1, 2, 1)
23 | plt.title("Distribution of 'Age' Before Outlier Handling")
24 | plt.hist(df['Age'], bins=10, color='blue', alpha=0.7, label='Original')
25 | plt.legend()
26 | 
27 | plt.subplot(1, 2, 2)
28 | plt.title("Distribution of 'Test_Score' Before Outlier Handling")
29 | plt.hist(df['Test_Score'], bins=10, color='orange', alpha=0.7, label='Original')
30 | plt.legend()
31 | 
32 | plt.tight_layout()
33 | plt.show()
34 | 
35 | # Identify and handle outliers using winsorizing for 'Age' column
36 | winsorizing_fraction = 0.1
37 | df['Age_Winsorized'] = winsorize(df['Age'], limits=[winsorizing_fraction, winsorizing_fraction])
38 | 
39 | # Display the dataset after winsorizing
40 | print("\nDataset after Winsorizing:")
41 | print(df)
42 | 
43 | # Display the dataset statistics after winsorizing
44 | print("\nDataset Statistics after Winsorizing:")
45 | print(df.describe())
46 | 
47 | # Plot the distributions after winsorizing
48 | plt.figure(figsize=(12, 5))
49 | 
50 | plt.subplot(1, 2, 1)
51 | plt.title("Distribution of 'Age' After Winsorizing")
52 | plt.hist(df['Age_Winsorized'], bins=10, color='blue', alpha=0.7, label='Winsorized')
53 | plt.legend()
54 | 
55 | plt.subplot(1, 2, 2)
56 | plt.title("Distribution of 'Test_Score' After Winsorizing")
57 | plt.hist(df['Test_Score'], bins=10, color='orange', alpha=0.7, label='Original')
58 | plt.legend()
59 | 
60 | plt.tight_layout()
61 | plt.show()
62 | 


--------------------------------------------------------------------------------
/chapter08/11.data_transformation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Generate student data with missing ages and test scores
 6 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 7 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 8 | 
 9 | df = pd.DataFrame(data)
10 | 
11 | # Fill NaN values with the mean of each  
12 | df.fillna(df.mean(), inplace=True)
13 | 
14 | # Display the original dataset statistics
15 | print("Original Dataset Statistics:")
16 | print(df.describe())
17 | 
18 | # Plot the distributions before outlier handling
19 | plt.figure(figsize=(12, 5))
20 | 
21 | plt.subplot(1, 2, 1)
22 | plt.title("Distribution of 'Age' Before Transformation")
23 | plt.hist(df['Age'], bins=10, color='blue', alpha=0.7, label='Original')
24 | plt.legend()
25 | 
26 | plt.subplot(1, 2, 2)
27 | plt.title("Distribution of 'Test_Score' Before Transformation")
28 | plt.hist(df['Test_Score'], bins=10, color='orange', alpha=0.7, label='Original')
29 | plt.legend()
30 | 
31 | plt.tight_layout()
32 | plt.show()
33 | 
34 | # Apply logarithmic transformation to 'Age' and 'Test_Score'
35 | df_log_transformed = df.copy()
36 | df_log_transformed['Age'] = np.log1p(df_log_transformed['Age'])
37 | df_log_transformed['Test_Score'] = np.log1p(df_log_transformed['Test_Score'])
38 | 
39 | # Display the dataset after logarithmic transformation
40 | print("\nDataset after Logarithmic Transformation:")
41 | print(df_log_transformed.describe())
42 | 
43 | # Apply square root transformation to 'Age' and 'Test_Score'
44 | df_sqrt_transformed = df.copy()
45 | df_sqrt_transformed['Age'] = np.sqrt(df_sqrt_transformed['Age'])
46 | df_sqrt_transformed['Test_Score'] = np.sqrt(df_sqrt_transformed['Test_Score'])
47 | 
48 | # Display the dataset after square root transformation
49 | print("\nDataset after Square Root Transformation:")
50 | print(df_sqrt_transformed.describe())
51 | 
52 | # Plot the distributions after transformation
53 | plt.figure(figsize=(12, 5))
54 | 
55 | plt.subplot(1, 2, 1)
56 | plt.title("Distribution of 'Age' After Transformation")
57 | plt.hist(df_log_transformed['Age'], bins=10, color='blue', alpha=0.7, label='Log-Transformed')
58 | plt.legend()
59 | 
60 | plt.subplot(1, 2, 2)
61 | plt.title("Distribution of 'Test_Score' After Transformation")
62 | plt.hist(df_log_transformed['Test_Score'], bins=10, color='orange', alpha=0.7, label='Log-Transformed')
63 | plt.legend()
64 | 
65 | plt.tight_layout()
66 | plt.show()
67 | 


--------------------------------------------------------------------------------
/chapter08/12.mahalanobis_distance.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from scipy.stats import chi2
 5 | from mpl_toolkits.mplot3d import Axes3D
 6 | 
 7 | # Generate multivariate student data
 8 | np.random.seed(42)
 9 | data = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0.5], [0.5, 1]], size=100)
10 | 
11 | # Introduce outliers
12 | outliers = np.array([[8, 8], [9, 9]])
13 | data = np.concatenate([data, outliers])
14 | 
15 | df = pd.DataFrame(data, columns=['X1', 'X2'])
16 | 
17 | def mahalanobis_distance(x, mean, inv_cov_matrix):
18 |     # Center the data
19 |     centered_data = x - mean
20 |     
21 |     # Calculate Mahalanobis Distance
22 |     mahalanobis_dist = np.sqrt(np.dot(centered_data, np.dot(inv_cov_matrix, centered_data)))
23 |     
24 |     return mahalanobis_dist
25 | 
26 | # Ensure data is of type float
27 | df[['X1', 'X2']] = df[['X1', 'X2']].astype(float)
28 | 
29 | # Center the data
30 | mean = np.mean(df[['X1', 'X2']], axis=0)
31 | 
32 | # Calculate the covariance matrix
33 | cov_matrix = np.cov(df[['X1', 'X2']], rowvar=False)
34 | 
35 | # Calculate the inverse of the covariance matrix
36 | inv_cov_matrix = np.linalg.inv(cov_matrix)
37 | 
38 | # Calculate Mahalanobis Distance for each data point
39 | df['Mahalanobis_Distance'] = df.apply(lambda row: mahalanobis_distance(row[['X1', 'X2']], mean, inv_cov_matrix), axis=1)
40 | 
41 | # Set a significance level for outlier detection
42 | alpha = 0.01
43 | chi2_threshold = chi2.ppf(1 - alpha, df=2)  # df is the degrees of freedom, which is the number of features
44 | 
45 | # Identify outliers
46 | outliers = df[df['Mahalanobis_Distance'] > chi2_threshold]
47 | 
48 | # Drop outliers from the dataframe
49 | df_no_outliers = df[df['Mahalanobis_Distance'] <= chi2_threshold]
50 | 
51 | # Visualize the data and outliers in 3D space
52 | fig = plt.figure(figsize=(10, 8))
53 | ax = fig.add_subplot(111, projection='3d')
54 | 
55 | # Plot all data points in blue
56 | ax.scatter(df_no_outliers['X1'], df_no_outliers['X2'], df_no_outliers['Mahalanobis_Distance'], color='blue', label='Data Points')
57 | 
58 | # Plot outliers in red
59 | ax.scatter(outliers['X1'], outliers['X2'], outliers['Mahalanobis_Distance'], color='red', label='Outliers')
60 | 
61 | ax.set_xlabel('X1')
62 | ax.set_ylabel('X2')
63 | ax.set_zlabel('Mahalanobis Distance')
64 | ax.set_title('Outlier Detection using Mahalanobis Distance')
65 | 
66 | plt.legend()
67 | plt.show()
68 | 
69 | # Describe changes in the dataset
70 | print("\nOriginal Dataset Statistics:")
71 | print(df.describe())
72 | 
73 | print("\nDataset Statistics after Removing Outliers:")
74 | print(df_no_outliers.describe())
75 | 


--------------------------------------------------------------------------------
/chapter08/13.clustering.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.cluster import DBSCAN
 5 | from sklearn.preprocessing import StandardScaler
 6 | 
 7 | # Generate example data
 8 | np.random.seed(42)
 9 | data = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0.5], [0.5, 1]], size=100)
10 | outliers = np.random.multivariate_normal(mean=[8, 8], cov=[[1, 0], [0, 1]], size=10)
11 | data_with_outliers = np.vstack([data, outliers])
12 | 
13 | # Create a DataFrame
14 | df = pd.DataFrame(data_with_outliers, columns=['Feature1', 'Feature2'])
15 | 
16 | # Visualize the data
17 | plt.scatter(df['Feature1'], df['Feature2'], color='blue', label='Inliers')
18 | plt.scatter(outliers[:, 0], outliers[:, 1], color='red', marker='x', label='Outliers')  # Use 'x' as the marker for outliers
19 | plt.title('Original Data with Outliers')
20 | plt.xlabel('Feature1')
21 | plt.ylabel('Feature2')
22 | plt.legend()
23 | plt.show()
24 | 
25 | # Standardize the data
26 | scaler = StandardScaler()
27 | data_scaled = scaler.fit_transform(df)
28 | 
29 | # Apply DBSCAN for outlier detection
30 | dbscan = DBSCAN(eps=0.4, min_samples=5)
31 | df['Outlier'] = dbscan.fit_predict(data_scaled)
32 | 
33 | # Visualize the results
34 | plt.scatter(df['Feature1'][df['Outlier'] == -1], df['Feature2'][df['Outlier'] == -1], color='red', marker='x', label='Outliers')
35 | plt.scatter(df['Feature1'][df['Outlier'] != -1], df['Feature2'][df['Outlier'] != -1], color='blue', label='Inliers')
36 | plt.title('Outlier Detection with DBSCAN')
37 | plt.xlabel('Feature1')
38 | plt.ylabel('Feature2')
39 | plt.legend()
40 | plt.show()
41 | 


--------------------------------------------------------------------------------
/chapter08/14.multivariate_trimming.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | from scipy.stats import chi2
  6 | from mpl_toolkits.mplot3d import Axes3D
  7 | 
  8 | # Generate multivariate student data
  9 | np.random.seed(42)
 10 | data = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0.5], [0.5, 1]], size=100)
 11 | 
 12 | # Introduce outliers
 13 | outliers = np.array([[8, 8], [9, 9]])
 14 | data = np.concatenate([data, outliers])
 15 | 
 16 | df = pd.DataFrame(data, columns=['X1', 'X2'])
 17 | 
 18 | def mahalanobis_distance(x, mean, inv_cov_matrix):
 19 |     # Center the data
 20 |     centered_data = x - mean
 21 |     
 22 |     # Calculate Mahalanobis Distance
 23 |     mahalanobis_dist = np.sqrt(np.dot(centered_data, np.dot(inv_cov_matrix, centered_data)))
 24 |     
 25 |     return mahalanobis_dist
 26 | 
 27 | # Ensure data is of type float
 28 | df[['X1', 'X2']] = df[['X1', 'X2']].astype(float)
 29 | 
 30 | # Center the data
 31 | mean = np.mean(df[['X1', 'X2']], axis=0)
 32 | 
 33 | # Calculate the covariance matrix
 34 | cov_matrix = np.cov(df[['X1', 'X2']], rowvar=False)
 35 | 
 36 | # Calculate the inverse of the covariance matrix
 37 | inv_cov_matrix = np.linalg.inv(cov_matrix)
 38 | 
 39 | # Calculate Mahalanobis Distance for each data point
 40 | df['Mahalanobis_Distance'] = df.apply(lambda row: mahalanobis_distance(row[['X1', 'X2']], mean, inv_cov_matrix), axis=1)
 41 | 
 42 | # Set a significance level for outlier detection
 43 | alpha = 0.1
 44 | chi2_threshold = chi2.ppf(1 - alpha, df=2)  # df is the degrees of freedom, which is the number of features
 45 | 
 46 | # Identify outliers
 47 | outliers = df[df['Mahalanobis_Distance'] > chi2_threshold]
 48 | 
 49 | # Drop outliers from the dataframe
 50 | df_no_outliers = df[df['Mahalanobis_Distance'] <= chi2_threshold]
 51 | 
 52 | # Visualize the distribution plots before and after removing outliers
 53 | plt.figure(figsize=(12, 5))
 54 | 
 55 | plt.subplot(1, 2, 1)
 56 | plt.title("Distribution of 'X1' Before Outlier Handling")
 57 | sns.histplot(df['X1'], bins=20, color='blue', kde=True)
 58 | plt.xlabel('X1')
 59 | plt.ylabel('Frequency')
 60 | 
 61 | plt.subplot(1, 2, 2)
 62 | plt.title("Distribution of 'X2' Before Outlier Handling")
 63 | sns.histplot(df['X2'], bins=20, color='orange', kde=True)
 64 | plt.xlabel('X2')
 65 | plt.ylabel('Frequency')
 66 | 
 67 | plt.tight_layout()
 68 | plt.show()
 69 | 
 70 | plt.figure(figsize=(12, 5))
 71 | 
 72 | plt.subplot(1, 2, 1)
 73 | plt.title("Distribution of 'X1' After Outlier Handling")
 74 | sns.histplot(df_no_outliers['X1'], bins=20, color='blue', kde=True)
 75 | plt.xlabel('X1')
 76 | plt.ylabel('Frequency')
 77 | 
 78 | plt.subplot(1, 2, 2)
 79 | plt.title("Distribution of 'X2' After Outlier Handling")
 80 | sns.histplot(df_no_outliers['X2'], bins=20, color='orange', kde=True)
 81 | plt.xlabel('X2')
 82 | plt.ylabel('Frequency')
 83 | 
 84 | plt.tight_layout()
 85 | plt.show()
 86 | 
 87 | # Visualize the data and outliers in 3D space
 88 | fig = plt.figure(figsize=(10, 8))
 89 | ax = fig.add_subplot(111, projection='3d')
 90 | 
 91 | # Plot all data points in blue
 92 | ax.scatter(df_no_outliers['X1'], df_no_outliers['X2'], df_no_outliers['Mahalanobis_Distance'], color='blue', label='Data Points')
 93 | 
 94 | # Plot outliers with a different symbol (e.g., 'x') in red
 95 | ax.scatter(outliers['X1'], outliers['X2'], outliers['Mahalanobis_Distance'], color='red', marker='x', label='Outliers')
 96 | 
 97 | ax.set_xlabel('X1')
 98 | ax.set_ylabel('X2')
 99 | ax.set_zlabel('Mahalanobis Distance')
100 | ax.set_title('Outlier Detection using Mahalanobis Distance')
101 | 
102 | plt.legend()
103 | plt.show()
104 | 
105 | # Describe changes in the dataset
106 | print("\nOriginal Dataset Statistics:")
107 | print(df.describe())
108 | 
109 | print("\nDataset Statistics after Removing Outliers:")
110 | print(df_no_outliers.describe())
111 | 


--------------------------------------------------------------------------------
/chapter08/2.delete_missing_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Generate student data with missing ages and test scores
 6 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 7 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 8 | 
 9 | df = pd.DataFrame(data)
10 | 
11 | # Display the original dataset statistics
12 | print("Original Dataset Statistics:")
13 | print(df.describe())
14 | 
15 | # Plot the distributions before deletion
16 | plt.figure(figsize=(12, 5))
17 | 
18 | plt.subplot(1, 2, 1)
19 | plt.title("Distribution of 'Age' Before Deletion")
20 | plt.hist(df['Age'].dropna(), bins=10, color='blue', alpha=0.7, label='Original')
21 | plt.legend()
22 | 
23 | plt.subplot(1, 2, 2)
24 | plt.title("Distribution of 'Test_Score' Before Deletion")
25 | plt.hist(df['Test_Score'].dropna(), bins=10, color='orange', alpha=0.7, label='Original')
26 | plt.legend()
27 | 
28 | plt.tight_layout()
29 | plt.show()
30 | 
31 | # Delete rows with any missing values
32 | df_no_missing = df.dropna()
33 | 
34 | # Display the dataset after deletion
35 | print("\nDataset after Deleting Rows with Missing Values:")
36 | print(df_no_missing)
37 | 
38 | # Display the dataset statistics after deletion
39 | print("\nDataset Statistics after Deleting Rows with Missing Values:")
40 | print(df_no_missing.describe())
41 | 
42 | # Plot the distributions after deletion
43 | plt.figure(figsize=(12, 5))
44 | 
45 | plt.subplot(1, 2, 1)
46 | plt.title("Distribution of 'Age' After Deletion")
47 | plt.hist(df_no_missing['Age'], bins=10, color='blue', alpha=0.7, label='After Deletion')
48 | plt.legend()
49 | 
50 | plt.subplot(1, 2, 2)
51 | plt.title("Distribution of 'Test_Score' After Deletion")
52 | plt.hist(df_no_missing['Test_Score'], bins=10, color='orange', alpha=0.7, label='After Deletion')
53 | plt.legend()
54 | 
55 | plt.tight_layout()
56 | plt.show()
57 | 
58 | # Explain the changes and size drop
59 | print("\nExplanation:")
60 | print("The rows containing missing values were removed, resulting in a smaller dataset.")
61 | 


--------------------------------------------------------------------------------
/chapter08/3.mean_imputation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Create a sample dataset with missing values
 6 | np.random.seed(42)
 7 | 
 8 | # Generate student data with missing ages and test scores
 9 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
10 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
11 | 
12 | df = pd.DataFrame(data)
13 | print(df)
14 | 
15 | # Display the original dataset statistics
16 | print("Original Dataset Statistics:")
17 | print(df.describe())
18 | 
19 | # Plot the distributions before mean imputation
20 | plt.figure(figsize=(12, 5))
21 | 
22 | plt.subplot(1, 2, 1)
23 | plt.title("Distribution of 'Age' Before Mean Imputation")
24 | plt.hist(df['Age'].dropna(), bins=10, color='blue', alpha=0.7, label='Original')
25 | plt.legend()
26 | 
27 | plt.subplot(1, 2, 2)
28 | plt.title("Distribution of 'Test_Score' Before Mean Imputation")
29 | plt.hist(df['Test_Score'].dropna(), bins=10, color='orange', alpha=0.7, label='Original')
30 | plt.legend()
31 | 
32 | plt.tight_layout()
33 | plt.show()
34 | 
35 | # Mean imputation for missing ages and test scores with rounded mean for 'Age'
36 | df_mean_imputed = df.copy()
37 | df_mean_imputed['Age'].fillna(round(df['Age'].mean()), inplace=True)
38 | df_mean_imputed['Test_Score'].fillna(df['Test_Score'].mean(), inplace=True)
39 | 
40 | # Display the dataset after mean imputation
41 | print("\nDataset after Mean Imputation:")
42 | print(df_mean_imputed)
43 | 
44 | # Display the dataset statistics after mean imputation
45 | print("\nDataset Statistics after Mean Imputation:")
46 | print(df_mean_imputed.describe())
47 | 
48 | # Plot the distributions after mean imputation
49 | plt.figure(figsize=(12, 5))
50 | 
51 | plt.subplot(1, 2, 1)
52 | plt.title("Distribution of 'Age' After Mean Imputation")
53 | plt.hist(df_mean_imputed['Age'], bins=10, color='blue', alpha=0.7, label='Imputed')
54 | plt.legend()
55 | 
56 | plt.subplot(1, 2, 2)
57 | plt.title("Distribution of 'Test_Score' After Mean Imputation")
58 | plt.hist(df_mean_imputed['Test_Score'], bins=10, color='orange', alpha=0.7, label='Imputed')
59 | plt.legend()
60 | 
61 | plt.tight_layout()
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/chapter08/4.median_imputation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | # Generate student data with missing ages and test scores
 7 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 8 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 9 | 
10 | df = pd.DataFrame(data)
11 | 
12 | # Display the original dataset statistics
13 | print("Original Dataset Statistics:")
14 | print(df.describe())
15 | 
16 | # Plot the distributions before median imputation
17 | plt.figure(figsize=(12, 5))
18 | 
19 | plt.subplot(1, 2, 1)
20 | plt.title("Distribution of 'Age' Before Median Imputation")
21 | plt.hist(df['Age'].dropna(), bins=10, color='blue', alpha=0.7, label='Original')
22 | plt.legend()
23 | 
24 | plt.subplot(1, 2, 2)
25 | plt.title("Distribution of 'Test_Score' Before Median Imputation")
26 | plt.hist(df['Test_Score'].dropna(), bins=10, color='orange', alpha=0.7, label='Original')
27 | plt.legend()
28 | 
29 | plt.tight_layout()
30 | plt.show()
31 | 
32 | # Median imputation for missing ages and test scores
33 | df_median_imputed = df.copy()
34 | df_median_imputed['Age'].fillna(df['Age'].median(), inplace=True)
35 | df_median_imputed['Test_Score'].fillna(df['Test_Score'].median(), inplace=True)
36 | 
37 | # Display the dataset after median imputation
38 | print("\nDataset after Median Imputation:")
39 | print(df_median_imputed)
40 | 
41 | # Display the dataset statistics after median imputation
42 | print("\nDataset Statistics after Median Imputation:")
43 | print(df_median_imputed.describe())
44 | 
45 | # Plot the distributions after median imputation
46 | plt.figure(figsize=(12, 5))
47 | 
48 | plt.subplot(1, 2, 1)
49 | plt.title("Distribution of 'Age' After Median Imputation")
50 | plt.hist(df_median_imputed['Age'], bins=10, color='blue', alpha=0.7, label='Imputed')
51 | plt.legend()
52 | 
53 | plt.subplot(1, 2, 2)
54 | plt.title("Distribution of 'Test_Score' After Median Imputation")
55 | plt.hist(df_median_imputed['Test_Score'], bins=10, color='orange', alpha=0.7, label='Imputed')
56 | plt.legend()
57 | 
58 | plt.tight_layout()
59 | plt.show()
60 | 
61 | 
62 | print(df['Age'].median())


--------------------------------------------------------------------------------
/chapter08/5.indicator_imputation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Create a sample dataset with missing values
 6 | np.random.seed(42)
 7 | 
 8 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 9 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
10 | 
11 | df = pd.DataFrame(data)
12 | 
13 | # Create indicator variables for missing values
14 | df['Age_missing'] = df['Age'].isnull().astype(int)
15 | df['Test_Score_missing'] = df['Test_Score'].isnull().astype(int)
16 | 
17 | # Display the original dataset
18 | print("Original Dataset:")
19 | print(df)
20 | 
21 | # Impute missing values with a placeholder (e.g., mean or median)
22 | df_imputed = df.copy()
23 | df_imputed['Age'].fillna(df_imputed['Age'].mean(), inplace=True)
24 | df_imputed['Test_Score'].fillna(df_imputed['Test_Score'].mean(), inplace=True)
25 | 
26 | # Display the dataset after imputation
27 | print("\nDataset after Indicator Variable Imputation:")
28 | print(df_imputed)
29 | 
30 | # Plot distribution charts for indicator variables
31 | plt.figure(figsize=(12, 5))
32 | 
33 | plt.subplot(1, 2, 1)
34 | plt.title("Distribution of Age_missing")
35 | df['Age_missing'].value_counts().plot(kind='bar', color=['blue', 'orange'])
36 | plt.xlabel("Missing (1) / Not Missing (0)")
37 | plt.ylabel("Count")
38 | 
39 | plt.subplot(1, 2, 2)
40 | plt.title("Distribution of Test_Score_missing")
41 | df['Test_Score_missing'].value_counts().plot(kind='bar', color=['blue', 'orange'])
42 | plt.xlabel("Missing (1) / Not Missing (0)")
43 | plt.ylabel("Count")
44 | 
45 | plt.tight_layout()
46 | plt.show()
47 | 
48 | import seaborn as sns
49 | 
50 | plt.figure(figsize=(12, 5))
51 | 
52 | plt.subplot(1, 2, 1)
53 | sns.boxplot(x='Age_missing', y='Test_Score', data=df_imputed)
54 | plt.title("Boxplot of Test_Score by Age_missing")
55 | 
56 | plt.subplot(1, 2, 2)
57 | sns.boxplot(x='Test_Score_missing', y='Age', data=df_imputed)
58 | plt.title("Boxplot of Age by Test_Score_missing")
59 | 
60 | plt.tight_layout()
61 | plt.show()
62 | 
63 | 


--------------------------------------------------------------------------------
/chapter08/6.outliers_visualisation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from scipy import stats
 5 | 
 6 | # Original dataset
 7 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 8 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 9 | 
10 | df = pd.DataFrame(data)
11 | 
12 | # Calculate Z-Scores for each column
13 | z_scores_age = np.abs(stats.zscore(df['Age'].dropna()))
14 | z_scores_test_score = np.abs(stats.zscore(df['Test_Score'].dropna()))
15 | 
16 | # Set Z-Score threshold
17 | z_threshold = 3
18 | 
19 | # Identify outliers
20 | outliers_age = np.where(z_scores_age > z_threshold)[0]
21 | outliers_test_score = np.where(z_scores_test_score > z_threshold)[0]
22 | 
23 | # Output identified outliers
24 | print("Outliers in 'Age':", df['Age'].iloc[outliers_age].to_list())
25 | print("Outliers in 'Test_Score':", df['Test_Score'].iloc[outliers_test_score].to_list())
26 | 
27 | plt.figure(figsize=(12, 5))
28 | 
29 | plt.subplot(1, 2, 1)
30 | plt.title("Violin Plot for 'Age'")
31 | plt.violinplot(df['Age'].dropna(), vert=False)
32 | 
33 | plt.subplot(1, 2, 2)
34 | plt.title("Violin Plot for 'Test_Score'")
35 | plt.violinplot(df['Test_Score'].dropna(), vert=False)
36 | 
37 | plt.tight_layout()
38 | plt.show()
39 | 
40 | 
41 | plt.figure(figsize=(12, 5))
42 | 
43 | plt.subplot(1, 2, 1)
44 | plt.title("Box Plot for 'Age'")
45 | plt.boxplot(df['Age'].dropna(), vert=False)
46 | 
47 | plt.subplot(1, 2, 2)
48 | plt.title("Box Plot for 'Test_Score'")
49 | plt.boxplot(df['Test_Score'].dropna(), vert=False)
50 | 
51 | plt.tight_layout()
52 | plt.show()
53 | 


--------------------------------------------------------------------------------
/chapter08/7.identify_univariate_outliers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from scipy import stats
 5 | 
 6 | # Original dataset
 7 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 8 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 9 | 
10 | df = pd.DataFrame(data)
11 | 
12 | # Calculate Z-Scores for each column
13 | z_scores_age = np.abs(stats.zscore(df['Age'].dropna()))
14 | z_scores_test_score = np.abs(stats.zscore(df['Test_Score'].dropna()))
15 | 
16 | # Set Z-Score threshold
17 | z_threshold = 3
18 | 
19 | # Identify outliers
20 | outliers_age = np.where(z_scores_age > z_threshold)[0]
21 | outliers_test_score = np.where(z_scores_test_score > z_threshold)[0]
22 | 
23 | # Plot Z-Scores
24 | plt.figure(figsize=(12, 5))
25 | 
26 | plt.subplot(1, 2, 1)
27 | plt.title("Z-Scores for 'Age'")
28 | plt.scatter(range(len(z_scores_age)), z_scores_age, color='blue', label='Z-Scores')
29 | plt.axhline(y=z_threshold, color='red', linestyle='--', label='Threshold')
30 | plt.legend()
31 | 
32 | plt.subplot(1, 2, 2)
33 | plt.title("Z-Scores for 'Test_Score'")
34 | plt.scatter(range(len(z_scores_test_score)), z_scores_test_score, color='orange', label='Z-Scores')
35 | plt.axhline(y=z_threshold, color='red', linestyle='--', label='Threshold')
36 | plt.legend()
37 | 
38 | plt.tight_layout()
39 | plt.show()
40 | 
41 | # Function to identify outliers using IQR
42 | def identify_outliers(column):
43 |     Q1 = df[column].quantile(0.25)
44 |     Q3 = df[column].quantile(0.75)
45 |     IQR = Q3 - Q1
46 |     lower_bound = Q1 - 1.5 * IQR
47 |     upper_bound = Q3 + 1.5 * IQR
48 |     outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
49 |     return outliers
50 | 
51 | # Identify and print outliers for 'Age'
52 | age_outliers = identify_outliers('Age')
53 | print("Outliers in 'Age':")
54 | print(age_outliers)
55 | 
56 | # Identify and print outliers for 'Test_Score'
57 | test_score_outliers = identify_outliers('Test_Score')
58 | print("\nOutliers in 'Test_Score':")
59 | print(test_score_outliers)
60 | 
61 | # Visualize the distribution of 'Age' and 'Test_Score' using box plots
62 | plt.figure(figsize=(12, 5))
63 | 
64 | plt.subplot(1, 2, 1)
65 | plt.title("Box Plot of 'Age'")
66 | plt.boxplot(df['Age'].dropna())
67 | plt.xticks([1], ['Age'])
68 | 
69 | plt.subplot(1, 2, 2)
70 | plt.title("Box Plot of 'Test_Score'")
71 | plt.boxplot(df['Test_Score'].dropna())
72 | plt.xticks([1], ['Test_Score'])
73 | 
74 | plt.tight_layout()
75 | plt.show()
76 | 


--------------------------------------------------------------------------------
/chapter08/8.handle_univariate_outliers_deletions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Generate student data with missing ages and test scores
 6 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 7 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 8 | 
 9 | df = pd.DataFrame(data)
10 | 
11 | # Fill NaN values with the mean of each column
12 | df.fillna(df.mean(), inplace=True)
13 | 
14 | # Display the original dataset statistics
15 | print("Original Dataset Statistics:")
16 | print(df.describe())
17 | 
18 | # Plot the distributions before outlier handling
19 | plt.figure(figsize=(12, 5))
20 | 
21 | plt.subplot(1, 2, 1)
22 | plt.title("Distribution of 'Age' Before Outlier Handling")
23 | plt.hist(df['Age'], bins=10, color='blue', alpha=0.7, label='Original')
24 | plt.legend()
25 | 
26 | plt.subplot(1, 2, 2)
27 | plt.title("Distribution of 'Test_Score' Before Outlier Handling")
28 | plt.hist(df['Test_Score'], bins=10, color='orange', alpha=0.7, label='Original')
29 | plt.legend()
30 | 
31 | plt.tight_layout()
32 | plt.show()
33 | 
34 | # Identify and handle outliers using interquartile range (IQR)
35 | Q1 = df['Test_Score'].quantile(0.25)
36 | Q3 = df['Test_Score'].quantile(0.75)
37 | IQR = Q3 - Q1
38 | 
39 | outlier_threshold = 1.5
40 | lower_bound = Q1 - outlier_threshold * IQR
41 | upper_bound = Q3 + outlier_threshold * IQR
42 | 
43 | df_no_outliers = df[(df['Test_Score'] >= lower_bound) & (df['Test_Score'] <= upper_bound)].copy()
44 | 
45 | # Display the dataset after outlier handling
46 | print("\nDataset after Outlier Handling:")
47 | print(df_no_outliers)
48 | 
49 | # Display the dataset statistics after outlier handling
50 | print("\nDataset Statistics after Outlier Handling:")
51 | print(df_no_outliers.describe())
52 | 
53 | # Plot the distributions after outlier handling
54 | plt.figure(figsize=(12, 5))
55 | 
56 | plt.subplot(1, 2, 1)
57 | plt.title("Distribution of 'Age' After Outlier Handling")
58 | plt.hist(df_no_outliers['Age'], bins=10, color='blue', alpha=0.7, label='Cleaned')
59 | plt.legend()
60 | 
61 | plt.subplot(1, 2, 2)
62 | plt.title("Distribution of 'Test_Score' After Outlier Handling")
63 | plt.hist(df_no_outliers['Test_Score'], bins=10, color='orange', alpha=0.7, label='Cleaned')
64 | plt.legend()
65 | 
66 | plt.tight_layout()
67 | plt.show()
68 | 


--------------------------------------------------------------------------------
/chapter08/9.trimming.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Generate student data with missing ages and test scores
 6 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90],
 7 |         'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]}
 8 | 
 9 | df = pd.DataFrame(data)
10 | 
11 | # Fill NaN values with the mean of each column
12 | df.fillna(df.mean(), inplace=True)
13 | 
14 | # Display the original dataset statistics
15 | print("Original Dataset Statistics:")
16 | print(df.describe())
17 | 
18 | # Plot the distributions before outlier handling
19 | plt.figure(figsize=(12, 5))
20 | 
21 | plt.subplot(1, 2, 1)
22 | plt.title("Distribution of 'Age' Before Outlier Handling")
23 | plt.hist(df['Age'], bins=10, color='blue', alpha=0.7, label='Original')
24 | plt.legend()
25 | 
26 | plt.subplot(1, 2, 2)
27 | plt.title("Distribution of 'Test_Score' Before Outlier Handling")
28 | plt.hist(df['Test_Score'], bins=10, color='orange', alpha=0.7, label='Original')
29 | plt.legend()
30 | 
31 | plt.tight_layout()
32 | plt.show()
33 | 
34 | # Drop the 10% of values on each side of the distribution for 'Age' column
35 | df_trimmed = df[(df['Age'] >= df['Age'].quantile(0.1)) & (df['Age'] <= df['Age'].quantile(0.9))]
36 | 
37 | # Calculate trimmed mean for each column
38 | df_trimmed_mean = df_trimmed.mean()
39 | 
40 | # Display the trimmed dataset statistics
41 | print("\nTrimmed Dataset Statistics:")
42 | print(df_trimmed.describe())
43 | 
44 | # Display the trimmed mean for each column
45 | print("\nTrimmed Mean:")
46 | print(df_trimmed_mean)
47 | 
48 | # Plot the distributions after trimming
49 | plt.figure(figsize=(12, 5))
50 | 
51 | plt.subplot(1, 2, 1)
52 | plt.title("Distribution of 'Age' After Trimming")
53 | plt.hist(df_trimmed['Age'], bins=10, color='blue', alpha=0.7, label='Trimmed')
54 | plt.legend()
55 | 
56 | plt.subplot(1, 2, 2)
57 | plt.title("Distribution of 'Test_Score' After Trimming")
58 | plt.hist(df_trimmed['Test_Score'], bins=10, color='orange', alpha=0.7, label='Trimmed')
59 | plt.legend()
60 | 
61 | plt.tight_layout()
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/chapter09/min_max_scaling.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.preprocessing import MinMaxScaler
 5 | 
 6 | # Create a dataset with features related to housing prices
 7 | np.random.seed(42)
 8 | num_samples = 100
 9 | 
10 | # Square footage in square feet
11 | square_footage = np.random.uniform(500, 5000, num_samples)
12 | 
13 | # Distance to the nearest school in miles
14 | distance_to_school = np.random.uniform(0.1, 5, num_samples)
15 | 
16 | # Commute distance to work in miles
17 | commute_distance = np.random.exponential(5, num_samples)
18 | 
19 | # Traffic density (skewed feature)
20 | traffic_density = np.random.exponential(2, num_samples)
21 | 
22 | # Create a DataFrame
23 | data = pd.DataFrame({
24 |     'Square_Footage': square_footage,
25 |     'Distance_to_School': distance_to_school,
26 |     'Commute_Distance': commute_distance,
27 |     'Traffic_Density': traffic_density
28 | })
29 | 
30 | 
31 | # Display original dataset statistics
32 | print("Original Dataset Statistics:")
33 | print(data.describe())
34 | 
35 | # Plot the distributions before scaling
36 | plt.figure(figsize=(12, 8))
37 | 
38 | for i, column in enumerate(data.columns):
39 |     plt.subplot(2, 2, i+1)
40 |     plt.title(f"Distribution of '{column}' Before Scaling")
41 |     plt.hist(data[column], bins=20, color='blue', alpha=0.7)
42 |     plt.xlabel(column)
43 | 
44 | plt.tight_layout()
45 | plt.show()
46 | 
47 | # Apply Min-Max scaling
48 | scaler = MinMaxScaler()
49 | data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
50 | 
51 | # Display dataset statistics after scaling
52 | print("\nDataset Statistics After Scaling:")
53 | print(data_scaled.describe())
54 | 
55 | # Plot the distributions after scaling
56 | plt.figure(figsize=(12, 8))
57 | 
58 | for i, column in enumerate(data_scaled.columns):
59 |     plt.subplot(2, 2, i+1)
60 |     plt.title(f"Distribution of '{column}' After Scaling")
61 |     plt.hist(data_scaled[column], bins=20, color='green', alpha=0.7)
62 |     plt.xlabel(column)
63 | 
64 | plt.tight_layout()
65 | plt.show()
66 | 


--------------------------------------------------------------------------------
/chapter09/robust_scaler.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.preprocessing import RobustScaler
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | 
 7 | # Create a dataset with features related to housing prices
 8 | np.random.seed(42)
 9 | num_samples = 100
10 | 
11 | # Square footage in square feet
12 | square_footage = np.random.uniform(500, 5000, num_samples)
13 | 
14 | # Distance to the nearest school in miles
15 | distance_to_school = np.random.uniform(0.1, 5, num_samples)
16 | 
17 | # Commute distance to work in miles
18 | commute_distance = np.random.exponential(5, num_samples)
19 | 
20 | # Traffic density (skewed feature)
21 | traffic_density = np.random.exponential(2, num_samples)
22 | 
23 | # Create a DataFrame
24 | data = pd.DataFrame({
25 |     'Square_Footage': square_footage,
26 |     'Distance_to_School': distance_to_school,
27 |     'Commute_Distance': commute_distance,
28 |     'Traffic_Density': traffic_density
29 | })
30 | 
31 | 
32 | # Display original dataset statistics
33 | print("Original Dataset Statistics:")
34 | print(data.describe())
35 | 
36 | # Plot the distributions before scaling
37 | plt.figure(figsize=(12, 8))
38 | 
39 | for i, column in enumerate(data.columns):
40 |     plt.subplot(2, 2, i+1)
41 |     plt.title(f"Distribution of '{column}' Before Scaling")
42 |     plt.hist(data[column], bins=20, color='blue', alpha=0.7)
43 |     plt.xlabel(column)
44 | 
45 | plt.tight_layout()
46 | plt.show()
47 | 
48 | # Applying RobustScaler
49 | robust_scaler = RobustScaler()
50 | data_scaled = robust_scaler.fit_transform(data)
51 | 
52 | # Converting the scaled data back to a DataFrame
53 | data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
54 | 
55 | # Displaying the dataset after scaling
56 | print("\nDataset after Robust Scaling:")
57 | print(data_scaled.describe())
58 | 
59 | # Plotting the distributions after scaling
60 | plt.figure(figsize=(12, 8))
61 | 
62 | for i, col in enumerate(data_scaled.columns, 1):
63 |     plt.subplot(2, 2, i)
64 |     plt.title(f"Distribution of {col} After Robust Scaling")
65 |     plt.hist(data_scaled[col], bins=20, color='orange', alpha=0.7)
66 | 
67 | plt.tight_layout()
68 | plt.show()
69 | 


--------------------------------------------------------------------------------
/chapter09/zscaler.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | np.random.seed(42)
 6 | num_samples = 100
 7 | 
 8 | # Square footage in square feet
 9 | square_footage = np.random.uniform(500, 5000, num_samples)
10 | 
11 | # Distance to the nearest school in miles
12 | distance_to_school = np.random.uniform(0.1, 5, num_samples)
13 | 
14 | # Commute distance to work in miles
15 | commute_distance = np.random.exponential(5, num_samples)
16 | 
17 | # Traffic density (skewed feature)
18 | traffic_density = np.random.exponential(2, num_samples)
19 | # Create a DataFrame with all features
20 | data = pd.DataFrame({
21 |     'Square_Footage': square_footage,
22 |     'Distance_to_School': distance_to_school,
23 |     'Commute_Distance': commute_distance,
24 |     'Traffic_Density': traffic_density
25 | })
26 | 
27 | # Print original dataset statistics
28 | print("Original Dataset Statistics:")
29 | print(data.describe())
30 | 
31 | # Visualize the original distributions
32 | data.hist(figsize=(12, 10), bins=20, color='blue', alpha=0.7)
33 | plt.suptitle('Original Data Distributions')
34 | plt.show()
35 | 
36 | # Z-score scaling
37 | data_zscore = (data - data.mean()) / data.std()
38 | 
39 | # Print dataset statistics after Z-score scaling
40 | print("\nDataset Statistics after Z-score Scaling:")
41 | print(data_zscore.describe())
42 | 
43 | # Visualize the distributions after Z-score scaling
44 | data_zscore.hist(figsize=(12, 10), bins=20, color='green', alpha=0.7)
45 | plt.suptitle('Data Distributions after Z-score Scaling')
46 | plt.show()
47 | 


--------------------------------------------------------------------------------
/chapter10/1a.label_encoding.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | from sklearn.preprocessing import LabelEncoder 
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | # Sample dataset 
 7 | data = { 
 8 |     'Employee Rating': ['Poor', 'Good', 'Satisfactory', 'Excellent', 'Good'], 
 9 |     'Salary': [35000, 50000, 42000, 60000, 52000],
10 |     'Years of Experience': [2, 5, 3, 8, 6],
11 |     'Department': ['HR', 'IT', 'Finance', 'IT', 'Marketing'] 
12 | } 
13 | 
14 | df = pd.DataFrame(data) 
15 | print("Original DataFrame:")
16 | print(df)
17 | 
18 | # Initialize the LabelEncoder 
19 | label_encoder = LabelEncoder() 
20 | 
21 | # Apply label encoding to the 'Employee Rating' column 
22 | df['Employee Rating (Encoded)'] = label_encoder.fit_transform(df['Employee Rating']) 
23 | 
24 | print("\nDataFrame after Label Encoding:")
25 | print(df) 
26 | 
27 | # Plot the distribution of the 'Employee Rating' column before encoding
28 | plt.figure(figsize=(14, 6))
29 | 
30 | plt.subplot(1, 2, 1)
31 | sns.countplot(x='Employee Rating', data=df, order=df['Employee Rating'].value_counts().index)
32 | plt.title('Distribution of Employee Rating (Before Encoding)')
33 | plt.xlabel('Employee Rating')
34 | plt.ylabel('Count')
35 | 
36 | # Plot the distribution of the 'Employee Rating (Encoded)' column after encoding
37 | plt.subplot(1, 2, 2)
38 | sns.countplot(x='Employee Rating (Encoded)', data=df, order=df['Employee Rating (Encoded)'].value_counts().index)
39 | plt.title('Distribution of Employee Rating (After Encoding)')
40 | plt.xlabel('Employee Rating (Encoded)')
41 | plt.ylabel('Count')
42 | 
43 | plt.tight_layout()
44 | plt.show()
45 | 


--------------------------------------------------------------------------------
/chapter10/1b.label_encoding_forced.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.preprocessing import LabelEncoder
 3 | 
 4 | # Sample dataset 
 5 | data = { 
 6 |     'Employee Rating': ['Poor', 'Good', 'Satisfactory', 'Excellent', 'Good'], 
 7 |     'Salary': [35000, 50000, 42000, 60000, 52000],
 8 |     'Years of Experience': [2, 5, 3, 8, 6],
 9 |     'Department': ['HR', 'IT', 'Finance', 'IT', 'Marketing'] 
10 | } 
11 | 
12 | df = pd.DataFrame(data)
13 | print("Original DataFrame:")
14 | print(df)
15 | 
16 | # Define the correct order of categories with prefixes
17 | ordered_categories = {
18 |     'Poor': '1.Poor',
19 |     'Satisfactory': '2.Satisfactory',
20 |     'Good': '3.Good',
21 |     'Excellent': '4.Excellent'
22 | }
23 | 
24 | # Map the 'Employee Rating' column to the prefixed categories
25 | df['Employee Rating Ordered'] = df['Employee Rating'].map(ordered_categories)
26 | 
27 | # Initialize the LabelEncoder
28 | label_encoder = LabelEncoder()
29 | 
30 | # Apply label encoding to the 'Employee Rating Ordered' column
31 | df['Employee Rating (Encoded)'] = label_encoder.fit_transform(df['Employee Rating Ordered'])
32 | 
33 | # Reverse the mapping for clarity in the DataFrame (optional)
34 | reverse_mapping = {v: k for k, v in ordered_categories.items()}
35 | df['Employee Rating Ordered'] = df['Employee Rating Ordered'].map(reverse_mapping)
36 | 
37 | print("\nDataFrame after Label Encoding with Correct Order:")
38 | print(df[['Employee Rating Ordered','Employee Rating (Encoded)']])
39 | 


--------------------------------------------------------------------------------
/chapter10/2.one_hot_encoding.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.preprocessing import OneHotEncoder
 5 | 
 6 | # Sample dataset
 7 | data = {
 8 |     'Customer ID': [1, 2, 3, 4, 5],
 9 |     'Contract Type': ['Month-to-Month', 'One Year', 'Month-to-Month', 'Two Year', 'One Year'],
10 |     'Internet Service': ['DSL', 'Fiber Optic', 'DSL', 'Fiber Optic', 'No Internet Service'],
11 |     'Payment Method': ['Electronic Check', 'Mailed Check', 'Bank Transfer', 'Credit Card', 'Electronic Check'],
12 | }
13 | 
14 | df = pd.DataFrame(data)
15 | 
16 | # Plot distribution of original 'Contract Type' column
17 | plt.figure(figsize=(8, 6))
18 | sns.countplot(x='Contract Type', data=df).set_title('Contract Type Distribution')
19 | plt.show()
20 | 
21 | # Initialize the OneHotEncoder for 'Contract Type' without dropping any category
22 | one_hot_encoder = OneHotEncoder(sparse_output=False)
23 | 
24 | # Fit and transform the 'Contract Type' column
25 | encoded_columns = one_hot_encoder.fit_transform(df[['Contract Type']])
26 | 
27 | # Create a new DataFrame with the one-hot encoded columns for 'Contract Type'
28 | encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(['Contract Type']))
29 | 
30 | # Concatenate the one-hot encoded DataFrame with the original DataFrame
31 | df_encoded = pd.concat([df, encoded_df], axis=1)
32 | 
33 | # Dropping the original 'Contract Type' column as it is now encoded
34 | df_encoded = df_encoded.drop(['Contract Type'], axis=1)
35 | 
36 | print(df_encoded)
37 | 
38 | # Plot distribution of encoded 'Contract Type' columns
39 | encoded_cols = encoded_df.columns
40 | 
41 | fig, axes = plt.subplots(1, len(encoded_cols), figsize=(6 * len(encoded_cols), 5))
42 | for i, col in enumerate(encoded_cols):
43 |     sns.countplot(ax=axes[i], x=encoded_df[col]).set_title(f'{col} Distribution')
44 | plt.tight_layout()
45 | plt.show()
46 | 


--------------------------------------------------------------------------------
/chapter10/4.frequency_encoding.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.model_selection import train_test_split
 5 | from category_encoders import CountEncoder  # Ensure you have this library installed
 6 | 
 7 | # Create a sample dataset
 8 | data = {
 9 |     'Customer ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
10 |     'Product Category': ['Electronics', 'Clothing', 'Electronics', 'Books', 'Books', 'Clothing', 'Electronics', 'Books', 'Clothing', 'Books'],
11 |     'Total Purchases': [5, 2, 3, 8, 7, 4, 2, 5, 1, 6]
12 | }
13 | 
14 | df = pd.DataFrame(data)
15 | 
16 | # Display the sample dataset
17 | print("Sample Dataset:")
18 | print(df)
19 | 
20 | # Define the features
21 | X = df[['Customer ID', 'Product Category', 'Total Purchases']]
22 | 
23 | # Split the data into training and testing sets
24 | X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
25 | 
26 | # Initialize the CountEncoder for 'Product Category'
27 | count_encoder = CountEncoder(cols=['Product Category'])
28 | 
29 | # Fit and transform the training data
30 | X_train_encoded = count_encoder.fit_transform(X_train)
31 | 
32 | # Transform the test data using the same encoder
33 | X_test_encoded = count_encoder.transform(X_test)
34 | 
35 | # Plot the distribution of the original and encoded 'Product Category' in the training set
36 | fig, axes = plt.subplots(1, 2, figsize=(16, 6))
37 | 
38 | # Original 'Product Category' distribution
39 | sns.countplot(ax=axes[0], x='Product Category', data=X_train).set_title('Original Product Category Distribution (Training Set)')
40 | 
41 | # Encoded 'Product Category' distribution
42 | sns.countplot(ax=axes[1], x='Product Category', data=X_train_encoded).set_title('Encoded Product Category Distribution (Training Set)')
43 | 
44 | plt.tight_layout()
45 | plt.show()
46 | 
47 | # Display the encoded training dataset
48 | print("\nEncoded Training Dataset:")
49 | print(X_train_encoded.head())
50 | 
51 | # Display the encoded testing dataset
52 | print("\nEncoded Testing Dataset:")
53 | print(X_test_encoded.head())
54 | 


--------------------------------------------------------------------------------
/chapter10/5.binary_encoding.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | from category_encoders import BinaryEncoder
 5 | 
 6 | # Sample data
 7 | data = {
 8 |     'Country': ['USA', 'Canada', 'USA', 'Canada', 'Mexico', 'USA', 'Mexico', 'Canada'],
 9 |     'Age': [25, 30, 35, 40, 45, 50, 55, 60],
10 |     'Income': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000],
11 |     'Subscription': [1, 0, 1, 0, 1, 0, 1, 0]
12 | }
13 | 
14 | df = pd.DataFrame(data)
15 | 
16 | # Plot the distribution of the 'Country' feature before encoding
17 | plt.figure(figsize=(10, 6))
18 | sns.countplot(x='Country', data=df)
19 | plt.title('Distribution of Country Feature Before Encoding')
20 | plt.show()
21 | 
22 | # Apply binary encoding to the 'Country' feature
23 | encoder = BinaryEncoder(cols=['Country'])
24 | df_encoded = encoder.fit_transform(df)
25 | 
26 | # Display the encoded dataframe
27 | print(df_encoded)
28 | 
29 | # Plot the distribution of the binary encoded features
30 | encoded_cols = [col for col in df_encoded.columns if 'Country' in col]
31 | n_cols = len(encoded_cols)
32 | 
33 | fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 5))
34 | fig.suptitle('Distribution of Country Feature After Binary Encoding')
35 | 
36 | for i, col in enumerate(encoded_cols):
37 |     sns.histplot(df_encoded[col], kde=True, ax=axes[i], bins=2)
38 |     axes[i].set_title(col)
39 |     axes[i].set_xlabel('Encoded Value')
40 |     axes[i].set_ylabel('Count')
41 | 
42 | plt.tight_layout()
43 | plt.show()


--------------------------------------------------------------------------------
/chapter11/1.decomposing_time_series/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Generate example data with noise
 6 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M')
 7 | np.random.seed(42)
 8 | noise_data = pd.Series(np.random.normal(0, 2, len(date_rng)), index=date_rng)
 9 | 
10 | # Plotting the time series data with noise
11 | plt.figure(figsize=(10, 5))
12 | plt.plot(noise_data, label='Temperature Fluctuations')
13 | plt.title('Time Series Data with Noise')
14 | plt.xlabel('Time')
15 | plt.ylabel('Temperature')
16 | plt.legend()
17 | plt.show()
18 | 


--------------------------------------------------------------------------------
/chapter11/1.decomposing_time_series/seasonality.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | # Generate example data with seasonality
 5 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M')
 6 | seasonal_data = pd.Series([10, 12, 15, 22, 30, 35, 40, 38, 30, 22, 15, 12] * 11, index=date_rng)
 7 | 
 8 | # Plotting the time series data with seasonality
 9 | plt.figure(figsize=(10, 5))
10 | plt.plot(seasonal_data, label='Ice Cream Sales')
11 | plt.title('Time Series Data with Seasonality')
12 | plt.xlabel('Time')
13 | plt.ylabel('Sales')
14 | plt.legend()
15 | plt.show()
16 | 


--------------------------------------------------------------------------------
/chapter11/1.decomposing_time_series/trend.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | # Generate example data
 5 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M')
 6 | sales_data = pd.Series(range(1, len(date_rng) + 1), index=date_rng)
 7 | 
 8 | # Plotting the time series data with a trend
 9 | plt.figure(figsize=(10, 5))
10 | plt.plot(sales_data, label='Sales Data')
11 | plt.title('Time Series Data with Trend')
12 | plt.xlabel('Time')
13 | plt.ylabel('Sales')
14 | plt.legend()
15 | plt.show()
16 | 


--------------------------------------------------------------------------------
/chapter11/2.types/multivariate.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Generate example multivariate time series data
 6 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M')
 7 | temperature_data = pd.Series(np.random.normal(20, 5, len(date_rng)), index=date_rng)
 8 | rainfall_data = pd.Series(np.random.normal(50, 20, len(date_rng)), index=date_rng)
 9 | 
10 | # Create a DataFrame with both temperature and rainfall data
11 | multivariate_data = pd.DataFrame({'Temperature': temperature_data, 'Rainfall': rainfall_data})
12 | 
13 | # Plotting the multivariate time series data
14 | plt.figure(figsize=(12, 6))
15 | 
16 | plt.subplot(2, 1, 1)
17 | plt.plot(multivariate_data['Temperature'], label='Temperature Data', color='blue')
18 | plt.title('Multivariate Time Series Data')
19 | plt.xlabel('Time')
20 | plt.ylabel('Temperature (°C)')
21 | plt.legend()
22 | 
23 | plt.subplot(2, 1, 2)
24 | plt.plot(multivariate_data['Rainfall'], label='Rainfall Data', color='green')
25 | plt.xlabel('Time')
26 | plt.ylabel('Rainfall (mm)')
27 | plt.legend()
28 | 
29 | plt.tight_layout()
30 | plt.show()
31 | 


--------------------------------------------------------------------------------
/chapter11/2.types/univariate.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Generate example univariate time series data
 6 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M')
 7 | temperature_data = pd.Series(np.random.normal(20, 5, len(date_rng)), index=date_rng)
 8 | 
 9 | # Plotting the univariate time series data
10 | plt.figure(figsize=(10, 5))
11 | plt.plot(temperature_data, label='Temperature Data')
12 | plt.title('Univariate Time Series Data')
13 | plt.xlabel('Time')
14 | plt.ylabel('Temperature (°C)')
15 | plt.legend()
16 | plt.show()
17 | 


--------------------------------------------------------------------------------
/chapter11/3.missing_values/1.identify_missing_values.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Set seed for reproducibility
 6 | np.random.seed(42)
 7 | 
 8 | # Generate a date range
 9 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')  # Business days
10 | 
11 | # Generate random stock prices
12 | n = len(date_range)
13 | data = {
14 |     'open': np.random.uniform(100, 200, n),
15 |     'high': np.random.uniform(200, 300, n),
16 |     'low': np.random.uniform(50, 100, n),
17 |     'close': np.random.uniform(100, 200, n)
18 | }
19 | 
20 | # Create DataFrame
21 | df = pd.DataFrame(data, index=date_range)
22 | 
23 | # Introduce random NaN values
24 | nan_indices = np.random.choice(df.index, size=100, replace=False)
25 | df.loc[nan_indices] = np.nan
26 | 
27 | # Drop random dates to simulate missing timestamps
28 | missing_dates = np.random.choice(df.index, size=50, replace=False)
29 | df = df.drop(missing_dates)
30 | 
31 | # Display the first few rows of the DataFrame
32 | print("Initial DataFrame with Missing Values and Timestamps:\n", df.head())
33 | 
34 | # Step 1: Checking for NaNs or Null Values in columns
35 | missing_values = df.isnull().sum()
36 | print("\nMissing Values in Each Column:\n", missing_values)
37 | 
38 | # Step 2: Identifying Missing Timestamps
39 | complete_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B')  # 'B' is for business days
40 | df_reindexed = df.reindex(complete_index)
41 | missing_timestamps = df_reindexed[df_reindexed.isnull().all(axis=1)]
42 | 
43 | # Calculate percentage of missing timestamps
44 | total_timestamps = len(complete_index)
45 | missing_timestamps_count = missing_timestamps.shape[0]
46 | missing_timestamps_percentage = (missing_timestamps_count / total_timestamps) * 100
47 | 
48 | print("\nMissing Timestamps:\n", missing_timestamps)
49 | print(f"\nPercentage of Missing Timestamps: {missing_timestamps_percentage:.2f}%")
50 | 
51 | # Plotting
52 | plt.figure(figsize=(14, 7))
53 | 
54 | # Plot the closing prices
55 | plt.plot(df.index, df['close'], marker='o', linestyle='-', label='Closing Price', color='blue')
56 | 
57 | # Mark missing timestamps with vertical lines
58 | for date in missing_dates:
59 |     plt.axvline(x=date, color='red', linestyle='--', linewidth=1)
60 | 
61 | # Highlight points with NaN values
62 | nan_dates = df.index[df['close'].isnull()]
63 | plt.scatter(nan_dates, [df['close'].mean()] * len(nan_dates), color='orange', label='NaN Values in Close', zorder=5)
64 | 
65 | plt.title('Daily Closing Prices with Missing Timestamps and NaN Values Highlighted')
66 | plt.xlabel('Date')
67 | plt.ylabel('Closing Price')
68 | plt.legend()
69 | plt.grid(True)
70 | plt.show()
71 | 
72 | # Summary of Identifying Missing Values
73 | print("\nNaN values were introduced randomly in the dataset and are highlighted in orange on the plot.\n"
74 |       "Red dashed lines indicate missing timestamps where no data is available for the dates in the index.\n"
75 |       "Blue line shows the closing prices with missing values removed.")
76 | 
77 | 


--------------------------------------------------------------------------------
/chapter11/3.missing_values/2.remove_missing_values.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Set seed for reproducibility
 6 | np.random.seed(42)
 7 | 
 8 | # Generate date range and random stock prices
 9 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
10 | n = len(date_range)
11 | data = {
12 |     'open': np.random.uniform(100, 200, n),
13 |     'high': np.random.uniform(200, 300, n),
14 |     'low': np.random.uniform(50, 100, n),
15 |     'close': np.random.uniform(100, 200, n)
16 | }
17 | df = pd.DataFrame(data, index=date_range)
18 | 
19 | # Introduce random NaN values in 'close' and 'open' columns
20 | nan_indices_close = np.random.choice(df.index, size=50, replace=False)
21 | nan_indices_open = np.random.choice(df.index, size=50, replace=False)
22 | df.loc[nan_indices_close, 'close'] = np.nan
23 | df.loc[nan_indices_open, 'open'] = np.nan
24 | 
25 | # Display the first few rows of the DataFrame
26 | print("Initial DataFrame with Missing Values:\n", df.head())
27 | 
28 | # Step 1: Checking for NaNs or Null Values in columns
29 | missing_values = df.isnull().sum()
30 | print("\nMissing Values in Each Column:\n", missing_values)
31 | 
32 | # Print percentage of missing values in each column
33 | missing_percentage = (missing_values / len(df)) * 100
34 | print("\nPercentage of Missing Values in Each Column:\n", missing_percentage)
35 | 
36 | # Print the number of rows before dropping NaN values
37 | print(f"\nNumber of rows before dropping NaN values: {len(df)}")
38 | 
39 | # Step 2: Drop rows with NaN values
40 | df_cleaned = df.dropna()
41 | 
42 | # Print the number of rows after dropping NaN values
43 | print(f"\nNumber of rows after dropping NaN values: {len(df_cleaned)}")
44 | 
45 | # Print percentage of missing values after dropping NaN values
46 | cleaned_missing_values = df_cleaned.isnull().sum()
47 | cleaned_missing_percentage = (cleaned_missing_values / len(df_cleaned)) * 100
48 | print("\nPercentage of Missing Values After Dropping Rows:\n", cleaned_missing_percentage)
49 | 
50 | # Plotting original data with NaN values
51 | plt.figure(figsize=(14, 7))
52 | plt.plot(df.index, df['close'], marker='o', linestyle='-', label='Original Closing Price', color='blue', alpha=0.5)
53 | 
54 | # Highlight points with NaN values in the original dataset
55 | nan_dates_close = df.index[df['close'].isnull()]
56 | nan_dates_open = df.index[df['open'].isnull()]
57 | 
58 | # Use 'x' marker for the points to be dropped
59 | plt.scatter(nan_dates_close, [df['close'].mean()] * len(nan_dates_close), color='orange', label='NaN Values in Close (To be Dropped)', marker='x', zorder=5)
60 | plt.scatter(nan_dates_open, [df['close'].mean()] * len(nan_dates_open), color='red', label='NaN Values in Open (To be Dropped)', marker='x', zorder=5)
61 | 
62 | plt.title('Original Daily Closing Prices with NaN Values Highlighted')
63 | plt.xlabel('Date')
64 | plt.ylabel('Closing Price')
65 | plt.legend()
66 | plt.grid(True)
67 | plt.show()
68 | 
69 | # Plotting cleaned data after dropping rows with NaN values
70 | plt.figure(figsize=(14, 7))
71 | plt.plot(df_cleaned.index, df_cleaned['close'], marker='o', linestyle='-', label='Cleaned Closing Price', color='green')
72 | 
73 | plt.title('Cleaned Daily Closing Prices After Dropping NaN Values')
74 | plt
75 | 


--------------------------------------------------------------------------------
/chapter11/3.missing_values/3.back_forward_fill.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Set seed for reproducibility
 5 | np.random.seed(42)
 6 | 
 7 | # Generate date range and random stock prices
 8 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
 9 | n = len(date_range)
10 | data = {
11 |     'open': np.random.uniform(100, 200, n),
12 |     'high': np.random.uniform(200, 300, n),
13 |     'low': np.random.uniform(50, 100, n),
14 |     'close': np.random.uniform(100, 200, n)
15 | }
16 | df = pd.DataFrame(data, index=date_range)
17 | 
18 | # Introduce random NaN values in 'close' and 'open' columns
19 | nan_indices_close = np.random.choice(df.index, size=50, replace=False)
20 | nan_indices_open = np.random.choice(df.index, size=50, replace=False)
21 | df.loc[nan_indices_close, 'close'] = np.nan
22 | df.loc[nan_indices_open, 'open'] = np.nan
23 | 
24 | # Fill NaN values using forward fill and backward fill
25 | df['close_ffill'] = df['close'].ffill()  # Forward Fill
26 | df['close_bfill'] = df['close'].bfill()  # Backward Fill
27 | 
28 | # Display the entire DataFrame including original and filled values
29 | print("Complete DataFrame with Original and Filled Values:\n")
30 | print(df[['open', 'close', 'close_ffill', 'close_bfill']].head(20))  # Show first 20 rows
31 | 


--------------------------------------------------------------------------------
/chapter11/3.missing_values/4.interpolation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Set seed for reproducibility
 6 | np.random.seed(42)
 7 | 
 8 | # Generate date range and random stock prices
 9 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
10 | n = len(date_range)
11 | data = {
12 |     'open': np.random.uniform(100, 200, n),
13 |     'high': np.random.uniform(200, 300, n),
14 |     'low': np.random.uniform(50, 100, n),
15 |     'close': np.random.uniform(100, 200, n)
16 | }
17 | df = pd.DataFrame(data, index=date_range)
18 | 
19 | # Introduce random NaN values in 'close' and 'open' columns
20 | nan_indices_close = np.random.choice(df.index, size=50, replace=False)
21 | nan_indices_open = np.random.choice(df.index, size=50, replace=False)
22 | df.loc[nan_indices_close, 'close'] = np.nan
23 | df.loc[nan_indices_open, 'open'] = np.nan
24 | 
25 | # Interpolation
26 | # Linear Interpolation
27 | df['close_linear'] = df['close'].interpolate(method='linear')
28 | 
29 | # Polynomial Interpolation
30 | df['close_poly'] = df['close'].interpolate(method='polynomial', order=3)
31 | 
32 | # Spline Interpolation
33 | df['close_spline'] = df['close'].interpolate(method='spline', order=3)
34 | 
35 | print(df.head(30))
36 | 
37 | # Function to plot and highlight filled values
38 | def plot_filled(ax, original, filled, label, color):
39 |     ax.plot(filled, label=label, linestyle='-', color=color)
40 |     filled_values = filled[original.isna()]
41 |     ax.plot(filled_values.index, filled_values, 'o', color=color, markersize=5)
42 |     ax.legend()
43 | 
44 | # Plot the results in separate subplots
45 | fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(14, 18), sharex=True)
46 | 
47 | 
48 | # Linear Interpolation
49 | plot_filled(axes[0], df['close'], df['close_linear'], 'Linear Interpolation', 'purple')
50 | axes[0].set_title('Linear Interpolation')
51 | 
52 | # Polynomial Interpolation
53 | plot_filled(axes[1], df['close'], df['close_poly'], 'Polynomial Interpolation', 'orange')
54 | axes[1].set_title('Polynomial Interpolation')
55 | 
56 | # Spline Interpolation
57 | plot_filled(axes[2], df['close'], df['close_spline'], 'Spline Interpolation', 'brown')
58 | axes[2].set_title('Spline Interpolation')
59 | 
60 | # Set common labels
61 | plt.xlabel('Date')
62 | fig.supylabel('Stock Price (Close)')
63 | 
64 | plt.tight_layout()
65 | plt.show()
66 | 


--------------------------------------------------------------------------------
/chapter11/4.analisis/autocorrelation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from statsmodels.tsa.arima.model import ARIMA
 5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
 6 | 
 7 | # Set seed for reproducibility
 8 | np.random.seed(42)
 9 | 
10 | # Generate date range and random stock prices
11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
12 | n = len(date_range)
13 | data = {
14 |     'open': np.random.uniform(100, 200, n),
15 |     'high': np.random.uniform(200, 300, n),
16 |     'low': np.random.uniform(50, 100, n),
17 |     'close': np.random.uniform(100, 200, n)
18 | }
19 | df = pd.DataFrame(data, index=date_range)
20 | 
21 | # Plot ACF and PACF
22 | plt.figure(figsize=(14, 6))
23 | 
24 | # ACF plot
25 | plt.subplot(1, 2, 1)
26 | plot_acf(df['close'].dropna(), lags=40, ax=plt.gca())
27 | plt.title('Autocorrelation Function (ACF)')
28 | 
29 | # PACF plot
30 | plt.subplot(1, 2, 2)
31 | plot_pacf(df['close'].dropna(), lags=40, ax=plt.gca())
32 | plt.title('Partial Autocorrelation Function (PACF)')
33 | 
34 | plt.tight_layout()
35 | plt.show()
36 | 


--------------------------------------------------------------------------------
/chapter11/5.outliers/1.seasonal_decomposition.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from statsmodels.tsa.seasonal import seasonal_decompose
 5 | from scipy.stats import zscore
 6 | 
 7 | # Set seed for reproducibility
 8 | np.random.seed(42)
 9 | 
10 | # Generate date range and random stock prices
11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
12 | n = len(date_range)
13 | data = {
14 |     'open': np.random.uniform(100, 200, n),
15 |     'high': np.random.uniform(200, 300, n),
16 |     'low': np.random.uniform(50, 100, n),
17 |     'close': np.random.uniform(100, 200, n)
18 | }
19 | df = pd.DataFrame(data, index=date_range)
20 | 
21 | 
22 | # Introduce more aggressive outliers in the 'close' column
23 | outlier_indices = np.random.choice(df.index, size=10, replace=False)
24 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5  # Increase by 50%
25 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5  # Decrease by 50%
26 | 
27 | # Decompose the series
28 | result = seasonal_decompose(df['close'], model='additive', period=252, extrapolate_trend='freq')
29 | 
30 | # Add decomposed components to DataFrame
31 | df['trend'] = result.trend
32 | df['seasonal'] = result.seasonal
33 | df['residual'] = result.resid
34 | 
35 | # Calculate Z-scores of residuals to identify outliers
36 | df['resid_z'] = zscore(df['residual'].dropna())
37 | 
38 | # Identify outliers (Z-score threshold set to 3)
39 | outliers = df[np.abs(df['resid_z']) > 3]
40 | 
41 | # Handling outliers by replacing them with the median of the residuals
42 | median_resid = df['residual'].median()
43 | df.loc[outliers.index, 'close'] = df['close'].median()
44 | 
45 | # Print the DataFrame to understand the numbers
46 | print(df[['close', 'close', 'trend', 'seasonal', 'residual', 'resid_z']].head(20))
47 | 
48 | # Plot the decomposed components
49 | fig, axes = plt.subplots(4, 1, figsize=(14, 18), sharex=True)
50 | 
51 | result.observed.plot(ax=axes[0], title='Observed', color='blue')
52 | result.trend.plot(ax=axes[1], title='Trend', color='orange')
53 | result.seasonal.plot(ax=axes[2], title='Seasonal', color='green')
54 | result.resid.plot(ax=axes[3], title='Residual', color='red')
55 | 
56 | plt.tight_layout()
57 | plt.show()
58 | 


--------------------------------------------------------------------------------
/chapter11/5.outliers/2.autocorrelation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from statsmodels.tsa.arima.model import ARIMA
 5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
 6 | 
 7 | # Set seed for reproducibility
 8 | np.random.seed(42)
 9 | 
10 | # Generate date range and random stock prices
11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
12 | n = len(date_range)
13 | data = {
14 |     'open': np.random.uniform(100, 200, n),
15 |     'high': np.random.uniform(200, 300, n),
16 |     'low': np.random.uniform(50, 100, n),
17 |     'close': np.random.uniform(100, 200, n)
18 | }
19 | df = pd.DataFrame(data, index=date_range)
20 | 
21 | # Introduce more aggressive outliers in the 'close' column
22 | outlier_indices = np.random.choice(df.index, size=10, replace=False)
23 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5  # Increase by 50%
24 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5  # Decrease by 50%
25 | 
26 | 
27 | # Plot ACF and PACF
28 | plt.figure(figsize=(14, 6))
29 | 
30 | # ACF plot
31 | plt.subplot(1, 2, 1)
32 | plot_acf(df['close'].dropna(), lags=40, ax=plt.gca())
33 | plt.title('Autocorrelation Function (ACF)')
34 | 
35 | # PACF plot
36 | plt.subplot(1, 2, 2)
37 | plot_pacf(df['close'].dropna(), lags=40, ax=plt.gca())
38 | plt.title('Partial Autocorrelation Function (PACF)')
39 | 
40 | plt.tight_layout()
41 | plt.show()
42 | 


--------------------------------------------------------------------------------
/chapter11/5.outliers/3.arima.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from statsmodels.tsa.arima.model import ARIMA
 5 | from scipy.stats import zscore
 6 | 
 7 | # Set seed for reproducibility
 8 | np.random.seed(42)
 9 | 
10 | # Generate date range and random stock prices
11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
12 | n = len(date_range)
13 | data = {
14 |     'open': np.random.uniform(100, 200, n),
15 |     'high': np.random.uniform(200, 300, n),
16 |     'low': np.random.uniform(50, 100, n),
17 |     'close': np.random.uniform(100, 200, n)
18 | }
19 | df = pd.DataFrame(data, index=date_range)
20 | 
21 | 
22 | # Introduce more aggressive outliers in the 'close' column
23 | outlier_indices = np.random.choice(df.index, size=10, replace=False)
24 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5  # Increase by 50%
25 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5  # Decrease by 50%
26 | 
27 | 
28 | # Fit ARIMA model to close_filled series
29 | model = ARIMA(df['close'], order=(2, 1, 1)) 
30 | results = model.fit()
31 | 
32 | # Calculate residuals and Z-scores
33 | df['residuals'] = results.resid
34 | df['residuals_z'] = zscore(df['residuals'].dropna())
35 | 
36 | # Identify outliers based on Z-score threshold (e.g., ±3)
37 | outliers_arima = df[np.abs(df['residuals_z']) > 3]
38 | 
39 | # Generate smoothed series from ARIMA model
40 | df['arima_smooth'] = results.fittedvalues
41 | 
42 | # Plotting the original close_filled and ARIMA smoothed series
43 | plt.figure(figsize=(14, 8))
44 | plt.plot(df['close'], label='Original Close', color='blue')
45 | plt.plot(df['arima_smooth'], label='ARIMA Smoothed', color='red')
46 | plt.scatter(outliers_arima.index, df.loc[outliers_arima.index, 'close'], color='orange', label='Outliers')
47 | plt.title('ARIMA Smoothing and Outlier Detection')
48 | plt.legend()
49 | plt.show()
50 | 
51 | # Print the summary of the model
52 | print(results.summary())
53 | 
54 | # Plot the diagnostics to check model fit
55 | results.plot_diagnostics(figsize=(14, 8))
56 | plt.show()
57 | 


--------------------------------------------------------------------------------
/chapter11/5.outliers/4.moving_average.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.metrics import mean_absolute_error, mean_squared_error
 5 | 
 6 | # Set seed for reproducibility
 7 | np.random.seed(42)
 8 | 
 9 | # Generate date range and random stock prices
10 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
11 | n = len(date_range)
12 | data = {
13 |     'open': np.random.uniform(100, 200, n),
14 |     'high': np.random.uniform(200, 300, n),
15 |     'low': np.random.uniform(50, 100, n),
16 |     'close': np.random.uniform(100, 200, n)
17 | }
18 | df = pd.DataFrame(data, index=date_range)
19 | 
20 | # Introduce more aggressive outliers in the 'close' column
21 | outlier_indices = np.random.choice(df.index, size=10, replace=False)
22 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5  # Increase by 50%
23 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5  # Decrease by 50%
24 | 
25 | # Define window size for SMA and span for EMA
26 | window_size = 20
27 | span = 20
28 | 
29 | # Calculate Simple Moving Average (SMA)
30 | df['SMA'] = df['close'].rolling(window=window_size, min_periods=1).mean()
31 | 
32 | # Calculate Exponential Moving Average (EMA)
33 | df['EMA'] = df['close'].ewm(span=span, adjust=False).mean()
34 | 
35 | # Calculate residuals for SMA and EMA
36 | df['SMA_residuals'] = df['close'] - df['SMA']
37 | df['EMA_residuals'] = df['close'] - df['EMA']
38 | 
39 | # Performance Metrics Calculation
40 | sma_mae = mean_absolute_error(df['close'], df['SMA'])
41 | sma_mse = mean_squared_error(df['close'], df['SMA'])
42 | sma_rmse = np.sqrt(sma_mse)
43 | 
44 | ema_mae = mean_absolute_error(df['close'], df['EMA'])
45 | ema_mse = mean_squared_error(df['close'], df['EMA'])
46 | ema_rmse = np.sqrt(ema_mse)
47 | 
48 | # Plotting original 'close', SMA, and EMA
49 | plt.figure(figsize=(14, 7))
50 | plt.plot(df.index, df['close'], label='Original Close Price', marker='o', linestyle='-', color='b')
51 | plt.plot(df.index, df['SMA'], label=f'Simple Moving Average (window={window_size})', linestyle='--', color='r')
52 | plt.plot(df.index, df['EMA'], label=f'Exponential Moving Average (span={span})', linestyle='-.', color='g')
53 | plt.title('Simple vs. Exponential Moving Average')
54 | plt.xlabel('Date')
55 | plt.ylabel('Price')
56 | plt.legend()
57 | plt.show()
58 | 
59 | # Plotting Performance Metrics
60 | metrics = ['MAE', 'MSE', 'RMSE']
61 | sma_values = [sma_mae, sma_mse, sma_rmse]
62 | ema_values = [ema_mae, ema_mse, ema_rmse]
63 | 
64 | plt.figure(figsize=(10, 6))
65 | bar_width = 0.35
66 | index = np.arange(len(metrics))
67 | 
68 | plt.bar(index, sma_values, bar_width, label='Simple Moving Average (SMA)', color='b')
69 | plt.bar(index + bar_width, ema_values, bar_width, label='Exponential Moving Average (EMA)', color='g')
70 | 
71 | plt.xlabel('Metrics')
72 | plt.ylabel('Value')
73 | plt.title('Performance Metrics: SMA vs. EMA')
74 | plt.xticks(index + bar_width / 2, metrics)
75 | plt.legend()
76 | plt.tight_layout()
77 | plt.show()
78 | 


--------------------------------------------------------------------------------
/chapter11/6.feature_engineering/1.lags.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from sklearn.metrics import mean_absolute_error, mean_squared_error
 5 | 
 6 | # Set seed for reproducibility
 7 | np.random.seed(42)
 8 | 
 9 | # Generate date range and random stock prices
10 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
11 | n = len(date_range)
12 | data = {
13 |     'open': np.random.uniform(100, 200, n),
14 |     'high': np.random.uniform(200, 300, n),
15 |     'low': np.random.uniform(50, 100, n),
16 |     'close': np.random.uniform(100, 200, n)
17 | }
18 | df = pd.DataFrame(data, index=date_range)
19 | 
20 | # Introduce more aggressive outliers in the 'close' column
21 | outlier_indices = np.random.choice(df.index, size=10, replace=False)
22 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5  # Increase by 50%
23 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5  # Decrease by 50%
24 | 
25 | # Function to create lagged features
26 | def create_lagged_features(df, column, lags):
27 |     for lag in lags:
28 |         df[f'{column}_lag_{lag}'] = df[column].shift(lag)
29 |     return df
30 | 
31 | # Define the lags to create
32 | lags = [5, 10, 20]
33 | 
34 | # Create lagged features for 'close' column
35 | df = create_lagged_features(df, 'close', lags)
36 | 
37 | # Plotting original 'close' and lagged features in separate subplots
38 | plt.figure(figsize=(14, 10))
39 | 
40 | # First subplot for the original 'close' price
41 | plt.subplot(len(lags) + 1, 1, 1)
42 | plt.plot(df.index, df['close'], label='Original Close Price', linestyle='-', color='b')
43 | plt.title('Original Close Price')
44 | plt.xlabel('Date')
45 | plt.ylabel('Price')
46 | plt.legend()
47 | 
48 | # Create additional subplots for each lagged feature
49 | for i, lag in enumerate(lags):
50 |     plt.subplot(len(lags) + 1, 1, i + 2)
51 |     plt.plot(df.index, df[f'close_lag_{lag}'], label=f'Lag {lag}', linestyle='-', color='b')
52 |     plt.title(f'Lag {lag} Feature')
53 |     plt.xlabel('Date')
54 |     plt.ylabel('Price')
55 |     plt.legend()
56 | 
57 | plt.tight_layout()  # Adjust spacing between plots
58 | plt.show()
59 | 
60 | # Explanation of significance of lagged features
61 | print("Explanation of Lagged Features:")
62 | print("- Lagged features, such as Lag 1, Lag 5, Lag 10, and Lag 20, represent historical values of the 'close' price.")
63 | print("- They capture temporal dependencies and autocorrelation present in the data.")
64 | print("- Lagged features are important for predicting future movements based on past behavior.")
65 | print("- They help in identifying trends, cycles, and seasonality in time series data.")
66 | 


--------------------------------------------------------------------------------
/chapter11/6.feature_engineering/2.seasonal_differencing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from statsmodels.tsa.stattools import adfuller
 5 | 
 6 | # Set seed for reproducibility
 7 | np.random.seed(42)
 8 | 
 9 | # Generate date range
10 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B')
11 | n = len(date_range)
12 | 
13 | # Generate a seasonal pattern
14 | seasonal_pattern = np.sin(np.linspace(0, 3 * np.pi, n)) * 20
15 | 
16 | # Generate random stock prices with added seasonal component
17 | data = {
18 |     'open': np.random.uniform(100, 200, n) + seasonal_pattern,
19 |     'high': np.random.uniform(200, 300, n) + seasonal_pattern,
20 |     'low': np.random.uniform(50, 100, n) + seasonal_pattern,
21 |     'close': np.random.uniform(100, 200, n) + seasonal_pattern
22 | }
23 | df = pd.DataFrame(data, index=date_range)
24 | 
25 | # Introduce more aggressive outliers in the 'close' column
26 | outlier_indices = np.random.choice(df.index, size=10, replace=False)
27 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5  # Increase by 50%
28 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5  # Decrease by 50%
29 | 
30 | 
31 | # First Differencing
32 | df['First Difference'] = df['close'].diff()
33 | 
34 | # Second Differencing
35 | df['Second Difference'] = df['First Difference'].diff()
36 | 
37 | # Seasonal Differencing (weekly seasonality)
38 | df['Seasonal Difference'] = df['close'].diff(5)
39 | 
40 | # Plotting the original series and differenced series
41 | plt.figure(figsize=(14, 10))
42 | 
43 | plt.subplot(4, 1, 1)
44 | plt.plot(df.index, df['close'], label='Original Series with Seasonality', color='blue')
45 | plt.title('Original Series with Seasonality')
46 | plt.legend(loc='upper right')
47 | 
48 | plt.subplot(4, 1, 2)
49 | plt.plot(df.index, df['First Difference'], label='First Difference', color='orange')
50 | plt.title('First Differencing')
51 | plt.legend(loc='upper right')
52 | 
53 | plt.subplot(4, 1, 3)
54 | plt.plot(df.index, df['Second Difference'], label='Second Difference', color='green')
55 | plt.title('Second Differencing')
56 | plt.legend(loc='upper right')
57 | 
58 | plt.subplot(4, 1, 4)
59 | plt.plot(df.index, df['Seasonal Difference'], label='Seasonal Differencing (Weekly)', color='red')
60 | plt.title('Seasonal Differencing')
61 | plt.legend(loc='upper right')
62 | 
63 | plt.tight_layout()
64 | plt.show()
65 | 
66 | # Augmented Dickey-Fuller Test
67 | def adf_test(series, title=''):
68 |     result = adfuller(series.dropna(), autolag='AIC')
69 |     print(f'Augmented Dickey-Fuller Test: {title}')
70 |     print(f'ADF Statistic: {result[0]}')
71 |     print(f'p-value: {result[1]}')
72 |     for key, value in result[4].items():
73 |         print(f'   {key}: {value}')
74 |     print('\n')
75 | 
76 | # Perform ADF test on original, first differenced, second differenced, and seasonal differenced series
77 | adf_test(df['close'], title='Original Series')
78 | adf_test(df['close'].diff(), title='First Difference')
79 | adf_test(df['close'].diff().diff(), title='Second Difference')
80 | adf_test(df['Seasonal Difference'], title='Seasonal Differencing')
81 | 


--------------------------------------------------------------------------------
/chapter12/1.text_cleaning.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from transformers import BertTokenizer
 3 | 
 4 | # Sample user reviews
 5 | reviews = [
 6 |     "<html>This product    is <b>amazing!</b></html>",
 7 |     "The product is good, but it could be better!!!",
 8 |     "I've never seen such a terrible      product. 0/10",
 9 |     "The product is AWESOME!!! Highly recommended!",
10 | ]
11 | 
12 | # a. Removing HTML tags and Special Characters
13 | def clean_html_tags(text):
14 |     soup = BeautifulSoup(text, "html.parser")
15 |     return soup.get_text()
16 | 
17 | # b. Handling Capitalization and Letter Case
18 | def standardize_case(text):
19 |     return text.lower()
20 | 
21 | # c. Dealing with Numerical Values and Symbols
22 | def remove_numbers_and_symbols(text):
23 |     return ''.join(e for e in text if e.isalpha() or e.isspace())
24 | 
25 | # d. Addressing Whitespace and Formatting Issues
26 | def remove_extra_whitespace(text):
27 |     return ' '.join(text.split())
28 | 
29 | 
30 | # Applying the text preprocessing pipeline
31 | def preprocess_text(text):
32 |     text = clean_html_tags(text)
33 |     text = standardize_case(text)
34 |     text = remove_numbers_and_symbols(text)
35 |     text = remove_extra_whitespace(text)
36 |     return text
37 | 
38 | # Preprocess all reviews
39 | preprocessed_reviews = [preprocess_text(review) for review in reviews]
40 | 
41 | print("Original Reviews:")
42 | for review in reviews:
43 |     print(f"- {review}")
44 | 
45 | print("\nPreprocessed Reviews:")
46 | for preprocessed_review in preprocessed_reviews:
47 |     print(f"- {preprocessed_review}")
48 | 
49 | 


--------------------------------------------------------------------------------
/chapter12/10.word_tokenisation.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tokenize import word_tokenize
 3 | 
 4 | # Download the necessary NLTK data (run this once)
 5 | nltk.download('punkt')
 6 | 
 7 | # Sample text
 8 | text = "The quick brown fox jumps over the lazy dog. It's unaffordable!"
 9 | 
10 | # Perform word tokenization
11 | word_tokens = word_tokenize(text)
12 | 
13 | print("Word tokens:")
14 | print(word_tokens)


--------------------------------------------------------------------------------
/chapter12/11.bpe_tokeniser.py:
--------------------------------------------------------------------------------
 1 | from tokenizers import Tokenizer
 2 | 
 3 | # Load the pre-trained GPT-2 BPE tokenizer
 4 | tokenizer = Tokenizer.from_pretrained("gpt2")
 5 | 
 6 | # Sample text
 7 | text = "Tokenization in medical texts can include words like hyperlipidemia.."
 8 | 
 9 | # Tokenize the text
10 | encoding = tokenizer.encode(text)
11 | 
12 | # Print the tokens
13 | print("Tokens:", encoding.tokens)
14 | 
15 | # Print the token IDs
16 | print("Token IDs:", encoding.ids)
17 | 
18 | # Decode the token IDs back to text
19 | decoded_text = tokenizer.decode(encoding.ids)
20 | print("Decoded Text:", decoded_text)


--------------------------------------------------------------------------------
/chapter12/12.tokenisation_wordpiece.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer 
 2 | 
 3 | # Load the pre-trained tokenizer 
 4 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
 5 | 
 6 | # Sample text 
 7 | text = "Tokenization in medical texts can include words like hyperlipidemia." 
 8 | 
 9 | 
10 | # Tokenize the text 
11 | tokens = tokenizer.tokenize(text) 
12 | print("Tokens:", tokens) 
13 | 
14 | # Convert tokens to input IDs 
15 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 
16 | print("Input IDs:", input_ids) 


--------------------------------------------------------------------------------
/chapter12/13.specialised_tokenisers.py:
--------------------------------------------------------------------------------
 1 | import stanza
 2 | from transformers import GPT2Tokenizer, GPT2LMHeadModel
 3 | from collections import Counter
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | # Initialize Stanza for biomedical text
 8 | stanza.download('en', package='mimic', processors='tokenize')
 9 | nlp = stanza.Pipeline('en', package='mimic', processors='tokenize')
10 | 
11 | # Initialize standard GPT-2 tokenizer
12 | standard_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
13 | standard_tokenizer.pad_token = standard_tokenizer.eos_token  # Set pad_token to eos_token
14 | model = GPT2LMHeadModel.from_pretrained("gpt2")
15 | model.config.pad_token_id = model.config.eos_token_id  # Set pad_token_id for the model
16 | 
17 | # Sample medical corpus
18 | corpus = [
19 |     "The patient suffered a myocardial infarction.",
20 |     "Early detection of heart attack is crucial.",
21 |     "Treatment for myocardial infarction includes medication.",
22 |     "Patients with heart conditions require regular check-ups.",
23 |     "Myocardial infarction can lead to severe complications."
24 | ]
25 | 
26 | def stanza_tokenize(text):
27 |     doc = nlp(text)
28 |     tokens = [word.text for sent in doc.sentences for word in sent.words]
29 |     return tokens
30 | 
31 | def calculate_oov_and_compression(corpus, tokenizer):
32 |     oov_count = 0
33 |     total_tokens = 0
34 |     all_tokens = []
35 | 
36 |     for sentence in corpus:
37 |         tokens = tokenizer.tokenize(sentence) if hasattr(tokenizer, 'tokenize') else stanza_tokenize(sentence)
38 |         all_tokens.extend(tokens)
39 |         total_tokens += len(tokens)
40 |         oov_count += tokens.count(tokenizer.oov_token) if hasattr(tokenizer, 'oov_token') else 0
41 | 
42 |     oov_rate = (oov_count / total_tokens) * 100 if total_tokens > 0 else 0
43 |     avg_tokens_per_sentence = total_tokens / len(corpus)
44 | 
45 |     return oov_rate, avg_tokens_per_sentence, all_tokens
46 | 
47 | def analyze_token_utilization(tokens):
48 |     token_counts = Counter(tokens)
49 |     total_tokens = len(tokens)
50 |     utilization = {token: count / total_tokens for token, count in token_counts.items()}
51 |     return utilization
52 | 
53 | def calculate_perplexity(tokenizer, model, text):
54 |     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
55 |     with torch.no_grad():
56 |         outputs = model(**inputs, labels=inputs["input_ids"])
57 |     return torch.exp(outputs.loss).item()
58 | 
59 | # Evaluation
60 | for tokenizer_name, tokenizer in [("Standard GPT-2", standard_tokenizer), ("Stanza Medical", stanza_tokenize)]:
61 |     oov_rate, avg_tokens, all_tokens = calculate_oov_and_compression(corpus, tokenizer)
62 |     utilization = analyze_token_utilization(all_tokens)
63 |     
64 |     print(f"\n{tokenizer_name} Tokenizer:")
65 |     print(f"OOV Rate: {oov_rate:.2f}%")
66 |     print(f"Average Tokens per Sentence: {avg_tokens:.2f}")
67 |     print("Top 5 Most Used Tokens:")
68 |     for token, freq in sorted(utilization.items(), key=lambda x: x[1], reverse=True)[:5]:
69 |         print(f"  {token}: {freq:.2%}")
70 |     
71 | 
72 | # Example output for "myocardial infarction"
73 | term = "myocardial infarction"
74 | print(f"\nTokenizing '{term}':")
75 | print(f"Standard GPT-2: {standard_tokenizer.tokenize(term)}")
76 | print(f"Stanza Medical: {stanza_tokenize(term)}")


--------------------------------------------------------------------------------
/chapter12/14.embedding_bert.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries
 2 | from transformers import BertTokenizer, BertModel
 3 | import torch
 4 | 
 5 | # Load pre-trained BERT tokenizer and model
 6 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 7 | model = BertModel.from_pretrained('bert-base-uncased')
 8 | 
 9 | # Input sentence
10 | sentence = "BERT embeddings are very useful for natural language processing tasks."
11 | 
12 | # Tokenize the input sentence
13 | inputs = tokenizer(sentence, return_tensors='pt')
14 | 
15 | # Generate embeddings
16 | with torch.no_grad():
17 |     outputs = model(**inputs)
18 | 
19 | # Extract the last hidden states (embeddings)
20 | last_hidden_states = outputs.last_hidden_state
21 | 
22 | # Print the shape of the embeddings tensor
23 | print("Shape of the embeddings tensor:", last_hidden_states.shape)
24 | 
25 | # Print the embeddings for the first token (CLS token)
26 | cls_embedding = last_hidden_states[0, 0, :].numpy()
27 | print("CLS token embedding:", cls_embedding)
28 | 
29 | # Print the embeddings for the first word
30 | first_word_embedding = last_hidden_states[0, 1, :].numpy()
31 | print("First word embedding:", first_word_embedding)
32 | 


--------------------------------------------------------------------------------
/chapter12/15.embedding_bge.py:
--------------------------------------------------------------------------------
 1 | from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 2 | 
 3 | # Define the model name and parameters
 4 | model_name = "BAAI/bge-small-en"
 5 | model_kwargs = {"device": "cpu"}
 6 | encode_kwargs = {"normalize_embeddings": True}
 7 | 
 8 | # Initialize the embeddings model
 9 | bge_embeddings = HuggingFaceBgeEmbeddings(
10 |     model_name=model_name,
11 |     model_kwargs=model_kwargs,
12 |     encode_kwargs=encode_kwargs
13 | )
14 | 
15 | # Sample sentences to embed
16 | sentences = [
17 |     "The quick brown fox jumps over the lazy dog.",
18 |     "I love machine learning and natural language processing."
19 | ]
20 | 
21 | # Generate embeddings for each sentence
22 | embeddings = [bge_embeddings.embed_query(sentence) for sentence in sentences]
23 | 
24 | # Print the embeddings
25 | for i, embedding in enumerate(embeddings):
26 |     print(f"Embedding for sentence {i+1}: {embedding[:5]}...")  # Print the first 5 values for brevity
27 |     print(f"Length of embedding: {len(embedding)}")
28 | 


--------------------------------------------------------------------------------
/chapter12/16.embedding_gte.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | 
 3 | # Load the GTE-base model
 4 | model = SentenceTransformer('thenlper/gte-base')
 5 | 
 6 | # Sample texts to embed
 7 | texts = [
 8 |     "The quick brown fox jumps over the lazy dog.",
 9 |     "I love machine learning and natural language processing.",
10 |     "Embeddings are useful for many NLP tasks."
11 | ]
12 | 
13 | # Generate embeddings
14 | embeddings = model.encode(texts)
15 | 
16 | # Print the shape of the embeddings
17 | print(f"Shape of embeddings: {embeddings.shape}")
18 | 
19 | # Print the first few values of the first embedding
20 | print(f"First few values of the first embedding: {embeddings[0][:5]}")
21 | 


--------------------------------------------------------------------------------
/chapter12/2.punctuation.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | # Sample text
 4 | text = "I love this product!!! It's amazing!!!"
 5 | 
 6 | 
 7 | # Option 1: Replace symbols and punctuation
 8 | replaced_text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
 9 | print("Replaced Text:", replaced_text)
10 | 
11 | # Option 2: Remove symbols and punctuation
12 | removed_text = "".join(char for char in text if char.isalnum() or char.isspace())
13 | print("Removed Text:", removed_text)
14 | 


--------------------------------------------------------------------------------
/chapter12/3.pii_detection.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from presidio_analyzer import AnalyzerEngine
 3 | from presidio_anonymizer import AnonymizerEngine
 4 | from presidio_anonymizer.entities import OperatorConfig
 5 | 
 6 | # Sample DataFrame
 7 | data = {
 8 |     'text': [
 9 |         "Hello, my name is John Doe. My email is john.doe@example.com",
10 |         "Contact Jane Smith at jane.smith@work.com",
11 |         "Call her at 987-654-3210.",
12 |         "This is a test message without PII."
13 |     ]
14 | }
15 | 
16 | df = pd.DataFrame(data)
17 | 
18 | # Initialize the analyzer and anonymizer engines
19 | analyzer = AnalyzerEngine()
20 | anonymizer = AnonymizerEngine()
21 | 
22 | def anonymize_text(text):
23 |     """ Anonymize PII entities in text """
24 |     # Analyze the text to detect PII entities
25 |     analyzer_results = analyzer.analyze(text=text, entities=["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER"], language="en")
26 | 
27 |     # Define the anonymization configuration
28 |     operators = {
29 |         "PERSON": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 4, "from_end": True}),
30 |         "EMAIL_ADDRESS": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 5, "from_end": True}),
31 |         "PHONE_NUMBER": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 6, "from_end": True})
32 |     }
33 | 
34 |     # Anonymize the detected PII entities
35 |     anonymized_result = anonymizer.anonymize(text=text, analyzer_results=analyzer_results, operators=operators)
36 |     
37 |     return anonymized_result.text
38 | 
39 | # Apply the anonymization function to the DataFrame
40 | df['anonymized_text'] = df['text'].apply(anonymize_text)
41 | 
42 | # Display the DataFrame
43 | print(df['anonymized_text'])


--------------------------------------------------------------------------------
/chapter12/4.rare_words.py:
--------------------------------------------------------------------------------
 1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer
 2 | 
 3 | # Initialize the GPT-2 tokenizer and model
 4 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 5 | model = GPT2LMHeadModel.from_pretrained("gpt2")
 6 | 
 7 | # Define a text prompt with a rare word
 8 | text = "The quokka, a rare marsupial,"
 9 | 
10 | # Encode the input text to tensor
11 | indexed_tokens = tokenizer.encode(text, return_tensors='pt')
12 | 
13 | # Generate text until the output length reaches 50 tokens
14 | output_text = model.generate(indexed_tokens, max_length=50, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
15 | 
16 | # Decode the output text
17 | output_text_decoded = tokenizer.decode(output_text[0], skip_special_tokens=True)
18 | print(output_text_decoded)


--------------------------------------------------------------------------------
/chapter12/5.spelling_checker.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline
 2 | 
 3 | def fix_spelling(text):
 4 |     # Initialize the spelling correction pipeline
 5 |     spell_check = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base")
 6 |     
 7 |     # Generate the corrected text
 8 |     corrected = spell_check(text, max_length=2048)[0]['generated_text']
 9 |     
10 |     return corrected
11 | 
12 | # Test the function with some sample text containing spelling mistakes
13 | sample_text = "y name si from Grece."
14 | corrected_text = fix_spelling(sample_text)
15 | 
16 | print("Original text:", sample_text)
17 | print("Corrected text:", corrected_text)


--------------------------------------------------------------------------------
/chapter12/6.fuzzy_matching.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline
 2 | from thefuzz import process, fuzz
 3 | 
 4 | def fix_spelling(text, threshold=80):
 5 |     # Initialize the spelling correction pipeline
 6 |     spell_check = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base")
 7 |     
 8 |     # Generate the corrected text
 9 |     corrected = spell_check(text, max_length=2048)[0]['generated_text']
10 |     
11 |     # Split the original and corrected texts into words
12 |     original_words = text.split()
13 |     corrected_words = corrected.split()
14 |     
15 |     # Create a dictionary of common English words (you can expand this list)
16 |     common_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at'])
17 |     
18 |     # Fuzzy match each word
19 |     final_words = []
20 |     for orig, corr in zip(original_words, corrected_words):
21 |         if orig.lower() in common_words:
22 |             final_words.append(orig)  # Keep common words as they are
23 |         else:
24 |             # Use fuzzy matching to find the best match
25 |             matches = process.extractOne(orig, [corr], scorer=fuzz.ratio)
26 |             if matches[1] >= threshold:
27 |                 final_words.append(matches[0])
28 |             else:
29 |                 final_words.append(orig)  # Keep the original word if no good match found
30 |     
31 |     return ' '.join(final_words)
32 | 
33 | # Test the function with some sample text containing spelling mistakes
34 | sample_text = "Lets do a copmarsion of speling mistaks in this sentense."
35 | corrected_text = fix_spelling(sample_text)
36 | 
37 | print("Original text:", sample_text)
38 | print("Corrected text:", corrected_text)


--------------------------------------------------------------------------------
/chapter12/7.fixed_chunking.py:
--------------------------------------------------------------------------------
 1 | # Step 1: Load Example Data
 2 | reviews = [
 3 |     "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.",
 4 |     "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.",
 5 |     "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.",
 6 |     "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.",
 7 |     "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers."
 8 | ]
 9 | 
10 | # Step 2: Create the TokenTextSplitter
11 | from langchain_text_splitters import TokenTextSplitter
12 | 
13 | # Initialize the TokenTextSplitter with a chunk size of 50 tokens and no overlap
14 | text_splitter = TokenTextSplitter(chunk_size=50, chunk_overlap=0)
15 | 
16 | # Step 3: Join Reviews and Split Text
17 | # Combine the reviews into a single text block for chunking
18 | text_block = " ".join(reviews)
19 | 
20 | # Split the text into token-based chunks
21 | chunks = text_splitter.split_text(text_block)
22 | 
23 | # Print the chunks
24 | print("Chunks with 50 tokens each:")
25 | for i, chunk in enumerate(chunks):
26 |     print(f"Chunk {i + 1}:")
27 |     print(chunk)
28 |     print("\n")
29 | 
30 | # Step 4: Experiment with Different Chunk Sizes
31 | chunk_sizes = [20, 70, 150]
32 | 
33 | for size in chunk_sizes:
34 |     print(f"Chunk Size: {size}")
35 |     text_splitter = TokenTextSplitter(chunk_size=size, chunk_overlap=0)
36 |     chunks = text_splitter.split_text(text_block)
37 |     
38 |     for i, chunk in enumerate(chunks):
39 |         print(f"Chunk {i + 1}:")
40 |         print(chunk)
41 |         print("\n")
42 | 


--------------------------------------------------------------------------------
/chapter12/8.paragraph_chunking.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 2 | 
 3 | reviews = [
 4 |     "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.",
 5 |     "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.",
 6 |     "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.",
 7 |     "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.",
 8 |     "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers."
 9 | ]
10 | 
11 | # Combine the reviews into a single text block for chunking
12 | text_block = " ".join(reviews)
13 | 
14 | # Create a RecursiveCharacterTextSplitter
15 | text_splitter = RecursiveCharacterTextSplitter(
16 |     separators=["\n\n", "\n", " ", ""],
17 |     chunk_size=200,
18 |     chunk_overlap=0,
19 |     length_function=len
20 | )
21 | 
22 | # Split the text into chunks
23 | chunks = text_splitter.split_text(text_block)
24 | 
25 | # Print the chunks
26 | for i, chunk in enumerate(chunks, 1):
27 |     print(f"Chunk {i}:")
28 |     print(chunk.strip())
29 |     print("-" * 50)


--------------------------------------------------------------------------------
/chapter12/9.semantic_chunking.py:
--------------------------------------------------------------------------------
 1 | from langchain_experimental.text_splitter import SemanticChunker
 2 | from langchain_huggingface import HuggingFaceEmbeddings
 3 | import os
 4 | 
 5 | reviews = [
 6 |     "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.",
 7 |     "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.",
 8 |     "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.",
 9 |     "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.",
10 |     "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers."
11 | ]
12 | # Combine the reviews into a single text block for chunking
13 | text_block = " ".join(reviews)
14 | 
15 | text_splitter = SemanticChunker(HuggingFaceEmbeddings())
16 | 
17 | docs = text_splitter.create_documents([text_block])
18 | 
19 | for i, doc in enumerate(docs):
20 |     print(f"Chunk {i + 1}:")
21 |     print(doc.page_content)
22 |     print("\n")


--------------------------------------------------------------------------------
/chapter12/9.semantic_similarity.py:
--------------------------------------------------------------------------------
 1 | from langchain_experimental.text_splitter import SemanticChunker
 2 | from langchain_huggingface import HuggingFaceEmbeddings
 3 | 
 4 | # Load the HuggingFace embedding model
 5 | embedding_model = HuggingFaceEmbeddings(model_name='roberta-base')
 6 | 
 7 | # Create a SemanticChunker with the correct parameters
 8 | text_splitter = SemanticChunker(
 9 |     embeddings=embedding_model,
10 |     buffer_size=10,                                # Hypothetical buffer size
11 |     add_start_index=True,                           # Whether to add start index for chunks
12 |     breakpoint_threshold_type='interquartile',         # Type of breakpoint threshold
13 |     breakpoint_threshold_amount=0.7,                # Amount for breakpoint threshold
14 |     number_of_chunks=5                         # Target number of chunks, optional
15 | )
16 | 
17 | # Example data
18 | reviews = [
19 |     "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.",
20 |     "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.",
21 |     "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.",
22 |     "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.",
23 |     "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers."
24 | ]
25 | 
26 | # Combine the reviews into a single text block for chunking
27 | text_block = " ".join(reviews)
28 | 
29 | # Split the text into semantic chunks
30 | chunks = text_splitter.split_text(text_block)
31 | 
32 | # Print the chunks
33 | for i, chunk in enumerate(chunks, 1):
34 |     print(f"Chunk {i}:")
35 |     print(chunk.strip())
36 |     print("-" * 50)
37 | 


--------------------------------------------------------------------------------
/chapter13/2.ocr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from PIL import Image
 4 | from paddleocr import PaddleOCR, draw_ocr
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | # Initialize PaddleOCR
 8 | ocr = PaddleOCR(use_angle_cls=True, lang='en')
 9 | 
10 | # Define the folder containing images
11 | folder_path = 'chapter13/images'
12 | 
13 | # Supported image extensions
14 | supported_extensions = ('.png', '.jpg', '.jpeg')
15 | 
16 | # Get all images in the folder
17 | image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.lower().endswith(supported_extensions)]
18 | 
19 | # Create an empty DataFrame to store results
20 | df = pd.DataFrame(columns=['Image Path', 'Extracted Text'])
21 | 
22 | # Check if there are any images found
23 | if not image_paths:
24 |     print("No images found in the specified folder.")
25 | else:
26 |     # Function to process images and extract text
27 |     def process_image(image_path):
28 |         # Perform OCR on the image
29 |         result = ocr.ocr(image_path, cls=True)
30 |         
31 |         # Extracting and printing the text
32 |         extracted_text = ""
33 |         for line in result[0]:
34 |             extracted_text += line[1][0] + " "
35 |         print(f"Extracted Text from {os.path.basename(image_path)}:\n{extracted_text}\n")
36 |         
37 |         # Append results to DataFrame
38 |         df.loc[len(df)] = [image_path, extracted_text]
39 | 
40 |     # Process each image in the folder
41 |     for image_path in image_paths:
42 |         process_image(image_path)
43 | 
44 | # Display the DataFrame
45 | print(df)
46 | 
47 | # Optionally, save the DataFrame to a CSV file
48 | df.to_csv('extracted_texts.csv', index=False)
49 | 


--------------------------------------------------------------------------------
/chapter13/3.ocr_with_llms.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import re
 4 | from langchain import PromptTemplate, LLMChain
 5 | from langchain.llms import HuggingFaceHub
 6 | 
 7 | # Read the CSV file
 8 | df = pd.read_csv('extracted_texts.csv')
 9 | 
10 | # Initialize the Hugging Face model
11 | model_name = "mistralai/Mistral-Nemo-Instruct-2407"  # Using Mistral for instruction-following
12 | 
13 | # Your Hugging Face API token
14 | api_token = "add_your_token"  # Replace with your actual API token
15 | 
16 | # LangChain setup with few-shot examples
17 | prompt_template = PromptTemplate(
18 |     input_variables=["text"],
19 |     template='''Correct the following text for spelling errors and return only the corrected text in lowercase. Respond using JSON format, strictly according to the following schema:
20 | {{"corrected_text": "corrected text in lowercase"}}
21 | 
22 | Examples:
23 | Input: "Open vs Proprietry LLMs"
24 | Output: {{"corrected_text": "open vs proprietary llms"}}
25 | 
26 | Input: "HOW TO MITIGATE SaCURITY RISKS IN AI AND ML SYSTEM VECTOR LAB"
27 | Output: {{"corrected_text": "how to mitigate security risks in ai and ml system vector lab"}}
28 | 
29 | Input: "BUILDING DBRX-CLASS CUSTOM LLMS WITH MOSAIC A1 TRAINING VECTOR LAB"
30 | Output: {{"corrected_text": "building dbrx-class custom llms with mosaic a1 training vector lab"}}
31 | 
32 | Text to correct:
33 | {text}
34 | Output (JSON format only):
35 | '''
36 | )
37 | 
38 | huggingface_llm = HuggingFaceHub(repo_id=model_name, huggingfacehub_api_token=api_token, model_kwargs={"task": "text-generation"})
39 | llm_chain = LLMChain(prompt=prompt_template, llm=huggingface_llm)
40 | 
41 | def correct_text(text):
42 |     # Use the LLMChain to generate a response
43 |     response = llm_chain.run(text)
44 |     print(f"Raw Response: {response}")  # Debugging line to see the raw response
45 | 
46 |     # Use regex to extract the JSON part that follows "Output (JSON format only):"
47 |     json_match = re.search(r'Output \(JSON format only\):\s*(\{.*\})', response)
48 |     if json_match:
49 |         json_str = json_match.group(1)
50 |         try:
51 |             response_json = json.loads(json_str)
52 |             corrected_text = response_json.get('corrected_text', '')
53 |             return corrected_text
54 |         except json.JSONDecodeError as json_error:
55 |             print(f"JSON Decode Error: {json_error}")
56 |             return "error"
57 |     else:
58 |         print("No valid JSON object found in the response")
59 |         return "error"
60 | 
61 | # Apply text correction to the 'Extracted Text' column
62 | df['Corrected Text'] = df['Extracted Text'].apply(correct_text)
63 | 
64 | # Display the DataFrame
65 | print(df)
66 | 
67 | # Optionally, save the updated DataFrame to a new CSV file
68 | df.to_csv('cleaned_texts.csv', index=False)
69 | 
70 | # Print examples of corrections
71 | for _, row in df.iterrows():
72 |     print("Original:", row['Extracted Text'])
73 |     print("Corrected:", row['Corrected Text'])
74 |     print()
75 | 


--------------------------------------------------------------------------------
/chapter13/4.image_captioning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from PIL import Image
 4 | import matplotlib.pyplot as plt
 5 | from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
 6 | from langchain import PromptTemplate, LLMChain
 7 | from langchain.llms import HuggingFaceHub
 8 | 
 9 | # Define the folder containing images
10 | folder_path = 'chapter13/images'
11 | 
12 | # Supported image extensions
13 | supported_extensions = ('.png', '.jpg', '.jpeg')
14 | 
15 | # Get all images in the folder
16 | image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.lower().endswith(supported_extensions)]
17 | 
18 | # Create an empty DataFrame to store results
19 | df = pd.DataFrame(columns=['Image Path', 'Generated Caption', 'Refined Caption'])
20 | 
21 | # Initialize the BLIP model and processor for image captioning
22 | blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
23 | blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
24 | 
25 | # Initialize the LLM for text refinement
26 | llm_model_name = "google/flan-t5-small"  # You can choose other models as well
27 | tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
28 | model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)
29 | 
30 | # LangChain setup
31 | api_token = ""
32 | prompt_template = PromptTemplate(input_variables=["text"], template="Refine and correct the following caption: {text}")
33 | huggingface_llm = HuggingFaceHub(repo_id=llm_model_name, huggingfacehub_api_token=api_token)
34 | llm_chain = LLMChain(prompt=prompt_template, llm=huggingface_llm)
35 | 
36 | def refine_caption(caption):
37 |     # Create a prompt using LangChain and generate refined caption
38 |     prompt = prompt_template.format(text=caption)
39 |     refined_caption = llm_chain.run(prompt)
40 |     return refined_caption
41 | 
42 | def generate_caption(image_path):
43 |     image = Image.open(image_path).convert("RGB")
44 |     inputs = blip_processor(images=image, return_tensors="pt")
45 |     outputs = blip_model.generate(**inputs)
46 |     caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
47 |     return caption
48 | 
49 | # Process each image in the folder
50 | if not image_paths:
51 |     print("No images found in the specified folder.")
52 | else:
53 |     for image_path in image_paths:
54 |         # Generate image caption
55 |         caption = generate_caption(image_path)
56 |         print(f"Generated Caption for {os.path.basename(image_path)}:\n{caption}\n")
57 |         
58 |         # Refine the caption
59 |         refined_caption = refine_caption(caption)
60 |         print(f"Refined Caption:\n{refined_caption}\n")
61 |         
62 |         # Append results to DataFrame
63 |         df.loc[len(df)] = [image_path, caption, refined_caption]
64 | 
65 | # Display the DataFrame
66 | print(df)
67 | 
68 | # Optionally, save the DataFrame to a CSV file
69 | df.to_csv('captions.csv', index=False)
70 | 


--------------------------------------------------------------------------------
/chapter13/5.whisper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import WhisperProcessor, WhisperForConditionalGeneration
 3 | import librosa
 4 | 
 5 | # Load the Whisper processor and model from Hugging Face
 6 | processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
 7 | model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
 8 | 
 9 | # Define the path to your audio file
10 | audio_path = "chapter13/audio/3.chain orchestrator.mp3"  # Replace with your actual audio file path
11 | 
12 | # Load the audio file
13 | audio, rate = librosa.load(audio_path, sr=16000)
14 | 
15 | # Preprocess the audio file for the Whisper model
16 | input_features = processor(audio, sampling_rate=rate, return_tensors="pt").input_features
17 | 
18 | # Generate the transcription
19 | with torch.no_grad():
20 |     predicted_ids = model.generate(input_features)
21 | 
22 | # Decode the generated transcription
23 | transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
24 | 
25 | # Print the transcribed text
26 | print("Transcribed Text:")
27 | print(transcription)
28 | 


--------------------------------------------------------------------------------
/chapter13/6.emotion_detection.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pandas as pd
 3 | from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
 4 | import librosa
 5 | import numpy as np
 6 | 
 7 | # Load the Whisper processor and model from Hugging Face
 8 | whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
 9 | whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
10 | 
11 | # Load the emotion detection processor and model from Hugging Face
12 | emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
13 | emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
14 | emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
15 | 
16 | # Define the path to your audio file
17 | audio_path = "chapter13/audio/3.chain orchestrator.mp3"  # Replace with your actual audio file path
18 | 
19 | # Load the audio file
20 | audio, rate = librosa.load(audio_path, sr=16000)
21 | 
22 | # Function to split audio into chunks
23 | def split_audio(audio, rate, chunk_duration=30):
24 |     chunk_length = int(rate * chunk_duration)
25 |     num_chunks = int(np.ceil(len(audio) / chunk_length))
26 |     return [audio[i*chunk_length:(i+1)*chunk_length] for i in range(num_chunks)]
27 | 
28 | # Function to transcribe audio to text using Whisper
29 | def transcribe_audio(audio_chunk, rate):
30 |     # Preprocess the audio file for the Whisper model
31 |     input_features = whisper_processor(audio_chunk, sampling_rate=rate, return_tensors="pt").input_features
32 | 
33 |     # Generate the transcription
34 |     with torch.no_grad():
35 |         predicted_ids = whisper_model.generate(input_features)
36 | 
37 |     # Decode the generated transcription
38 |     transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
39 |     return transcription
40 | 
41 | # Function to detect emotions from text using the emotion detection model
42 | def detect_emotion(text):
43 |     inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
44 |     outputs = emotion_model(**inputs)
45 |     predicted_class_id = torch.argmax(outputs.logits, dim=-1).item()
46 |     emotions = emotion_model.config.id2label
47 |     return emotions[predicted_class_id]
48 | 
49 | # Split audio into chunks
50 | audio_chunks = split_audio(audio, rate, chunk_duration=30)  # 30-second chunks
51 | 
52 | # Create a DataFrame to store results
53 | df = pd.DataFrame(columns=['Chunk Index', 'Transcription', 'Emotion'])
54 | 
55 | # Process each audio chunk
56 | for i, audio_chunk in enumerate(audio_chunks):
57 |     transcription = transcribe_audio(audio_chunk, rate)
58 |     emotion = detect_emotion(transcription)
59 |     
60 |     # Append results to DataFrame
61 |     df.loc[i] = [i, transcription, emotion]
62 |     print(f"Processed Chunk {i+1}/{len(audio_chunks)}")
63 | 
64 | # Display the DataFrame
65 | print(df)
66 | 
67 | # Optionally, save the DataFrame to a CSV file
68 | df.to_csv('transcriptions_with_emotions.csv', index=False)
69 | 


--------------------------------------------------------------------------------
/chapter13/7.write_highlights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pandas as pd
 3 | from transformers import WhisperProcessor, WhisperForConditionalGeneration
 4 | import librosa
 5 | import numpy as np
 6 | from langchain.prompts import PromptTemplate
 7 | from langchain.chains import LLMChain
 8 | from langchain.llms import HuggingFaceHub
 9 | 
10 | # Load the Whisper processor and model from Hugging Face
11 | whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
12 | whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
13 | 
14 | # Initialize the Hugging Face model
15 | model_name = "mistralai/Mistral-Nemo-Instruct-2407"  # Using Mistral for instruction-following
16 | 
17 | # Your Hugging Face API token
18 | api_token = "add_your_huggigng_face_token"  # Replace with your actual API token
19 | 
20 | # LangChain setup with few-shot examples
21 | prompt_template = PromptTemplate(
22 |     input_variables=["text"],
23 |     template='''This is the transcribed text from a YouTube video. Write the key highlights from this video in bullet format.
24 | {text}
25 | Output:
26 | '''
27 | )
28 | 
29 | huggingface_llm = HuggingFaceHub(repo_id=model_name, huggingfacehub_api_token=api_token, model_kwargs={"task": "text-generation"})
30 | llm_chain = LLMChain(prompt=prompt_template, llm=huggingface_llm)
31 | 
32 | # Define the path to your audio file
33 | audio_path = "chapter13/audio/3.chain orchestrator.mp3"  # Replace with your actual audio file path
34 | 
35 | # Load the audio file
36 | audio, rate = librosa.load(audio_path, sr=16000)
37 | 
38 | # Function to split audio into chunks
39 | def split_audio(audio, rate, chunk_duration=30):
40 |     chunk_length = int(rate * chunk_duration)
41 |     num_chunks = int(np.ceil(len(audio) / chunk_length))
42 |     return [audio[i*chunk_length:(i+1)*chunk_length] for i in range(num_chunks)]
43 | 
44 | # Function to transcribe audio to text using Whisper
45 | def transcribe_audio(audio_chunk, rate):
46 |     # Preprocess the audio file for the Whisper model
47 |     input_features = whisper_processor(audio_chunk, sampling_rate=rate, return_tensors="pt").input_features
48 | 
49 |     # Generate the transcription
50 |     with torch.no_grad():
51 |         predicted_ids = whisper_model.generate(input_features)
52 | 
53 |     # Decode the generated transcription
54 |     transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
55 |     return transcription
56 | 
57 | # Function to generate key highlights from text using the LLM
58 | def generate_highlights(text):
59 |     try:
60 |         response = llm_chain.run(text)
61 |         return response.strip()  # Clean up any whitespace around the response
62 |     except Exception as e:
63 |         print(f"Error generating highlights: {e}")
64 |         return "error"  # Handle errors gracefully
65 | 
66 | # Split audio into chunks
67 | audio_chunks = split_audio(audio, rate, chunk_duration=30)  # 30-second chunks
68 | 
69 | # Transcribe each audio chunk
70 | transcriptions = [transcribe_audio(chunk, rate) for chunk in audio_chunks]
71 | 
72 | # Join all transcriptions into a single text
73 | full_transcription = " ".join(transcriptions)
74 | 
75 | # Generate highlights from the full transcription
76 | highlights = generate_highlights(full_transcription)
77 | 
78 | # Create a DataFrame to store results
79 | df = pd.DataFrame(columns=['Full Transcription', 'Highlights'])
80 | df.loc[0] = [full_transcription, highlights]
81 | 
82 | # Display the DataFrame
83 | print(df)
84 | 
85 | # Optionally, save the DataFrame to a CSV file
86 | df.to_csv('transcriptions_with_highlights.csv', index=False)
87 | 
88 | # Print examples of corrections
89 | print("Full Transcription:", full_transcription)
90 | print("Highlights:", highlights)
91 | 


--------------------------------------------------------------------------------
/chapter13/audio/3.chain orchestrator.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/audio/3.chain orchestrator.mp3


--------------------------------------------------------------------------------
/chapter13/images/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/1.png


--------------------------------------------------------------------------------
/chapter13/images/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/10.png


--------------------------------------------------------------------------------
/chapter13/images/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/11.png


--------------------------------------------------------------------------------
/chapter13/images/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/12.png


--------------------------------------------------------------------------------
/chapter13/images/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/13.png


--------------------------------------------------------------------------------
/chapter13/images/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/14.png


--------------------------------------------------------------------------------
/chapter13/images/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/15.png


--------------------------------------------------------------------------------
/chapter13/images/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/16.png


--------------------------------------------------------------------------------
/chapter13/images/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/17.png


--------------------------------------------------------------------------------
/chapter13/images/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/18.png


--------------------------------------------------------------------------------
/chapter13/images/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/19.png


--------------------------------------------------------------------------------
/chapter13/images/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/2.png


--------------------------------------------------------------------------------
/chapter13/images/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/20.png


--------------------------------------------------------------------------------
/chapter13/images/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/21.png


--------------------------------------------------------------------------------
/chapter13/images/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/22.png


--------------------------------------------------------------------------------
/chapter13/images/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/23.png


--------------------------------------------------------------------------------
/chapter13/images/24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/24.png


--------------------------------------------------------------------------------
/chapter13/images/25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/25.png


--------------------------------------------------------------------------------
/chapter13/images/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/3.png


--------------------------------------------------------------------------------
/chapter13/images/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/4.png


--------------------------------------------------------------------------------
/chapter13/images/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/5.png


--------------------------------------------------------------------------------
/chapter13/images/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/6.png


--------------------------------------------------------------------------------
/chapter13/images/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/7.png


--------------------------------------------------------------------------------
/chapter13/images/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/8.png


--------------------------------------------------------------------------------
/chapter13/images/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/9.png


--------------------------------------------------------------------------------