├── LICENSE ├── README.md ├── chapter01 ├── 1.batch.py ├── 2.real_time_streaming.py ├── 3.semi_real_time.py ├── 4.work_with_queue.py ├── 5.sql_databases.py ├── 6.no_sql_databases.py └── 7.api.py ├── chapter02 ├── accuracy.py ├── average_timeliness.py ├── completeness.py ├── consistency.py ├── data_compliance.py ├── data_usage.py ├── duplication.py ├── timeliness.py └── uniqueness.py ├── chapter03 ├── great_expectations │ ├── code │ │ ├── 1.data_set_up.py │ │ ├── 2.mock_test_dataset.py │ │ └── 3.with_pandas_profiler.py │ └── great_expectations │ │ ├── checkpoints │ │ └── expect_iris_ckpnt.yml │ │ ├── expectations │ │ └── expect_iris.json │ │ ├── great_expectations.yml │ │ ├── plugins │ │ └── custom_data_docs │ │ │ └── styles │ │ │ └── data_docs_custom_styles.css │ │ └── uncommitted │ │ └── data_docs │ │ └── local_site │ │ ├── expectations │ │ └── expect_iris.html │ │ ├── index.html │ │ └── static │ │ ├── fonts │ │ └── HKGrotesk │ │ │ ├── HKGrotesk-Italic.otf │ │ │ ├── HKGrotesk-LightItalic.otf │ │ │ ├── HKGrotesk-MediumItalic.otf │ │ │ └── HKGrotesk-SemiBoldItalic.otf │ │ ├── images │ │ ├── favicon.ico │ │ ├── glossary_scroller.gif │ │ ├── iterative-dev-loop.png │ │ ├── logo-long-vector.svg │ │ ├── logo-long.png │ │ ├── short-logo-vector.svg │ │ ├── short-logo.png │ │ └── validation_failed_unexpected_values.gif │ │ └── styles │ │ ├── data_docs_custom_styles_template.css │ │ └── data_docs_default_styles.css ├── intoduction │ └── identify_trends.py └── pandas_profiling │ ├── data_profile_report.html │ ├── pandas_profiler.ipynb │ └── pandas_profiler.json ├── chapter04 ├── 1.descriptive_stats.py ├── 2.rename_columns.py ├── 3.dropping_columns.py ├── 4.data_types.py ├── 5.date_time.py ├── 6.format_date.py ├── 7.extract_datetime_components.py ├── 8.time_deltas.py └── 9.time_zones.py ├── chapter05 ├── 1.use_case.py ├── 2.inner_join.py ├── 3.outer_merge.py ├── 4.right_merge.py ├── 5.left_merge.py ├── 6a.manage_duplicates.py ├── 6b.manage_duplicates_validate.py ├── 6c.merge_and_aggregate.py ├── 6d.dmanage_duplicates_concatenation.py ├── 7a.managed_duplicated_columns.py ├── 7b.drop_columns_merge.py ├── 7c.use_keys_merge.py ├── 8a.perfomance_benchmark_set_index.py ├── 8b.performance_benchmark_sort_indexes.py ├── 8c.performance_benchmark_memory.py ├── 9a.concatenate_row_wise.py ├── 9b.reset_index.py └── 9c.concatenate_column_wise.py ├── chapter06 ├── 1.use_case.py ├── 2.groupby_full_example.py ├── 3.apply_axis0.py ├── 4.apply_axis1.py ├── 5.simple_filtering.py └── 6.advanced_filtering.py ├── chapter07 ├── 1.postgressql.py ├── 2.pymongo.py ├── 3.pymongo_expand.py ├── 4a.kafka_producer.py ├── 4b.kafka_consumer.py ├── 5.time_based_partitioning.py ├── 6.geo_partitioning.py ├── 7.hybrid_partitioning.py ├── __pycache__ │ └── pymongo.cpython-312.pyc ├── setup │ ├── cleanup_script.sh │ ├── docker-compose.yml │ └── setup_postgres.sh ├── template_aws_s3.py └── template_bigquery.py ├── chapter08 ├── 1.detect_missing_data.py ├── 10.winsorizing.py ├── 11.data_transformation.py ├── 12.mahalanobis_distance.py ├── 13.clustering.py ├── 14.multivariate_trimming.py ├── 2.delete_missing_data.py ├── 3.mean_imputation.py ├── 4.median_imputation.py ├── 5.indicator_imputation.py ├── 6.outliers_visualisation.py ├── 7.identify_univariate_outliers.py ├── 8.handle_univariate_outliers_deletions.py └── 9.trimming.py ├── chapter09 ├── min_max_scaling.py ├── robust_scaler.py └── zscaler.py ├── chapter10 ├── 1a.label_encoding.py ├── 1b.label_encoding_forced.py ├── 2.one_hot_encoding.py ├── 3.target_encoding.py ├── 4.frequency_encoding.py └── 5.binary_encoding.py ├── chapter11 ├── 1.decomposing_time_series │ ├── noise.py │ ├── seasonality.py │ └── trend.py ├── 2.types │ ├── multivariate.py │ └── univariate.py ├── 3.missing_values │ ├── 1.identify_missing_values.py │ ├── 2.remove_missing_values.py │ ├── 3.back_forward_fill.py │ └── 4.interpolation.py ├── 4.analisis │ └── autocorrelation.py ├── 5.outliers │ ├── 1.seasonal_decomposition.py │ ├── 2.autocorrelation.py │ ├── 3.arima.py │ └── 4.moving_average.py └── 6.feature_engineering │ ├── 1.lags.py │ └── 2.seasonal_differencing.py ├── chapter12 ├── 1.text_cleaning.py ├── 10.word_tokenisation.py ├── 11.bpe_tokeniser.py ├── 12.tokenisation_wordpiece.py ├── 13.specialised_tokenisers.py ├── 14.embedding_bert.py ├── 15.embedding_bge.py ├── 16.embedding_gte.py ├── 2.punctuation.py ├── 3.pii_detection.py ├── 4.rare_words.py ├── 5.spelling_checker.py ├── 6.fuzzy_matching.py ├── 7.fixed_chunking.py ├── 8.paragraph_chunking.py ├── 9.semantic_chunking.py └── 9.semantic_similarity.py └── chapter13 ├── 1.image_prerpocessing.py ├── 2.ocr.py ├── 3.ocr_with_llms.py ├── 4.image_captioning.py ├── 5.whisper.py ├── 6.emotion_detection.py ├── 7.write_highlights.py ├── audio └── 3.chain orchestrator.mp3 └── images ├── 1.png ├── 10.png ├── 11.png ├── 12.png ├── 13.png ├── 14.png ├── 15.png ├── 16.png ├── 17.png ├── 18.png ├── 19.png ├── 2.png ├── 20.png ├── 21.png ├── 22.png ├── 23.png ├── 24.png ├── 25.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png └── 9.png /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /chapter01/1.batch.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | 4 | # Step 1: Generate Mock Data 5 | def generate_mock_data(num_records): 6 | data = [] 7 | for _ in range(num_records): 8 | record = { 9 | 'id': random.randint(1, 1000), 10 | 'value': random.random() * 100 11 | } 12 | data.append(record) 13 | return data 14 | 15 | # Step 2: Batch Processing 16 | def process_in_batches(data, batch_size): 17 | for i in range(0, len(data), batch_size): 18 | yield data[i:i + batch_size] 19 | 20 | # Step 3: Transform Data 21 | def transform_data(batch): 22 | transformed_batch = [] 23 | for record in batch: 24 | transformed_record = { 25 | 'id': record['id'], 26 | 'value': record['value'], 27 | 'transformed_value': record['value'] * 1.1 # Example transformation 28 | } 29 | transformed_batch.append(transformed_record) 30 | return transformed_batch 31 | 32 | # Step 4: Load Data 33 | def load_data(batch): 34 | for record in batch: 35 | # Simulate loading data into a database 36 | print(f"Loading record into database: {record}") 37 | 38 | # Main Function 39 | def main(): 40 | # Parameters 41 | num_records = 100 # Total number of records to generate 42 | batch_size = 10 # Number of records per batch 43 | 44 | # Generate data 45 | data = generate_mock_data(num_records) 46 | print("Original data:",data) 47 | 48 | # Process and load data in batches 49 | for batch in process_in_batches(data, batch_size): 50 | transformed_batch = transform_data(batch) 51 | print("Batch before loading:") 52 | for record in transformed_batch: 53 | print(record) 54 | load_data(transformed_batch) 55 | time.sleep(1) # Simulate time delay between batches 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /chapter01/2.real_time_streaming.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | 4 | # Step 1: Generate Mock Data Continuously 5 | def generate_mock_data(): 6 | while True: 7 | record = { 8 | 'id': random.randint(1, 1000), 9 | 'value': random.random() * 100 10 | } 11 | yield record 12 | time.sleep(0.5) # Simulate data arriving every 0.5 seconds 13 | 14 | # Step 2: Stream Processing with a time limit 15 | def process_stream(run_time_seconds=10): 16 | start_time = time.time() 17 | for record in generate_mock_data(): 18 | transformed_record = transform_data(record) 19 | load_data(transformed_record) 20 | 21 | # Check if the run time has exceeded the limit 22 | if time.time() - start_time > run_time_seconds: 23 | print("Time limit reached. Terminating the stream processing.") 24 | break 25 | 26 | # Step 3: Transform Data 27 | def transform_data(record): 28 | transformed_record = { 29 | 'id': record['id'], 30 | 'value': record['value'], 31 | 'transformed_value': record['value'] * 1.1 # Example transformation 32 | } 33 | return transformed_record 34 | 35 | # Step 4: Load Data 36 | def load_data(record): 37 | # Simulate loading data into a database 38 | print(f"Loading record into database: {record}") 39 | 40 | # Main Function 41 | def main(): 42 | process_stream(run_time_seconds=10) # Run the stream for 10 seconds 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /chapter01/3.semi_real_time.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | from collections import deque 4 | 5 | # Step 1: Generate Mock Data Continuously 6 | def generate_mock_data(): 7 | while True: 8 | record = { 9 | 'id': random.randint(1, 1000), 10 | 'value': random.random() * 100 11 | } 12 | yield record 13 | time.sleep(0.1) # Simulate data arriving every 0.1 seconds 14 | 15 | # Step 2: Process Semi-Real-Time 16 | def process_semi_real_time(batch_size, interval): 17 | buffer = deque() 18 | start_time = time.time() 19 | 20 | for record in generate_mock_data(): 21 | buffer.append(record) 22 | 23 | # Check if interval has elapsed or buffer size reached 24 | if (time.time() - start_time) >= interval or len(buffer) >= batch_size: 25 | # Process and clear the buffer 26 | transformed_batch = transform_data(list(buffer)) # Convert deque to list 27 | print(f"Batch of {len(transformed_batch)} records before loading:") 28 | for rec in transformed_batch: 29 | print(rec) 30 | load_data(transformed_batch) 31 | buffer.clear() 32 | start_time = time.time() # Reset start time 33 | 34 | # Step 3: Transform Data 35 | def transform_data(batch): 36 | transformed_batch = [] 37 | for record in batch: 38 | transformed_record = { 39 | 'id': record['id'], 40 | 'value': record['value'], 41 | 'transformed_value': record['value'] * 1.1 # Example transformation 42 | } 43 | transformed_batch.append(transformed_record) 44 | return transformed_batch 45 | 46 | # Step 4: Load Data 47 | def load_data(batch): 48 | for record in batch: 49 | # Simulate loading data into a database 50 | print(f"Loading record into database: {record}") 51 | 52 | # Main Function 53 | def main(): 54 | batch_size = 5 # Number of records to process per batch 55 | interval = 3.0 # Maximum time interval (in seconds) to process a batch 56 | 57 | process_semi_real_time(batch_size, interval) 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /chapter01/4.work_with_queue.py: -------------------------------------------------------------------------------- 1 | from queue import Queue 2 | 3 | def read_message_queue(): 4 | q = Queue() 5 | 6 | # Adding messages to the queue 7 | for i in range(10): # Mocking messages 8 | q.put(f"message {i}") 9 | 10 | # Reading and processing messages from the queue 11 | while not q.empty(): 12 | message = q.get() 13 | process_message(message) 14 | q.task_done() # Signal that the task is done 15 | 16 | def process_message(message): 17 | print(f"Processing message: {message}") 18 | 19 | # Example usage 20 | read_message_queue() 21 | -------------------------------------------------------------------------------- /chapter01/5.sql_databases.py: -------------------------------------------------------------------------------- 1 | def read_sql(): 2 | # Simulating a SQL table with a dictionary 3 | sql_table = [ 4 | {"id": 1, "name": "Alice", "age": 30}, 5 | {"id": 2, "name": "Bob", "age": 24}, 6 | ] 7 | for row in sql_table: 8 | process_row(row) 9 | 10 | def process_row(row): 11 | print(f"Processing row: id={row['id']}, name={row['name']}, age={row['age']}") 12 | 13 | # Example usage 14 | read_sql() 15 | 16 | print(f"{'id':<5} {'name':<10} {'age':<3}") 17 | print("-" * 20) 18 | # Print each row 19 | for row in sql_table: 20 | print(f"{row['id']:<5} {row['name']:<10} {row['age']:<3}") 21 | -------------------------------------------------------------------------------- /chapter01/6.no_sql_databases.py: -------------------------------------------------------------------------------- 1 | def read_nosql(): 2 | data_store = { 3 | "1": {"name": "Alice", "age": 30}, 4 | "2": {"name": "Bob", "age": 24}, 5 | } 6 | for key, value in data_store.items(): 7 | process_entry(key, value) 8 | 9 | def process_entry(key, value): 10 | print(f"Processing key: {key} with value: {value}") 11 | 12 | # Example usage 13 | read_nosql() 14 | -------------------------------------------------------------------------------- /chapter01/7.api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | 4 | # Define the API endpoint URL 5 | url = "https://www.thecocktaildb.com/api/json/v1/1/search.php?s=margarita" 6 | 7 | # Make the API request 8 | response = requests.get(url) 9 | 10 | # Check if the request was successful (status code 200) 11 | if response.status_code == 200: 12 | # Extract the response JSON data 13 | data = response.json() 14 | 15 | # Check if the API response contains cocktails data 16 | if 'drinks' in data: 17 | # Create DataFrame from drinks data 18 | df = pd.DataFrame(data['drinks']) 19 | 20 | # Print the resulting DataFrame 21 | print(df.head()) 22 | else: 23 | print("No drinks found.") 24 | else: 25 | print(f"Failed to retrieve data from API. Status code: {response.status_code}") 26 | -------------------------------------------------------------------------------- /chapter02/accuracy.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample dataset 4 | data = { 5 | 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 6 | 'Age': [25, 30, 28, 28, 22], 7 | 'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'], 8 | 'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco'] 9 | } 10 | 11 | # Reference dataset for accuracy comparison 12 | reference_data = { 13 | 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 14 | 'Age': [25, 30, 29, 28, 22], 15 | 'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'], 16 | 'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco'] 17 | } 18 | 19 | df = pd.DataFrame(data) 20 | reference_df = pd.DataFrame(reference_data) 21 | 22 | # Step 1: Import necessary libraries 23 | # We import the pandas library to work with the dataset. 24 | 25 | # Step 2: Create a sample dataset and a reference dataset 26 | # We create a sample dataset and a reference dataset with the same structure. 27 | 28 | # Step 3: Create DataFrames 29 | df = pd.DataFrame(data) 30 | reference_df = pd.DataFrame(reference_data) 31 | 32 | # Step 4: Compare data to the reference 33 | accuracy_check = df == reference_df 34 | 35 | # Step 5: Calculate accuracy percentage 36 | accuracy_percentage = accuracy_check.mean() * 100 37 | # We calculate the accuracy percentage by taking the mean of the accuracy check for each column and multiplying by 100. 38 | 39 | # Step 6: Display the accuracy results 40 | print("Accuracy Check:") 41 | print(accuracy_check) 42 | print("\nAccuracy Percentage:") 43 | print(accuracy_percentage) 44 | -------------------------------------------------------------------------------- /chapter02/average_timeliness.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from datetime import datetime, timedelta 4 | 5 | # Generate a random dataset with timestamps 6 | np.random.seed(0) # For reproducibility 7 | n_samples = 100 8 | start_time = datetime(2023, 10, 25, 9, 0, 0) 9 | end_time = datetime(2023, 10, 25, 16, 0, 0) 10 | 11 | timestamps = [start_time + timedelta(minutes=np.random.randint(0, (end_time - start_time).total_seconds() / 60)) for _ in range(n_samples)] 12 | values = np.random.randint(50, 101, n_samples) 13 | 14 | df = pd.DataFrame({'Timestamp': timestamps, 'Value': values}) 15 | 16 | # Reference timestamp (current time for this example) 17 | reference_timestamp = datetime(2023, 10, 25, 12, 0, 0) 18 | 19 | # Define a timeliness threshold (in minutes) 20 | timeliness_threshold = 30 21 | 22 | # Calculate timeliness 23 | df['Timeliness'] = (reference_timestamp - df['Timestamp']).dt.total_seconds() / 60 24 | df['Timely'] = df['Timeliness'] <= timeliness_threshold 25 | 26 | # Calculate the average timeliness 27 | average_timeliness = df['Timeliness'].mean() 28 | 29 | # Display results 30 | print("Dataset with Timestamps:") 31 | print(df.head()) 32 | 33 | print("\nAverage Timeliness (in minutes):", average_timeliness) 34 | print("Percentage of Timely Records:", (df['Timely'].sum() / n_samples) * 100, "%") 35 | -------------------------------------------------------------------------------- /chapter02/completeness.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample dataset 4 | data = { 5 | 'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'], 6 | 'Age': [25, 30, None, 28, 22], 7 | 'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'], 8 | 'City': ['New York', 'Los Angeles', 'Chicago', None, 'San Francisco'] 9 | } 10 | 11 | df = pd.DataFrame(data) 12 | 13 | # Step 1: Import necessary libraries 14 | # We import the pandas library to work with the dataset. 15 | 16 | # Step 2: Create a sample dataset 17 | # We create a simple dataset with columns 'Name', 'Age', 'Gender', and 'City'. Some values are intentionally missing (represented as 'None'). 18 | 19 | # Step 3: Create a DataFrame 20 | df = pd.DataFrame(data) 21 | # We create a DataFrame using the sample data. 22 | 23 | # Step 4: Check completeness 24 | completeness = df.isnull().sum() 25 | # The .isnull() method checks for missing values in the DataFrame, and .sum() counts the missing values for each column. 26 | 27 | # Step 5: Calculate completeness percentage 28 | total_records = len(df) 29 | completeness_percentage = (1- completeness / total_records) * 100 30 | # We calculate the completeness percentage by dividing the count of missing values by the total number of records and then multiplying by 100. 31 | 32 | # Step 6: Display the completeness results 33 | print("Completeness Check:") 34 | print(completeness) 35 | print("\nCompleteness Percentage:") 36 | print(completeness_percentage) 37 | -------------------------------------------------------------------------------- /chapter02/consistency.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Create a sample dataset 4 | data = { 5 | 'ProductID': [1, 2, 3, 4, 5], 6 | 'ProductName': ['PROD001', 'PROD002', 'Product003', 'PROD004', 'PROD005'], 7 | } 8 | 9 | df = pd.DataFrame(data) 10 | 11 | # Define the expected prefix 12 | expected_prefix = "PROD" 13 | 14 | # Check consistency and create a boolean mask for inconsistent names 15 | inconsistent_mask = ~df['ProductName'].str.startswith(expected_prefix) 16 | 17 | # Create a new column to indicate consistency 18 | df['Consistency'] = ~inconsistent_mask 19 | 20 | # Calculate the percentage of consistent rows 21 | consistent_percentage = (df['Consistency'].sum() / len(df)) * 100 22 | 23 | # Display the dataset with the consistency check results 24 | print("Dataset with Consistency Check:") 25 | print(df) 26 | 27 | # Display the percentage of consistent rows 28 | print(f"Percentage of Consistent Rows: {consistent_percentage:.2f}%") -------------------------------------------------------------------------------- /chapter02/data_compliance.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | # Simulate a dataset with compliance checks 4 | def simulate_data_compliance(num_records): 5 | data_records = [] 6 | compliant_count = 0 # Counter for compliant records 7 | 8 | for _ in range(num_records): 9 | # Generate a random record (e.g., containing age and consent fields) 10 | age = random.randint(18, 100) 11 | consent_given = random.choice([True, False]) 12 | 13 | # Define compliance rules 14 | age_rule = age >= 18 15 | consent_rule = age >= 18 and consent_given 16 | 17 | # Check compliance with specific regulations 18 | age_compliant = "Age Compliant" if age_rule else "Age Non-Compliant" 19 | consent_compliant = "Consent Compliant" if consent_rule else "Consent Non-Compliant" 20 | 21 | # Define overall compliance status 22 | compliance_status = "Compliant" if age_rule and consent_rule else "Non-Compliant" 23 | 24 | # Count compliant records 25 | if compliance_status == "Compliant": 26 | compliant_count += 1 27 | 28 | data_records.append({ 29 | "Age": age, 30 | "Consent Given": consent_given, 31 | "Age Compliance": age_compliant, 32 | "Consent Compliance": consent_compliant, 33 | "Overall Compliance Status": compliance_status 34 | }) 35 | 36 | # Calculate the percentage of compliant records 37 | percentage_compliant = (compliant_count / num_records) * 100 38 | 39 | return data_records, percentage_compliant 40 | 41 | # Define the number of data records to simulate 42 | num_records = 100 43 | 44 | # Simulate data compliance checks 45 | data_records, percentage_compliant = simulate_data_compliance(num_records) 46 | 47 | # Display the results for a sample of data records and the percentage of compliance 48 | sample_size = 10 49 | for record in data_records[:sample_size]: 50 | print(record) 51 | 52 | print(f"\nPercentage of Compliant Records: {percentage_compliant:.2f}%") 53 | -------------------------------------------------------------------------------- /chapter02/data_usage.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | # Simulated data usage metrics 4 | def simulate_data_usage(): 5 | # Simulate the number of users in the organization 6 | num_users = 500 7 | 8 | # Simulate data utilization rates for each user (percentage) 9 | data_utilization_rates = [random.uniform(20, 90) for _ in range(num_users)] 10 | 11 | # Simulate the number of data requests or queries made by each user 12 | data_requests = [random.randint(1, 100) for _ in range(num_users)] 13 | 14 | # Calculate the overall data utilization rate for the organization 15 | organization_data_utilization_rate = sum(data_utilization_rates) / num_users 16 | 17 | # Calculate the total number of data requests or queries 18 | total_data_requests = sum(data_requests) 19 | 20 | # Simulate user satisfaction surveys (on a scale of 1 to 5) 21 | user_satisfaction_scores = [random.randint(1, 5) for _ in range(num_users)] 22 | 23 | # Calculate average user satisfaction score 24 | avg_user_satisfaction_score = sum(user_satisfaction_scores) / num_users 25 | 26 | return { 27 | "data_utilization_rates": data_utilization_rates, 28 | "organization_data_utilization_rate": organization_data_utilization_rate, 29 | "data_requests": data_requests, 30 | "total_data_requests": total_data_requests, 31 | "user_satisfaction_scores": user_satisfaction_scores, 32 | "avg_user_satisfaction_score": avg_user_satisfaction_score, 33 | } 34 | 35 | # Run the simulation 36 | data_usage_metrics = simulate_data_usage() 37 | 38 | # Display the results 39 | print("\nOrganization Data Utilization Rate:") 40 | print(f"{data_usage_metrics['organization_data_utilization_rate']:.2f}%") 41 | print("\nTotal Number of Data Requests or Queries:") 42 | print(data_usage_metrics["total_data_requests"]) 43 | print("\nAverage User Satisfaction Score:") 44 | print(f"{data_usage_metrics['avg_user_satisfaction_score']:.2f}") 45 | -------------------------------------------------------------------------------- /chapter02/duplication.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Create a sample dataset with duplicate records 4 | data = { 5 | 'EmployeeID': [101, 102, 103, 101, 104, 105, 102], 6 | 'FirstName': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Bob'], 7 | 'LastName': ['Smith', 'Johnson', 'Brown', 'Davis', 'Lee', 'White', 'Johnson'], 8 | } 9 | 10 | df = pd.DataFrame(data) 11 | 12 | # Check for duplicate records based on the 'EmployeeID' column 13 | duplicated_mask = df.duplicated(subset='EmployeeID', keep='first') 14 | 15 | # Create a new column to indicate duplicate records 16 | df['IsDuplicate'] = duplicated_mask 17 | 18 | # Calculate the percentage of duplicate records 19 | duplicate_percentage = (df['IsDuplicate'].sum() / len(df)) * 100 20 | 21 | # Display the dataset with the duplicate records marked 22 | print("Dataset with Duplicate Records:") 23 | print(df) 24 | 25 | # Display the percentage of duplicate records 26 | print(f"Percentage of Duplicate Records: {duplicate_percentage:.2f}%") 27 | -------------------------------------------------------------------------------- /chapter02/timeliness.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import datetime 3 | 4 | # Sample dataset with timestamps 5 | data = { 6 | 'Timestamp': ['2023-10-25 10:00:00', '2023-10-25 11:00:00', '2023-10-25 12:00:00'], 7 | 'Value': [50, 55, 60] 8 | } 9 | 10 | # Convert the 'Timestamp' column to datetime objects 11 | df = pd.DataFrame(data) 12 | df['Timestamp'] = pd.to_datetime(df['Timestamp']) 13 | 14 | # Reference timestamp (current time for this example) 15 | reference_timestamp = datetime(2023, 10, 25, 12, 30, 0) 16 | 17 | # Step 1: Import necessary libraries and create the dataset 18 | # We import Pandas and the datetime module and create a sample dataset with timestamps. 19 | 20 | # Step 2: Convert timestamps to datetime objects 21 | # We convert the 'Timestamp' column to datetime objects to work with timestamps effectively. 22 | 23 | # Step 3: Define the reference timestamp 24 | # In this example, we set a reference timestamp, which represents the current time. 25 | 26 | # Step 4: Calculate timeliness 27 | timeliness_check = df['Timestamp'] < reference_timestamp 28 | 29 | # Step 5: Display timeliness results 30 | print("Timeliness Check:") 31 | print(timeliness_check) 32 | -------------------------------------------------------------------------------- /chapter02/uniqueness.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Create a sample dataset 4 | data = { 5 | 'Email': ['john.doe@example.com', 'jane.smith@example.com', 'james.doe@example.com', 'susan.brown@example.com'], 6 | } 7 | 8 | df = pd.DataFrame(data) 9 | 10 | # Check uniqueness and create a boolean mask for duplicated email addresses 11 | duplicated_mask = df['Email'].duplicated(keep='first') 12 | 13 | # Create a new column to indicate uniqueness 14 | df['Uniqueness'] = ~duplicated_mask 15 | 16 | # Calculate the percentage of unique records 17 | unique_percentage = (df['Uniqueness'].sum() / len(df)) * 100 18 | 19 | # Display the dataset with the uniqueness check results 20 | print("Dataset with Uniqueness Check:") 21 | print(df) 22 | 23 | # Display the percentage of unique records 24 | print(f"Percentage of Unique Records: {unique_percentage:.2f}%") 25 | -------------------------------------------------------------------------------- /chapter03/great_expectations/code/1.data_set_up.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # Load the 'iris' dataset from seaborn library 5 | iris_data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') 6 | 7 | iris_data.to_csv('../data/iris_data.csv', index=False) 8 | print("File written! :)") 9 | -------------------------------------------------------------------------------- /chapter03/great_expectations/code/2.mock_test_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # Load the 'iris' dataset from seaborn library 5 | iris_data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') 6 | 7 | # do some transformtions that will fail the expectations 8 | #update values 9 | iris_data['sepal_length'] = 60 10 | 11 | #rename columns 12 | iris_data.rename(columns={'petal_width': 'petal_w'}, inplace=True) 13 | 14 | #write dataframe 15 | iris_data.to_csv('../data/iris_data_test.csv', index=False) 16 | print("File written! :)") 17 | -------------------------------------------------------------------------------- /chapter03/great_expectations/code/3.with_pandas_profiler.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ydata_profiling import ProfileReport 3 | 4 | # Load the 'iris' dataset from seaborn library 5 | iris_data = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') 6 | 7 | # Then run Pandas Profiling 8 | profile = ProfileReport(iris_data, title="Pandas Profiling Report", explorative=True) 9 | 10 | # And obtain an Expectation Suite from the profile report 11 | suite = profile.to_expectation_suite(suite_name="my_pandas_profiling_suite") 12 | -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/checkpoints/expect_iris_ckpnt.yml: -------------------------------------------------------------------------------- 1 | name: expect_iris_ckpnt 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: SimpleCheckpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: 8 | batch_request: {} 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | evaluation_parameters: {} 20 | runtime_configuration: {} 21 | validations: 22 | - batch_request: 23 | datasource_name: iris_data.csv 24 | data_connector_name: default_inferred_data_connector_name 25 | data_asset_name: iris_data_test.csv 26 | data_connector_query: 27 | index: -1 28 | expectation_suite_name: expect_iris 29 | profilers: [] 30 | ge_cloud_id: 31 | expectation_suite_ge_cloud_id: 32 | -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-Italic.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-Italic.otf -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-LightItalic.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-LightItalic.otf -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-MediumItalic.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-MediumItalic.otf -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-SemiBoldItalic.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/fonts/HKGrotesk/HKGrotesk-SemiBoldItalic.otf -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/favicon.ico -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/glossary_scroller.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/glossary_scroller.gif -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/iterative-dev-loop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/iterative-dev-loop.png -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/logo-long.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/logo-long.png -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/short-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/short-logo.png -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/validation_failed_unexpected_values.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/images/validation_failed_unexpected_values.gif -------------------------------------------------------------------------------- /chapter03/great_expectations/great_expectations/uncommitted/data_docs/local_site/static/styles/data_docs_custom_styles_template.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /chapter03/intoduction/identify_trends.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import numpy as np 4 | 5 | # Generate hypothetical sales data 6 | np.random.seed(42) 7 | sales_data = np.random.normal(loc=1000, scale=300, size=1000) 8 | 9 | # Plotting the distribution 10 | plt.figure(figsize=(10, 6)) 11 | sns.histplot(sales_data, bins=30, kde=True, color='skyblue') 12 | plt.title('Distribution of Daily Sales Revenue') 13 | plt.xlabel('Sales Revenue') 14 | plt.ylabel('Frequency') 15 | plt.show() 16 | -------------------------------------------------------------------------------- /chapter04/1.descriptive_stats.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # Create the initial and expanded e-commerce dataset 7 | data = { 8 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 9 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 10 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 11 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 12 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 13 | } 14 | 15 | df = pd.DataFrame(data) 16 | 17 | # Convert 'Timestamp' to datetime 18 | df['Timestamp'] = pd.to_datetime(df['Timestamp']) 19 | 20 | # Display the initial and expanded dataset 21 | print("Initial and Expanded Dataset:") 22 | print(df) 23 | 24 | # Remove irrelevant column 'CustomerID' 25 | df = df.drop(columns=['CustomerID']) 26 | 27 | # Descriptive statistics 28 | desc_stats = df.describe() 29 | print("\nDescriptive Statistics:") 30 | print(desc_stats) 31 | 32 | # Visualize distributions 33 | plt.figure(figsize=(15, 8)) 34 | 35 | # Distribution of Purchase Amount 36 | plt.subplot(2, 2, 1) 37 | sns.histplot(df['PurchaseAmount'], kde=True, color='skyblue') 38 | plt.title('Distribution of Purchase Amount') 39 | 40 | # Distribution of Payment Methods 41 | plt.subplot(2, 2, 2) 42 | sns.countplot(x='PaymentMethod', data=df, palette='Set2') 43 | plt.title('Distribution of Payment Methods') 44 | 45 | # Distribution of Product Names 46 | plt.subplot(2, 1, 2) 47 | sns.countplot(x='ProductName', data=df, palette='Set2') 48 | plt.title('Distribution of Product Names') 49 | 50 | plt.tight_layout() 51 | plt.show() 52 | -------------------------------------------------------------------------------- /chapter04/2.rename_columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | # Create the initial and expanded e-commerce dataset 5 | data = { 6 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 8 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 9 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 10 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 11 | } 12 | 13 | df = pd.DataFrame(data) 14 | 15 | # Convert 'Timestamp' to datetime 16 | df['Timestamp'] = pd.to_datetime(df['Timestamp']) 17 | 18 | # Display the initial and expanded dataset 19 | print("Initial and Expanded Dataset:") 20 | print(df) 21 | 22 | # Scenario: Renaming Columns with Error Handling 23 | 24 | try: 25 | # Attempt to rename a single column 26 | df.rename(columns={'ProductName': 'OldProductName'}, inplace=True) 27 | except ValueError as ve: 28 | print(f"Error: {ve}") 29 | 30 | # Check if the column exists before renaming 31 | if 'OldProductName' in df.columns: 32 | try: 33 | # Attempt to rename multiple columns 34 | df.rename(columns={'OldProductName': 'NewProductName', 'PurchaseAmount': 'NewPurchaseAmount'}, inplace=True) 35 | except ValueError as ve: 36 | print(f"Error: {ve}") 37 | else: 38 | print("Error: Column 'OldProductName' does not exist in the DataFrame.") 39 | 40 | # Display the dataset after renaming (if successful) 41 | print("\nDataset after Renaming (if successful):") 42 | print(df) 43 | -------------------------------------------------------------------------------- /chapter04/3.dropping_columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Create the initial e-commerce dataset 4 | data = { 5 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 6 | 'NewProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 7 | 'NewPurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 8 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 9 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | 14 | # Display the initial e-commerce dataset 15 | print("Initial E-commerce Dataset:") 16 | print(df) 17 | 18 | # Display the initial memory usage 19 | print("Initial Memory Usage:") 20 | print(df.memory_usage().sum() / (1024 ** 2), "MB") # Convert bytes to megabytes 21 | 22 | # Save a copy of the DataFrame before dropping columns for comparison 23 | df_before_drop = df.copy() 24 | 25 | # Scenario: Dropping Irrelevant Columns 26 | columns_to_drop = ['CustomerID', 'Timestamp'] # Replace with the names of the columns you want to drop 27 | 28 | try: 29 | # Drop columns considered irrelevant for the current analysis 30 | df.drop(columns=columns_to_drop, inplace=True) 31 | except KeyError as ke: 32 | print(f"Error: {ke}") 33 | 34 | # Display the DataFrame after dropping columns 35 | print("\nDataFrame after Dropping Irrelevant Columns:") 36 | print(df.columns) 37 | 38 | # Display the DataFrame before dropping columns for comparison 39 | print("\nDataFrame Before Dropping Columns:") 40 | print(df_before_drop.columns) 41 | 42 | # Display the memory usage after dropping columns 43 | print("\nMemory Usage After Dropping Columns:") 44 | print(df.memory_usage().sum() / (1024 ** 2), "MB") # Convert bytes to megabytes 45 | 46 | -------------------------------------------------------------------------------- /chapter04/4.data_types.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Create the initial e-commerce dataset 4 | data = { 5 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 6 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 7 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 8 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 9 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | 14 | # Display the initial e-commerce dataset 15 | print("Initial E-commerce Dataset:") 16 | print(df) 17 | 18 | # Inspect data types of columns 19 | print("\nData Types of Columns:") 20 | print(df.dtypes) 21 | 22 | # Convert 'PurchaseAmount' to numeric 23 | df['PurchaseAmount'] = pd.to_numeric(df['PurchaseAmount'], errors='coerce') 24 | 25 | # Convert 'ProductName' to string 26 | df['ProductName'] = df['ProductName'].astype('str') 27 | 28 | # Convert 'PaymentMethod' to categorical 29 | df['PaymentMethod'] = df['PaymentMethod'].astype('category') 30 | 31 | # Convert 'CustomerID' to numeric 32 | df['CustomerID'] = pd.to_numeric(df['CustomerID'], errors='coerce') 33 | 34 | # Add a new boolean column 'HasDevice' 35 | df['HasDive'] = df['ProductName'].str.contains('Dive', case=False) 36 | df['HasDive'] = df['HasDive'].astype('bool') 37 | 38 | # Display the dataset after type transformations and adding 'HasDive' 39 | print("\nE-commerce Dataset After Type Transformations and Adding 'HasDive':") 40 | print(df) 41 | 42 | # Inspect data types of columns after transformations 43 | print("\nData Types of Columns After Transformations:") 44 | print(df.dtypes) 45 | -------------------------------------------------------------------------------- /chapter04/5.date_time.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import datetime 3 | from dateutil import parser 4 | 5 | # Sample dataset 6 | data = { 7 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 8 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 9 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 10 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 11 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 12 | } 13 | 14 | df = pd.DataFrame(data) 15 | print(df) 16 | 17 | # Method 1: Using strptime 18 | # Comment: Explicitly define the timestamp format for parsing 19 | df['Timestamp1'] = df['Timestamp'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H:%M:%S')) 20 | 21 | # Method 2: Using dateutil.parser.parse() 22 | # Comment: Automatically detect the timestamp format for parsing 23 | df['Timestamp2'] = df['Timestamp'].apply(parser.parse) 24 | 25 | # Method 3: Using pd.to_datetime() 26 | # Comment: A concise method for parsing timestamps in a pandas DataFrame 27 | df['Timestamp3'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S') 28 | 29 | # Display the DataFrame after parsing 30 | print("\nData Types of Columns:") 31 | print(df.dtypes) 32 | print(df) 33 | 34 | -------------------------------------------------------------------------------- /chapter04/6.format_date.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample dataset 4 | data = { 5 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 6 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 7 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 8 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 9 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | print(df) 14 | 15 | # Ensure 'Timestamp' column is of type datetime 16 | df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S') 17 | 18 | # Display the DataFrame after parsing 19 | print("\nDataFrame After Parsing:") 20 | print(df) 21 | 22 | # Method 4: Using strftime for custom formatting 23 | # Comment: The strftime method is used to customize the display format of datetime objects 24 | df['FormattedTimestamp'] = df['Timestamp'].dt.strftime('%b %d, %Y %I:%M %p') 25 | 26 | # Display the DataFrame with the formatted timestamp 27 | print("\nDataFrame with Formatted Timestamp:") 28 | print(df[['Timestamp', 'FormattedTimestamp']]) 29 | 30 | # Display data types of columns 31 | print("\nData Types of Columns After Transformations:") 32 | print(df.dtypes) 33 | -------------------------------------------------------------------------------- /chapter04/7.extract_datetime_components.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample dataset 4 | data = { 5 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 6 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 7 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 8 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 9 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | print(df) 14 | 15 | # Ensure 'Timestamp' column is of type datetime 16 | df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S') 17 | 18 | # Display the DataFrame after parsing 19 | print("DataFrame After Parsing:") 20 | print(df) 21 | 22 | # Extracting Components: Day, Month, Year 23 | df['Day'] = df['Timestamp'].dt.day 24 | df['Month'] = df['Timestamp'].dt.month 25 | df['Year'] = df['Timestamp'].dt.year 26 | 27 | # Display the DataFrame with extracted components 28 | print("\nDataFrame with Extracted Components:") 29 | print(df[['Timestamp', 'Day', 'Month', 'Year']]) 30 | 31 | -------------------------------------------------------------------------------- /chapter04/8.time_deltas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample dataset 4 | data = { 5 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 6 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 7 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 8 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 9 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | 14 | # Convert 'Timestamp' to datetime 15 | df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S') 16 | 17 | # Sort DataFrame by 'Timestamp' 18 | df.sort_values(by='Timestamp', inplace=True) 19 | 20 | # Calculate time differences and add to DataFrame 21 | df['TimeSincePreviousPurchase'] = df['Timestamp'].diff() 22 | df['TimeUntilNextPurchase'] = -df['Timestamp'].diff(-1) 23 | 24 | # Display the DataFrame with timedelta columns 25 | print("DataFrame with Time Differences:") 26 | print(df[['Timestamp', 'TimeSincePreviousPurchase', 'TimeUntilNextPurchase']]) 27 | 28 | # Create diff with longer periods 29 | df['TimeDifference2periods'] = df['Timestamp'].diff(periods=2) 30 | 31 | print("DataFrame with Time Differences:") 32 | print(df[['Timestamp', 'TimeSincePreviousPurchase', "TimeDifference2periods"]]) 33 | 34 | # Fill missing values on diff 35 | df['TimeDiff2periods_nonulls'] = df['Timestamp'].diff(periods=2).fillna(0) 36 | print("DataFrame with Time Differences:") 37 | print(df[['Timestamp', 'TimeDiff2periods_nonulls', "TimeDifference2periods"]]) 38 | 39 | -------------------------------------------------------------------------------- /chapter04/9.time_zones.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample dataset 4 | data = { 5 | 'CustomerID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 6 | 'ProductName': ['Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_A', 'Product_B', 'Product_C', 'Product_A', 'Product_B', 'Product_C'], 7 | 'PurchaseAmount': [50, 75, 120, 60, 80, 55, 90, 110, 70, 85, 130], 8 | 'PaymentMethod': ['Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card', 'PayPal', 'Cash', 'Card', 'Bank Transfer', 'Card'], 9 | 'Timestamp': ['2022-01-01 08:30:45', '2022-01-02 14:20:30', '2022-01-03 20:15:10', '2022-01-04 12:45:30', '2022-01-05 18:10:55', '2022-01-06 09:30:15', '2022-01-07 15:40:20', '2022-01-08 22:25:50', '2022-01-09 14:55:45', '2022-01-10 19:30:10', '2022-01-11 08:45:30'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | 14 | # Convert 'Timestamp' to datetime 15 | df['Timestamp'] = pd.to_datetime(df['Timestamp']) 16 | 17 | # Localize timestamps to a specific time zone (e.g., 'UTC') 18 | df['Timestamp_UTC'] = df['Timestamp'].dt.tz_localize('UTC') 19 | 20 | # Convert localized timestamps to a different time zone (e.g., 'America/New_York') 21 | df['Timestamp_NY'] = df['Timestamp_UTC'].dt.tz_convert('America/New_York') 22 | 23 | # Display the DataFrame with time zone-handled timestamps 24 | print(df[['Timestamp', 'Timestamp_UTC', 'Timestamp_NY']]) 25 | 26 | -------------------------------------------------------------------------------- /chapter05/1.use_case.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 8 | }) 9 | 10 | # Sample project assignment data 11 | project_data = pd.DataFrame({ 12 | 'employee_id': [2, 3, 4, 5, 6], 13 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'] 14 | }) 15 | 16 | 17 | # Displaying the results 18 | print("employee_data Result:") 19 | print(employee_data) 20 | 21 | print("project_data Result:") 22 | print(project_data) 23 | 24 | -------------------------------------------------------------------------------- /chapter05/2.inner_join.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 8 | }) 9 | 10 | # Sample project assignment data 11 | project_data = pd.DataFrame({ 12 | 'employee_id': [2, 3, 4, 5, 6], 13 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'] 14 | }) 15 | 16 | # Performing an inner join 17 | merged_data = pd.merge(employee_data, project_data, on='employee_id', how='inner') 18 | 19 | # Displaying the results 20 | print("Merged Data Result:") 21 | print(merged_data) -------------------------------------------------------------------------------- /chapter05/3.outer_merge.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 8 | }) 9 | 10 | # Sample project assignment data 11 | project_data = pd.DataFrame({ 12 | 'employee_id': [2, 3, 4, 5, 6], 13 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'] 14 | }) 15 | 16 | # Performing a full outer merge 17 | full_outer_merged_data = pd.merge(employee_data, project_data, on='employee_id', how='outer') 18 | 19 | # Displaying the results 20 | print("Full Outer Merged Data Result:") 21 | print(full_outer_merged_data) -------------------------------------------------------------------------------- /chapter05/4.right_merge.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 8 | }) 9 | 10 | # Sample project assignment data 11 | project_data = pd.DataFrame({ 12 | 'employee_id': [2, 3, 4, 5, 6], 13 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'] 14 | }) 15 | 16 | # Performing a right merge 17 | right_merged_data = pd.merge(employee_data, project_data, on='employee_id', how='right') 18 | 19 | # Displaying the results 20 | print("Right Merged Data Result:") 21 | print(right_merged_data) -------------------------------------------------------------------------------- /chapter05/5.left_merge.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 8 | }) 9 | 10 | # Sample project assignment data 11 | project_data = pd.DataFrame({ 12 | 'employee_id': [2, 3, 4, 5, 6], 13 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectE'] 14 | }) 15 | 16 | # Performing a left merge 17 | left_merged_data = pd.merge(employee_data, project_data, on='employee_id', how='left') 18 | 19 | # Displaying the results 20 | print("Left Merged Data Result:") 21 | print(left_merged_data) -------------------------------------------------------------------------------- /chapter05/6a.manage_duplicates.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data with potential duplicate keys 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 2, 3, 4, 5, 5], 6 | 'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'], 7 | 'department': ['HR', 'IT', 'IT', 'Marketing', 'Finance', 'IT', 'IT'] 8 | }) 9 | 10 | # Sample project assignment data with potential duplicate keys 11 | project_data = pd.DataFrame({ 12 | 'employee_id': [2, 3, 4, 5, 5, 6], 13 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectD', 'ProjectE'] 14 | }) 15 | 16 | # Handling duplicates 17 | ## Drop duplicates 18 | employee_data = employee_data.drop_duplicates(subset='employee_id', keep='first') 19 | project_data = project_data.drop_duplicates(subset='employee_id', keep='first') 20 | 21 | # Performing a merge 22 | merged_data = pd.merge(employee_data, project_data, on='employee_id', how='inner') 23 | 24 | # Displaying the results 25 | print("Merged Data Result after handling duplicates:") 26 | print(merged_data) -------------------------------------------------------------------------------- /chapter05/6b.manage_duplicates_validate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data with potential duplicate keys 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 2, 3, 4, 5, 5], 6 | 'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'], 7 | 'department': ['HR', 'IT', 'IT', 'Marketing', 'Finance', 'IT', 'IT'] 8 | }) 9 | 10 | # Sample project assignment data with potential duplicate keys 11 | project_data = pd.DataFrame({ 12 | 'employee_id': [2, 3, 4, 5, 5, 6], 13 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectD', 'ProjectE'] 14 | }) 15 | 16 | # Performing a merge with validation to ensure no duplicates in the key column of the left DataFrame 17 | try: 18 | merged_data = pd.merge(employee_data, project_data, on='employee_id', how='inner', validate='one_to_many') 19 | print("Merged Data Result:") 20 | print(merged_data) 21 | except ValueError as e: 22 | print("Merge failed:", e) 23 | -------------------------------------------------------------------------------- /chapter05/6c.merge_and_aggregate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample employee data with potential duplicate keys 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 2, 3, 4, 5, 5], 6 | 'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'], 7 | 'department': ['HR', 'IT', 'IT', 'Marketing', 'Finance', 'IT', 'IT'], 8 | 'salary': [50000, 60000, 60000, 55000, 65000, 70000, 70000] # Added salary for aggregation 9 | }) 10 | 11 | # Sample project assignment data with no duplicate keys 12 | project_data = pd.DataFrame({ 13 | 'employee_id': [2, 3, 4, 5, 7, 6], 14 | 'project_name': ['ProjectA', 'ProjectB', 'ProjectC', 'ProjectD', 'ProjectD', 'ProjectE'] 15 | }) 16 | 17 | # Aggregating duplicate entries in employee_data 18 | aggregated_employee_data = employee_data.groupby('employee_id').agg({ 19 | 'name': 'first', # Keep the first name encountered 20 | 'department': 'first', # Keep the first department encountered 21 | 'salary': 'sum' # Sum the salaries in case of duplicates 22 | }).reset_index() 23 | 24 | # Performing a merge 25 | merged_data = pd.merge(aggregated_employee_data, project_data, on='employee_id', how='inner') 26 | 27 | # Displaying the results 28 | print("Merged Data Result after aggregation:") 29 | print(merged_data) 30 | -------------------------------------------------------------------------------- /chapter05/6d.dmanage_duplicates_concatenation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Creating a sample DataFrame with potential duplicate keys 4 | employee_data = pd.DataFrame({ 5 | 'employee_id': [1, 2, 2, 3, 4, 5, 5], 6 | 'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eva', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Marketing', 'Finance', 'IT', 'HR'] 8 | }) 9 | 10 | # Displaying the original DataFrame 11 | print("Original Employee Data:") 12 | print(employee_data) 13 | 14 | # Concatenating department names for each employee_id 15 | employee_data['department'] = employee_data.groupby('employee_id')['department'].transform(lambda x: ', '.join(x)) 16 | # Removing duplicate entries based on employee_id 17 | employee_data = employee_data.drop_duplicates('employee_id') 18 | 19 | # Displaying the modified DataFrame 20 | print("\nModified Employee Data after Concatenation and Removing Duplicates:") 21 | print(employee_data) 22 | -------------------------------------------------------------------------------- /chapter05/7a.managed_duplicated_columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Creating two sample DataFrames with the same column names 4 | employee_data_1 = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 8 | }) 9 | 10 | employee_data_2 = pd.DataFrame({ 11 | 'employee_id': [6, 7, 8, 9, 10], 12 | 'name': ['Frank', 'Grace', 'Hannah', 'Ian', 'Jill'], 13 | 'department': ['Logistics', 'Marketing', 'IT', 'Marketing', 'Finance'] 14 | }) 15 | 16 | # Merging the two DataFrames with suffixes to differentiate identical columns 17 | merged_data = pd.merge(employee_data_1, employee_data_2, on='employee_id', how='outer', suffixes=('_1', '_2')) 18 | 19 | # Displaying the merged DataFrame 20 | print("Merged Employee Data with Suffixes:") 21 | print(merged_data) -------------------------------------------------------------------------------- /chapter05/7b.drop_columns_merge.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Creating two sample DataFrames with some identical columns 4 | employee_data_1 = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] # More reliable department information 8 | }) 9 | 10 | employee_data_2 = pd.DataFrame({ 11 | 'employee_id': [1, 2, 3, 4, 5], 12 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 13 | 'department': ['Human Resources', 'Information Technology', 'Sales', 'Financial', 'Technical'] # Less reliable, drop this 14 | }) 15 | 16 | # Dropping the less reliable 'department' column from the second DataFrame before merging 17 | employee_data_2.drop(columns=['department'], inplace=True) 18 | 19 | # Merging the two DataFrames on 'employee_id' and 'name' which are the reliable keys 20 | merged_data = pd.merge(employee_data_1, employee_data_2, on=['employee_id', 'name'], how='inner') 21 | 22 | # Displaying the merged DataFrame 23 | print("Merged Employee Data with More Reliable Department Information:") 24 | print(merged_data) -------------------------------------------------------------------------------- /chapter05/7c.use_keys_merge.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Creating two sample DataFrames with identical keys and some identical columns 4 | employee_data_1 = pd.DataFrame({ 5 | 'employee_id': [1, 2, 3, 4, 5], 6 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 7 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'], 8 | 'salary': [50000, 60000, 70000, 80000, 90000] 9 | }) 10 | 11 | employee_data_2 = pd.DataFrame({ 12 | 'employee_id': [1, 2, 3, 4, 5], # Identical keys 13 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], # Identical column 14 | 'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'], 15 | 'bonus': [3000, 4000, 5000, 6000, 7000] 16 | }) 17 | 18 | # Merging the two DataFrames with suffixes to differentiate identical columns 19 | merged_data = pd.merge(employee_data_1, employee_data_2, on=['employee_id', 'name'], how='inner', suffixes=('_1', '_2')) 20 | 21 | # Displaying the merged DataFrame 22 | print("Merged Employee Data with Identical Keys and Columns:") 23 | print(merged_data) -------------------------------------------------------------------------------- /chapter05/8a.perfomance_benchmark_set_index.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from time import time 4 | 5 | # Number of rows for the benchmarking example 6 | num_rows = 5 7 | 8 | # Creating two sample DataFrames with identical keys and some identical columns 9 | employee_data_1 = pd.DataFrame({ 10 | 'employee_id': np.arange(num_rows), 11 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 12 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'], 13 | 'salary': [50000, 60000, 70000, 80000, 90000] 14 | }) 15 | 16 | employee_data_2 = pd.DataFrame({ 17 | 'employee_id': np.arange(num_rows), 18 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 19 | 'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'], 20 | 'bonus': [3000, 4000, 5000, 6000, 7000] 21 | }) 22 | 23 | # Merge operation 24 | start_time = time() 25 | merged_data = pd.merge(employee_data_1, employee_data_2, on=['employee_id', 'name'], how='inner', suffixes=('_1', '_2')) 26 | end_time = time() 27 | merge_time = end_time - start_time 28 | 29 | # Displaying the merged DataFrame 30 | print("Merged Employee Data:") 31 | print(merged_data) 32 | print(f"Merge operation took: {merge_time:.5f} seconds") 33 | 34 | # Utilizing indexes 35 | employee_data_1.set_index('employee_id', inplace=True) 36 | employee_data_2.set_index('employee_id', inplace=True) 37 | 38 | # Repeating the merge operation after reducing memory usage 39 | start_time = time() 40 | merged_data_reduced = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2')) 41 | end_time = time() 42 | merge_reduced_time = end_time - start_time 43 | print("Optimised Merged Employee Data:") 44 | print(merged_data_reduced) 45 | print(f"Merge operation with after optimisation took: {merge_reduced_time:.5f} seconds") -------------------------------------------------------------------------------- /chapter05/8b.performance_benchmark_sort_indexes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from time import time 4 | 5 | # Number of rows for the benchmarking example 6 | num_rows = 5 7 | 8 | # Creating two sample DataFrames with identical keys and some identical columns 9 | employee_data_1 = pd.DataFrame({ 10 | 'employee_id': np.arange(num_rows), 11 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 12 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'], 13 | 'salary': [50000, 60000, 70000, 80000, 90000] 14 | }) 15 | 16 | employee_data_2 = pd.DataFrame({ 17 | 'employee_id': np.arange(num_rows), 18 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 19 | 'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'], 20 | 'bonus': [3000, 4000, 5000, 6000, 7000] 21 | }) 22 | 23 | # Utilizing indexes 24 | employee_data_1.set_index('employee_id', inplace=True) 25 | employee_data_2.set_index('employee_id', inplace=True) 26 | 27 | 28 | # Merge operation 29 | start_time = time() 30 | merged_data = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2')) 31 | end_time = time() 32 | merge_time = end_time - start_time 33 | 34 | # Displaying the merged DataFrame 35 | print("Merged Employee Data:") 36 | print(merged_data) 37 | print(f"Merge operation took: {merge_time:.5f} seconds") 38 | 39 | # Sort indexes 40 | employee_data_1.sort_index(inplace=True) 41 | employee_data_2.sort_index(inplace=True) 42 | 43 | # Repeating the merge operation after reducing memory usage 44 | start_time = time() 45 | merged_data_reduced = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2')) 46 | end_time = time() 47 | merge_reduced_time = end_time - start_time 48 | 49 | print(f"Merge operation after optimisation took: {merge_reduced_time:.5f} seconds") -------------------------------------------------------------------------------- /chapter05/8c.performance_benchmark_memory.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from time import time 4 | 5 | # Number of rows for the benchmarking example 6 | num_rows = 5 7 | 8 | # Creating two sample DataFrames with identical keys and some identical columns 9 | employee_data_1 = pd.DataFrame({ 10 | 'employee_id': np.arange(num_rows), 11 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 12 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'], 13 | 'salary': [50000, 60000, 70000, 80000, 90000] 14 | }) 15 | 16 | employee_data_2 = pd.DataFrame({ 17 | 'employee_id': np.arange(num_rows), 18 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 19 | 'department': ['HR', 'IT', 'Sales', 'Finance', 'Operations'], 20 | 'bonus': [3000, 4000, 5000, 6000, 7000] 21 | }) 22 | 23 | # Utilizing indexes 24 | employee_data_1.set_index('employee_id', inplace=True) 25 | employee_data_2.set_index('employee_id', inplace=True) 26 | 27 | # Sort indexes 28 | employee_data_1.sort_index(inplace=True) 29 | employee_data_2.sort_index(inplace=True) 30 | 31 | # Merge operation 32 | start_time = time() 33 | merged_data = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2')) 34 | end_time = time() 35 | merge_time = end_time - start_time 36 | 37 | # Displaying the merged DataFrame 38 | print("Merged Employee Data:") 39 | print(merged_data) 40 | print(f"Merge operation took: {merge_time:.5f} seconds") 41 | 42 | # Reduce memory usage by downcasting numerical columns 43 | employee_data_1['salary'] = pd.to_numeric(employee_data_1['salary'], downcast='integer') 44 | employee_data_2['bonus'] = pd.to_numeric(employee_data_2['bonus'], downcast='integer') 45 | 46 | # Repeating the merge operation after reducing memory usage 47 | start_time = time() 48 | merged_data_reduced = pd.merge(employee_data_1, employee_data_2, left_index=True, right_index=True, suffixes=('_1', '_2')) 49 | end_time = time() 50 | merge_reduced_time = end_time - start_time 51 | 52 | print(f"Merge operation after optimisation took: {merge_reduced_time:.5f} seconds") -------------------------------------------------------------------------------- /chapter05/9a.concatenate_row_wise.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Creating two sample DataFrames with some identical columns 5 | employee_data_1 = pd.DataFrame({ 6 | 'employee_id': np.arange(1, 6), 7 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 8 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 9 | }) 10 | 11 | employee_data_2 = pd.DataFrame({ 12 | 'employee_id': np.arange(6, 11), 13 | 'name': ['Frank', 'Grace', 'Hannah', 'Ian', 'Jill'], 14 | 'department': ['Logistics', 'HR', 'IT', 'Marketing', 'Finance'] 15 | }) 16 | 17 | # Concatenating the two DataFrames row-wise 18 | concatenated_data = pd.concat([employee_data_1, employee_data_2], axis=0) 19 | 20 | # Displaying the concatenated DataFrame 21 | print("Concatenated Employee Data:") 22 | print(concatenated_data) -------------------------------------------------------------------------------- /chapter05/9b.reset_index.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Creating two sample DataFrames with some identical columns 5 | employee_data_1 = pd.DataFrame({ 6 | 'employee_id': np.arange(1, 6), 7 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 8 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 9 | }) 10 | 11 | employee_data_2 = pd.DataFrame({ 12 | 'employee_id': np.arange(6, 11), 13 | 'name': ['Frank', 'Grace', 'Hannah', 'Ian', 'Jill'], 14 | 'department': ['Logistics', 'HR', 'IT', 'Marketing', 'Finance'] 15 | }) 16 | 17 | # Concatenating the two DataFrames row-wise 18 | concatenated_data = pd.concat([employee_data_1, employee_data_2], axis=0) 19 | 20 | # Displaying the concatenated DataFrame before resetting the index 21 | print("Concatenated Employee Data (Before Resetting Index):") 22 | print(concatenated_data) 23 | 24 | # Resetting the index 25 | concatenated_data_reset = concatenated_data.reset_index(drop=True) 26 | 27 | # Displaying the concatenated DataFrame after resetting the index 28 | print("\nConcatenated Employee Data (After Resetting Index):") 29 | print(concatenated_data_reset) 30 | -------------------------------------------------------------------------------- /chapter05/9c.concatenate_column_wise.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Creating two sample DataFrames with some identical columns 5 | employee_data_1 = pd.DataFrame({ 6 | 'employee_id': np.arange(1, 6), 7 | 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'], 8 | 'department': ['HR', 'IT', 'Marketing', 'Finance', 'IT'] 9 | }) 10 | 11 | # Creating additional data that could represent a different aspect of employee information 12 | employee_performance = pd.DataFrame({ 13 | 'employee_id': np.arange(1, 6), 14 | 'performance_rating': [3, 4, 5, 3, 4] 15 | }) 16 | 17 | # Concatenating the two DataFrames column-wise 18 | concatenated_data = pd.concat([employee_data_1, employee_performance], axis=1) 19 | 20 | # Displaying the concatenated DataFrame 21 | print("Concatenated Employee Data (Column-wise):") 22 | print(concatenated_data) 23 | -------------------------------------------------------------------------------- /chapter06/1.use_case.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample sales data 4 | data = { 5 | 'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'], 6 | 'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'], 7 | 'Region': ['North', 'South', 'East', 'West', 'North', 'South'], 8 | 'Sales': [200, 300, 150, 350, 100, 250], 9 | 'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | print(df) 14 | -------------------------------------------------------------------------------- /chapter06/2.groupby_full_example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Extended sample sales data 4 | data = { 5 | 'Category': [ 6 | 'Electronics', 'Electronics', 'Electronics', 'Electronics', 7 | 'Furniture', 'Furniture', 'Furniture', 'Furniture', 8 | 'Clothing', 'Clothing', 'Clothing', 'Clothing', 9 | 'Electronics', 'Furniture', 'Clothing' 10 | ], 11 | 'Sub-Category': [ 12 | 'Mobile', 'Laptop', 'Tablet', 'Laptop', 13 | 'Chair', 'Table', 'Desk', 'Table', 14 | 'Men', 'Women', 'Kids', 'Men', 15 | 'Mobile', 'Chair', 'Women' 16 | ], 17 | 'Region': [ 18 | 'North', 'South', 'East', 'West', 19 | 'North', 'South', 'East', 'West', 20 | 'North', 'South', 'East', 'West', 21 | 'North', 'West', 'East' 22 | ], 23 | 'Sales': [ 24 | 200, 300, 250, 400, 25 | 150, 350, 200, 400, 26 | 100, 250, 150, 300, 27 | 220, 170, 270 28 | ], 29 | 'Date': [ 30 | '2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', 31 | '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08', 32 | '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12', 33 | '2023-01-13', '2023-01-14', '2023-01-15' 34 | ] 35 | } 36 | 37 | df = pd.DataFrame(data) 38 | print("_____________") 39 | print("Sample df is shown below") 40 | print(df) 41 | 42 | # Group by 'Category' and aggregate the 'Sales' column 43 | category_sales = df.groupby('Category')['Sales'].sum().reset_index() 44 | print("_____________") 45 | print("Sales per Category are shown below:") 46 | print(category_sales) 47 | 48 | # Group by 'Category' and 'Region' and aggregate the 'Sales' column 49 | category_region_sales = df.groupby(['Category', 'Region'])['Sales'].sum().reset_index() 50 | print("_____________") 51 | print("Sales per Category and Region are shown below:") 52 | print(category_region_sales) 53 | 54 | # Group by 'Category' and 'Region' and apply multiple aggregation functions 55 | print("_____________") 56 | print("Total and Mean Sales per Category and Region are shown below:") 57 | category_region_sales_agg = df.groupby(['Category', 'Region'])['Sales'].agg(['sum', 'mean']).reset_index() 58 | print(category_region_sales_agg) 59 | 60 | # Multiple column aggregations 61 | print("_____________") 62 | print("Multiple column aggregations:") 63 | advanced_agg = df.groupby(['Category', 'Region']).agg({ 64 | 'Sales': ['sum', 'mean', 'count'], 65 | 'Sub-Category': 'nunique' # Unique count of Sub-Category 66 | }).reset_index() 67 | print(advanced_agg) 68 | 69 | # ____________________________________________________________________ 70 | # Define custom aggregation functions 71 | print("_____________") 72 | print("Custom Aggregations:") 73 | def range_sales(series): 74 | return series.max() - series.min() 75 | 76 | def coefficient_of_variation(series): 77 | return series.std() / series.mean() 78 | 79 | # Group by 'Category', 'Region', and apply multiple aggregations including custom functions 80 | advanced_agg_custom = df.groupby('Region').agg({ 81 | 'Sales': ['sum', 'mean', 'count', range_sales, coefficient_of_variation], 82 | 'Sub-Category': 'nunique' 83 | }).reset_index() 84 | 85 | # Rename columns for clarity 86 | advanced_agg_custom.columns = [ 87 | 'Region', 'Total Sales', 'Average Sales', 'Number of Transactions', 88 | 'Sales Range', 'Coefficient of Variation', 'Unique Sub-Categories' 89 | ] 90 | 91 | print(advanced_agg_custom) 92 | print(# Displaying only the specified columns 93 | print(advanced_agg_custom[['Region', 'Total Sales', 'Sales Range', 'Coefficient of Variation', 'Unique Sub-Categories']])) 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /chapter06/3.apply_axis0.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Sample sales data with additional columns 5 | data = { 6 | 'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'], 7 | 'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'], 8 | 'Sales': [100, 200, 150, 300, 120, 180], 9 | 'Quantity': [10, 5, 8, 3, 15, 12], 10 | 'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'] 11 | } 12 | df = pd.DataFrame(data) 13 | 14 | # Convert 'Date' column to datetime format 15 | df['Date'] = pd.to_datetime(df['Date']) 16 | 17 | # Define a custom function to compute multiple statistics for 'Sales' and 'Quantity' 18 | def compute_statistics(series): 19 | sum_sales = series['Sales'].sum() 20 | mean_sales = series['Sales'].mean() 21 | std_sales = series['Sales'].std() 22 | cv_sales = std_sales / mean_sales 23 | 24 | sum_quantity = series['Quantity'].sum() 25 | mean_quantity = series['Quantity'].mean() 26 | std_quantity = series['Quantity'].std() 27 | cv_quantity = std_quantity / mean_quantity 28 | 29 | return pd.Series([sum_sales, mean_sales, std_sales, cv_sales, sum_quantity, mean_quantity, std_quantity, cv_quantity], 30 | index=['Sum_Sales', 'Mean_Sales', 'Std_Sales', 'CV_Sales', 31 | 'Sum_Quantity', 'Mean_Quantity', 'Std_Quantity', 'CV_Quantity']) 32 | 33 | # Group by 'Category' and apply custom function to compute statistics of 'Sales' and 'Quantity' 34 | result_complex = df.groupby('Category').apply(compute_statistics).reset_index() 35 | 36 | print("Using apply() for complex function (multiple statistics calculation for 'Sales' and 'Quantity'):") 37 | print(result_complex) -------------------------------------------------------------------------------- /chapter06/4.apply_axis1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Sample sales data with additional columns 5 | data = { 6 | 'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'], 7 | 'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'], 8 | 'Sales': [100, 200, 150, 300, 120, 180], 9 | 'Quantity': [10, 5, 8, 3, 15, 12], 10 | 'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'] 11 | } 12 | df = pd.DataFrame(data) 13 | 14 | # Convert 'Date' column to datetime format 15 | df['Date'] = pd.to_datetime(df['Date']) 16 | 17 | # Define a function to compute Total_Sales_Quantity and Sales_Quantity_Ratio 18 | def compute_metrics(row): 19 | total_sales_quantity = row['Sales'] + row['Quantity'] 20 | sales_quantity_ratio = row['Sales'] / row['Quantity'] if row['Quantity'] != 0 else np.nan 21 | return pd.Series([total_sales_quantity, sales_quantity_ratio], index=['Total_Sales_Quantity', 'Sales_Quantity_Ratio']) 22 | 23 | # Apply the function row-wise (axis=1) to calculate new metrics 24 | df[['Total_Sales_Quantity', 'Sales_Quantity_Ratio']] = df.apply(compute_metrics, axis=1) 25 | 26 | # Group by 'Category' to calculate metrics per category 27 | category_metrics = df.groupby('Category')[['Total_Sales_Quantity', 'Sales_Quantity_Ratio']].mean().reset_index() 28 | 29 | print("DataFrame with Total_Sales_Quantity and Sales_Quantity_Ratio per Category:") 30 | print(category_metrics) 31 | -------------------------------------------------------------------------------- /chapter06/5.simple_filtering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample sales data 4 | data = { 5 | 'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Clothing', 'Clothing'], 6 | 'Sub-Category': ['Mobile', 'Laptop', 'Chair', 'Table', 'Men', 'Women'], 7 | 'Sales': [100, 200, 150, 300, 120, 180], 8 | 'Quantity': [10, 5, 8, 3, 15, 12], 9 | 'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'] 10 | } 11 | df = pd.DataFrame(data) 12 | 13 | # Filter to show products with quantity > 10 14 | filtered_data = df[df['Quantity'] > 10] 15 | 16 | print("Filtered Data:") 17 | print(filtered_data) 18 | -------------------------------------------------------------------------------- /chapter06/6.advanced_filtering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Sample sales data with additional columns 4 | data = { 5 | 'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics', 'Electronics'], 6 | 'Sub-Category': ['Mobile', 'Laptop', 'Tablet', 'Headphones', 'Smartwatch', 'Printer'], 7 | 'Sales': [1000, 1500, 800, 300, 400, 600], 8 | 'Quantity': [50, 25, 40, 15, 20, 30], 9 | 'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06'] 10 | } 11 | df = pd.DataFrame(data) 12 | 13 | # Convert 'Date' column to datetime format 14 | df['Date'] = pd.to_datetime(df['Date']) 15 | 16 | # Filter criteria: Sales greater than 1000 and Quantity less than 30 17 | filtered_data = df[(df['Sales'] > 1000) & (df['Quantity'] < 30)] 18 | 19 | print("Filtered Data based on Multiple Criteria:") 20 | print(filtered_data) 21 | -------------------------------------------------------------------------------- /chapter07/1.postgressql.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import psycopg2 3 | from psycopg2 import sql 4 | 5 | # Function to check if a table exists in the database 6 | def table_exists(cursor, table_name): 7 | cursor.execute( 8 | sql.SQL("SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = %s)"), 9 | [table_name] 10 | ) 11 | return cursor.fetchone()[0] 12 | 13 | # Function to create a table in the database 14 | def create_table(cursor, table_name): 15 | cursor.execute( 16 | sql.SQL(""" 17 | CREATE TABLE {} ( 18 | id SERIAL PRIMARY KEY, 19 | name VARCHAR(255), 20 | age INT 21 | ) 22 | """).format(sql.Identifier(table_name)) 23 | ) 24 | 25 | # Function to insert data into the table 26 | def insert_data(cursor, table_name, data): 27 | cursor.executemany( 28 | sql.SQL("INSERT INTO {} (name, age) VALUES (%s, %s)").format(sql.Identifier(table_name)), 29 | data 30 | ) 31 | 32 | # Function to fetch and print data from the table 33 | def print_table_data(cursor, table_name): 34 | cursor.execute( 35 | sql.SQL("SELECT * FROM {}").format(sql.Identifier(table_name)) 36 | ) 37 | rows = cursor.fetchall() 38 | for row in rows: 39 | print(row) 40 | 41 | # Mock DataFrame 42 | data = { 43 | 'name': ['Alice', 'Bob', 'Charlie'], 44 | 'age': [25, 30, 22] 45 | } 46 | 47 | df = pd.DataFrame(data) 48 | 49 | # PostgreSQL connection parameters 50 | db_params = { 51 | 'dbname': 'learn_sql', 52 | 'user': 'the_great_coder', 53 | 'password': 'the_great_coder_again', 54 | 'host': 'localhost', 55 | 'port': '5432' 56 | } 57 | 58 | # Connect to PostgreSQL 59 | conn = psycopg2.connect(**db_params) 60 | cursor = conn.cursor() 61 | 62 | # Specify the table name 63 | table_name = 'example_table' 64 | 65 | # Check if the table exists, and create it if it doesn't 66 | if not table_exists(cursor, table_name): 67 | create_table(cursor, table_name) 68 | 69 | # Insert data into the table 70 | insert_data(cursor, table_name, df.values.tolist()) 71 | 72 | # Commit the changes 73 | conn.commit() 74 | 75 | # Print the data from the table 76 | print_table_data(cursor, table_name) 77 | 78 | # Close the connection 79 | cursor.close() 80 | conn.close() 81 | -------------------------------------------------------------------------------- /chapter07/2.pymongo.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | # MongoDB connection parameters 4 | mongo_params = { 5 | 'host': 'localhost', 6 | 'port': 27017, 7 | } 8 | 9 | # Function to check if a collection exists in the database 10 | def collection_exists(db, collection_name): 11 | return collection_name in db.list_collection_names() 12 | 13 | # Function to create a collection in the database 14 | def create_collection(db, collection_name): 15 | db.create_collection(collection_name) 16 | 17 | # Function to insert data into a collection 18 | def insert_data(collection, data): 19 | collection.insert_many(data) 20 | 21 | # Mock document data 22 | documents = [ 23 | {'name': 'Alice', 'age': 25}, 24 | {'name': 'Bob', 'age': 30}, 25 | {'name': 'Charlie', 'age': 22} 26 | ] 27 | 28 | # MongoDB database and collection names 29 | db_name = 'no_sql_db' 30 | collection_name = 'best_collection_ever' 31 | 32 | # Connect to MongoDB 33 | client = MongoClient(**mongo_params) 34 | db = client[db_name] 35 | 36 | # Check if the collection exists, and create it if it doesn't 37 | if not collection_exists(db, collection_name): 38 | create_collection(db, collection_name) 39 | 40 | # Get the collection 41 | collection = db[collection_name] 42 | 43 | # Insert data into the collection 44 | insert_data(collection, documents) 45 | 46 | # Query data from the collection 47 | result = collection.find() 48 | for document in result: 49 | print(document) 50 | 51 | # Close the MongoDB connection 52 | client.close() 53 | -------------------------------------------------------------------------------- /chapter07/3.pymongo_expand.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | 3 | # MongoDB connection parameters 4 | mongo_params = { 5 | 'host': 'localhost', 6 | 'port': 27017, 7 | } 8 | 9 | # Function to check if a collection exists in the database 10 | def collection_exists(db, collection_name): 11 | return collection_name in db.list_collection_names() 12 | 13 | # Function to create a collection in the database 14 | def create_collection(db, collection_name): 15 | db.create_collection(collection_name) 16 | 17 | # Function to insert data into a collection 18 | def insert_data(collection, data): 19 | collection.insert_many(data) 20 | 21 | # Mock document data with different structures 22 | documents = [ 23 | {'name': 'Alice', 'age': 25, 'email': 'alice@example.com'}, 24 | {'name': 'Bob', 'age': 30, 'address': '123 Main St'}, 25 | {'name': 'Charlie', 'age': 22, 'hobbies': ['reading', 'gaming']}, 26 | {'name': 'David', 'age': 40, 'email': 'david@example.com', 'address': '456 Elm St', 'active': True}, 27 | {'name': 'Eve', 'age': 35, 'email': 'eve@example.com', 'phone': '555-1234'} 28 | ] 29 | 30 | # MongoDB database and collection names 31 | db_name = 'no_sql_db' 32 | collection_name = 'best_collection_ever' 33 | 34 | # Connect to MongoDB 35 | client = MongoClient(**mongo_params) 36 | db = client[db_name] 37 | 38 | # Check if the collection exists, and create it if it doesn't 39 | if not collection_exists(db, collection_name): 40 | create_collection(db, collection_name) 41 | 42 | # Get the collection 43 | collection = db[collection_name] 44 | 45 | # Insert data into the collection 46 | insert_data(collection, documents) 47 | 48 | # Query data from the collection 49 | result = collection.find() 50 | for document in result: 51 | print(document) 52 | 53 | # Close the MongoDB connection 54 | client.close() 55 | -------------------------------------------------------------------------------- /chapter07/4a.kafka_producer.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | from confluent_kafka import Producer 3 | import json 4 | 5 | # MongoDB connection 6 | mongo_client = MongoClient('mongodb://localhost:27017') 7 | db = mongo_client['no_sql_db'] 8 | collection = db['best_collection_ever'] 9 | 10 | # Kafka producer configuration 11 | kafka_config = { 12 | 'bootstrap.servers': 'localhost:9092' 13 | } 14 | producer = Producer(kafka_config) 15 | 16 | def delivery_report(err, msg): 17 | if err is not None: 18 | print(f'Message delivery failed: {err}') 19 | else: 20 | print(f'Message delivered to {msg.topic()} [{msg.partition()}]') 21 | 22 | # Read from MongoDB and produce to Kafka 23 | for document in collection.find(): 24 | # Convert MongoDB document to JSON string 25 | message = json.dumps(document, default=str) 26 | 27 | # Produce message to Kafka 28 | producer.produce('mongodb_topic', value=message.encode('utf-8'), callback=delivery_report) 29 | producer.poll(0) 30 | 31 | producer.flush() -------------------------------------------------------------------------------- /chapter07/4b.kafka_consumer.py: -------------------------------------------------------------------------------- 1 | from confluent_kafka import Consumer, KafkaError 2 | import json 3 | import time 4 | 5 | # Kafka consumer configuration 6 | consumer_config = { 7 | 'bootstrap.servers': 'localhost:9092', 8 | 'group.id': 'mongodb_consumer_group', 9 | 'auto.offset.reset': 'earliest' 10 | } 11 | 12 | consumer = Consumer(consumer_config) 13 | consumer.subscribe(['mongodb_topic']) 14 | 15 | # Set the duration for which the consumer should run (in seconds) 16 | run_duration = 10 # For example, 10 seconds 17 | start_time = time.time() 18 | 19 | print("Starting consumer...") 20 | 21 | while True: 22 | # Check if the specified duration has passed 23 | if time.time() - start_time > run_duration: 24 | print("Time limit reached, shutting down consumer.") 25 | break 26 | 27 | msg = consumer.poll(1.0) 28 | 29 | if msg is None: 30 | continue 31 | if msg.error(): 32 | if msg.error().code() == KafkaError._PARTITION_EOF: 33 | print('Reached end of partition') 34 | else: 35 | print(f'Error: {msg.error()}') 36 | else: 37 | # Process the message 38 | document = json.loads(msg.value().decode('utf-8')) 39 | print(f'Received document: {document}') 40 | # Add your processing logic here 41 | 42 | consumer.close() 43 | print("Consumer closed.") -------------------------------------------------------------------------------- /chapter07/5.time_based_partitioning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | from datetime import datetime 6 | 7 | # Sample data 8 | data = {"timestamp": ["2022-01-01", "2022-01-01", "2022-01-02"], 9 | "value": [10, 15, 12]} 10 | 11 | # Create a Pandas DataFrame 12 | df = pd.DataFrame(data) 13 | 14 | # Convert the timestamp column to a datetime type 15 | df["timestamp"] = pd.to_datetime(df["timestamp"]) 16 | 17 | # Time-based partitioning 18 | base_path = "path_to_write_data" 19 | for timestamp, group in df.groupby(df["timestamp"].dt.date): 20 | # Create the directory if it doesn't exist 21 | os.makedirs(base_path, exist_ok=True) 22 | 23 | partition_path = os.path.join(base_path, str(timestamp)) 24 | 25 | table = pa.Table.from_pandas(group) 26 | pq.write_table(table, partition_path) 27 | 28 | # To read data from a specific partition 29 | specific_partition_path = "/Users/maria.zervou/projects/python_best_practices/data_sinks/data/2022-01-01" 30 | partitioned_data = pq.read_table(specific_partition_path).to_pandas() 31 | -------------------------------------------------------------------------------- /chapter07/6.geo_partitioning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | from datetime import datetime 6 | 7 | # Create a base directory for storing partitioned data 8 | base_directory = "/Users/maria.zervou/projects/python_best_practices/data_sinks/geo_data" 9 | os.makedirs(base_directory, exist_ok=True) 10 | 11 | # Geographic partitioning 12 | geo_data = {"region": ["North", "South", "East"], 13 | "value": [10, 15, 12]} 14 | geo_df = pd.DataFrame(geo_data) 15 | 16 | for region, group in geo_df.groupby("region"): 17 | # Create a directory for each region within the base directory 18 | region_path = os.path.join(base_directory, region) 19 | 20 | # Convert the group to a PyArrow Table and write it to the partition path 21 | table = pa.Table.from_pandas(group) 22 | pq.write_table(table, region_path) 23 | -------------------------------------------------------------------------------- /chapter07/7.hybrid_partitioning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | from datetime import datetime 6 | 7 | # Create a base directory for storing partitioned data 8 | base_directory = "/Users/maria.zervou/projects/python_best_practices/data_sinks/hybrid_data" 9 | 10 | # Hybrid partitioning 11 | hybrid_data = {"timestamp": ["2022-01-01", "2022-01-01", "2022-01-02"], 12 | "region": ["North", "South", "East"], 13 | "value": [10, 15, 12]} 14 | hybrid_df = pd.DataFrame(hybrid_data) 15 | 16 | for (timestamp, region), group in hybrid_df.groupby(["timestamp", "region"]): 17 | # Create a directory for each timestamp and region combination within the base directory 18 | timestamp_path = os.path.join(base_directory, str(timestamp)) 19 | os.makedirs(timestamp_path, exist_ok=True) 20 | timestamp_region_path = os.path.join(base_directory, str(timestamp), str(region)) 21 | 22 | # Convert the group to a PyArrow Table and write it to the partition path 23 | table = pa.Table.from_pandas(group) 24 | pq.write_table(table, timestamp_region_path) 25 | 26 | -------------------------------------------------------------------------------- /chapter07/__pycache__/pymongo.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter07/__pycache__/pymongo.cpython-312.pyc -------------------------------------------------------------------------------- /chapter07/setup/cleanup_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Function to print section headers 4 | print_header() { 5 | echo "========================================" 6 | echo "$1" 7 | echo "========================================" 8 | } 9 | 10 | # Stop and remove Docker containers 11 | print_header "Stopping and removing Docker containers" 12 | docker-compose down -v 13 | docker rm -f $(docker ps -aq) 14 | 15 | # Remove Kafka data 16 | print_header "Removing Kafka data" 17 | rm -rf /tmp/kafka-logs /tmp/zookeeper 18 | 19 | # MongoDB cleanup 20 | print_header "Cleaning up MongoDB" 21 | mongo < chi2_threshold] 47 | 48 | # Drop outliers from the dataframe 49 | df_no_outliers = df[df['Mahalanobis_Distance'] <= chi2_threshold] 50 | 51 | # Visualize the data and outliers in 3D space 52 | fig = plt.figure(figsize=(10, 8)) 53 | ax = fig.add_subplot(111, projection='3d') 54 | 55 | # Plot all data points in blue 56 | ax.scatter(df_no_outliers['X1'], df_no_outliers['X2'], df_no_outliers['Mahalanobis_Distance'], color='blue', label='Data Points') 57 | 58 | # Plot outliers in red 59 | ax.scatter(outliers['X1'], outliers['X2'], outliers['Mahalanobis_Distance'], color='red', label='Outliers') 60 | 61 | ax.set_xlabel('X1') 62 | ax.set_ylabel('X2') 63 | ax.set_zlabel('Mahalanobis Distance') 64 | ax.set_title('Outlier Detection using Mahalanobis Distance') 65 | 66 | plt.legend() 67 | plt.show() 68 | 69 | # Describe changes in the dataset 70 | print("\nOriginal Dataset Statistics:") 71 | print(df.describe()) 72 | 73 | print("\nDataset Statistics after Removing Outliers:") 74 | print(df_no_outliers.describe()) 75 | -------------------------------------------------------------------------------- /chapter08/13.clustering.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.cluster import DBSCAN 5 | from sklearn.preprocessing import StandardScaler 6 | 7 | # Generate example data 8 | np.random.seed(42) 9 | data = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0.5], [0.5, 1]], size=100) 10 | outliers = np.random.multivariate_normal(mean=[8, 8], cov=[[1, 0], [0, 1]], size=10) 11 | data_with_outliers = np.vstack([data, outliers]) 12 | 13 | # Create a DataFrame 14 | df = pd.DataFrame(data_with_outliers, columns=['Feature1', 'Feature2']) 15 | 16 | # Visualize the data 17 | plt.scatter(df['Feature1'], df['Feature2'], color='blue', label='Inliers') 18 | plt.scatter(outliers[:, 0], outliers[:, 1], color='red', marker='x', label='Outliers') # Use 'x' as the marker for outliers 19 | plt.title('Original Data with Outliers') 20 | plt.xlabel('Feature1') 21 | plt.ylabel('Feature2') 22 | plt.legend() 23 | plt.show() 24 | 25 | # Standardize the data 26 | scaler = StandardScaler() 27 | data_scaled = scaler.fit_transform(df) 28 | 29 | # Apply DBSCAN for outlier detection 30 | dbscan = DBSCAN(eps=0.4, min_samples=5) 31 | df['Outlier'] = dbscan.fit_predict(data_scaled) 32 | 33 | # Visualize the results 34 | plt.scatter(df['Feature1'][df['Outlier'] == -1], df['Feature2'][df['Outlier'] == -1], color='red', marker='x', label='Outliers') 35 | plt.scatter(df['Feature1'][df['Outlier'] != -1], df['Feature2'][df['Outlier'] != -1], color='blue', label='Inliers') 36 | plt.title('Outlier Detection with DBSCAN') 37 | plt.xlabel('Feature1') 38 | plt.ylabel('Feature2') 39 | plt.legend() 40 | plt.show() 41 | -------------------------------------------------------------------------------- /chapter08/14.multivariate_trimming.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from scipy.stats import chi2 6 | from mpl_toolkits.mplot3d import Axes3D 7 | 8 | # Generate multivariate student data 9 | np.random.seed(42) 10 | data = np.random.multivariate_normal(mean=[0, 0], cov=[[1, 0.5], [0.5, 1]], size=100) 11 | 12 | # Introduce outliers 13 | outliers = np.array([[8, 8], [9, 9]]) 14 | data = np.concatenate([data, outliers]) 15 | 16 | df = pd.DataFrame(data, columns=['X1', 'X2']) 17 | 18 | def mahalanobis_distance(x, mean, inv_cov_matrix): 19 | # Center the data 20 | centered_data = x - mean 21 | 22 | # Calculate Mahalanobis Distance 23 | mahalanobis_dist = np.sqrt(np.dot(centered_data, np.dot(inv_cov_matrix, centered_data))) 24 | 25 | return mahalanobis_dist 26 | 27 | # Ensure data is of type float 28 | df[['X1', 'X2']] = df[['X1', 'X2']].astype(float) 29 | 30 | # Center the data 31 | mean = np.mean(df[['X1', 'X2']], axis=0) 32 | 33 | # Calculate the covariance matrix 34 | cov_matrix = np.cov(df[['X1', 'X2']], rowvar=False) 35 | 36 | # Calculate the inverse of the covariance matrix 37 | inv_cov_matrix = np.linalg.inv(cov_matrix) 38 | 39 | # Calculate Mahalanobis Distance for each data point 40 | df['Mahalanobis_Distance'] = df.apply(lambda row: mahalanobis_distance(row[['X1', 'X2']], mean, inv_cov_matrix), axis=1) 41 | 42 | # Set a significance level for outlier detection 43 | alpha = 0.1 44 | chi2_threshold = chi2.ppf(1 - alpha, df=2) # df is the degrees of freedom, which is the number of features 45 | 46 | # Identify outliers 47 | outliers = df[df['Mahalanobis_Distance'] > chi2_threshold] 48 | 49 | # Drop outliers from the dataframe 50 | df_no_outliers = df[df['Mahalanobis_Distance'] <= chi2_threshold] 51 | 52 | # Visualize the distribution plots before and after removing outliers 53 | plt.figure(figsize=(12, 5)) 54 | 55 | plt.subplot(1, 2, 1) 56 | plt.title("Distribution of 'X1' Before Outlier Handling") 57 | sns.histplot(df['X1'], bins=20, color='blue', kde=True) 58 | plt.xlabel('X1') 59 | plt.ylabel('Frequency') 60 | 61 | plt.subplot(1, 2, 2) 62 | plt.title("Distribution of 'X2' Before Outlier Handling") 63 | sns.histplot(df['X2'], bins=20, color='orange', kde=True) 64 | plt.xlabel('X2') 65 | plt.ylabel('Frequency') 66 | 67 | plt.tight_layout() 68 | plt.show() 69 | 70 | plt.figure(figsize=(12, 5)) 71 | 72 | plt.subplot(1, 2, 1) 73 | plt.title("Distribution of 'X1' After Outlier Handling") 74 | sns.histplot(df_no_outliers['X1'], bins=20, color='blue', kde=True) 75 | plt.xlabel('X1') 76 | plt.ylabel('Frequency') 77 | 78 | plt.subplot(1, 2, 2) 79 | plt.title("Distribution of 'X2' After Outlier Handling") 80 | sns.histplot(df_no_outliers['X2'], bins=20, color='orange', kde=True) 81 | plt.xlabel('X2') 82 | plt.ylabel('Frequency') 83 | 84 | plt.tight_layout() 85 | plt.show() 86 | 87 | # Visualize the data and outliers in 3D space 88 | fig = plt.figure(figsize=(10, 8)) 89 | ax = fig.add_subplot(111, projection='3d') 90 | 91 | # Plot all data points in blue 92 | ax.scatter(df_no_outliers['X1'], df_no_outliers['X2'], df_no_outliers['Mahalanobis_Distance'], color='blue', label='Data Points') 93 | 94 | # Plot outliers with a different symbol (e.g., 'x') in red 95 | ax.scatter(outliers['X1'], outliers['X2'], outliers['Mahalanobis_Distance'], color='red', marker='x', label='Outliers') 96 | 97 | ax.set_xlabel('X1') 98 | ax.set_ylabel('X2') 99 | ax.set_zlabel('Mahalanobis Distance') 100 | ax.set_title('Outlier Detection using Mahalanobis Distance') 101 | 102 | plt.legend() 103 | plt.show() 104 | 105 | # Describe changes in the dataset 106 | print("\nOriginal Dataset Statistics:") 107 | print(df.describe()) 108 | 109 | print("\nDataset Statistics after Removing Outliers:") 110 | print(df_no_outliers.describe()) 111 | -------------------------------------------------------------------------------- /chapter08/2.delete_missing_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Generate student data with missing ages and test scores 6 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 7 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 8 | 9 | df = pd.DataFrame(data) 10 | 11 | # Display the original dataset statistics 12 | print("Original Dataset Statistics:") 13 | print(df.describe()) 14 | 15 | # Plot the distributions before deletion 16 | plt.figure(figsize=(12, 5)) 17 | 18 | plt.subplot(1, 2, 1) 19 | plt.title("Distribution of 'Age' Before Deletion") 20 | plt.hist(df['Age'].dropna(), bins=10, color='blue', alpha=0.7, label='Original') 21 | plt.legend() 22 | 23 | plt.subplot(1, 2, 2) 24 | plt.title("Distribution of 'Test_Score' Before Deletion") 25 | plt.hist(df['Test_Score'].dropna(), bins=10, color='orange', alpha=0.7, label='Original') 26 | plt.legend() 27 | 28 | plt.tight_layout() 29 | plt.show() 30 | 31 | # Delete rows with any missing values 32 | df_no_missing = df.dropna() 33 | 34 | # Display the dataset after deletion 35 | print("\nDataset after Deleting Rows with Missing Values:") 36 | print(df_no_missing) 37 | 38 | # Display the dataset statistics after deletion 39 | print("\nDataset Statistics after Deleting Rows with Missing Values:") 40 | print(df_no_missing.describe()) 41 | 42 | # Plot the distributions after deletion 43 | plt.figure(figsize=(12, 5)) 44 | 45 | plt.subplot(1, 2, 1) 46 | plt.title("Distribution of 'Age' After Deletion") 47 | plt.hist(df_no_missing['Age'], bins=10, color='blue', alpha=0.7, label='After Deletion') 48 | plt.legend() 49 | 50 | plt.subplot(1, 2, 2) 51 | plt.title("Distribution of 'Test_Score' After Deletion") 52 | plt.hist(df_no_missing['Test_Score'], bins=10, color='orange', alpha=0.7, label='After Deletion') 53 | plt.legend() 54 | 55 | plt.tight_layout() 56 | plt.show() 57 | 58 | # Explain the changes and size drop 59 | print("\nExplanation:") 60 | print("The rows containing missing values were removed, resulting in a smaller dataset.") 61 | -------------------------------------------------------------------------------- /chapter08/3.mean_imputation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Create a sample dataset with missing values 6 | np.random.seed(42) 7 | 8 | # Generate student data with missing ages and test scores 9 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 10 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 11 | 12 | df = pd.DataFrame(data) 13 | print(df) 14 | 15 | # Display the original dataset statistics 16 | print("Original Dataset Statistics:") 17 | print(df.describe()) 18 | 19 | # Plot the distributions before mean imputation 20 | plt.figure(figsize=(12, 5)) 21 | 22 | plt.subplot(1, 2, 1) 23 | plt.title("Distribution of 'Age' Before Mean Imputation") 24 | plt.hist(df['Age'].dropna(), bins=10, color='blue', alpha=0.7, label='Original') 25 | plt.legend() 26 | 27 | plt.subplot(1, 2, 2) 28 | plt.title("Distribution of 'Test_Score' Before Mean Imputation") 29 | plt.hist(df['Test_Score'].dropna(), bins=10, color='orange', alpha=0.7, label='Original') 30 | plt.legend() 31 | 32 | plt.tight_layout() 33 | plt.show() 34 | 35 | # Mean imputation for missing ages and test scores with rounded mean for 'Age' 36 | df_mean_imputed = df.copy() 37 | df_mean_imputed['Age'].fillna(round(df['Age'].mean()), inplace=True) 38 | df_mean_imputed['Test_Score'].fillna(df['Test_Score'].mean(), inplace=True) 39 | 40 | # Display the dataset after mean imputation 41 | print("\nDataset after Mean Imputation:") 42 | print(df_mean_imputed) 43 | 44 | # Display the dataset statistics after mean imputation 45 | print("\nDataset Statistics after Mean Imputation:") 46 | print(df_mean_imputed.describe()) 47 | 48 | # Plot the distributions after mean imputation 49 | plt.figure(figsize=(12, 5)) 50 | 51 | plt.subplot(1, 2, 1) 52 | plt.title("Distribution of 'Age' After Mean Imputation") 53 | plt.hist(df_mean_imputed['Age'], bins=10, color='blue', alpha=0.7, label='Imputed') 54 | plt.legend() 55 | 56 | plt.subplot(1, 2, 2) 57 | plt.title("Distribution of 'Test_Score' After Mean Imputation") 58 | plt.hist(df_mean_imputed['Test_Score'], bins=10, color='orange', alpha=0.7, label='Imputed') 59 | plt.legend() 60 | 61 | plt.tight_layout() 62 | plt.show() 63 | -------------------------------------------------------------------------------- /chapter08/4.median_imputation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | # Generate student data with missing ages and test scores 7 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 8 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 9 | 10 | df = pd.DataFrame(data) 11 | 12 | # Display the original dataset statistics 13 | print("Original Dataset Statistics:") 14 | print(df.describe()) 15 | 16 | # Plot the distributions before median imputation 17 | plt.figure(figsize=(12, 5)) 18 | 19 | plt.subplot(1, 2, 1) 20 | plt.title("Distribution of 'Age' Before Median Imputation") 21 | plt.hist(df['Age'].dropna(), bins=10, color='blue', alpha=0.7, label='Original') 22 | plt.legend() 23 | 24 | plt.subplot(1, 2, 2) 25 | plt.title("Distribution of 'Test_Score' Before Median Imputation") 26 | plt.hist(df['Test_Score'].dropna(), bins=10, color='orange', alpha=0.7, label='Original') 27 | plt.legend() 28 | 29 | plt.tight_layout() 30 | plt.show() 31 | 32 | # Median imputation for missing ages and test scores 33 | df_median_imputed = df.copy() 34 | df_median_imputed['Age'].fillna(df['Age'].median(), inplace=True) 35 | df_median_imputed['Test_Score'].fillna(df['Test_Score'].median(), inplace=True) 36 | 37 | # Display the dataset after median imputation 38 | print("\nDataset after Median Imputation:") 39 | print(df_median_imputed) 40 | 41 | # Display the dataset statistics after median imputation 42 | print("\nDataset Statistics after Median Imputation:") 43 | print(df_median_imputed.describe()) 44 | 45 | # Plot the distributions after median imputation 46 | plt.figure(figsize=(12, 5)) 47 | 48 | plt.subplot(1, 2, 1) 49 | plt.title("Distribution of 'Age' After Median Imputation") 50 | plt.hist(df_median_imputed['Age'], bins=10, color='blue', alpha=0.7, label='Imputed') 51 | plt.legend() 52 | 53 | plt.subplot(1, 2, 2) 54 | plt.title("Distribution of 'Test_Score' After Median Imputation") 55 | plt.hist(df_median_imputed['Test_Score'], bins=10, color='orange', alpha=0.7, label='Imputed') 56 | plt.legend() 57 | 58 | plt.tight_layout() 59 | plt.show() 60 | 61 | 62 | print(df['Age'].median()) -------------------------------------------------------------------------------- /chapter08/5.indicator_imputation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Create a sample dataset with missing values 6 | np.random.seed(42) 7 | 8 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 9 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 10 | 11 | df = pd.DataFrame(data) 12 | 13 | # Create indicator variables for missing values 14 | df['Age_missing'] = df['Age'].isnull().astype(int) 15 | df['Test_Score_missing'] = df['Test_Score'].isnull().astype(int) 16 | 17 | # Display the original dataset 18 | print("Original Dataset:") 19 | print(df) 20 | 21 | # Impute missing values with a placeholder (e.g., mean or median) 22 | df_imputed = df.copy() 23 | df_imputed['Age'].fillna(df_imputed['Age'].mean(), inplace=True) 24 | df_imputed['Test_Score'].fillna(df_imputed['Test_Score'].mean(), inplace=True) 25 | 26 | # Display the dataset after imputation 27 | print("\nDataset after Indicator Variable Imputation:") 28 | print(df_imputed) 29 | 30 | # Plot distribution charts for indicator variables 31 | plt.figure(figsize=(12, 5)) 32 | 33 | plt.subplot(1, 2, 1) 34 | plt.title("Distribution of Age_missing") 35 | df['Age_missing'].value_counts().plot(kind='bar', color=['blue', 'orange']) 36 | plt.xlabel("Missing (1) / Not Missing (0)") 37 | plt.ylabel("Count") 38 | 39 | plt.subplot(1, 2, 2) 40 | plt.title("Distribution of Test_Score_missing") 41 | df['Test_Score_missing'].value_counts().plot(kind='bar', color=['blue', 'orange']) 42 | plt.xlabel("Missing (1) / Not Missing (0)") 43 | plt.ylabel("Count") 44 | 45 | plt.tight_layout() 46 | plt.show() 47 | 48 | import seaborn as sns 49 | 50 | plt.figure(figsize=(12, 5)) 51 | 52 | plt.subplot(1, 2, 1) 53 | sns.boxplot(x='Age_missing', y='Test_Score', data=df_imputed) 54 | plt.title("Boxplot of Test_Score by Age_missing") 55 | 56 | plt.subplot(1, 2, 2) 57 | sns.boxplot(x='Test_Score_missing', y='Age', data=df_imputed) 58 | plt.title("Boxplot of Age by Test_Score_missing") 59 | 60 | plt.tight_layout() 61 | plt.show() 62 | 63 | -------------------------------------------------------------------------------- /chapter08/6.outliers_visualisation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from scipy import stats 5 | 6 | # Original dataset 7 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 8 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 9 | 10 | df = pd.DataFrame(data) 11 | 12 | # Calculate Z-Scores for each column 13 | z_scores_age = np.abs(stats.zscore(df['Age'].dropna())) 14 | z_scores_test_score = np.abs(stats.zscore(df['Test_Score'].dropna())) 15 | 16 | # Set Z-Score threshold 17 | z_threshold = 3 18 | 19 | # Identify outliers 20 | outliers_age = np.where(z_scores_age > z_threshold)[0] 21 | outliers_test_score = np.where(z_scores_test_score > z_threshold)[0] 22 | 23 | # Output identified outliers 24 | print("Outliers in 'Age':", df['Age'].iloc[outliers_age].to_list()) 25 | print("Outliers in 'Test_Score':", df['Test_Score'].iloc[outliers_test_score].to_list()) 26 | 27 | plt.figure(figsize=(12, 5)) 28 | 29 | plt.subplot(1, 2, 1) 30 | plt.title("Violin Plot for 'Age'") 31 | plt.violinplot(df['Age'].dropna(), vert=False) 32 | 33 | plt.subplot(1, 2, 2) 34 | plt.title("Violin Plot for 'Test_Score'") 35 | plt.violinplot(df['Test_Score'].dropna(), vert=False) 36 | 37 | plt.tight_layout() 38 | plt.show() 39 | 40 | 41 | plt.figure(figsize=(12, 5)) 42 | 43 | plt.subplot(1, 2, 1) 44 | plt.title("Box Plot for 'Age'") 45 | plt.boxplot(df['Age'].dropna(), vert=False) 46 | 47 | plt.subplot(1, 2, 2) 48 | plt.title("Box Plot for 'Test_Score'") 49 | plt.boxplot(df['Test_Score'].dropna(), vert=False) 50 | 51 | plt.tight_layout() 52 | plt.show() 53 | -------------------------------------------------------------------------------- /chapter08/7.identify_univariate_outliers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from scipy import stats 5 | 6 | # Original dataset 7 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 8 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 9 | 10 | df = pd.DataFrame(data) 11 | 12 | # Calculate Z-Scores for each column 13 | z_scores_age = np.abs(stats.zscore(df['Age'].dropna())) 14 | z_scores_test_score = np.abs(stats.zscore(df['Test_Score'].dropna())) 15 | 16 | # Set Z-Score threshold 17 | z_threshold = 3 18 | 19 | # Identify outliers 20 | outliers_age = np.where(z_scores_age > z_threshold)[0] 21 | outliers_test_score = np.where(z_scores_test_score > z_threshold)[0] 22 | 23 | # Plot Z-Scores 24 | plt.figure(figsize=(12, 5)) 25 | 26 | plt.subplot(1, 2, 1) 27 | plt.title("Z-Scores for 'Age'") 28 | plt.scatter(range(len(z_scores_age)), z_scores_age, color='blue', label='Z-Scores') 29 | plt.axhline(y=z_threshold, color='red', linestyle='--', label='Threshold') 30 | plt.legend() 31 | 32 | plt.subplot(1, 2, 2) 33 | plt.title("Z-Scores for 'Test_Score'") 34 | plt.scatter(range(len(z_scores_test_score)), z_scores_test_score, color='orange', label='Z-Scores') 35 | plt.axhline(y=z_threshold, color='red', linestyle='--', label='Threshold') 36 | plt.legend() 37 | 38 | plt.tight_layout() 39 | plt.show() 40 | 41 | # Function to identify outliers using IQR 42 | def identify_outliers(column): 43 | Q1 = df[column].quantile(0.25) 44 | Q3 = df[column].quantile(0.75) 45 | IQR = Q3 - Q1 46 | lower_bound = Q1 - 1.5 * IQR 47 | upper_bound = Q3 + 1.5 * IQR 48 | outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)] 49 | return outliers 50 | 51 | # Identify and print outliers for 'Age' 52 | age_outliers = identify_outliers('Age') 53 | print("Outliers in 'Age':") 54 | print(age_outliers) 55 | 56 | # Identify and print outliers for 'Test_Score' 57 | test_score_outliers = identify_outliers('Test_Score') 58 | print("\nOutliers in 'Test_Score':") 59 | print(test_score_outliers) 60 | 61 | # Visualize the distribution of 'Age' and 'Test_Score' using box plots 62 | plt.figure(figsize=(12, 5)) 63 | 64 | plt.subplot(1, 2, 1) 65 | plt.title("Box Plot of 'Age'") 66 | plt.boxplot(df['Age'].dropna()) 67 | plt.xticks([1], ['Age']) 68 | 69 | plt.subplot(1, 2, 2) 70 | plt.title("Box Plot of 'Test_Score'") 71 | plt.boxplot(df['Test_Score'].dropna()) 72 | plt.xticks([1], ['Test_Score']) 73 | 74 | plt.tight_layout() 75 | plt.show() 76 | -------------------------------------------------------------------------------- /chapter08/8.handle_univariate_outliers_deletions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Generate student data with missing ages and test scores 6 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 7 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 8 | 9 | df = pd.DataFrame(data) 10 | 11 | # Fill NaN values with the mean of each column 12 | df.fillna(df.mean(), inplace=True) 13 | 14 | # Display the original dataset statistics 15 | print("Original Dataset Statistics:") 16 | print(df.describe()) 17 | 18 | # Plot the distributions before outlier handling 19 | plt.figure(figsize=(12, 5)) 20 | 21 | plt.subplot(1, 2, 1) 22 | plt.title("Distribution of 'Age' Before Outlier Handling") 23 | plt.hist(df['Age'], bins=10, color='blue', alpha=0.7, label='Original') 24 | plt.legend() 25 | 26 | plt.subplot(1, 2, 2) 27 | plt.title("Distribution of 'Test_Score' Before Outlier Handling") 28 | plt.hist(df['Test_Score'], bins=10, color='orange', alpha=0.7, label='Original') 29 | plt.legend() 30 | 31 | plt.tight_layout() 32 | plt.show() 33 | 34 | # Identify and handle outliers using interquartile range (IQR) 35 | Q1 = df['Test_Score'].quantile(0.25) 36 | Q3 = df['Test_Score'].quantile(0.75) 37 | IQR = Q3 - Q1 38 | 39 | outlier_threshold = 1.5 40 | lower_bound = Q1 - outlier_threshold * IQR 41 | upper_bound = Q3 + outlier_threshold * IQR 42 | 43 | df_no_outliers = df[(df['Test_Score'] >= lower_bound) & (df['Test_Score'] <= upper_bound)].copy() 44 | 45 | # Display the dataset after outlier handling 46 | print("\nDataset after Outlier Handling:") 47 | print(df_no_outliers) 48 | 49 | # Display the dataset statistics after outlier handling 50 | print("\nDataset Statistics after Outlier Handling:") 51 | print(df_no_outliers.describe()) 52 | 53 | # Plot the distributions after outlier handling 54 | plt.figure(figsize=(12, 5)) 55 | 56 | plt.subplot(1, 2, 1) 57 | plt.title("Distribution of 'Age' After Outlier Handling") 58 | plt.hist(df_no_outliers['Age'], bins=10, color='blue', alpha=0.7, label='Cleaned') 59 | plt.legend() 60 | 61 | plt.subplot(1, 2, 2) 62 | plt.title("Distribution of 'Test_Score' After Outlier Handling") 63 | plt.hist(df_no_outliers['Test_Score'], bins=10, color='orange', alpha=0.7, label='Cleaned') 64 | plt.legend() 65 | 66 | plt.tight_layout() 67 | plt.show() 68 | -------------------------------------------------------------------------------- /chapter08/9.trimming.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Generate student data with missing ages and test scores 6 | data = {'Age': [18, 20, None, 22, 21, 19, None, 23, 18, 24, 40, 41, 45, None, 34, None, 25, 30, 32, 24, 35, 38, 76, 90], 7 | 'Test_Score': [85, None, 90, 92, None, 88, 94, 91, None, 87, 75, 78, 80, None, 74, 20, 50, 68, None, 58, 48, 59, 10, 5]} 8 | 9 | df = pd.DataFrame(data) 10 | 11 | # Fill NaN values with the mean of each column 12 | df.fillna(df.mean(), inplace=True) 13 | 14 | # Display the original dataset statistics 15 | print("Original Dataset Statistics:") 16 | print(df.describe()) 17 | 18 | # Plot the distributions before outlier handling 19 | plt.figure(figsize=(12, 5)) 20 | 21 | plt.subplot(1, 2, 1) 22 | plt.title("Distribution of 'Age' Before Outlier Handling") 23 | plt.hist(df['Age'], bins=10, color='blue', alpha=0.7, label='Original') 24 | plt.legend() 25 | 26 | plt.subplot(1, 2, 2) 27 | plt.title("Distribution of 'Test_Score' Before Outlier Handling") 28 | plt.hist(df['Test_Score'], bins=10, color='orange', alpha=0.7, label='Original') 29 | plt.legend() 30 | 31 | plt.tight_layout() 32 | plt.show() 33 | 34 | # Drop the 10% of values on each side of the distribution for 'Age' column 35 | df_trimmed = df[(df['Age'] >= df['Age'].quantile(0.1)) & (df['Age'] <= df['Age'].quantile(0.9))] 36 | 37 | # Calculate trimmed mean for each column 38 | df_trimmed_mean = df_trimmed.mean() 39 | 40 | # Display the trimmed dataset statistics 41 | print("\nTrimmed Dataset Statistics:") 42 | print(df_trimmed.describe()) 43 | 44 | # Display the trimmed mean for each column 45 | print("\nTrimmed Mean:") 46 | print(df_trimmed_mean) 47 | 48 | # Plot the distributions after trimming 49 | plt.figure(figsize=(12, 5)) 50 | 51 | plt.subplot(1, 2, 1) 52 | plt.title("Distribution of 'Age' After Trimming") 53 | plt.hist(df_trimmed['Age'], bins=10, color='blue', alpha=0.7, label='Trimmed') 54 | plt.legend() 55 | 56 | plt.subplot(1, 2, 2) 57 | plt.title("Distribution of 'Test_Score' After Trimming") 58 | plt.hist(df_trimmed['Test_Score'], bins=10, color='orange', alpha=0.7, label='Trimmed') 59 | plt.legend() 60 | 61 | plt.tight_layout() 62 | plt.show() 63 | -------------------------------------------------------------------------------- /chapter09/min_max_scaling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import MinMaxScaler 5 | 6 | # Create a dataset with features related to housing prices 7 | np.random.seed(42) 8 | num_samples = 100 9 | 10 | # Square footage in square feet 11 | square_footage = np.random.uniform(500, 5000, num_samples) 12 | 13 | # Distance to the nearest school in miles 14 | distance_to_school = np.random.uniform(0.1, 5, num_samples) 15 | 16 | # Commute distance to work in miles 17 | commute_distance = np.random.exponential(5, num_samples) 18 | 19 | # Traffic density (skewed feature) 20 | traffic_density = np.random.exponential(2, num_samples) 21 | 22 | # Create a DataFrame 23 | data = pd.DataFrame({ 24 | 'Square_Footage': square_footage, 25 | 'Distance_to_School': distance_to_school, 26 | 'Commute_Distance': commute_distance, 27 | 'Traffic_Density': traffic_density 28 | }) 29 | 30 | 31 | # Display original dataset statistics 32 | print("Original Dataset Statistics:") 33 | print(data.describe()) 34 | 35 | # Plot the distributions before scaling 36 | plt.figure(figsize=(12, 8)) 37 | 38 | for i, column in enumerate(data.columns): 39 | plt.subplot(2, 2, i+1) 40 | plt.title(f"Distribution of '{column}' Before Scaling") 41 | plt.hist(data[column], bins=20, color='blue', alpha=0.7) 42 | plt.xlabel(column) 43 | 44 | plt.tight_layout() 45 | plt.show() 46 | 47 | # Apply Min-Max scaling 48 | scaler = MinMaxScaler() 49 | data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns) 50 | 51 | # Display dataset statistics after scaling 52 | print("\nDataset Statistics After Scaling:") 53 | print(data_scaled.describe()) 54 | 55 | # Plot the distributions after scaling 56 | plt.figure(figsize=(12, 8)) 57 | 58 | for i, column in enumerate(data_scaled.columns): 59 | plt.subplot(2, 2, i+1) 60 | plt.title(f"Distribution of '{column}' After Scaling") 61 | plt.hist(data_scaled[column], bins=20, color='green', alpha=0.7) 62 | plt.xlabel(column) 63 | 64 | plt.tight_layout() 65 | plt.show() 66 | -------------------------------------------------------------------------------- /chapter09/robust_scaler.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import RobustScaler 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | # Create a dataset with features related to housing prices 8 | np.random.seed(42) 9 | num_samples = 100 10 | 11 | # Square footage in square feet 12 | square_footage = np.random.uniform(500, 5000, num_samples) 13 | 14 | # Distance to the nearest school in miles 15 | distance_to_school = np.random.uniform(0.1, 5, num_samples) 16 | 17 | # Commute distance to work in miles 18 | commute_distance = np.random.exponential(5, num_samples) 19 | 20 | # Traffic density (skewed feature) 21 | traffic_density = np.random.exponential(2, num_samples) 22 | 23 | # Create a DataFrame 24 | data = pd.DataFrame({ 25 | 'Square_Footage': square_footage, 26 | 'Distance_to_School': distance_to_school, 27 | 'Commute_Distance': commute_distance, 28 | 'Traffic_Density': traffic_density 29 | }) 30 | 31 | 32 | # Display original dataset statistics 33 | print("Original Dataset Statistics:") 34 | print(data.describe()) 35 | 36 | # Plot the distributions before scaling 37 | plt.figure(figsize=(12, 8)) 38 | 39 | for i, column in enumerate(data.columns): 40 | plt.subplot(2, 2, i+1) 41 | plt.title(f"Distribution of '{column}' Before Scaling") 42 | plt.hist(data[column], bins=20, color='blue', alpha=0.7) 43 | plt.xlabel(column) 44 | 45 | plt.tight_layout() 46 | plt.show() 47 | 48 | # Applying RobustScaler 49 | robust_scaler = RobustScaler() 50 | data_scaled = robust_scaler.fit_transform(data) 51 | 52 | # Converting the scaled data back to a DataFrame 53 | data_scaled = pd.DataFrame(data_scaled, columns=data.columns) 54 | 55 | # Displaying the dataset after scaling 56 | print("\nDataset after Robust Scaling:") 57 | print(data_scaled.describe()) 58 | 59 | # Plotting the distributions after scaling 60 | plt.figure(figsize=(12, 8)) 61 | 62 | for i, col in enumerate(data_scaled.columns, 1): 63 | plt.subplot(2, 2, i) 64 | plt.title(f"Distribution of {col} After Robust Scaling") 65 | plt.hist(data_scaled[col], bins=20, color='orange', alpha=0.7) 66 | 67 | plt.tight_layout() 68 | plt.show() 69 | -------------------------------------------------------------------------------- /chapter09/zscaler.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | np.random.seed(42) 6 | num_samples = 100 7 | 8 | # Square footage in square feet 9 | square_footage = np.random.uniform(500, 5000, num_samples) 10 | 11 | # Distance to the nearest school in miles 12 | distance_to_school = np.random.uniform(0.1, 5, num_samples) 13 | 14 | # Commute distance to work in miles 15 | commute_distance = np.random.exponential(5, num_samples) 16 | 17 | # Traffic density (skewed feature) 18 | traffic_density = np.random.exponential(2, num_samples) 19 | # Create a DataFrame with all features 20 | data = pd.DataFrame({ 21 | 'Square_Footage': square_footage, 22 | 'Distance_to_School': distance_to_school, 23 | 'Commute_Distance': commute_distance, 24 | 'Traffic_Density': traffic_density 25 | }) 26 | 27 | # Print original dataset statistics 28 | print("Original Dataset Statistics:") 29 | print(data.describe()) 30 | 31 | # Visualize the original distributions 32 | data.hist(figsize=(12, 10), bins=20, color='blue', alpha=0.7) 33 | plt.suptitle('Original Data Distributions') 34 | plt.show() 35 | 36 | # Z-score scaling 37 | data_zscore = (data - data.mean()) / data.std() 38 | 39 | # Print dataset statistics after Z-score scaling 40 | print("\nDataset Statistics after Z-score Scaling:") 41 | print(data_zscore.describe()) 42 | 43 | # Visualize the distributions after Z-score scaling 44 | data_zscore.hist(figsize=(12, 10), bins=20, color='green', alpha=0.7) 45 | plt.suptitle('Data Distributions after Z-score Scaling') 46 | plt.show() 47 | -------------------------------------------------------------------------------- /chapter10/1a.label_encoding.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import LabelEncoder 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | 6 | # Sample dataset 7 | data = { 8 | 'Employee Rating': ['Poor', 'Good', 'Satisfactory', 'Excellent', 'Good'], 9 | 'Salary': [35000, 50000, 42000, 60000, 52000], 10 | 'Years of Experience': [2, 5, 3, 8, 6], 11 | 'Department': ['HR', 'IT', 'Finance', 'IT', 'Marketing'] 12 | } 13 | 14 | df = pd.DataFrame(data) 15 | print("Original DataFrame:") 16 | print(df) 17 | 18 | # Initialize the LabelEncoder 19 | label_encoder = LabelEncoder() 20 | 21 | # Apply label encoding to the 'Employee Rating' column 22 | df['Employee Rating (Encoded)'] = label_encoder.fit_transform(df['Employee Rating']) 23 | 24 | print("\nDataFrame after Label Encoding:") 25 | print(df) 26 | 27 | # Plot the distribution of the 'Employee Rating' column before encoding 28 | plt.figure(figsize=(14, 6)) 29 | 30 | plt.subplot(1, 2, 1) 31 | sns.countplot(x='Employee Rating', data=df, order=df['Employee Rating'].value_counts().index) 32 | plt.title('Distribution of Employee Rating (Before Encoding)') 33 | plt.xlabel('Employee Rating') 34 | plt.ylabel('Count') 35 | 36 | # Plot the distribution of the 'Employee Rating (Encoded)' column after encoding 37 | plt.subplot(1, 2, 2) 38 | sns.countplot(x='Employee Rating (Encoded)', data=df, order=df['Employee Rating (Encoded)'].value_counts().index) 39 | plt.title('Distribution of Employee Rating (After Encoding)') 40 | plt.xlabel('Employee Rating (Encoded)') 41 | plt.ylabel('Count') 42 | 43 | plt.tight_layout() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /chapter10/1b.label_encoding_forced.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import LabelEncoder 3 | 4 | # Sample dataset 5 | data = { 6 | 'Employee Rating': ['Poor', 'Good', 'Satisfactory', 'Excellent', 'Good'], 7 | 'Salary': [35000, 50000, 42000, 60000, 52000], 8 | 'Years of Experience': [2, 5, 3, 8, 6], 9 | 'Department': ['HR', 'IT', 'Finance', 'IT', 'Marketing'] 10 | } 11 | 12 | df = pd.DataFrame(data) 13 | print("Original DataFrame:") 14 | print(df) 15 | 16 | # Define the correct order of categories with prefixes 17 | ordered_categories = { 18 | 'Poor': '1.Poor', 19 | 'Satisfactory': '2.Satisfactory', 20 | 'Good': '3.Good', 21 | 'Excellent': '4.Excellent' 22 | } 23 | 24 | # Map the 'Employee Rating' column to the prefixed categories 25 | df['Employee Rating Ordered'] = df['Employee Rating'].map(ordered_categories) 26 | 27 | # Initialize the LabelEncoder 28 | label_encoder = LabelEncoder() 29 | 30 | # Apply label encoding to the 'Employee Rating Ordered' column 31 | df['Employee Rating (Encoded)'] = label_encoder.fit_transform(df['Employee Rating Ordered']) 32 | 33 | # Reverse the mapping for clarity in the DataFrame (optional) 34 | reverse_mapping = {v: k for k, v in ordered_categories.items()} 35 | df['Employee Rating Ordered'] = df['Employee Rating Ordered'].map(reverse_mapping) 36 | 37 | print("\nDataFrame after Label Encoding with Correct Order:") 38 | print(df[['Employee Rating Ordered','Employee Rating (Encoded)']]) 39 | -------------------------------------------------------------------------------- /chapter10/2.one_hot_encoding.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import OneHotEncoder 5 | 6 | # Sample dataset 7 | data = { 8 | 'Customer ID': [1, 2, 3, 4, 5], 9 | 'Contract Type': ['Month-to-Month', 'One Year', 'Month-to-Month', 'Two Year', 'One Year'], 10 | 'Internet Service': ['DSL', 'Fiber Optic', 'DSL', 'Fiber Optic', 'No Internet Service'], 11 | 'Payment Method': ['Electronic Check', 'Mailed Check', 'Bank Transfer', 'Credit Card', 'Electronic Check'], 12 | } 13 | 14 | df = pd.DataFrame(data) 15 | 16 | # Plot distribution of original 'Contract Type' column 17 | plt.figure(figsize=(8, 6)) 18 | sns.countplot(x='Contract Type', data=df).set_title('Contract Type Distribution') 19 | plt.show() 20 | 21 | # Initialize the OneHotEncoder for 'Contract Type' without dropping any category 22 | one_hot_encoder = OneHotEncoder(sparse_output=False) 23 | 24 | # Fit and transform the 'Contract Type' column 25 | encoded_columns = one_hot_encoder.fit_transform(df[['Contract Type']]) 26 | 27 | # Create a new DataFrame with the one-hot encoded columns for 'Contract Type' 28 | encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(['Contract Type'])) 29 | 30 | # Concatenate the one-hot encoded DataFrame with the original DataFrame 31 | df_encoded = pd.concat([df, encoded_df], axis=1) 32 | 33 | # Dropping the original 'Contract Type' column as it is now encoded 34 | df_encoded = df_encoded.drop(['Contract Type'], axis=1) 35 | 36 | print(df_encoded) 37 | 38 | # Plot distribution of encoded 'Contract Type' columns 39 | encoded_cols = encoded_df.columns 40 | 41 | fig, axes = plt.subplots(1, len(encoded_cols), figsize=(6 * len(encoded_cols), 5)) 42 | for i, col in enumerate(encoded_cols): 43 | sns.countplot(ax=axes[i], x=encoded_df[col]).set_title(f'{col} Distribution') 44 | plt.tight_layout() 45 | plt.show() 46 | -------------------------------------------------------------------------------- /chapter10/4.frequency_encoding.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from sklearn.model_selection import train_test_split 5 | from category_encoders import CountEncoder # Ensure you have this library installed 6 | 7 | # Create a sample dataset 8 | data = { 9 | 'Customer ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 10 | 'Product Category': ['Electronics', 'Clothing', 'Electronics', 'Books', 'Books', 'Clothing', 'Electronics', 'Books', 'Clothing', 'Books'], 11 | 'Total Purchases': [5, 2, 3, 8, 7, 4, 2, 5, 1, 6] 12 | } 13 | 14 | df = pd.DataFrame(data) 15 | 16 | # Display the sample dataset 17 | print("Sample Dataset:") 18 | print(df) 19 | 20 | # Define the features 21 | X = df[['Customer ID', 'Product Category', 'Total Purchases']] 22 | 23 | # Split the data into training and testing sets 24 | X_train, X_test = train_test_split(X, test_size=0.2, random_state=42) 25 | 26 | # Initialize the CountEncoder for 'Product Category' 27 | count_encoder = CountEncoder(cols=['Product Category']) 28 | 29 | # Fit and transform the training data 30 | X_train_encoded = count_encoder.fit_transform(X_train) 31 | 32 | # Transform the test data using the same encoder 33 | X_test_encoded = count_encoder.transform(X_test) 34 | 35 | # Plot the distribution of the original and encoded 'Product Category' in the training set 36 | fig, axes = plt.subplots(1, 2, figsize=(16, 6)) 37 | 38 | # Original 'Product Category' distribution 39 | sns.countplot(ax=axes[0], x='Product Category', data=X_train).set_title('Original Product Category Distribution (Training Set)') 40 | 41 | # Encoded 'Product Category' distribution 42 | sns.countplot(ax=axes[1], x='Product Category', data=X_train_encoded).set_title('Encoded Product Category Distribution (Training Set)') 43 | 44 | plt.tight_layout() 45 | plt.show() 46 | 47 | # Display the encoded training dataset 48 | print("\nEncoded Training Dataset:") 49 | print(X_train_encoded.head()) 50 | 51 | # Display the encoded testing dataset 52 | print("\nEncoded Testing Dataset:") 53 | print(X_test_encoded.head()) 54 | -------------------------------------------------------------------------------- /chapter10/5.binary_encoding.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from category_encoders import BinaryEncoder 5 | 6 | # Sample data 7 | data = { 8 | 'Country': ['USA', 'Canada', 'USA', 'Canada', 'Mexico', 'USA', 'Mexico', 'Canada'], 9 | 'Age': [25, 30, 35, 40, 45, 50, 55, 60], 10 | 'Income': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000], 11 | 'Subscription': [1, 0, 1, 0, 1, 0, 1, 0] 12 | } 13 | 14 | df = pd.DataFrame(data) 15 | 16 | # Plot the distribution of the 'Country' feature before encoding 17 | plt.figure(figsize=(10, 6)) 18 | sns.countplot(x='Country', data=df) 19 | plt.title('Distribution of Country Feature Before Encoding') 20 | plt.show() 21 | 22 | # Apply binary encoding to the 'Country' feature 23 | encoder = BinaryEncoder(cols=['Country']) 24 | df_encoded = encoder.fit_transform(df) 25 | 26 | # Display the encoded dataframe 27 | print(df_encoded) 28 | 29 | # Plot the distribution of the binary encoded features 30 | encoded_cols = [col for col in df_encoded.columns if 'Country' in col] 31 | n_cols = len(encoded_cols) 32 | 33 | fig, axes = plt.subplots(1, n_cols, figsize=(5*n_cols, 5)) 34 | fig.suptitle('Distribution of Country Feature After Binary Encoding') 35 | 36 | for i, col in enumerate(encoded_cols): 37 | sns.histplot(df_encoded[col], kde=True, ax=axes[i], bins=2) 38 | axes[i].set_title(col) 39 | axes[i].set_xlabel('Encoded Value') 40 | axes[i].set_ylabel('Count') 41 | 42 | plt.tight_layout() 43 | plt.show() -------------------------------------------------------------------------------- /chapter11/1.decomposing_time_series/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | # Generate example data with noise 6 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M') 7 | np.random.seed(42) 8 | noise_data = pd.Series(np.random.normal(0, 2, len(date_rng)), index=date_rng) 9 | 10 | # Plotting the time series data with noise 11 | plt.figure(figsize=(10, 5)) 12 | plt.plot(noise_data, label='Temperature Fluctuations') 13 | plt.title('Time Series Data with Noise') 14 | plt.xlabel('Time') 15 | plt.ylabel('Temperature') 16 | plt.legend() 17 | plt.show() 18 | -------------------------------------------------------------------------------- /chapter11/1.decomposing_time_series/seasonality.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | # Generate example data with seasonality 5 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M') 6 | seasonal_data = pd.Series([10, 12, 15, 22, 30, 35, 40, 38, 30, 22, 15, 12] * 11, index=date_rng) 7 | 8 | # Plotting the time series data with seasonality 9 | plt.figure(figsize=(10, 5)) 10 | plt.plot(seasonal_data, label='Ice Cream Sales') 11 | plt.title('Time Series Data with Seasonality') 12 | plt.xlabel('Time') 13 | plt.ylabel('Sales') 14 | plt.legend() 15 | plt.show() 16 | -------------------------------------------------------------------------------- /chapter11/1.decomposing_time_series/trend.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | # Generate example data 5 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M') 6 | sales_data = pd.Series(range(1, len(date_rng) + 1), index=date_rng) 7 | 8 | # Plotting the time series data with a trend 9 | plt.figure(figsize=(10, 5)) 10 | plt.plot(sales_data, label='Sales Data') 11 | plt.title('Time Series Data with Trend') 12 | plt.xlabel('Time') 13 | plt.ylabel('Sales') 14 | plt.legend() 15 | plt.show() 16 | -------------------------------------------------------------------------------- /chapter11/2.types/multivariate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | # Generate example multivariate time series data 6 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M') 7 | temperature_data = pd.Series(np.random.normal(20, 5, len(date_rng)), index=date_rng) 8 | rainfall_data = pd.Series(np.random.normal(50, 20, len(date_rng)), index=date_rng) 9 | 10 | # Create a DataFrame with both temperature and rainfall data 11 | multivariate_data = pd.DataFrame({'Temperature': temperature_data, 'Rainfall': rainfall_data}) 12 | 13 | # Plotting the multivariate time series data 14 | plt.figure(figsize=(12, 6)) 15 | 16 | plt.subplot(2, 1, 1) 17 | plt.plot(multivariate_data['Temperature'], label='Temperature Data', color='blue') 18 | plt.title('Multivariate Time Series Data') 19 | plt.xlabel('Time') 20 | plt.ylabel('Temperature (°C)') 21 | plt.legend() 22 | 23 | plt.subplot(2, 1, 2) 24 | plt.plot(multivariate_data['Rainfall'], label='Rainfall Data', color='green') 25 | plt.xlabel('Time') 26 | plt.ylabel('Rainfall (mm)') 27 | plt.legend() 28 | 29 | plt.tight_layout() 30 | plt.show() 31 | -------------------------------------------------------------------------------- /chapter11/2.types/univariate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | # Generate example univariate time series data 6 | date_rng = pd.date_range(start='2010-01-01', end='2020-12-31', freq='M') 7 | temperature_data = pd.Series(np.random.normal(20, 5, len(date_rng)), index=date_rng) 8 | 9 | # Plotting the univariate time series data 10 | plt.figure(figsize=(10, 5)) 11 | plt.plot(temperature_data, label='Temperature Data') 12 | plt.title('Univariate Time Series Data') 13 | plt.xlabel('Time') 14 | plt.ylabel('Temperature (°C)') 15 | plt.legend() 16 | plt.show() 17 | -------------------------------------------------------------------------------- /chapter11/3.missing_values/1.identify_missing_values.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Set seed for reproducibility 6 | np.random.seed(42) 7 | 8 | # Generate a date range 9 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') # Business days 10 | 11 | # Generate random stock prices 12 | n = len(date_range) 13 | data = { 14 | 'open': np.random.uniform(100, 200, n), 15 | 'high': np.random.uniform(200, 300, n), 16 | 'low': np.random.uniform(50, 100, n), 17 | 'close': np.random.uniform(100, 200, n) 18 | } 19 | 20 | # Create DataFrame 21 | df = pd.DataFrame(data, index=date_range) 22 | 23 | # Introduce random NaN values 24 | nan_indices = np.random.choice(df.index, size=100, replace=False) 25 | df.loc[nan_indices] = np.nan 26 | 27 | # Drop random dates to simulate missing timestamps 28 | missing_dates = np.random.choice(df.index, size=50, replace=False) 29 | df = df.drop(missing_dates) 30 | 31 | # Display the first few rows of the DataFrame 32 | print("Initial DataFrame with Missing Values and Timestamps:\n", df.head()) 33 | 34 | # Step 1: Checking for NaNs or Null Values in columns 35 | missing_values = df.isnull().sum() 36 | print("\nMissing Values in Each Column:\n", missing_values) 37 | 38 | # Step 2: Identifying Missing Timestamps 39 | complete_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='B') # 'B' is for business days 40 | df_reindexed = df.reindex(complete_index) 41 | missing_timestamps = df_reindexed[df_reindexed.isnull().all(axis=1)] 42 | 43 | # Calculate percentage of missing timestamps 44 | total_timestamps = len(complete_index) 45 | missing_timestamps_count = missing_timestamps.shape[0] 46 | missing_timestamps_percentage = (missing_timestamps_count / total_timestamps) * 100 47 | 48 | print("\nMissing Timestamps:\n", missing_timestamps) 49 | print(f"\nPercentage of Missing Timestamps: {missing_timestamps_percentage:.2f}%") 50 | 51 | # Plotting 52 | plt.figure(figsize=(14, 7)) 53 | 54 | # Plot the closing prices 55 | plt.plot(df.index, df['close'], marker='o', linestyle='-', label='Closing Price', color='blue') 56 | 57 | # Mark missing timestamps with vertical lines 58 | for date in missing_dates: 59 | plt.axvline(x=date, color='red', linestyle='--', linewidth=1) 60 | 61 | # Highlight points with NaN values 62 | nan_dates = df.index[df['close'].isnull()] 63 | plt.scatter(nan_dates, [df['close'].mean()] * len(nan_dates), color='orange', label='NaN Values in Close', zorder=5) 64 | 65 | plt.title('Daily Closing Prices with Missing Timestamps and NaN Values Highlighted') 66 | plt.xlabel('Date') 67 | plt.ylabel('Closing Price') 68 | plt.legend() 69 | plt.grid(True) 70 | plt.show() 71 | 72 | # Summary of Identifying Missing Values 73 | print("\nNaN values were introduced randomly in the dataset and are highlighted in orange on the plot.\n" 74 | "Red dashed lines indicate missing timestamps where no data is available for the dates in the index.\n" 75 | "Blue line shows the closing prices with missing values removed.") 76 | 77 | -------------------------------------------------------------------------------- /chapter11/3.missing_values/2.remove_missing_values.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Set seed for reproducibility 6 | np.random.seed(42) 7 | 8 | # Generate date range and random stock prices 9 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 10 | n = len(date_range) 11 | data = { 12 | 'open': np.random.uniform(100, 200, n), 13 | 'high': np.random.uniform(200, 300, n), 14 | 'low': np.random.uniform(50, 100, n), 15 | 'close': np.random.uniform(100, 200, n) 16 | } 17 | df = pd.DataFrame(data, index=date_range) 18 | 19 | # Introduce random NaN values in 'close' and 'open' columns 20 | nan_indices_close = np.random.choice(df.index, size=50, replace=False) 21 | nan_indices_open = np.random.choice(df.index, size=50, replace=False) 22 | df.loc[nan_indices_close, 'close'] = np.nan 23 | df.loc[nan_indices_open, 'open'] = np.nan 24 | 25 | # Display the first few rows of the DataFrame 26 | print("Initial DataFrame with Missing Values:\n", df.head()) 27 | 28 | # Step 1: Checking for NaNs or Null Values in columns 29 | missing_values = df.isnull().sum() 30 | print("\nMissing Values in Each Column:\n", missing_values) 31 | 32 | # Print percentage of missing values in each column 33 | missing_percentage = (missing_values / len(df)) * 100 34 | print("\nPercentage of Missing Values in Each Column:\n", missing_percentage) 35 | 36 | # Print the number of rows before dropping NaN values 37 | print(f"\nNumber of rows before dropping NaN values: {len(df)}") 38 | 39 | # Step 2: Drop rows with NaN values 40 | df_cleaned = df.dropna() 41 | 42 | # Print the number of rows after dropping NaN values 43 | print(f"\nNumber of rows after dropping NaN values: {len(df_cleaned)}") 44 | 45 | # Print percentage of missing values after dropping NaN values 46 | cleaned_missing_values = df_cleaned.isnull().sum() 47 | cleaned_missing_percentage = (cleaned_missing_values / len(df_cleaned)) * 100 48 | print("\nPercentage of Missing Values After Dropping Rows:\n", cleaned_missing_percentage) 49 | 50 | # Plotting original data with NaN values 51 | plt.figure(figsize=(14, 7)) 52 | plt.plot(df.index, df['close'], marker='o', linestyle='-', label='Original Closing Price', color='blue', alpha=0.5) 53 | 54 | # Highlight points with NaN values in the original dataset 55 | nan_dates_close = df.index[df['close'].isnull()] 56 | nan_dates_open = df.index[df['open'].isnull()] 57 | 58 | # Use 'x' marker for the points to be dropped 59 | plt.scatter(nan_dates_close, [df['close'].mean()] * len(nan_dates_close), color='orange', label='NaN Values in Close (To be Dropped)', marker='x', zorder=5) 60 | plt.scatter(nan_dates_open, [df['close'].mean()] * len(nan_dates_open), color='red', label='NaN Values in Open (To be Dropped)', marker='x', zorder=5) 61 | 62 | plt.title('Original Daily Closing Prices with NaN Values Highlighted') 63 | plt.xlabel('Date') 64 | plt.ylabel('Closing Price') 65 | plt.legend() 66 | plt.grid(True) 67 | plt.show() 68 | 69 | # Plotting cleaned data after dropping rows with NaN values 70 | plt.figure(figsize=(14, 7)) 71 | plt.plot(df_cleaned.index, df_cleaned['close'], marker='o', linestyle='-', label='Cleaned Closing Price', color='green') 72 | 73 | plt.title('Cleaned Daily Closing Prices After Dropping NaN Values') 74 | plt 75 | -------------------------------------------------------------------------------- /chapter11/3.missing_values/3.back_forward_fill.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Set seed for reproducibility 5 | np.random.seed(42) 6 | 7 | # Generate date range and random stock prices 8 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 9 | n = len(date_range) 10 | data = { 11 | 'open': np.random.uniform(100, 200, n), 12 | 'high': np.random.uniform(200, 300, n), 13 | 'low': np.random.uniform(50, 100, n), 14 | 'close': np.random.uniform(100, 200, n) 15 | } 16 | df = pd.DataFrame(data, index=date_range) 17 | 18 | # Introduce random NaN values in 'close' and 'open' columns 19 | nan_indices_close = np.random.choice(df.index, size=50, replace=False) 20 | nan_indices_open = np.random.choice(df.index, size=50, replace=False) 21 | df.loc[nan_indices_close, 'close'] = np.nan 22 | df.loc[nan_indices_open, 'open'] = np.nan 23 | 24 | # Fill NaN values using forward fill and backward fill 25 | df['close_ffill'] = df['close'].ffill() # Forward Fill 26 | df['close_bfill'] = df['close'].bfill() # Backward Fill 27 | 28 | # Display the entire DataFrame including original and filled values 29 | print("Complete DataFrame with Original and Filled Values:\n") 30 | print(df[['open', 'close', 'close_ffill', 'close_bfill']].head(20)) # Show first 20 rows 31 | -------------------------------------------------------------------------------- /chapter11/3.missing_values/4.interpolation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Set seed for reproducibility 6 | np.random.seed(42) 7 | 8 | # Generate date range and random stock prices 9 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 10 | n = len(date_range) 11 | data = { 12 | 'open': np.random.uniform(100, 200, n), 13 | 'high': np.random.uniform(200, 300, n), 14 | 'low': np.random.uniform(50, 100, n), 15 | 'close': np.random.uniform(100, 200, n) 16 | } 17 | df = pd.DataFrame(data, index=date_range) 18 | 19 | # Introduce random NaN values in 'close' and 'open' columns 20 | nan_indices_close = np.random.choice(df.index, size=50, replace=False) 21 | nan_indices_open = np.random.choice(df.index, size=50, replace=False) 22 | df.loc[nan_indices_close, 'close'] = np.nan 23 | df.loc[nan_indices_open, 'open'] = np.nan 24 | 25 | # Interpolation 26 | # Linear Interpolation 27 | df['close_linear'] = df['close'].interpolate(method='linear') 28 | 29 | # Polynomial Interpolation 30 | df['close_poly'] = df['close'].interpolate(method='polynomial', order=3) 31 | 32 | # Spline Interpolation 33 | df['close_spline'] = df['close'].interpolate(method='spline', order=3) 34 | 35 | print(df.head(30)) 36 | 37 | # Function to plot and highlight filled values 38 | def plot_filled(ax, original, filled, label, color): 39 | ax.plot(filled, label=label, linestyle='-', color=color) 40 | filled_values = filled[original.isna()] 41 | ax.plot(filled_values.index, filled_values, 'o', color=color, markersize=5) 42 | ax.legend() 43 | 44 | # Plot the results in separate subplots 45 | fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(14, 18), sharex=True) 46 | 47 | 48 | # Linear Interpolation 49 | plot_filled(axes[0], df['close'], df['close_linear'], 'Linear Interpolation', 'purple') 50 | axes[0].set_title('Linear Interpolation') 51 | 52 | # Polynomial Interpolation 53 | plot_filled(axes[1], df['close'], df['close_poly'], 'Polynomial Interpolation', 'orange') 54 | axes[1].set_title('Polynomial Interpolation') 55 | 56 | # Spline Interpolation 57 | plot_filled(axes[2], df['close'], df['close_spline'], 'Spline Interpolation', 'brown') 58 | axes[2].set_title('Spline Interpolation') 59 | 60 | # Set common labels 61 | plt.xlabel('Date') 62 | fig.supylabel('Stock Price (Close)') 63 | 64 | plt.tight_layout() 65 | plt.show() 66 | -------------------------------------------------------------------------------- /chapter11/4.analisis/autocorrelation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from statsmodels.tsa.arima.model import ARIMA 5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 6 | 7 | # Set seed for reproducibility 8 | np.random.seed(42) 9 | 10 | # Generate date range and random stock prices 11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 12 | n = len(date_range) 13 | data = { 14 | 'open': np.random.uniform(100, 200, n), 15 | 'high': np.random.uniform(200, 300, n), 16 | 'low': np.random.uniform(50, 100, n), 17 | 'close': np.random.uniform(100, 200, n) 18 | } 19 | df = pd.DataFrame(data, index=date_range) 20 | 21 | # Plot ACF and PACF 22 | plt.figure(figsize=(14, 6)) 23 | 24 | # ACF plot 25 | plt.subplot(1, 2, 1) 26 | plot_acf(df['close'].dropna(), lags=40, ax=plt.gca()) 27 | plt.title('Autocorrelation Function (ACF)') 28 | 29 | # PACF plot 30 | plt.subplot(1, 2, 2) 31 | plot_pacf(df['close'].dropna(), lags=40, ax=plt.gca()) 32 | plt.title('Partial Autocorrelation Function (PACF)') 33 | 34 | plt.tight_layout() 35 | plt.show() 36 | -------------------------------------------------------------------------------- /chapter11/5.outliers/1.seasonal_decomposition.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from statsmodels.tsa.seasonal import seasonal_decompose 5 | from scipy.stats import zscore 6 | 7 | # Set seed for reproducibility 8 | np.random.seed(42) 9 | 10 | # Generate date range and random stock prices 11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 12 | n = len(date_range) 13 | data = { 14 | 'open': np.random.uniform(100, 200, n), 15 | 'high': np.random.uniform(200, 300, n), 16 | 'low': np.random.uniform(50, 100, n), 17 | 'close': np.random.uniform(100, 200, n) 18 | } 19 | df = pd.DataFrame(data, index=date_range) 20 | 21 | 22 | # Introduce more aggressive outliers in the 'close' column 23 | outlier_indices = np.random.choice(df.index, size=10, replace=False) 24 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5 # Increase by 50% 25 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5 # Decrease by 50% 26 | 27 | # Decompose the series 28 | result = seasonal_decompose(df['close'], model='additive', period=252, extrapolate_trend='freq') 29 | 30 | # Add decomposed components to DataFrame 31 | df['trend'] = result.trend 32 | df['seasonal'] = result.seasonal 33 | df['residual'] = result.resid 34 | 35 | # Calculate Z-scores of residuals to identify outliers 36 | df['resid_z'] = zscore(df['residual'].dropna()) 37 | 38 | # Identify outliers (Z-score threshold set to 3) 39 | outliers = df[np.abs(df['resid_z']) > 3] 40 | 41 | # Handling outliers by replacing them with the median of the residuals 42 | median_resid = df['residual'].median() 43 | df.loc[outliers.index, 'close'] = df['close'].median() 44 | 45 | # Print the DataFrame to understand the numbers 46 | print(df[['close', 'close', 'trend', 'seasonal', 'residual', 'resid_z']].head(20)) 47 | 48 | # Plot the decomposed components 49 | fig, axes = plt.subplots(4, 1, figsize=(14, 18), sharex=True) 50 | 51 | result.observed.plot(ax=axes[0], title='Observed', color='blue') 52 | result.trend.plot(ax=axes[1], title='Trend', color='orange') 53 | result.seasonal.plot(ax=axes[2], title='Seasonal', color='green') 54 | result.resid.plot(ax=axes[3], title='Residual', color='red') 55 | 56 | plt.tight_layout() 57 | plt.show() 58 | -------------------------------------------------------------------------------- /chapter11/5.outliers/2.autocorrelation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from statsmodels.tsa.arima.model import ARIMA 5 | from statsmodels.graphics.tsaplots import plot_acf, plot_pacf 6 | 7 | # Set seed for reproducibility 8 | np.random.seed(42) 9 | 10 | # Generate date range and random stock prices 11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 12 | n = len(date_range) 13 | data = { 14 | 'open': np.random.uniform(100, 200, n), 15 | 'high': np.random.uniform(200, 300, n), 16 | 'low': np.random.uniform(50, 100, n), 17 | 'close': np.random.uniform(100, 200, n) 18 | } 19 | df = pd.DataFrame(data, index=date_range) 20 | 21 | # Introduce more aggressive outliers in the 'close' column 22 | outlier_indices = np.random.choice(df.index, size=10, replace=False) 23 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5 # Increase by 50% 24 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5 # Decrease by 50% 25 | 26 | 27 | # Plot ACF and PACF 28 | plt.figure(figsize=(14, 6)) 29 | 30 | # ACF plot 31 | plt.subplot(1, 2, 1) 32 | plot_acf(df['close'].dropna(), lags=40, ax=plt.gca()) 33 | plt.title('Autocorrelation Function (ACF)') 34 | 35 | # PACF plot 36 | plt.subplot(1, 2, 2) 37 | plot_pacf(df['close'].dropna(), lags=40, ax=plt.gca()) 38 | plt.title('Partial Autocorrelation Function (PACF)') 39 | 40 | plt.tight_layout() 41 | plt.show() 42 | -------------------------------------------------------------------------------- /chapter11/5.outliers/3.arima.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from statsmodels.tsa.arima.model import ARIMA 5 | from scipy.stats import zscore 6 | 7 | # Set seed for reproducibility 8 | np.random.seed(42) 9 | 10 | # Generate date range and random stock prices 11 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 12 | n = len(date_range) 13 | data = { 14 | 'open': np.random.uniform(100, 200, n), 15 | 'high': np.random.uniform(200, 300, n), 16 | 'low': np.random.uniform(50, 100, n), 17 | 'close': np.random.uniform(100, 200, n) 18 | } 19 | df = pd.DataFrame(data, index=date_range) 20 | 21 | 22 | # Introduce more aggressive outliers in the 'close' column 23 | outlier_indices = np.random.choice(df.index, size=10, replace=False) 24 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5 # Increase by 50% 25 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5 # Decrease by 50% 26 | 27 | 28 | # Fit ARIMA model to close_filled series 29 | model = ARIMA(df['close'], order=(2, 1, 1)) 30 | results = model.fit() 31 | 32 | # Calculate residuals and Z-scores 33 | df['residuals'] = results.resid 34 | df['residuals_z'] = zscore(df['residuals'].dropna()) 35 | 36 | # Identify outliers based on Z-score threshold (e.g., ±3) 37 | outliers_arima = df[np.abs(df['residuals_z']) > 3] 38 | 39 | # Generate smoothed series from ARIMA model 40 | df['arima_smooth'] = results.fittedvalues 41 | 42 | # Plotting the original close_filled and ARIMA smoothed series 43 | plt.figure(figsize=(14, 8)) 44 | plt.plot(df['close'], label='Original Close', color='blue') 45 | plt.plot(df['arima_smooth'], label='ARIMA Smoothed', color='red') 46 | plt.scatter(outliers_arima.index, df.loc[outliers_arima.index, 'close'], color='orange', label='Outliers') 47 | plt.title('ARIMA Smoothing and Outlier Detection') 48 | plt.legend() 49 | plt.show() 50 | 51 | # Print the summary of the model 52 | print(results.summary()) 53 | 54 | # Plot the diagnostics to check model fit 55 | results.plot_diagnostics(figsize=(14, 8)) 56 | plt.show() 57 | -------------------------------------------------------------------------------- /chapter11/5.outliers/4.moving_average.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import mean_absolute_error, mean_squared_error 5 | 6 | # Set seed for reproducibility 7 | np.random.seed(42) 8 | 9 | # Generate date range and random stock prices 10 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 11 | n = len(date_range) 12 | data = { 13 | 'open': np.random.uniform(100, 200, n), 14 | 'high': np.random.uniform(200, 300, n), 15 | 'low': np.random.uniform(50, 100, n), 16 | 'close': np.random.uniform(100, 200, n) 17 | } 18 | df = pd.DataFrame(data, index=date_range) 19 | 20 | # Introduce more aggressive outliers in the 'close' column 21 | outlier_indices = np.random.choice(df.index, size=10, replace=False) 22 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5 # Increase by 50% 23 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5 # Decrease by 50% 24 | 25 | # Define window size for SMA and span for EMA 26 | window_size = 20 27 | span = 20 28 | 29 | # Calculate Simple Moving Average (SMA) 30 | df['SMA'] = df['close'].rolling(window=window_size, min_periods=1).mean() 31 | 32 | # Calculate Exponential Moving Average (EMA) 33 | df['EMA'] = df['close'].ewm(span=span, adjust=False).mean() 34 | 35 | # Calculate residuals for SMA and EMA 36 | df['SMA_residuals'] = df['close'] - df['SMA'] 37 | df['EMA_residuals'] = df['close'] - df['EMA'] 38 | 39 | # Performance Metrics Calculation 40 | sma_mae = mean_absolute_error(df['close'], df['SMA']) 41 | sma_mse = mean_squared_error(df['close'], df['SMA']) 42 | sma_rmse = np.sqrt(sma_mse) 43 | 44 | ema_mae = mean_absolute_error(df['close'], df['EMA']) 45 | ema_mse = mean_squared_error(df['close'], df['EMA']) 46 | ema_rmse = np.sqrt(ema_mse) 47 | 48 | # Plotting original 'close', SMA, and EMA 49 | plt.figure(figsize=(14, 7)) 50 | plt.plot(df.index, df['close'], label='Original Close Price', marker='o', linestyle='-', color='b') 51 | plt.plot(df.index, df['SMA'], label=f'Simple Moving Average (window={window_size})', linestyle='--', color='r') 52 | plt.plot(df.index, df['EMA'], label=f'Exponential Moving Average (span={span})', linestyle='-.', color='g') 53 | plt.title('Simple vs. Exponential Moving Average') 54 | plt.xlabel('Date') 55 | plt.ylabel('Price') 56 | plt.legend() 57 | plt.show() 58 | 59 | # Plotting Performance Metrics 60 | metrics = ['MAE', 'MSE', 'RMSE'] 61 | sma_values = [sma_mae, sma_mse, sma_rmse] 62 | ema_values = [ema_mae, ema_mse, ema_rmse] 63 | 64 | plt.figure(figsize=(10, 6)) 65 | bar_width = 0.35 66 | index = np.arange(len(metrics)) 67 | 68 | plt.bar(index, sma_values, bar_width, label='Simple Moving Average (SMA)', color='b') 69 | plt.bar(index + bar_width, ema_values, bar_width, label='Exponential Moving Average (EMA)', color='g') 70 | 71 | plt.xlabel('Metrics') 72 | plt.ylabel('Value') 73 | plt.title('Performance Metrics: SMA vs. EMA') 74 | plt.xticks(index + bar_width / 2, metrics) 75 | plt.legend() 76 | plt.tight_layout() 77 | plt.show() 78 | -------------------------------------------------------------------------------- /chapter11/6.feature_engineering/1.lags.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import mean_absolute_error, mean_squared_error 5 | 6 | # Set seed for reproducibility 7 | np.random.seed(42) 8 | 9 | # Generate date range and random stock prices 10 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 11 | n = len(date_range) 12 | data = { 13 | 'open': np.random.uniform(100, 200, n), 14 | 'high': np.random.uniform(200, 300, n), 15 | 'low': np.random.uniform(50, 100, n), 16 | 'close': np.random.uniform(100, 200, n) 17 | } 18 | df = pd.DataFrame(data, index=date_range) 19 | 20 | # Introduce more aggressive outliers in the 'close' column 21 | outlier_indices = np.random.choice(df.index, size=10, replace=False) 22 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5 # Increase by 50% 23 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5 # Decrease by 50% 24 | 25 | # Function to create lagged features 26 | def create_lagged_features(df, column, lags): 27 | for lag in lags: 28 | df[f'{column}_lag_{lag}'] = df[column].shift(lag) 29 | return df 30 | 31 | # Define the lags to create 32 | lags = [5, 10, 20] 33 | 34 | # Create lagged features for 'close' column 35 | df = create_lagged_features(df, 'close', lags) 36 | 37 | # Plotting original 'close' and lagged features in separate subplots 38 | plt.figure(figsize=(14, 10)) 39 | 40 | # First subplot for the original 'close' price 41 | plt.subplot(len(lags) + 1, 1, 1) 42 | plt.plot(df.index, df['close'], label='Original Close Price', linestyle='-', color='b') 43 | plt.title('Original Close Price') 44 | plt.xlabel('Date') 45 | plt.ylabel('Price') 46 | plt.legend() 47 | 48 | # Create additional subplots for each lagged feature 49 | for i, lag in enumerate(lags): 50 | plt.subplot(len(lags) + 1, 1, i + 2) 51 | plt.plot(df.index, df[f'close_lag_{lag}'], label=f'Lag {lag}', linestyle='-', color='b') 52 | plt.title(f'Lag {lag} Feature') 53 | plt.xlabel('Date') 54 | plt.ylabel('Price') 55 | plt.legend() 56 | 57 | plt.tight_layout() # Adjust spacing between plots 58 | plt.show() 59 | 60 | # Explanation of significance of lagged features 61 | print("Explanation of Lagged Features:") 62 | print("- Lagged features, such as Lag 1, Lag 5, Lag 10, and Lag 20, represent historical values of the 'close' price.") 63 | print("- They capture temporal dependencies and autocorrelation present in the data.") 64 | print("- Lagged features are important for predicting future movements based on past behavior.") 65 | print("- They help in identifying trends, cycles, and seasonality in time series data.") 66 | -------------------------------------------------------------------------------- /chapter11/6.feature_engineering/2.seasonal_differencing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from statsmodels.tsa.stattools import adfuller 5 | 6 | # Set seed for reproducibility 7 | np.random.seed(42) 8 | 9 | # Generate date range 10 | date_range = pd.date_range(start='2020-01-01', end='2023-12-31', freq='B') 11 | n = len(date_range) 12 | 13 | # Generate a seasonal pattern 14 | seasonal_pattern = np.sin(np.linspace(0, 3 * np.pi, n)) * 20 15 | 16 | # Generate random stock prices with added seasonal component 17 | data = { 18 | 'open': np.random.uniform(100, 200, n) + seasonal_pattern, 19 | 'high': np.random.uniform(200, 300, n) + seasonal_pattern, 20 | 'low': np.random.uniform(50, 100, n) + seasonal_pattern, 21 | 'close': np.random.uniform(100, 200, n) + seasonal_pattern 22 | } 23 | df = pd.DataFrame(data, index=date_range) 24 | 25 | # Introduce more aggressive outliers in the 'close' column 26 | outlier_indices = np.random.choice(df.index, size=10, replace=False) 27 | df.loc[outlier_indices[:5], 'close'] = df['close'] * 1.5 # Increase by 50% 28 | df.loc[outlier_indices[5:], 'close'] = df['close'] * 0.5 # Decrease by 50% 29 | 30 | 31 | # First Differencing 32 | df['First Difference'] = df['close'].diff() 33 | 34 | # Second Differencing 35 | df['Second Difference'] = df['First Difference'].diff() 36 | 37 | # Seasonal Differencing (weekly seasonality) 38 | df['Seasonal Difference'] = df['close'].diff(5) 39 | 40 | # Plotting the original series and differenced series 41 | plt.figure(figsize=(14, 10)) 42 | 43 | plt.subplot(4, 1, 1) 44 | plt.plot(df.index, df['close'], label='Original Series with Seasonality', color='blue') 45 | plt.title('Original Series with Seasonality') 46 | plt.legend(loc='upper right') 47 | 48 | plt.subplot(4, 1, 2) 49 | plt.plot(df.index, df['First Difference'], label='First Difference', color='orange') 50 | plt.title('First Differencing') 51 | plt.legend(loc='upper right') 52 | 53 | plt.subplot(4, 1, 3) 54 | plt.plot(df.index, df['Second Difference'], label='Second Difference', color='green') 55 | plt.title('Second Differencing') 56 | plt.legend(loc='upper right') 57 | 58 | plt.subplot(4, 1, 4) 59 | plt.plot(df.index, df['Seasonal Difference'], label='Seasonal Differencing (Weekly)', color='red') 60 | plt.title('Seasonal Differencing') 61 | plt.legend(loc='upper right') 62 | 63 | plt.tight_layout() 64 | plt.show() 65 | 66 | # Augmented Dickey-Fuller Test 67 | def adf_test(series, title=''): 68 | result = adfuller(series.dropna(), autolag='AIC') 69 | print(f'Augmented Dickey-Fuller Test: {title}') 70 | print(f'ADF Statistic: {result[0]}') 71 | print(f'p-value: {result[1]}') 72 | for key, value in result[4].items(): 73 | print(f' {key}: {value}') 74 | print('\n') 75 | 76 | # Perform ADF test on original, first differenced, second differenced, and seasonal differenced series 77 | adf_test(df['close'], title='Original Series') 78 | adf_test(df['close'].diff(), title='First Difference') 79 | adf_test(df['close'].diff().diff(), title='Second Difference') 80 | adf_test(df['Seasonal Difference'], title='Seasonal Differencing') 81 | -------------------------------------------------------------------------------- /chapter12/1.text_cleaning.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from transformers import BertTokenizer 3 | 4 | # Sample user reviews 5 | reviews = [ 6 | "This product is amazing!", 7 | "The product is good, but it could be better!!!", 8 | "I've never seen such a terrible product. 0/10", 9 | "The product is AWESOME!!! Highly recommended!", 10 | ] 11 | 12 | # a. Removing HTML tags and Special Characters 13 | def clean_html_tags(text): 14 | soup = BeautifulSoup(text, "html.parser") 15 | return soup.get_text() 16 | 17 | # b. Handling Capitalization and Letter Case 18 | def standardize_case(text): 19 | return text.lower() 20 | 21 | # c. Dealing with Numerical Values and Symbols 22 | def remove_numbers_and_symbols(text): 23 | return ''.join(e for e in text if e.isalpha() or e.isspace()) 24 | 25 | # d. Addressing Whitespace and Formatting Issues 26 | def remove_extra_whitespace(text): 27 | return ' '.join(text.split()) 28 | 29 | 30 | # Applying the text preprocessing pipeline 31 | def preprocess_text(text): 32 | text = clean_html_tags(text) 33 | text = standardize_case(text) 34 | text = remove_numbers_and_symbols(text) 35 | text = remove_extra_whitespace(text) 36 | return text 37 | 38 | # Preprocess all reviews 39 | preprocessed_reviews = [preprocess_text(review) for review in reviews] 40 | 41 | print("Original Reviews:") 42 | for review in reviews: 43 | print(f"- {review}") 44 | 45 | print("\nPreprocessed Reviews:") 46 | for preprocessed_review in preprocessed_reviews: 47 | print(f"- {preprocessed_review}") 48 | 49 | -------------------------------------------------------------------------------- /chapter12/10.word_tokenisation.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import word_tokenize 3 | 4 | # Download the necessary NLTK data (run this once) 5 | nltk.download('punkt') 6 | 7 | # Sample text 8 | text = "The quick brown fox jumps over the lazy dog. It's unaffordable!" 9 | 10 | # Perform word tokenization 11 | word_tokens = word_tokenize(text) 12 | 13 | print("Word tokens:") 14 | print(word_tokens) -------------------------------------------------------------------------------- /chapter12/11.bpe_tokeniser.py: -------------------------------------------------------------------------------- 1 | from tokenizers import Tokenizer 2 | 3 | # Load the pre-trained GPT-2 BPE tokenizer 4 | tokenizer = Tokenizer.from_pretrained("gpt2") 5 | 6 | # Sample text 7 | text = "Tokenization in medical texts can include words like hyperlipidemia.." 8 | 9 | # Tokenize the text 10 | encoding = tokenizer.encode(text) 11 | 12 | # Print the tokens 13 | print("Tokens:", encoding.tokens) 14 | 15 | # Print the token IDs 16 | print("Token IDs:", encoding.ids) 17 | 18 | # Decode the token IDs back to text 19 | decoded_text = tokenizer.decode(encoding.ids) 20 | print("Decoded Text:", decoded_text) -------------------------------------------------------------------------------- /chapter12/12.tokenisation_wordpiece.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer 2 | 3 | # Load the pre-trained tokenizer 4 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 5 | 6 | # Sample text 7 | text = "Tokenization in medical texts can include words like hyperlipidemia." 8 | 9 | 10 | # Tokenize the text 11 | tokens = tokenizer.tokenize(text) 12 | print("Tokens:", tokens) 13 | 14 | # Convert tokens to input IDs 15 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 16 | print("Input IDs:", input_ids) -------------------------------------------------------------------------------- /chapter12/13.specialised_tokenisers.py: -------------------------------------------------------------------------------- 1 | import stanza 2 | from transformers import GPT2Tokenizer, GPT2LMHeadModel 3 | from collections import Counter 4 | import numpy as np 5 | import torch 6 | 7 | # Initialize Stanza for biomedical text 8 | stanza.download('en', package='mimic', processors='tokenize') 9 | nlp = stanza.Pipeline('en', package='mimic', processors='tokenize') 10 | 11 | # Initialize standard GPT-2 tokenizer 12 | standard_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 13 | standard_tokenizer.pad_token = standard_tokenizer.eos_token # Set pad_token to eos_token 14 | model = GPT2LMHeadModel.from_pretrained("gpt2") 15 | model.config.pad_token_id = model.config.eos_token_id # Set pad_token_id for the model 16 | 17 | # Sample medical corpus 18 | corpus = [ 19 | "The patient suffered a myocardial infarction.", 20 | "Early detection of heart attack is crucial.", 21 | "Treatment for myocardial infarction includes medication.", 22 | "Patients with heart conditions require regular check-ups.", 23 | "Myocardial infarction can lead to severe complications." 24 | ] 25 | 26 | def stanza_tokenize(text): 27 | doc = nlp(text) 28 | tokens = [word.text for sent in doc.sentences for word in sent.words] 29 | return tokens 30 | 31 | def calculate_oov_and_compression(corpus, tokenizer): 32 | oov_count = 0 33 | total_tokens = 0 34 | all_tokens = [] 35 | 36 | for sentence in corpus: 37 | tokens = tokenizer.tokenize(sentence) if hasattr(tokenizer, 'tokenize') else stanza_tokenize(sentence) 38 | all_tokens.extend(tokens) 39 | total_tokens += len(tokens) 40 | oov_count += tokens.count(tokenizer.oov_token) if hasattr(tokenizer, 'oov_token') else 0 41 | 42 | oov_rate = (oov_count / total_tokens) * 100 if total_tokens > 0 else 0 43 | avg_tokens_per_sentence = total_tokens / len(corpus) 44 | 45 | return oov_rate, avg_tokens_per_sentence, all_tokens 46 | 47 | def analyze_token_utilization(tokens): 48 | token_counts = Counter(tokens) 49 | total_tokens = len(tokens) 50 | utilization = {token: count / total_tokens for token, count in token_counts.items()} 51 | return utilization 52 | 53 | def calculate_perplexity(tokenizer, model, text): 54 | inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) 55 | with torch.no_grad(): 56 | outputs = model(**inputs, labels=inputs["input_ids"]) 57 | return torch.exp(outputs.loss).item() 58 | 59 | # Evaluation 60 | for tokenizer_name, tokenizer in [("Standard GPT-2", standard_tokenizer), ("Stanza Medical", stanza_tokenize)]: 61 | oov_rate, avg_tokens, all_tokens = calculate_oov_and_compression(corpus, tokenizer) 62 | utilization = analyze_token_utilization(all_tokens) 63 | 64 | print(f"\n{tokenizer_name} Tokenizer:") 65 | print(f"OOV Rate: {oov_rate:.2f}%") 66 | print(f"Average Tokens per Sentence: {avg_tokens:.2f}") 67 | print("Top 5 Most Used Tokens:") 68 | for token, freq in sorted(utilization.items(), key=lambda x: x[1], reverse=True)[:5]: 69 | print(f" {token}: {freq:.2%}") 70 | 71 | 72 | # Example output for "myocardial infarction" 73 | term = "myocardial infarction" 74 | print(f"\nTokenizing '{term}':") 75 | print(f"Standard GPT-2: {standard_tokenizer.tokenize(term)}") 76 | print(f"Stanza Medical: {stanza_tokenize(term)}") -------------------------------------------------------------------------------- /chapter12/14.embedding_bert.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries 2 | from transformers import BertTokenizer, BertModel 3 | import torch 4 | 5 | # Load pre-trained BERT tokenizer and model 6 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 7 | model = BertModel.from_pretrained('bert-base-uncased') 8 | 9 | # Input sentence 10 | sentence = "BERT embeddings are very useful for natural language processing tasks." 11 | 12 | # Tokenize the input sentence 13 | inputs = tokenizer(sentence, return_tensors='pt') 14 | 15 | # Generate embeddings 16 | with torch.no_grad(): 17 | outputs = model(**inputs) 18 | 19 | # Extract the last hidden states (embeddings) 20 | last_hidden_states = outputs.last_hidden_state 21 | 22 | # Print the shape of the embeddings tensor 23 | print("Shape of the embeddings tensor:", last_hidden_states.shape) 24 | 25 | # Print the embeddings for the first token (CLS token) 26 | cls_embedding = last_hidden_states[0, 0, :].numpy() 27 | print("CLS token embedding:", cls_embedding) 28 | 29 | # Print the embeddings for the first word 30 | first_word_embedding = last_hidden_states[0, 1, :].numpy() 31 | print("First word embedding:", first_word_embedding) 32 | -------------------------------------------------------------------------------- /chapter12/15.embedding_bge.py: -------------------------------------------------------------------------------- 1 | from langchain_community.embeddings import HuggingFaceBgeEmbeddings 2 | 3 | # Define the model name and parameters 4 | model_name = "BAAI/bge-small-en" 5 | model_kwargs = {"device": "cpu"} 6 | encode_kwargs = {"normalize_embeddings": True} 7 | 8 | # Initialize the embeddings model 9 | bge_embeddings = HuggingFaceBgeEmbeddings( 10 | model_name=model_name, 11 | model_kwargs=model_kwargs, 12 | encode_kwargs=encode_kwargs 13 | ) 14 | 15 | # Sample sentences to embed 16 | sentences = [ 17 | "The quick brown fox jumps over the lazy dog.", 18 | "I love machine learning and natural language processing." 19 | ] 20 | 21 | # Generate embeddings for each sentence 22 | embeddings = [bge_embeddings.embed_query(sentence) for sentence in sentences] 23 | 24 | # Print the embeddings 25 | for i, embedding in enumerate(embeddings): 26 | print(f"Embedding for sentence {i+1}: {embedding[:5]}...") # Print the first 5 values for brevity 27 | print(f"Length of embedding: {len(embedding)}") 28 | -------------------------------------------------------------------------------- /chapter12/16.embedding_gte.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | 3 | # Load the GTE-base model 4 | model = SentenceTransformer('thenlper/gte-base') 5 | 6 | # Sample texts to embed 7 | texts = [ 8 | "The quick brown fox jumps over the lazy dog.", 9 | "I love machine learning and natural language processing.", 10 | "Embeddings are useful for many NLP tasks." 11 | ] 12 | 13 | # Generate embeddings 14 | embeddings = model.encode(texts) 15 | 16 | # Print the shape of the embeddings 17 | print(f"Shape of embeddings: {embeddings.shape}") 18 | 19 | # Print the first few values of the first embedding 20 | print(f"First few values of the first embedding: {embeddings[0][:5]}") 21 | -------------------------------------------------------------------------------- /chapter12/2.punctuation.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | # Sample text 4 | text = "I love this product!!! It's amazing!!!" 5 | 6 | 7 | # Option 1: Replace symbols and punctuation 8 | replaced_text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation))) 9 | print("Replaced Text:", replaced_text) 10 | 11 | # Option 2: Remove symbols and punctuation 12 | removed_text = "".join(char for char in text if char.isalnum() or char.isspace()) 13 | print("Removed Text:", removed_text) 14 | -------------------------------------------------------------------------------- /chapter12/3.pii_detection.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from presidio_analyzer import AnalyzerEngine 3 | from presidio_anonymizer import AnonymizerEngine 4 | from presidio_anonymizer.entities import OperatorConfig 5 | 6 | # Sample DataFrame 7 | data = { 8 | 'text': [ 9 | "Hello, my name is John Doe. My email is john.doe@example.com", 10 | "Contact Jane Smith at jane.smith@work.com", 11 | "Call her at 987-654-3210.", 12 | "This is a test message without PII." 13 | ] 14 | } 15 | 16 | df = pd.DataFrame(data) 17 | 18 | # Initialize the analyzer and anonymizer engines 19 | analyzer = AnalyzerEngine() 20 | anonymizer = AnonymizerEngine() 21 | 22 | def anonymize_text(text): 23 | """ Anonymize PII entities in text """ 24 | # Analyze the text to detect PII entities 25 | analyzer_results = analyzer.analyze(text=text, entities=["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER"], language="en") 26 | 27 | # Define the anonymization configuration 28 | operators = { 29 | "PERSON": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 4, "from_end": True}), 30 | "EMAIL_ADDRESS": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 5, "from_end": True}), 31 | "PHONE_NUMBER": OperatorConfig("mask", {"masking_char": "*", "chars_to_mask": 6, "from_end": True}) 32 | } 33 | 34 | # Anonymize the detected PII entities 35 | anonymized_result = anonymizer.anonymize(text=text, analyzer_results=analyzer_results, operators=operators) 36 | 37 | return anonymized_result.text 38 | 39 | # Apply the anonymization function to the DataFrame 40 | df['anonymized_text'] = df['text'].apply(anonymize_text) 41 | 42 | # Display the DataFrame 43 | print(df['anonymized_text']) -------------------------------------------------------------------------------- /chapter12/4.rare_words.py: -------------------------------------------------------------------------------- 1 | from transformers import GPT2LMHeadModel, GPT2Tokenizer 2 | 3 | # Initialize the GPT-2 tokenizer and model 4 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 5 | model = GPT2LMHeadModel.from_pretrained("gpt2") 6 | 7 | # Define a text prompt with a rare word 8 | text = "The quokka, a rare marsupial," 9 | 10 | # Encode the input text to tensor 11 | indexed_tokens = tokenizer.encode(text, return_tensors='pt') 12 | 13 | # Generate text until the output length reaches 50 tokens 14 | output_text = model.generate(indexed_tokens, max_length=50, num_beams=5, no_repeat_ngram_size=2, early_stopping=True) 15 | 16 | # Decode the output text 17 | output_text_decoded = tokenizer.decode(output_text[0], skip_special_tokens=True) 18 | print(output_text_decoded) -------------------------------------------------------------------------------- /chapter12/5.spelling_checker.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | 3 | def fix_spelling(text): 4 | # Initialize the spelling correction pipeline 5 | spell_check = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base") 6 | 7 | # Generate the corrected text 8 | corrected = spell_check(text, max_length=2048)[0]['generated_text'] 9 | 10 | return corrected 11 | 12 | # Test the function with some sample text containing spelling mistakes 13 | sample_text = "y name si from Grece." 14 | corrected_text = fix_spelling(sample_text) 15 | 16 | print("Original text:", sample_text) 17 | print("Corrected text:", corrected_text) -------------------------------------------------------------------------------- /chapter12/6.fuzzy_matching.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | from thefuzz import process, fuzz 3 | 4 | def fix_spelling(text, threshold=80): 5 | # Initialize the spelling correction pipeline 6 | spell_check = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base") 7 | 8 | # Generate the corrected text 9 | corrected = spell_check(text, max_length=2048)[0]['generated_text'] 10 | 11 | # Split the original and corrected texts into words 12 | original_words = text.split() 13 | corrected_words = corrected.split() 14 | 15 | # Create a dictionary of common English words (you can expand this list) 16 | common_words = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at']) 17 | 18 | # Fuzzy match each word 19 | final_words = [] 20 | for orig, corr in zip(original_words, corrected_words): 21 | if orig.lower() in common_words: 22 | final_words.append(orig) # Keep common words as they are 23 | else: 24 | # Use fuzzy matching to find the best match 25 | matches = process.extractOne(orig, [corr], scorer=fuzz.ratio) 26 | if matches[1] >= threshold: 27 | final_words.append(matches[0]) 28 | else: 29 | final_words.append(orig) # Keep the original word if no good match found 30 | 31 | return ' '.join(final_words) 32 | 33 | # Test the function with some sample text containing spelling mistakes 34 | sample_text = "Lets do a copmarsion of speling mistaks in this sentense." 35 | corrected_text = fix_spelling(sample_text) 36 | 37 | print("Original text:", sample_text) 38 | print("Corrected text:", corrected_text) -------------------------------------------------------------------------------- /chapter12/7.fixed_chunking.py: -------------------------------------------------------------------------------- 1 | # Step 1: Load Example Data 2 | reviews = [ 3 | "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.", 4 | "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.", 5 | "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.", 6 | "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.", 7 | "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers." 8 | ] 9 | 10 | # Step 2: Create the TokenTextSplitter 11 | from langchain_text_splitters import TokenTextSplitter 12 | 13 | # Initialize the TokenTextSplitter with a chunk size of 50 tokens and no overlap 14 | text_splitter = TokenTextSplitter(chunk_size=50, chunk_overlap=0) 15 | 16 | # Step 3: Join Reviews and Split Text 17 | # Combine the reviews into a single text block for chunking 18 | text_block = " ".join(reviews) 19 | 20 | # Split the text into token-based chunks 21 | chunks = text_splitter.split_text(text_block) 22 | 23 | # Print the chunks 24 | print("Chunks with 50 tokens each:") 25 | for i, chunk in enumerate(chunks): 26 | print(f"Chunk {i + 1}:") 27 | print(chunk) 28 | print("\n") 29 | 30 | # Step 4: Experiment with Different Chunk Sizes 31 | chunk_sizes = [20, 70, 150] 32 | 33 | for size in chunk_sizes: 34 | print(f"Chunk Size: {size}") 35 | text_splitter = TokenTextSplitter(chunk_size=size, chunk_overlap=0) 36 | chunks = text_splitter.split_text(text_block) 37 | 38 | for i, chunk in enumerate(chunks): 39 | print(f"Chunk {i + 1}:") 40 | print(chunk) 41 | print("\n") 42 | -------------------------------------------------------------------------------- /chapter12/8.paragraph_chunking.py: -------------------------------------------------------------------------------- 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter 2 | 3 | reviews = [ 4 | "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.", 5 | "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.", 6 | "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.", 7 | "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.", 8 | "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers." 9 | ] 10 | 11 | # Combine the reviews into a single text block for chunking 12 | text_block = " ".join(reviews) 13 | 14 | # Create a RecursiveCharacterTextSplitter 15 | text_splitter = RecursiveCharacterTextSplitter( 16 | separators=["\n\n", "\n", " ", ""], 17 | chunk_size=200, 18 | chunk_overlap=0, 19 | length_function=len 20 | ) 21 | 22 | # Split the text into chunks 23 | chunks = text_splitter.split_text(text_block) 24 | 25 | # Print the chunks 26 | for i, chunk in enumerate(chunks, 1): 27 | print(f"Chunk {i}:") 28 | print(chunk.strip()) 29 | print("-" * 50) -------------------------------------------------------------------------------- /chapter12/9.semantic_chunking.py: -------------------------------------------------------------------------------- 1 | from langchain_experimental.text_splitter import SemanticChunker 2 | from langchain_huggingface import HuggingFaceEmbeddings 3 | import os 4 | 5 | reviews = [ 6 | "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.", 7 | "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.", 8 | "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.", 9 | "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.", 10 | "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers." 11 | ] 12 | # Combine the reviews into a single text block for chunking 13 | text_block = " ".join(reviews) 14 | 15 | text_splitter = SemanticChunker(HuggingFaceEmbeddings()) 16 | 17 | docs = text_splitter.create_documents([text_block]) 18 | 19 | for i, doc in enumerate(docs): 20 | print(f"Chunk {i + 1}:") 21 | print(doc.page_content) 22 | print("\n") -------------------------------------------------------------------------------- /chapter12/9.semantic_similarity.py: -------------------------------------------------------------------------------- 1 | from langchain_experimental.text_splitter import SemanticChunker 2 | from langchain_huggingface import HuggingFaceEmbeddings 3 | 4 | # Load the HuggingFace embedding model 5 | embedding_model = HuggingFaceEmbeddings(model_name='roberta-base') 6 | 7 | # Create a SemanticChunker with the correct parameters 8 | text_splitter = SemanticChunker( 9 | embeddings=embedding_model, 10 | buffer_size=10, # Hypothetical buffer size 11 | add_start_index=True, # Whether to add start index for chunks 12 | breakpoint_threshold_type='interquartile', # Type of breakpoint threshold 13 | breakpoint_threshold_amount=0.7, # Amount for breakpoint threshold 14 | number_of_chunks=5 # Target number of chunks, optional 15 | ) 16 | 17 | # Example data 18 | reviews = [ 19 | "This smartphone has an excellent camera. The photos are sharp and the colors are vibrant. Overall, very satisfied with my purchase.", 20 | "I was disappointed with the laptop's performance. It frequently lags and the battery life is shorter than expected.", 21 | "The blender works great for making smoothies. It's powerful and easy to clean. Definitely worth the price.", 22 | "Customer support was unresponsive. I had to wait a long time for a reply, and my issue was not resolved satisfactorily.", 23 | "The book is a fascinating read. The storyline is engaging and the characters are well-developed. Highly recommend to all readers." 24 | ] 25 | 26 | # Combine the reviews into a single text block for chunking 27 | text_block = " ".join(reviews) 28 | 29 | # Split the text into semantic chunks 30 | chunks = text_splitter.split_text(text_block) 31 | 32 | # Print the chunks 33 | for i, chunk in enumerate(chunks, 1): 34 | print(f"Chunk {i}:") 35 | print(chunk.strip()) 36 | print("-" * 50) 37 | -------------------------------------------------------------------------------- /chapter13/2.ocr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from PIL import Image 4 | from paddleocr import PaddleOCR, draw_ocr 5 | import matplotlib.pyplot as plt 6 | 7 | # Initialize PaddleOCR 8 | ocr = PaddleOCR(use_angle_cls=True, lang='en') 9 | 10 | # Define the folder containing images 11 | folder_path = 'chapter13/images' 12 | 13 | # Supported image extensions 14 | supported_extensions = ('.png', '.jpg', '.jpeg') 15 | 16 | # Get all images in the folder 17 | image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.lower().endswith(supported_extensions)] 18 | 19 | # Create an empty DataFrame to store results 20 | df = pd.DataFrame(columns=['Image Path', 'Extracted Text']) 21 | 22 | # Check if there are any images found 23 | if not image_paths: 24 | print("No images found in the specified folder.") 25 | else: 26 | # Function to process images and extract text 27 | def process_image(image_path): 28 | # Perform OCR on the image 29 | result = ocr.ocr(image_path, cls=True) 30 | 31 | # Extracting and printing the text 32 | extracted_text = "" 33 | for line in result[0]: 34 | extracted_text += line[1][0] + " " 35 | print(f"Extracted Text from {os.path.basename(image_path)}:\n{extracted_text}\n") 36 | 37 | # Append results to DataFrame 38 | df.loc[len(df)] = [image_path, extracted_text] 39 | 40 | # Process each image in the folder 41 | for image_path in image_paths: 42 | process_image(image_path) 43 | 44 | # Display the DataFrame 45 | print(df) 46 | 47 | # Optionally, save the DataFrame to a CSV file 48 | df.to_csv('extracted_texts.csv', index=False) 49 | -------------------------------------------------------------------------------- /chapter13/3.ocr_with_llms.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import re 4 | from langchain import PromptTemplate, LLMChain 5 | from langchain.llms import HuggingFaceHub 6 | 7 | # Read the CSV file 8 | df = pd.read_csv('extracted_texts.csv') 9 | 10 | # Initialize the Hugging Face model 11 | model_name = "mistralai/Mistral-Nemo-Instruct-2407" # Using Mistral for instruction-following 12 | 13 | # Your Hugging Face API token 14 | api_token = "add_your_token" # Replace with your actual API token 15 | 16 | # LangChain setup with few-shot examples 17 | prompt_template = PromptTemplate( 18 | input_variables=["text"], 19 | template='''Correct the following text for spelling errors and return only the corrected text in lowercase. Respond using JSON format, strictly according to the following schema: 20 | {{"corrected_text": "corrected text in lowercase"}} 21 | 22 | Examples: 23 | Input: "Open vs Proprietry LLMs" 24 | Output: {{"corrected_text": "open vs proprietary llms"}} 25 | 26 | Input: "HOW TO MITIGATE SaCURITY RISKS IN AI AND ML SYSTEM VECTOR LAB" 27 | Output: {{"corrected_text": "how to mitigate security risks in ai and ml system vector lab"}} 28 | 29 | Input: "BUILDING DBRX-CLASS CUSTOM LLMS WITH MOSAIC A1 TRAINING VECTOR LAB" 30 | Output: {{"corrected_text": "building dbrx-class custom llms with mosaic a1 training vector lab"}} 31 | 32 | Text to correct: 33 | {text} 34 | Output (JSON format only): 35 | ''' 36 | ) 37 | 38 | huggingface_llm = HuggingFaceHub(repo_id=model_name, huggingfacehub_api_token=api_token, model_kwargs={"task": "text-generation"}) 39 | llm_chain = LLMChain(prompt=prompt_template, llm=huggingface_llm) 40 | 41 | def correct_text(text): 42 | # Use the LLMChain to generate a response 43 | response = llm_chain.run(text) 44 | print(f"Raw Response: {response}") # Debugging line to see the raw response 45 | 46 | # Use regex to extract the JSON part that follows "Output (JSON format only):" 47 | json_match = re.search(r'Output \(JSON format only\):\s*(\{.*\})', response) 48 | if json_match: 49 | json_str = json_match.group(1) 50 | try: 51 | response_json = json.loads(json_str) 52 | corrected_text = response_json.get('corrected_text', '') 53 | return corrected_text 54 | except json.JSONDecodeError as json_error: 55 | print(f"JSON Decode Error: {json_error}") 56 | return "error" 57 | else: 58 | print("No valid JSON object found in the response") 59 | return "error" 60 | 61 | # Apply text correction to the 'Extracted Text' column 62 | df['Corrected Text'] = df['Extracted Text'].apply(correct_text) 63 | 64 | # Display the DataFrame 65 | print(df) 66 | 67 | # Optionally, save the updated DataFrame to a new CSV file 68 | df.to_csv('cleaned_texts.csv', index=False) 69 | 70 | # Print examples of corrections 71 | for _, row in df.iterrows(): 72 | print("Original:", row['Extracted Text']) 73 | print("Corrected:", row['Corrected Text']) 74 | print() 75 | -------------------------------------------------------------------------------- /chapter13/4.image_captioning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from PIL import Image 4 | import matplotlib.pyplot as plt 5 | from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM 6 | from langchain import PromptTemplate, LLMChain 7 | from langchain.llms import HuggingFaceHub 8 | 9 | # Define the folder containing images 10 | folder_path = 'chapter13/images' 11 | 12 | # Supported image extensions 13 | supported_extensions = ('.png', '.jpg', '.jpeg') 14 | 15 | # Get all images in the folder 16 | image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.lower().endswith(supported_extensions)] 17 | 18 | # Create an empty DataFrame to store results 19 | df = pd.DataFrame(columns=['Image Path', 'Generated Caption', 'Refined Caption']) 20 | 21 | # Initialize the BLIP model and processor for image captioning 22 | blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") 23 | blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 24 | 25 | # Initialize the LLM for text refinement 26 | llm_model_name = "google/flan-t5-small" # You can choose other models as well 27 | tokenizer = AutoTokenizer.from_pretrained(llm_model_name) 28 | model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name) 29 | 30 | # LangChain setup 31 | api_token = "" 32 | prompt_template = PromptTemplate(input_variables=["text"], template="Refine and correct the following caption: {text}") 33 | huggingface_llm = HuggingFaceHub(repo_id=llm_model_name, huggingfacehub_api_token=api_token) 34 | llm_chain = LLMChain(prompt=prompt_template, llm=huggingface_llm) 35 | 36 | def refine_caption(caption): 37 | # Create a prompt using LangChain and generate refined caption 38 | prompt = prompt_template.format(text=caption) 39 | refined_caption = llm_chain.run(prompt) 40 | return refined_caption 41 | 42 | def generate_caption(image_path): 43 | image = Image.open(image_path).convert("RGB") 44 | inputs = blip_processor(images=image, return_tensors="pt") 45 | outputs = blip_model.generate(**inputs) 46 | caption = blip_processor.decode(outputs[0], skip_special_tokens=True) 47 | return caption 48 | 49 | # Process each image in the folder 50 | if not image_paths: 51 | print("No images found in the specified folder.") 52 | else: 53 | for image_path in image_paths: 54 | # Generate image caption 55 | caption = generate_caption(image_path) 56 | print(f"Generated Caption for {os.path.basename(image_path)}:\n{caption}\n") 57 | 58 | # Refine the caption 59 | refined_caption = refine_caption(caption) 60 | print(f"Refined Caption:\n{refined_caption}\n") 61 | 62 | # Append results to DataFrame 63 | df.loc[len(df)] = [image_path, caption, refined_caption] 64 | 65 | # Display the DataFrame 66 | print(df) 67 | 68 | # Optionally, save the DataFrame to a CSV file 69 | df.to_csv('captions.csv', index=False) 70 | -------------------------------------------------------------------------------- /chapter13/5.whisper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import WhisperProcessor, WhisperForConditionalGeneration 3 | import librosa 4 | 5 | # Load the Whisper processor and model from Hugging Face 6 | processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") 7 | model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") 8 | 9 | # Define the path to your audio file 10 | audio_path = "chapter13/audio/3.chain orchestrator.mp3" # Replace with your actual audio file path 11 | 12 | # Load the audio file 13 | audio, rate = librosa.load(audio_path, sr=16000) 14 | 15 | # Preprocess the audio file for the Whisper model 16 | input_features = processor(audio, sampling_rate=rate, return_tensors="pt").input_features 17 | 18 | # Generate the transcription 19 | with torch.no_grad(): 20 | predicted_ids = model.generate(input_features) 21 | 22 | # Decode the generated transcription 23 | transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] 24 | 25 | # Print the transcribed text 26 | print("Transcribed Text:") 27 | print(transcription) 28 | -------------------------------------------------------------------------------- /chapter13/6.emotion_detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pandas as pd 3 | from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer 4 | import librosa 5 | import numpy as np 6 | 7 | # Load the Whisper processor and model from Hugging Face 8 | whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") 9 | whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") 10 | 11 | # Load the emotion detection processor and model from Hugging Face 12 | emotion_model_name = "j-hartmann/emotion-english-distilroberta-base" 13 | emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name) 14 | emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name) 15 | 16 | # Define the path to your audio file 17 | audio_path = "chapter13/audio/3.chain orchestrator.mp3" # Replace with your actual audio file path 18 | 19 | # Load the audio file 20 | audio, rate = librosa.load(audio_path, sr=16000) 21 | 22 | # Function to split audio into chunks 23 | def split_audio(audio, rate, chunk_duration=30): 24 | chunk_length = int(rate * chunk_duration) 25 | num_chunks = int(np.ceil(len(audio) / chunk_length)) 26 | return [audio[i*chunk_length:(i+1)*chunk_length] for i in range(num_chunks)] 27 | 28 | # Function to transcribe audio to text using Whisper 29 | def transcribe_audio(audio_chunk, rate): 30 | # Preprocess the audio file for the Whisper model 31 | input_features = whisper_processor(audio_chunk, sampling_rate=rate, return_tensors="pt").input_features 32 | 33 | # Generate the transcription 34 | with torch.no_grad(): 35 | predicted_ids = whisper_model.generate(input_features) 36 | 37 | # Decode the generated transcription 38 | transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] 39 | return transcription 40 | 41 | # Function to detect emotions from text using the emotion detection model 42 | def detect_emotion(text): 43 | inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) 44 | outputs = emotion_model(**inputs) 45 | predicted_class_id = torch.argmax(outputs.logits, dim=-1).item() 46 | emotions = emotion_model.config.id2label 47 | return emotions[predicted_class_id] 48 | 49 | # Split audio into chunks 50 | audio_chunks = split_audio(audio, rate, chunk_duration=30) # 30-second chunks 51 | 52 | # Create a DataFrame to store results 53 | df = pd.DataFrame(columns=['Chunk Index', 'Transcription', 'Emotion']) 54 | 55 | # Process each audio chunk 56 | for i, audio_chunk in enumerate(audio_chunks): 57 | transcription = transcribe_audio(audio_chunk, rate) 58 | emotion = detect_emotion(transcription) 59 | 60 | # Append results to DataFrame 61 | df.loc[i] = [i, transcription, emotion] 62 | print(f"Processed Chunk {i+1}/{len(audio_chunks)}") 63 | 64 | # Display the DataFrame 65 | print(df) 66 | 67 | # Optionally, save the DataFrame to a CSV file 68 | df.to_csv('transcriptions_with_emotions.csv', index=False) 69 | -------------------------------------------------------------------------------- /chapter13/7.write_highlights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pandas as pd 3 | from transformers import WhisperProcessor, WhisperForConditionalGeneration 4 | import librosa 5 | import numpy as np 6 | from langchain.prompts import PromptTemplate 7 | from langchain.chains import LLMChain 8 | from langchain.llms import HuggingFaceHub 9 | 10 | # Load the Whisper processor and model from Hugging Face 11 | whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") 12 | whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") 13 | 14 | # Initialize the Hugging Face model 15 | model_name = "mistralai/Mistral-Nemo-Instruct-2407" # Using Mistral for instruction-following 16 | 17 | # Your Hugging Face API token 18 | api_token = "add_your_huggigng_face_token" # Replace with your actual API token 19 | 20 | # LangChain setup with few-shot examples 21 | prompt_template = PromptTemplate( 22 | input_variables=["text"], 23 | template='''This is the transcribed text from a YouTube video. Write the key highlights from this video in bullet format. 24 | {text} 25 | Output: 26 | ''' 27 | ) 28 | 29 | huggingface_llm = HuggingFaceHub(repo_id=model_name, huggingfacehub_api_token=api_token, model_kwargs={"task": "text-generation"}) 30 | llm_chain = LLMChain(prompt=prompt_template, llm=huggingface_llm) 31 | 32 | # Define the path to your audio file 33 | audio_path = "chapter13/audio/3.chain orchestrator.mp3" # Replace with your actual audio file path 34 | 35 | # Load the audio file 36 | audio, rate = librosa.load(audio_path, sr=16000) 37 | 38 | # Function to split audio into chunks 39 | def split_audio(audio, rate, chunk_duration=30): 40 | chunk_length = int(rate * chunk_duration) 41 | num_chunks = int(np.ceil(len(audio) / chunk_length)) 42 | return [audio[i*chunk_length:(i+1)*chunk_length] for i in range(num_chunks)] 43 | 44 | # Function to transcribe audio to text using Whisper 45 | def transcribe_audio(audio_chunk, rate): 46 | # Preprocess the audio file for the Whisper model 47 | input_features = whisper_processor(audio_chunk, sampling_rate=rate, return_tensors="pt").input_features 48 | 49 | # Generate the transcription 50 | with torch.no_grad(): 51 | predicted_ids = whisper_model.generate(input_features) 52 | 53 | # Decode the generated transcription 54 | transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] 55 | return transcription 56 | 57 | # Function to generate key highlights from text using the LLM 58 | def generate_highlights(text): 59 | try: 60 | response = llm_chain.run(text) 61 | return response.strip() # Clean up any whitespace around the response 62 | except Exception as e: 63 | print(f"Error generating highlights: {e}") 64 | return "error" # Handle errors gracefully 65 | 66 | # Split audio into chunks 67 | audio_chunks = split_audio(audio, rate, chunk_duration=30) # 30-second chunks 68 | 69 | # Transcribe each audio chunk 70 | transcriptions = [transcribe_audio(chunk, rate) for chunk in audio_chunks] 71 | 72 | # Join all transcriptions into a single text 73 | full_transcription = " ".join(transcriptions) 74 | 75 | # Generate highlights from the full transcription 76 | highlights = generate_highlights(full_transcription) 77 | 78 | # Create a DataFrame to store results 79 | df = pd.DataFrame(columns=['Full Transcription', 'Highlights']) 80 | df.loc[0] = [full_transcription, highlights] 81 | 82 | # Display the DataFrame 83 | print(df) 84 | 85 | # Optionally, save the DataFrame to a CSV file 86 | df.to_csv('transcriptions_with_highlights.csv', index=False) 87 | 88 | # Print examples of corrections 89 | print("Full Transcription:", full_transcription) 90 | print("Highlights:", highlights) 91 | -------------------------------------------------------------------------------- /chapter13/audio/3.chain orchestrator.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/audio/3.chain orchestrator.mp3 -------------------------------------------------------------------------------- /chapter13/images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/1.png -------------------------------------------------------------------------------- /chapter13/images/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/10.png -------------------------------------------------------------------------------- /chapter13/images/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/11.png -------------------------------------------------------------------------------- /chapter13/images/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/12.png -------------------------------------------------------------------------------- /chapter13/images/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/13.png -------------------------------------------------------------------------------- /chapter13/images/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/14.png -------------------------------------------------------------------------------- /chapter13/images/15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/15.png -------------------------------------------------------------------------------- /chapter13/images/16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/16.png -------------------------------------------------------------------------------- /chapter13/images/17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/17.png -------------------------------------------------------------------------------- /chapter13/images/18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/18.png -------------------------------------------------------------------------------- /chapter13/images/19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/19.png -------------------------------------------------------------------------------- /chapter13/images/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/2.png -------------------------------------------------------------------------------- /chapter13/images/20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/20.png -------------------------------------------------------------------------------- /chapter13/images/21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/21.png -------------------------------------------------------------------------------- /chapter13/images/22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/22.png -------------------------------------------------------------------------------- /chapter13/images/23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/23.png -------------------------------------------------------------------------------- /chapter13/images/24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/24.png -------------------------------------------------------------------------------- /chapter13/images/25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/25.png -------------------------------------------------------------------------------- /chapter13/images/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/3.png -------------------------------------------------------------------------------- /chapter13/images/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/4.png -------------------------------------------------------------------------------- /chapter13/images/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/5.png -------------------------------------------------------------------------------- /chapter13/images/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/6.png -------------------------------------------------------------------------------- /chapter13/images/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/7.png -------------------------------------------------------------------------------- /chapter13/images/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/8.png -------------------------------------------------------------------------------- /chapter13/images/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Python-Data-Cleaning-and-Preparation-Best-Practices/be8a99bb2178efe4f5e3e9b54d3a50f494f86de3/chapter13/images/9.png --------------------------------------------------------------------------------