├── README.md
├── bash-data-cleaning
    ├── clean_user_data.sh
    ├── output.md
    └── users.csv
├── build-with-python
    └── data-cleaning-n-validation-pipeline
    │   └── main.py
├── containerizing-python-apps
    └── currency-api
    │   ├── .dockerignore
    │   ├── Dockerfile
    │   ├── main.py
    │   └── requirements.txt
├── data-cleaning-essential-guide
    ├── README.md
    └── messy_data.csv
├── data-cleaning
    ├── README.md
    ├── generate_df.py
    ├── pandas_data_cleaning_one_liners.ipynb
    └── pandas_eda_one_liners.ipynb
├── data-science-app
    ├── README.md
    ├── app.py
    └── model_training.py
├── deploy_ml_models
    ├── Dockerfile
    ├── README.md
    ├── app.py
    └── linear_regression.py
├── docker
    ├── README.md
    ├── docker-volume
    │   └── volume-postgres.md
    ├── leverage-build-cache
    │   └── README.md
    └── minimal-img-python-apps
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── app.py
    │   └── requirements.txt
├── duckdb-json
    ├── ecommerce_data.json
    ├── output.md
    └── query_json.sql
├── duckdb-miniseries
    ├── README.md
    ├── analyze-csv
    │   ├── query_csv.sql
    │   ├── query_csv_op.md
    │   └── shopping_data.csv
    ├── analyze-pandas-dataframes
    │   ├── DuckDB_pandas_df.ipynb
    │   └── README.md
    ├── analyze-parquet
    │   ├── query_parquet.sql
    │   ├── query_parquet_op.md
    │   └── restaurant_orders.parquet
    ├── descriptive-statistics
    │   ├── DuckDB_descriptive_stats.ipynb
    │   ├── README.md
    │   └── cab_ride_data.csv
    └── hypothesis-testing
    │   ├── DuckDB_hypothesis_testing.ipynb
    │   ├── README.md
    │   └── cab_ride_data.csv
├── duckdb
    ├── README.md
    ├── generate_csv.py
    └── main.py
├── fastapi-docker-for-ml-model-deployment
    └── diabetes-predictor
    │   ├── app
    │       ├── __init__.py
    │       └── main.py
    │   └── train_model.py
├── fastapi
    ├── README.md
    └── main.py
├── machine-learning
    ├── Feature_Engineering_Tips.ipynb
    ├── HyperparameterTuning.ipynb
    ├── LinearRegressionExample.ipynb
    ├── LogisticRegressionExample.ipynb
    └── README.md
├── model_deployment
    ├── Dockerfile
    ├── README.md
    ├── app
    │   ├── __init__.py
    │   └── main.py
    ├── model_training.py
    └── requirements.txt
├── natural-language-processing
    ├── README.md
    └── nlp_with_python.ipynb
├── pandas
    ├── 5_steps_data_cleaning.ipynb
    ├── README.md
    ├── common_pandas_errors.ipynb
    ├── data_cleaning_with_pandas.ipynb
    ├── pandas_data_quality_checks_one_liners.ipynb
    └── pandas_plotting_functions.ipynb
├── postgres
    └── README.md
├── pyspark
    ├── README.md
    ├── pyspark_data_cleaning.ipynb
    ├── pyspark_read_csv.ipynb
    └── pyspark_write_parquet.ipynb
├── regex
    ├── learn_regex.py
    ├── quick-ref-regex.md
    ├── regex_basics.ipynb
    ├── regex_contd.ipynb
    └── regex_examples.ipynb
├── statistical_plots.ipynb
├── statistics
    ├── Basic_Stats_Functions_Python.ipynb
    ├── DescriptiveStats.ipynb
    ├── DescriptiveStats[final].ipynb
    ├── Outlier_Detection_Tips.ipynb
    ├── README.md
    ├── Stats_Libraries.ipynb
    ├── Visualizing_Statistical_Data.ipynb
    ├── handle_excel_files.ipynb
    ├── outlier_detection_techniques.ipynb
    ├── probability
    │   ├── README.md
    │   ├── beta_distribution.ipynb
    │   ├── cauchy_distribution.ipynb
    │   ├── geometric_distribution.ipynb
    │   ├── joint_and_conditional_pbty.ipynb
    │   └── poisson_distribution.ipynb
    ├── process_csv_files.ipynb
    ├── scipy_time_series_analysis.ipynb
    ├── sparse_data_analysis.ipynb
    ├── sparse_data_analysis_v0_1.ipynb
    └── time_series_decomposition.ipynb
└── vibe-coding
    └── speed-reader
        ├── README.md
        └── rsvp_reader.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Data Science Tutorials
 2 | > If you're coming from one of my data science tutorials, you'll find the code and the links to the tutorials here.
 3 | I hope you find them helpful. Happy learning and coding!
 4 | 
 5 | <img src="https://i.imgur.com/Da3iKuA.jpeg" alt="data-science-tutorials" width="800"/></img>
 6 | 
 7 | 
 8 | 
 9 | |No.| Article| Code|
10 | |----|----|------|
11 | |1|[Build a Data Science App with Python in 10 Easy Steps](https://www.kdnuggets.com/build-data-science-app-with-python-10-easy-steps)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/data-science-app)|
12 | |2|[A Practical Guide to Deploying Machine Learning Models](https://machinelearningmastery.com/a-practical-guide-to-deploying-machine-learning-models/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/model_deployment)|
13 | |3|[FastAPI Tutorial: Build APIs with Python in Minutes](https://www.kdnuggets.com/fastapi-tutorial-build-apis-with-python-in-minutes)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/fastapi)|
14 | |4|[The Beginner’s Guide to Natural Language Processing with Python](https://machinelearningmastery.com/the-beginners-guide-to-natural-language-processing-with-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/natural-language-processing)|
15 | |5|[How to Perform Statistical Analysis on Sparse Data in Python](https://www.statology.org/how-to-perform-statistical-analysis-sparse-data-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/sparse_data_analysis_v0_1.ipynb)|
16 | |6|[10 Essential Statistical Functions in Python](https://www.statology.org/10-essential-statistical-functions-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/Basic_Stats_Functions_Python.ipynb)|
17 | |7|[How to Calculate Joint and Conditional Probabilities in Python](https://www.statology.org/how-to-calculate-joint-and-conditional-probabilities-in-python/)|[Code]()|
18 | |8|[How to Use the Geometric Distribution in Python](https://www.statology.org/how-to-use-the-geometric-distribution-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/geometric_distribution.ipynb)|
19 | |9|[How to Use the Beta Distribution in Python](https://www.statology.org/how-to-use-the-beta-distribution-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/beta_distribution.ipynb)|
20 | |10|[How to Use the Cauchy Distribution in Python](https://www.statology.org/how-to-use-the-cauchy-distribution-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/cauchy_distribution.ipynb)|
21 | |11|[Tips for Effective Outlier Detection in Real-World Datasets](https://www.statology.org/tips-for-effective-outlier-detection-in-real-world-datasets/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/Outlier_Detection_Tips.ipynb)|
22 | |12|[How to Interpret Statistical Plots in Python](https://www.statology.org/how-to-interpret-statistical-plots-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistical_plots.ipynb)|
23 | |13|[Data Cleaning with Bash: A Handbook for Developers](https://www.kdnuggets.com/data-cleaning-with-bash-a-handbook-for-developers)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/bash-data-cleaning)|
24 | |14| [Analyzing JSON Data with DuckDB & SQL](https://www.kdnuggets.com/analyzing-json-data-with-duckdb-sql)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-json)|
25 | |15| [The Poisson Distribution: From Basics to Real-World Examples](https://www.statology.org/the-poisson-distribution-from-basics-to-real-world-examples/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/poisson_distribution.ipynb)|
26 | |16|[Why & How to Containerize Your Existing Python Apps](https://www.kdnuggets.com/why-how-to-containerize-your-existing-python-apps)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/containerizing-python-apps/currency-api)|
27 | |17|[How to Analyze Parquet Files with DuckDB](https://www.statology.org/how-to-analyze-parquet-files-with-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/analyze-parquet)|
28 | |18|[How to Analyze CSV Files with DuckDB](https://www.statology.org/how-to-analyze-csv-files-with-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/analyze-csv)|
29 | |19|[How to Query Pandas DataFrames with DuckDB](https://www.statology.org/how-to-query-pandas-dataframes-with-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/analyze-pandas-dataframes)|
30 | |20|[How to Calculate Descriptive Statistics in DuckDB](https://www.statology.org/how-to-calculate-descriptive-statistics-in-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/descriptive-statistics)|
31 | |21|[How to Perform Hypothesis Testing in DuckDB](https://www.statology.org/how-to-perform-hypothesis-testing-in-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/hypothesis-testing)|
32 | |22|[Top 5 Statistical Techniques to Detect and Handle Outliers in Data](https://www.statology.org/top-5-statistical-techniques-detect-handle-outliers-data/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/outlier_detection_techniques.ipynb)|
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/bash-data-cleaning/clean_user_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Define input and output files
 4 | INPUT_FILE="users.csv"
 5 | OUTPUT_FILE="users_cleaned.csv"
 6 | TEMP_FILE="temp.csv"
 7 | 
 8 | echo "Starting data cleaning process..."
 9 | 
10 | # Step 1: Handle Missing Values
11 | echo "Step 1: Handling missing values..."
12 | sed 's/,,/,NULL,/g; s/,$/,NULL/g' $INPUT_FILE > $OUTPUT_FILE
13 | 
14 | # Step 2: Fix Missing First Names
15 | echo "Step 2: Fixing missing first names..."
16 | awk -F, 'BEGIN {OFS=","} {if ($2 == "" || $2 == "NULL") $2 = "Unknown"; print}' $OUTPUT_FILE > $TEMP_FILE
17 | mv $TEMP_FILE $OUTPUT_FILE
18 | 
19 | # Step 3: Fix Invalid Email Formats
20 | echo "Step 3: Fixing invalid email formats..."
21 | awk -F, 'BEGIN {OFS=","} {if ($3 !~ /@/ || $3 == "" || $3 == "NULL" || $3 == "not_an_email") $3 = "unknown@example.com"; print}' $OUTPUT_FILE > $TEMP_FILE
22 | mv $TEMP_FILE $OUTPUT_FILE
23 | 
24 | # Step 4: Correct Date Formats
25 | echo "Step 4: Correcting date formats..."
26 | awk -F, 'BEGIN {OFS=","} {if ($5 == "invalid_date" || $5 == "" || $5 == "NULL") $5 = "2023-01-20"; print}' $OUTPUT_FILE > $TEMP_FILE
27 | mv $TEMP_FILE $OUTPUT_FILE
28 | 
29 | # Step 5: Ensure Last Login Date is Valid
30 | echo "Step 5: Ensuring last login date is valid..."
31 | awk -F, 'BEGIN {OFS=","} {if ($6 == "" || $6 == "NULL") $6 = "2023-03-23"; print}' $OUTPUT_FILE > $TEMP_FILE
32 | mv $TEMP_FILE $OUTPUT_FILE
33 | 
34 | # Step 6: Handle Negative Values
35 | echo "Step 6: Handling negative values..."
36 | awk -F, 'BEGIN {OFS=","} {if ($7 < 0) $7 = 0; print}' $OUTPUT_FILE > $TEMP_FILE
37 | mv $TEMP_FILE $OUTPUT_FILE
38 | 
39 | # Validation checks
40 | echo "Running validation checks..."
41 | 
42 | # Check for empty fields
43 | EMPTY_FIELDS=$(grep -c ",," $OUTPUT_FILE)
44 | echo "Empty fields remaining: $EMPTY_FIELDS"
45 | 
46 | # Check for invalid emails
47 | INVALID_EMAILS=$(grep -v -E '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' $OUTPUT_FILE | grep -v "email" | wc -l)
48 | echo "Invalid emails remaining: $INVALID_EMAILS"
49 | 
50 | # Check for invalid dates
51 | INVALID_DATES=$(grep -v -E '[0-9]{4}-[0-9]{2}-[0-9]{2}' $OUTPUT_FILE | grep -v "signup_date" | wc -l)
52 | echo "Invalid dates remaining: $INVALID_DATES"
53 | 
54 | # Check for negative values
55 | NEGATIVE_VALUES=$(awk -F, '$7 < 0 {print}' $OUTPUT_FILE | wc -l)
56 | echo "Negative values remaining: $NEGATIVE_VALUES"
57 | 
58 | echo "Data cleaning complete. Cleaned data saved to $OUTPUT_FILE"
59 | 
60 | # Optional: Remove temporary file if it exists
61 | if [ -f "$TEMP_FILE" ]; then
62 |     rm $TEMP_FILE
63 | fi
64 | 


--------------------------------------------------------------------------------
/bash-data-cleaning/output.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | $ head users.csv
 3 | id,first_name,last_name,email,signup_date,last_login,purchase_amount
 4 | 1,John,Smith,john.smith@example.com,2023-01-15,2023-03-20,125.99
 5 | 2,Jane,Doe,jane.doe@example.com,2023-01-16,2023-03-21,210.50
 6 | 3,Bob,Johnson,bob@example.com,2023-01-17,2023-03-22,0
 7 | 4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25
 8 | 5,,Brown,mike.brown@example.com,2023-01-19,2023-03-24,150.75
 9 | 6,Sarah,Miller,sarah.miller@example.com,invalid_date,2023-03-25,95.00
10 | 7,David,Jones,david.jones@example.com,2023-01-21,2023-03-26,300.00
11 | 8,Lisa,Garcia,lisa.garcia@example.com,2023-01-22,2023-03-27,-50.00
12 | 9,James,Martinez,mymail@example.com,2023-01-23,2023-03-28,125.00
13 | ```
14 | 
15 | ```
16 | $ head -n 5 users.csv
17 | id,first_name,last_name,email,signup_date,last_login,purchase_amount
18 | 1,John,Smith,john.smith@example.com,2023-01-15,2023-03-20,125.99
19 | 2,Jane,Doe,jane.doe@example.com,2023-01-16,2023-03-21,210.50
20 | 3,Bob,Johnson,bob@example.com,2023-01-17,2023-03-22,0
21 | 4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25
22 | ```
23 | ```
24 | $ grep -c ",," users.csv
25 | 2
26 | ```
27 | 
28 | ```
29 | grep -n ",," users.csv
30 | 5:4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25
31 | 6:5,,Brown,mike.brown@example.com,2023-01-19,2023-03-24,150.75
32 | ```
33 | 
34 | ```
35 | $ grep -v -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}$' users.csv | grep "invalid_date"
36 | 6,Sarah,Miller,sarah.miller@example.com,invalid_date,2023-03-25,95.00
37 | 
38 | ```
39 | 
40 | ```
41 | $ awk -F, '$7 < 0 {print $0}' users.csv
42 | 8,Lisa,Garcia,lisa.garcia@example.com,2023-01-22,2023-03-27,-50.00
43 | ```
44 | 
45 | ```
46 | $ awk -F, 'NR>1 {sum += $7} END {print "Total purchases: $" sum}' users_cleaned.csv
47 | ```
48 | ```
49 | Total purchases: $1282.49
50 | ```
51 | 
52 | ```
53 | balapriya@balapriya-82C4:~/bash-data-cleaning$ awk -F, 'NR>1 {sum += $7; count++} END {print "Average purchase: $" sum/count}' users_cleaned.csv
54 | Average purchase: $128.249
55 | ```
56 | ```
57 | $ awk -F, 'NR>1 {
58 |     split($5, date, "-");
59 |     months[date[2]]++;
60 | } 
61 | END {
62 |     for (month in months) {
63 |         print "Month " month ": " months[month] " users"
64 |     }
65 | }' users_cleaned.csv
66 | Month 01: 10 users
67 | ```
68 | 


--------------------------------------------------------------------------------
/bash-data-cleaning/users.csv:
--------------------------------------------------------------------------------
 1 | id,first_name,last_name,email,signup_date,last_login,purchase_amount
 2 | 1,John,Smith,john.smith@example.com,2023-01-15,2023-03-20,125.99
 3 | 2,Jane,Doe,jane.doe@example.com,2023-01-16,2023-03-21,210.50
 4 | 3,Bob,Johnson,bob@example.com,2023-01-17,2023-03-22,0
 5 | 4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25
 6 | 5,,Brown,mike.brown@example.com,2023-01-19,2023-03-24,150.75
 7 | 6,Sarah,Miller,sarah.miller@example.com,invalid_date,2023-03-25,95.00
 8 | 7,David,Jones,david.jones@example.com,2023-01-21,2023-03-26,300.00
 9 | 8,Lisa,Garcia,lisa.garcia@example.com,2023-01-22,2023-03-27,-50.00
10 | 9,James,Martinez,mymail@example.com,2023-01-23,2023-03-28,125.00
11 | 10,Emily,Anderson,emily.anderson@example.com,2023-01-24,2023-03-29,200.00
12 | 


--------------------------------------------------------------------------------
/build-with-python/data-cleaning-n-validation-pipeline/main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from pydantic import BaseModel, ValidationError, field_validator
 4 | from typing import Optional, List, Dict, Any
 5 | 
 6 | class DataValidator(BaseModel):
 7 |     """Pydantic model for data validation"""
 8 |     name: str
 9 |     age: Optional[int] = None
10 |     email: Optional[str] = None
11 |     salary: Optional[float] = None
12 |     
13 |     @field_validator('age')
14 |     @classmethod
15 |     def validate_age(cls, v):
16 |         if v is not None and (v < 0 or v > 120):
17 |             raise ValueError('Age must be between 0 and 120')
18 |         return v
19 |     
20 |     @field_validator('email')
21 |     @classmethod
22 |     def validate_email(cls, v):
23 |         if v and '@' not in v:
24 |             raise ValueError('Invalid email format')
25 |         return v
26 | 
27 | class DataPipeline:
28 |     def __init__(self):
29 |         self.cleaning_stats = {'duplicates_removed': 0, 'nulls_handled': 0, 'validation_errors': 0}
30 |     
31 |     def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
32 |         """Clean the dataset by handling duplicates and missing values"""
33 |         initial_rows = len(df)
34 |         
35 |         # Remove duplicates
36 |         df = df.drop_duplicates()
37 |         self.cleaning_stats['duplicates_removed'] = initial_rows - len(df)
38 |         
39 |         # Handle missing values
40 |         numeric_columns = df.select_dtypes(include=[np.number]).columns
41 |         df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())
42 |         
43 |         string_columns = df.select_dtypes(include=['object']).columns
44 |         df[string_columns] = df[string_columns].fillna('Unknown')
45 |         
46 |         self.cleaning_stats['nulls_handled'] = df.isnull().sum().sum()
47 |         return df
48 |     
49 |     def validate_data(self, df: pd.DataFrame) -> pd.DataFrame:
50 |         """Validate each row using Pydantic model"""
51 |         valid_rows = []
52 |         errors = []
53 |         
54 |         for idx, row in df.iterrows():
55 |             try:
56 |                 validated_row = DataValidator(**row.to_dict())
57 |                 valid_rows.append(validated_row.model_dump())
58 |             except ValidationError as e:
59 |                 errors.append({'row': idx, 'errors': str(e)})
60 |         
61 |         self.cleaning_stats['validation_errors'] = len(errors)
62 |         return pd.DataFrame(valid_rows), errors
63 |     
64 |     def process(self, df: pd.DataFrame) -> Dict[str, Any]:
65 |         """Main pipeline method"""
66 |         cleaned_df = self.clean_data(df.copy())
67 |         validated_df, validation_errors = self.validate_data(cleaned_df)
68 |         
69 |         return {
70 |             'cleaned_data': validated_df,
71 |             'validation_errors': validation_errors,
72 |             'stats': self.cleaning_stats
73 |         }
74 | 
75 | 
76 | # Example usage
77 | if __name__ == "__main__":
78 |     # Sample messy data
79 |     sample_data = pd.DataFrame({
80 |     'name': ['Tara Jamison', 'Jane Smith', 'Lucy Lee', None, 'Clara Clark','Jane Smith'],
81 |     'age': [25, -5, 25, 35, 150,-5],
82 |     'email': ['taraj@email.com', 'invalid-email', 'lucy@email.com', 'jane@email.com', 'clara@email.com','invalid-email'],
83 |     'salary': [50000, 60000, 50000, None, 75000,60000]
84 | })
85 |     
86 |     pipeline = DataPipeline()
87 |     result = pipeline.process(sample_data)
88 |     
89 |     print("Cleaned Data:")
90 |     print(result['cleaned_data'])
91 |     print(f"\nStats: {result['stats']}")
92 |     print(f"Validation Errors: {len(result['validation_errors'])}")
93 | 


--------------------------------------------------------------------------------
/containerizing-python-apps/currency-api/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Virtual environment
 2 | venv/
 3 | env/
 4 | ENV/
 5 | 
 6 | # Python cache files
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | *.so
11 | .Python
12 | 
13 | # Distribution / packaging
14 | dist/
15 | build/
16 | *.egg-info/
17 | 
18 | # Unit test / coverage reports
19 | htmlcov/
20 | .tox/
21 | .coverage
22 | .coverage.*
23 | .cache
24 | nosetests.xml
25 | coverage.xml
26 | *.cover
27 | 
28 | # Environments
29 | .env
30 | .venv
31 | 
32 | # IDE specific files
33 | .idea/
34 | .vscode/
35 | *.swp
36 | *.swo
37 | 
38 | # Local development files
39 | .DS_Store
40 | 


--------------------------------------------------------------------------------
/containerizing-python-apps/currency-api/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use official Python image
 2 | FROM python:3.11-slim
 3 | # Set work directory
 4 | WORKDIR /app
 5 | # Install dependencies
 6 | COPY requirements.txt .
 7 | RUN pip install --no-cache-dir -r requirements.txt
 8 | # Copy app code
 9 | COPY . .
10 | # Expose port
11 | EXPOSE 8000
12 | # Run the app
13 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
14 | 


--------------------------------------------------------------------------------
/containerizing-python-apps/currency-api/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, HTTPException, Query
 2 | from pydantic import BaseModel
 3 | from typing import Literal
 4 | 
 5 | app = FastAPI(title="Currency Converter")
 6 | 
 7 | 
 8 | class ConversionResponse(BaseModel):
 9 |     from_currency: str
10 |     to_currency: str
11 |     amount: float
12 |     converted: float
13 |     rate: float
14 | 
15 | 
16 | mock_rates = {
17 |     ("USD", "EUR"): 0.91,
18 |     ("EUR", "USD"): 1.10,
19 |     ("USD", "JPY"): 145.0,
20 | }
21 | 
22 | 
23 | @app.get("/convert", response_model=ConversionResponse)
24 | def convert(
25 |     amount: float = Query(..., gt=0),
26 |     from_currency: Literal["USD", "EUR"] = "USD",
27 |     to_currency: Literal["USD", "EUR", "JPY"] = "EUR",
28 | ):
29 |     if from_currency == to_currency:
30 |         raise HTTPException(
31 |             status_code=400, detail="From and to currencies must differ."
32 |         )
33 | 
34 |     rate = mock_rates.get((from_currency, to_currency))
35 |     if not rate:
36 |         raise HTTPException(status_code=400, detail="Conversion rate not available.")
37 | 
38 |     converted = amount * rate
39 |     return ConversionResponse(
40 |         from_currency=from_currency,
41 |         to_currency=to_currency,
42 |         amount=amount,
43 |         converted=round(converted, 2),
44 |         rate=rate,
45 |     )
46 | 
47 | 


--------------------------------------------------------------------------------
/containerizing-python-apps/currency-api/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | 


--------------------------------------------------------------------------------
/data-cleaning-essential-guide/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data-cleaning-essential-guide/messy_data.csv:
--------------------------------------------------------------------------------
 1 | id,name,age,gender,email,income,job_title,department,start_date,education,customer_rating,comments,phone_number,country,purchase_amount
 2 | 1,john smith,34,M,john.smith@email.com,75000,Sr. Developer,IT,2019-03-15,Bachelor's,4.5,Loyal customer since 2019,555-123-4567,USA,1250.99
 3 | 2,Jane Doe,28,F,jane.doe@email.com,82000,Senior Developer,Engineering,2020-01-10,Masters,4.8,Has premium membership,555-987-6543,united states,879.50
 4 | 3,Bob Johnson,,Male,bob.j@email,65000,Data Analyst,Analytics,2018-05-20,bachelor,3.2,occasional buyer,5551234567,US,450
 5 | 4,,42,M,mike.wilson@email.com,92000,Project Manager,IT,2017-11-05,PHD,4.2,,555-555-5555,United States,1500
 6 | 5,Sarah Williams,31,female,sarah.w@email.com,,UX Designer,Design,2021-02-28,Masters Degree,4.9,Very satisfied with service,555-789-0123,usa,2100.75
 7 | 6,Alex Brown,38,M,alex@email.com,78500,senior developer,IT,2019-08-12,Bachelors,,"Left a positive review, potential for upsell",555.111.2222,United States,950.25
 8 | 7,Emily Wilson,27,F,emily@incomplete,69000,Data Scientist,Analytics,2020/07/15,PHD,4.6,New customer - first purchase,5559876543,US,780
 9 | 8,Jane Doe,29,Female,different.jane@email.com,81000,Sr. Developer,Engineering,2020-01-10,Masters,4.5,Duplicate customer record,555-987-6543,USA,1200
10 | 9,Chris Martin,45,M,chris.martin@email.com,-5000,Marketing Specialist,Marketing,2022-04-18,bachelor's degree,3.8,Returned last purchase,555-123-7890,U.S.A.,350.99
11 | 10,Lisa Johnson,33,F,lisa.j@email.com,88000,Product Manager,Product,1/15/2021,MBA,6.2,High-value customer,555-444-3333,United States,3500
12 | 11,David Thompson,51,Male,david.t@email.com,110000,Director,Executive,2015-09-30,masters degree,4.1,Prefers phone contact,555)-222-1111,us,2700.50
13 | 12,Jennifer Clark,36,F,jennifer@email,72000,HR Specialist,Human Resources,2018-12-05,Bachelors,3.9,Seasonal buyer,555*789*4561,USA,575.25
14 | 13,Michael Scott,44,M,mscott@email.com,80000,Regional Manager,Management,2022/02/10,Ba chelor,2.1,Multiple complaints filed,5551112222,United states,150
15 | 14,amanda king,39,f,amanda.king@email.com,93500,Lead Engineer,Engineering,2017-06-22,PhD,4.7,Technical customer - reads documentation,555-333-9876,usa,1850.75
16 | 15,Ryan Chen,32,M,ryan.chen@email.com,79000,BI Analyst,Analytics,2019-11-18,MSc,,"Contact attempted, no response yet",555-666-7777,United States,0
17 | 


--------------------------------------------------------------------------------
/data-cleaning/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data-cleaning/generate_df.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from datetime import datetime, timedelta
 4 | 
 5 | # Set seed for reproducibility
 6 | np.random.seed(42)
 7 | 
 8 | # Create a sample dataset of customer orders
 9 | n_rows = 1000
10 | 
11 | # Generate random dates in the last year
12 | start_date = datetime(2024, 1, 1)
13 | end_date = datetime(2025, 3, 1)
14 | dates = [start_date + timedelta(days=np.random.randint(0, (end_date - start_date).days)) for _ in range(n_rows)]
15 | 
16 | # Generate customer IDs with some duplicates and inconsistent formats
17 | customer_formats = ['CUS-{}', 'C{}', 'CUST-{}', 'Customer {}', '{}']
18 | customer_ids = [np.random.choice(customer_formats).format(np.random.randint(1000, 9999)) for _ in range(n_rows)]
19 | 
20 | # Generate email addresses with some errors
21 | email_domains = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'company.com']
22 | emails = []
23 | for i in range(n_rows):
24 |     username = f"user{np.random.randint(100, 999)}"
25 |     domain = np.random.choice(email_domains)
26 |     # Introduce some errors
27 |     if np.random.random() < 0.05:  # Missing @ symbol
28 |         emails.append(f"{username}{domain}")
29 |     elif np.random.random() < 0.05:  # Extra spaces
30 |         emails.append(f" {username}@{domain} ")
31 |     elif np.random.random() < 0.05:  # Typos
32 |         emails.append(f"{username}@{domain.replace('com', 'cm')}")
33 |     else:
34 |         emails.append(f"{username}@{domain}")
35 | 
36 | # Generate product IDs with some missing values
37 | product_ids = [f"PROD-{np.random.randint(100, 999)}" if np.random.random() > 0.03 else np.nan for _ in range(n_rows)]
38 | 
39 | # Generate quantities with some outliers
40 | quantities = [np.random.randint(1, 10) if np.random.random() > 0.02 else np.random.randint(100, 1000) for _ in range(n_rows)]
41 | 
42 | # Generate prices with some negative values and inconsistent formats
43 | prices = []
44 | for _ in range(n_rows):
45 |     price = np.random.uniform(9.99, 199.99)
46 |     if np.random.random() < 0.02:  # Negative price
47 |         price = -price
48 |     if np.random.random() < 0.1:  # String format
49 |         prices.append(f"${price:.2f}")
50 |     elif np.random.random() < 0.1:  # Integer format
51 |         prices.append(int(price))
52 |     else:
53 |         prices.append(price)
54 | 
55 | # Generate shipping status with some inconsistent values
56 | status_options = ['Shipped', 'shipped', 'SHIPPED', 'In Transit', 'in transit', 'In-Transit', 'Delivered', 'delivered', 'DELIVERED', 'Pending', 'pending']
57 | shipping_status = [np.random.choice(status_options) for _ in range(n_rows)]
58 | 
59 | # Create the DataFrame
60 | df = pd.DataFrame({
61 |     'order_date': dates,
62 |     'customer_id': customer_ids,
63 |     'email': emails,
64 |     'product_id': product_ids,
65 |     'quantity': quantities,
66 |     'price': prices,
67 |     'shipping_status': shipping_status
68 | })
69 | 
70 | # Add some completely blank rows
71 | blank_indices = np.random.choice(range(n_rows), size=5, replace=False)
72 | for idx in blank_indices:
73 |     df.loc[idx, :] = np.nan
74 | 
75 | # Add some duplicate rows
76 | dup_indices = np.random.choice(range(n_rows), size=10, replace=False)
77 | df = pd.concat([df, df.loc[dup_indices]], ignore_index=True)
78 | 
79 | # Print the first few rows to see the data
80 | print(df.head())
81 | 


--------------------------------------------------------------------------------
/data-science-app/README.md:
--------------------------------------------------------------------------------
 1 | ## Building a Simple Data Science App
 2 | 
 3 | Install the required libraries in a virtual environment for the project:
 4 | ```
 5 | $ pip3 install fastapi uvicorn scikit-learn pandas
 6 | ```
 7 | 
 8 | Run `model_training.py` to train the logistic regression model.
 9 | 
10 | To run the FastAPI app, use:
11 | ```
12 | $ uvicorn app:app --reload
13 | ```
14 | Use curl to send POST requests to the `/predict` endpoint.
15 | 


--------------------------------------------------------------------------------
/data-science-app/app.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from pydantic import BaseModel
 3 | import pickle
 4 | 
 5 | app = FastAPI()
 6 | 
 7 | @app.get("/")
 8 | def read_root():
 9 |     return {"message": "A Simple Prediction API"}
10 | 
11 | def load_model():
12 |     with open('model/classifier.pkl', 'rb') as f:
13 |         model = pickle.load(f)
14 |     return model
15 | 
16 | class WineFeatures(BaseModel):
17 |     alcohol: float
18 |     malic_acid: float
19 |     ash: float
20 |     alcalinity_of_ash: float
21 |     magnesium: float
22 |     total_phenols: float
23 |     flavanoids: float
24 |     nonflavanoid_phenols: float
25 |     proanthocyanins: float
26 |     color_intensity: float
27 |     hue: float
28 |     od280_od315_of_diluted_wines: float
29 |     proline: float
30 | 
31 | @app.post("/predict")
32 | def predict_wine(features: WineFeatures):
33 |     model = load_model()
34 |     input_data = [[
35 |         features.alcohol, features.malic_acid, features.ash, features.alcalinity_of_ash,
36 |         features.magnesium, features.total_phenols, features.flavanoids,
37 |         features.nonflavanoid_phenols, features.proanthocyanins, features.color_intensity,
38 |         features.hue, features.od280_od315_of_diluted_wines, features.proline
39 |     ]]
40 |     
41 |     prediction = model.predict(input_data)
42 |     return {"prediction": int(prediction[0])}
43 | 


--------------------------------------------------------------------------------
/data-science-app/model_training.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_wine
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.preprocessing import StandardScaler
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.metrics import accuracy_score
 6 | import pandas as pd
 7 | import pickle
 8 | 
 9 | def load_wine_data():
10 |     wine_data = load_wine()
11 |     df = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names)
12 |     df['target'] = wine_data.target  # Adding the target (wine quality class)
13 |     return df
14 | 
15 | def preprocess_data(df):
16 |     X = df.drop('target', axis=1)  # Features
17 |     y = df['target']  # Target (wine quality)
18 | 
19 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)
20 | 
21 |     # Feature scaling
22 |     scaler = StandardScaler()
23 |     X_train_scaled = scaler.fit_transform(X_train)
24 |     X_test_scaled = scaler.transform(X_test)
25 |     
26 |     return X_train_scaled, X_test_scaled, y_train, y_test
27 | 
28 | def train_model(X_train, y_train):
29 |     model = LogisticRegression(random_state=42)
30 |     model.fit(X_train, y_train)
31 | 
32 |     # Save the trained model using pickle
33 |     with open('classifier.pkl', 'wb') as f:
34 |         pickle.dump(model, f)
35 | 
36 |     return model
37 | 
38 | def evaluate_model(model, X_test, y_test):
39 |     y_pred = model.predict(X_test)
40 |     accuracy = accuracy_score(y_test, y_pred)
41 |     print(f"Accuracy: {accuracy:.2f}")
42 | 
43 | if __name__ == "__main__":
44 |     df = load_wine_data()
45 |     X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df)
46 |     model = train_model(X_train_scaled, y_train)
47 |     evaluate_model(model, X_test_scaled, y_test)
48 | 


--------------------------------------------------------------------------------
/deploy_ml_models/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Python 3.11 as the base image
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set the working directory inside the container
 5 | WORKDIR /app
 6 | 
 7 | # Copy the application files
 8 | COPY app.py .
 9 | COPY linear_regression_model.pkl .
10 | COPY requirements.txt .
11 | 
12 | # Install the required libraries
13 | RUN pip install -r requirements.txt
14 | 
15 | # Expose the port that FastAPI will run on
16 | EXPOSE 8000
17 | 
18 | # Command to run the FastAPI app with uvicorn
19 | CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
20 | 


--------------------------------------------------------------------------------
/deploy_ml_models/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/deploy_ml_models/app.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | import pickle
 3 | import pandas as pd
 4 | from pydantic import BaseModel
 5 | 
 6 | # Load the saved model using pickle
 7 | with open('linear_regression_model.pkl', 'rb') as f:
 8 |     model = pickle.load(f)
 9 | 
10 | # Create the FastAPI app
11 | app = FastAPI()
12 | 
13 | # Define the input schema for the API
14 | class HousingData(BaseModel):
15 |     MedInc: float
16 |     HouseAge: float
17 |     AveRooms: float
18 |     AveOccup: float
19 | 
20 | # Define the prediction endpoint
21 | @app.post('/predict')
22 | def predict_price(data: HousingData):
23 |     # Convert input data to a DataFrame
24 |     input_data = pd.DataFrame([data.dict()])
25 |     
26 |     # Make a prediction
27 |     prediction = model.predict(input_data)
28 |     
29 |     # Return the predicted price
30 |     return {"Predicted Price": prediction[0]}
31 | 


--------------------------------------------------------------------------------
/deploy_ml_models/linear_regression.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.datasets import fetch_california_housing
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.linear_model import LinearRegression
 5 | import pickle
 6 | 
 7 | # Load the California Housing dataset
 8 | data = fetch_california_housing(as_frame=True)
 9 | df = pd.DataFrame(data.data, columns=data.feature_names)
10 | df['target'] = data.target
11 | 
12 | # Define features and target variable
13 | X = df[['MedInc', 'HouseAge', 'AveRooms', 'AveOccup']]  # Selecting some example features
14 | y = df['target']  # Target variable is the housing price (scaled)
15 | 
16 | # Split the dataset into training and test sets
17 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
18 | 
19 | # Initialize and train the linear regression model
20 | model = LinearRegression()
21 | model.fit(X_train, y_train)
22 | 
23 | # Evaluate the model on the test set
24 | score = model.score(X_test, y_test)
25 | print(f"Model R-squared: {score:.4f}")
26 | 
27 | # Save the model using pickle
28 | with open('linear_regression_model.pkl', 'wb') as f:
29 |     pickle.dump(model, f)
30 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docker/docker-volume/volume-postgres.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docker/leverage-build-cache/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docker/minimal-img-python-apps/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use the official lightweight Python 3.11-slim image
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set the working directory
 5 | WORKDIR /app
 6 | 
 7 | # Install dependencies
 8 | COPY requirements.txt requirements.txt
 9 | RUN pip install --no-cache-dir -r requirements.txt
10 | 
11 | # Copy the current directory contents into the container at /app
12 | COPY . .
13 | 
14 | # Expose the port the app runs on
15 | EXPOSE 5000
16 | 
17 | # Run the application
18 | CMD ["python3", "app.py"]
19 | 


--------------------------------------------------------------------------------
/docker/minimal-img-python-apps/README.md:
--------------------------------------------------------------------------------
 1 | ## Create Minimal Docker Image for a Sample Python Application
 2 | 
 3 | Your project directory should look like so:
 4 | 
 5 | ```
 6 | <project-dir>/
 7 | ├── app.py
 8 | ├── Dockerfile
 9 | ├── requirements.txt
10 | ```
11 | 
12 | Build the docker image for the Flask app:
13 | 
14 | ```sh
15 | $ docker build -t inventory-app:slim .
16 | ```
17 | 


--------------------------------------------------------------------------------
/docker/minimal-img-python-apps/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, jsonify
 2 | 
 3 | app = Flask(__name__)
 4 | 
 5 | # In-memory database for simplicity
 6 | inventory = {}
 7 | 
 8 | @app.route('/inventory', methods=['POST'])
 9 | def add_item():
10 |     item = request.get_json()
11 |     item_id = item.get('id')
12 |     if not item_id:
13 |         return jsonify({"error": "Item ID is required"}), 400
14 |     if item_id in inventory:
15 |         return jsonify({"error": "Item already exists"}), 400
16 |     inventory[item_id] = item
17 |     return jsonify(item), 201
18 | 
19 | @app.route('/inventory/<item_id>', methods=['GET'])
20 | def get_item(item_id):
21 |     item = inventory.get(item_id)
22 |     if not item:
23 |         return jsonify({"error": "Item not found"}), 404
24 |     return jsonify(item)
25 | 
26 | @app.route('/inventory/<item_id>', methods=['PUT'])
27 | def update_item(item_id):
28 |     if item_id not in inventory:
29 |         return jsonify({"error": "Item not found"}), 404
30 |     updated_item = request.get_json()
31 |     inventory[item_id] = updated_item
32 |     return jsonify(updated_item)
33 | 
34 | @app.route('/inventory/<item_id>', methods=['DELETE'])
35 | def delete_item(item_id):
36 |     if item_id not in inventory:
37 |         return jsonify({"error": "Item not found"}), 404
38 |     del inventory[item_id]
39 |     return '', 204
40 | 
41 | if __name__ == '__main__':
42 |     app.run(host='0.0.0.0', port=5000)
43 | 


--------------------------------------------------------------------------------
/docker/minimal-img-python-apps/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==3.0.3
2 | 


--------------------------------------------------------------------------------
/duckdb-json/ecommerce_data.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "order_id": "ORD-1001",
 4 |     "customer": {
 5 |       "id": "CUST-101",
 6 |       "name": "Alex Johnson",
 7 |       "email": "alex.j@example.com",
 8 |       "address": {
 9 |         "street": "123 Main St",
10 |         "city": "Boston",
11 |         "state": "MA",
12 |         "zip": "02108"
13 |       },
14 |       "loyalty_tier": "gold"
15 |     },
16 |     "order_date": "2023-10-15T14:30:00",
17 |     "items": [
18 |       {
19 |         "product_id": "PROD-501",
20 |         "name": "Wireless Headphones",
21 |         "category": "Electronics",
22 |         "price": 129.99,
23 |         "quantity": 1,
24 |         "tags": ["bluetooth", "noise-cancelling", "audio"]
25 |       },
26 |       {
27 |         "product_id": "PROD-245",
28 |         "name": "Smartphone Case",
29 |         "category": "Accessories",
30 |         "price": 24.99,
31 |         "quantity": 2,
32 |         "tags": ["protective", "smartphone", "silicone"]
33 |       }
34 |     ],
35 |     "payment": {
36 |       "method": "credit_card",
37 |       "total": 179.97,
38 |       "status": "completed"
39 |     }
40 |   },
41 |   {
42 |     "order_id": "ORD-1002",
43 |     "customer": {
44 |       "id": "CUST-102",
45 |       "name": "Sarah Miller",
46 |       "email": "sarahm@example.com",
47 |       "address": {
48 |         "street": "456 Oak Ave",
49 |         "city": "Seattle",
50 |         "state": "WA",
51 |         "zip": "98101"
52 |       },
53 |       "loyalty_tier": "silver"
54 |     },
55 |     "order_date": "2023-10-16T09:15:00",
56 |     "items": [
57 |       {
58 |         "product_id": "PROD-103",
59 |         "name": "Coffee Maker",
60 |         "category": "Kitchen",
61 |         "price": 89.99,
62 |         "quantity": 1,
63 |         "tags": ["appliance", "coffee", "automatic"]
64 |       },
65 |       {
66 |         "product_id": "PROD-107",
67 |         "name": "Coffee Beans Premium Blend",
68 |         "category": "Food & Beverage",
69 |         "price": 15.99,
70 |         "quantity": 3,
71 |         "tags": ["coffee", "organic", "fair-trade"]
72 |       }
73 |     ],
74 |     "payment": {
75 |       "method": "paypal",
76 |       "total": 137.96,
77 |       "status": "completed"
78 |     }
79 |   }
80 | ]
81 | 


--------------------------------------------------------------------------------
/duckdb-json/output.md:
--------------------------------------------------------------------------------
  1 | ```
  2 | ┌──────────┬───┬──────────────────────┬──────────────────────┐
  3 | │ order_id │ … │        items         │       payment        │
  4 | │ varchar  │   │ struct(product_id …  │ struct("method" va…  │
  5 | ├──────────┼───┼──────────────────────┼──────────────────────┤
  6 | │ ORD-1001 │ … │ [{'product_id': PR…  │ {'method': credit_…  │
  7 | │ ORD-1002 │ … │ [{'product_id': PR…  │ {'method': paypal,…  │
  8 | ├──────────┴───┴──────────────────────┴──────────────────────┤
  9 | │ 2 rows                                 5 columns (3 shown) │
 10 | └────────────────────────────────────────────────────────────┘
 11 | ```
 12 | 
 13 | ```
 14 | ┌─────────────┐
 15 | │ order_count │
 16 | │    int64    │
 17 | ├─────────────┤
 18 | │      2      │
 19 | └─────────────┘
 20 | ```
 21 | 
 22 | ```
 23 | ┌──────────┬───────────────┐
 24 | │ order_id │ customer_name │
 25 | │ varchar  │    varchar    │
 26 | ├──────────┼───────────────┤
 27 | │ ORD-1001 │ Alex Johnson  │
 28 | │ ORD-1002 │ Sarah Miller  │
 29 | └──────────┴───────────────┘
 30 | ```
 31 | 
 32 | ```
 33 | ┌──────────┬───────────────┬─────────┬─────────┐
 34 | │ order_id │ customer_name │  city   │  state  │
 35 | │ varchar  │    varchar    │ varchar │ varchar │
 36 | ├──────────┼───────────────┼─────────┼─────────┤
 37 | │ ORD-1001 │ Alex Johnson  │ Boston  │ MA      │
 38 | │ ORD-1002 │ Sarah Miller  │ Seattle │ WA      │
 39 | └──────────┴───────────────┴─────────┴─────────┘
 40 | ```
 41 | ```
 42 | ┌──────────┬───────────────┐
 43 | │ order_id │ customer_name │
 44 | │ varchar  │    varchar    │
 45 | ├──────────┼───────────────┤
 46 | │ ORD-1002 │ Sarah Miller  │
 47 | └──────────┴───────────────┘
 48 | ```
 49 | 
 50 | ```
 51 | ┌──────────┬────────────────┬───────────────┐
 52 | │ order_id │ payment_method │ total_amount  │
 53 | │ varchar  │    varchar     │ decimal(18,3) │
 54 | ├──────────┼────────────────┼───────────────┤
 55 | │ ORD-1001 │ credit_card    │       179.970 │
 56 | │ ORD-1002 │ paypal         │       137.960 │
 57 | └──────────┴────────────────┴───────────────┘
 58 | ```
 59 | 
 60 | ```
 61 | ┌──────────┬───────────────┬───┬───────────────┬──────────┐
 62 | │ order_id │ customer_name │ … │     price     │ quantity │
 63 | │ varchar  │    varchar    │   │ decimal(18,3) │  int32   │
 64 | ├──────────┼───────────────┼───┼───────────────┼──────────┤
 65 | │ ORD-1001 │ Alex Johnson  │ … │       129.990 │        1 │
 66 | │ ORD-1001 │ Alex Johnson  │ … │        24.990 │        2 │
 67 | │ ORD-1002 │ Sarah Miller  │ … │        89.990 │        1 │
 68 | │ ORD-1002 │ Sarah Miller  │ … │        15.990 │        3 │
 69 | ├──────────┴───────────────┴───┴───────────────┴──────────┤
 70 | │ 4 rows                              6 columns (4 shown) │
 71 | └─────────────────────────────────────────────────────────┘
 72 | ```
 73 | 
 74 | ```
 75 | ┌──────────┬───────────────┬───────────────┬────────────┐
 76 | │ order_id │ customer_name │  order_total  │ item_count │
 77 | │ varchar  │    varchar    │ decimal(18,3) │   uint64   │
 78 | ├──────────┼───────────────┼───────────────┼────────────┤
 79 | │ ORD-1001 │ Alex Johnson  │       179.970 │          2 │
 80 | │ ORD-1002 │ Sarah Miller  │       137.960 │          2 │
 81 | └──────────┴───────────────┴───────────────┴────────────┘
 82 | ```
 83 | ```
 84 | ┌─────────────────┬───────────┐
 85 | │    category     │ avg_price │
 86 | │     varchar     │  double   │
 87 | ├─────────────────┼───────────┤
 88 | │ Electronics     │    129.99 │
 89 | │ Kitchen         │     89.99 │
 90 | │ Accessories     │     24.99 │
 91 | │ Food & Beverage │     15.99 │
 92 | └─────────────────┴───────────┘
 93 | ```
 94 | ```
 95 | ┌──────────┬───────────────┬───┬───────────────┬──────────┐
 96 | │ order_id │ customer_name │ … │     price     │ quantity │
 97 | │ varchar  │    varchar    │   │ decimal(18,3) │  int32   │
 98 | ├──────────┼───────────────┼───┼───────────────┼──────────┤
 99 | │ ORD-1001 │ Alex Johnson  │ … │       129.990 │        1 │
100 | │ ORD-1001 │ Alex Johnson  │ … │        24.990 │        2 │
101 | │ ORD-1002 │ Sarah Miller  │ … │        89.990 │        1 │
102 | │ ORD-1002 │ Sarah Miller  │ … │        15.990 │        3 │
103 | ├──────────┴───────────────┴───┴───────────────┴──────────┤
104 | │ 4 rows                              6 columns (4 shown) │
105 | └─────────────────────────────────────────────────────────┘
106 | ```
107 | 
108 | ```
109 | ┌──────────┬───────────────┬───────────────────────────────────────────────────┐
110 | │ order_id │ customer_name │                       item                        │
111 | │ varchar  │    varchar    │ struct(product_id varchar, "name" varchar, cate…  │
112 | ├──────────┼───────────────┼───────────────────────────────────────────────────┤
113 | │ ORD-1001 │ Alex Johnson  │ {'product_id': PROD-501, 'name': Wireless Headp…  │
114 | │ ORD-1001 │ Alex Johnson  │ {'product_id': PROD-245, 'name': Smartphone Cas…  │
115 | │ ORD-1002 │ Sarah Miller  │ {'product_id': PROD-103, 'name': Coffee Maker, …  │
116 | │ ORD-1002 │ Sarah Miller  │ {'product_id': PROD-107, 'name': Coffee Beans P…  │
117 | └──────────┴───────────────┴───────────────────────────────────────────────────┘
118 | ```
119 | 


--------------------------------------------------------------------------------
/duckdb-json/query_json.sql:
--------------------------------------------------------------------------------
 1 | -- Create a table from the JSON file
 2 | CREATE TABLE ecommerce AS 
 3 | SELECT * FROM read_json_auto('ecommerce_data.json');
 4 | 
 5 | -- View the data
 6 | SELECT * FROM ecommerce;
 7 | 
 8 | -- Count the number of orders
 9 | SELECT COUNT(*) AS order_count FROM ecommerce;
10 | 
11 | -- Get order IDs and customer names
12 | SELECT 
13 |     order_id,
14 |     customer->>'name' AS customer_name
15 | FROM ecommerce;
16 | 
17 | -- Extract customer address information
18 | SELECT 
19 |     order_id,
20 |     customer->>'name' AS customer_name,
21 |     customer->'address'->>'city' AS city,
22 |     customer->'address'->>'state' AS state
23 | FROM ecommerce;
24 | 
25 | -- Find orders from customers in Seattle
26 | SELECT 
27 |     order_id,
28 |     customer->>'name' AS customer_name
29 | FROM ecommerce
30 | WHERE customer->'address'->>'city' = 'Seattle';
31 | 
32 | -- Get payment details
33 | SELECT 
34 |     order_id,
35 |     payment->>'method' AS payment_method,
36 |     CAST(payment->>'total' AS DECIMAL) AS total_amount
37 | FROM ecommerce;
38 | 
39 | -- Unnest the items array into separate rows
40 | SELECT 
41 |     order_id,
42 |     customer->>'name' AS customer_name,
43 |     unnest(items) AS item
44 | FROM ecommerce;
45 | 
46 | -- Get specific item details
47 | SELECT 
48 |     order_id,
49 |     customer->>'name' AS customer_name,
50 |     item->>'name' AS product_name,
51 |     item->>'category' AS category,
52 |     CAST(item->>'price' AS DECIMAL) AS price,
53 |     CAST(item->>'quantity' AS INTEGER) AS quantity
54 | FROM (
55 |     SELECT 
56 |         order_id,
57 |         customer,
58 |         unnest(items) AS item
59 |     FROM ecommerce
60 | ) AS unnested_items;
61 | 
62 | -- Calculate total value of each order
63 | SELECT 
64 |     order_id,
65 |     customer->>'name' AS customer_name,
66 |     CAST(payment->>'total' AS DECIMAL) AS order_total,
67 |     json_array_length(items) AS item_count
68 | FROM ecommerce;
69 | 
70 | -- Calculate average price by product category
71 | SELECT 
72 |     item->>'category' AS category,
73 |     AVG(CAST(item->>'price' AS DECIMAL)) AS avg_price
74 | FROM (
75 |     SELECT unnest(items) AS item
76 |     FROM ecommerce
77 | ) AS unnested_items
78 | GROUP BY category
79 | ORDER BY avg_price DESC;
80 | 
81 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/README.md:
--------------------------------------------------------------------------------
1 | This directory contains the code and notebooks for a DuckDB mini-series I've been working on. The mini-series focuses on working with common data formats and performing basic statistical analysis with DuckDB.
2 | 
3 | - [How to Analyze Parquet Files with DuckDB](https://www.statology.org/how-to-analyze-parquet-files-with-duckdb/)
4 | - [How to Analyze CSV Files with DuckDB](https://www.statology.org/how-to-analyze-csv-files-with-duckdb/)
5 | - [How to Query Pandas DataFrames with DuckDB](https://www.statology.org/how-to-query-pandas-dataframes-with-duckdb/)
6 | - [How to Calculate Descriptive Statistics in DuckDB](https://www.statology.org/how-to-calculate-descriptive-statistics-in-duckdb/)
7 | - [How to Perform Hypothesis Testing in DuckDB](https://www.statology.org/how-to-perform-hypothesis-testing-in-duckdb/)
8 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/analyze-csv/query_csv.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | SELECT * FROM read_csv('shopping_data.csv') LIMIT 5;
 3 | 
 4 | DESCRIBE SELECT * FROM read_csv('shopping_data.csv') LIMIT 5;
 5 | 
 6 | SELECT
 7 |     MIN(age) AS min_age, MAX(age) AS max_age, AVG(age) AS avg_age,
 8 |     MIN(purchase_amount) AS min_purchase, MAX(purchase_amount) AS max_purchase, AVG(purchase_amount) AS avg_purchase
 9 | FROM read_csv('shopping_data.csv');
10 | 
11 | SELECT customer_name, age, purchase_amount, category
12 | FROM read_csv_auto('shopping_data.csv')
13 | WHERE purchase_amount > 200
14 | ORDER BY purchase_amount DESC;
15 | 
16 | SELECT category, COUNT(*) AS total_purchases, SUM(purchase_amount) AS total_sales, AVG(purchase_amount) AS avg_spent
17 | FROM read_csv_auto('shopping_data.csv')
18 | GROUP BY category
19 | ORDER BY total_sales DESC;
20 | 
21 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/analyze-csv/query_csv_op.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | ┌─────────────┬───────────────┬───────┬───┬───────────────┬────────────────┐
 3 | │ customer_id │ customer_name │  age  │ … │ purchase_date │ payment_method │
 4 | │    int64    │    varchar    │ int64 │   │     date      │    varchar     │
 5 | ├─────────────┼───────────────┼───────┼───┼───────────────┼────────────────┤
 6 | │           1 │ Customer 1    │    56 │ … │ 2024-01-01    │ PayPal         │
 7 | │           2 │ Customer 2    │    46 │ … │ 2024-01-02    │ Credit Card    │
 8 | │           3 │ Customer 3    │    32 │ … │ 2024-01-03    │ Cash           │
 9 | │           4 │ Customer 4    │    60 │ … │ 2024-01-04    │ Cash           │
10 | │           5 │ Customer 5    │    25 │ … │ 2024-01-05    │ Debit Card     │
11 | ├─────────────┴───────────────┴───────┴───┴───────────────┴────────────────┤
12 | │ 5 rows                                               8 columns (5 shown) │
13 | └──────────────────────────────────────────────────────────────────────────┘
14 | ```
15 | 
16 | ```
17 | ┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
18 | │   column_name   │ column_type │  null   │   key   │ default │  extra  │
19 | │     varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
20 | ├─────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
21 | │ customer_id     │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
22 | │ customer_name   │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
23 | │ age             │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
24 | │ gender          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
25 | │ purchase_amount │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
26 | │ category        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
27 | │ purchase_date   │ DATE        │ YES     │ NULL    │ NULL    │ NULL    │
28 | │ payment_method  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
29 | └─────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘
30 | ```
31 | 
32 | ```
33 | ┌─────────┬─────────┬───┬──────────────┬────────────────────┐
34 | │ min_age │ max_age │ … │ max_purchase │    avg_purchase    │
35 | │  int64  │  int64  │   │    double    │       double       │
36 | ├─────────┼─────────┼───┼──────────────┼────────────────────┤
37 | │   19    │   61    │ … │    485.1     │ 242.92866666666666 │
38 | ├─────────┴─────────┴───┴──────────────┴────────────────────┤
39 | │ 1 rows                                6 columns (4 shown) │
40 | └───────────────────────────────────────────────────────────┘
41 | ```
42 | 
43 | ```
44 | ┌───────────────┬───────┬─────────────────┬─────────────┐
45 | │ customer_name │  age  │ purchase_amount │  category   │
46 | │    varchar    │ int64 │     double      │   varchar   │
47 | ├───────────────┼───────┼─────────────────┼─────────────┤
48 | │ Customer 16   │    20 │           485.1 │ Groceries   │
49 | │ Customer 18   │    19 │          470.35 │ Groceries   │
50 | │ Customer 21   │    47 │          461.72 │ Groceries   │
51 | │ Customer 9    │    40 │          455.57 │ Groceries   │
52 | │ Customer 19   │    41 │          448.47 │ Electronics │
53 | │ Customer 28   │    61 │          416.08 │ Clothing    │
54 | │ Customer 1    │    56 │          406.11 │ Books       │
55 | │ Customer 17   │    39 │          389.82 │ Groceries   │
56 | │ Customer 4    │    60 │          345.27 │ Electronics │
57 | │ Customer 11   │    28 │          334.64 │ Books       │
58 | │ Customer 20   │    61 │          302.97 │ Books       │
59 | │ Customer 14   │    57 │          277.89 │ Books       │
60 | │ Customer 13   │    53 │          264.83 │ Books       │
61 | │ Customer 7    │    56 │          252.64 │ Electronics │
62 | │ Customer 5    │    25 │          225.67 │ Groceries   │
63 | │ Customer 26   │    29 │          200.45 │ Clothing    │
64 | ├───────────────┴───────┴─────────────────┴─────────────┤
65 | │ 16 rows                                     4 columns │
66 | └───────────────────────────────────────────────────────┘
67 | ```
68 | 
69 | ```
70 | ┌─────────────┬─────────────────┬────────────────────┬────────────────────┐
71 | │  category   │ total_purchases │    total_sales     │     avg_spent      │
72 | │   varchar   │      int64      │       double       │       double       │
73 | ├─────────────┼─────────────────┼────────────────────┼────────────────────┤
74 | │ Groceries   │               9 │ 2716.7100000000005 │ 301.85666666666674 │
75 | │ Electronics │              12 │            2119.94 │ 176.66166666666666 │
76 | │ Books       │               7 │ 1834.6799999999998 │ 262.09714285714284 │
77 | │ Clothing    │               2 │             616.53 │            308.265 │
78 | └─────────────┴─────────────────┴────────────────────┴────────────────────┘
79 | ```
80 | 
81 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/analyze-csv/shopping_data.csv:
--------------------------------------------------------------------------------
 1 | customer_id,customer_name,age,gender,purchase_amount,category,purchase_date,payment_method
 2 | 1,Customer 1,56,Male,406.11,Books,2024-01-01,PayPal
 3 | 2,Customer 2,46,Male,159.26,Electronics,2024-01-02,Credit Card
 4 | 3,Customer 3,32,Female,57.86,Electronics,2024-01-03,Cash
 5 | 4,Customer 4,60,Female,345.27,Electronics,2024-01-04,Cash
 6 | 5,Customer 5,25,Female,225.67,Groceries,2024-01-05,Debit Card
 7 | 6,Customer 6,38,Female,69.8,Electronics,2024-01-06,Credit Card
 8 | 7,Customer 7,56,Female,252.64,Electronics,2024-01-07,Cash
 9 | 8,Customer 8,36,Male,26.85,Electronics,2024-01-08,PayPal
10 | 9,Customer 9,40,Female,455.57,Groceries,2024-01-09,PayPal
11 | 10,Customer 10,28,Female,136.8,Electronics,2024-01-10,Debit Card
12 | 11,Customer 11,28,Male,334.64,Books,2024-01-11,Cash
13 | 12,Customer 12,41,Female,162.74,Electronics,2024-01-12,Credit Card
14 | 13,Customer 13,53,Male,264.83,Books,2024-01-13,PayPal
15 | 14,Customer 14,57,Female,277.89,Books,2024-01-14,Cash
16 | 15,Customer 15,41,Male,100.58,Books,2024-01-15,Cash
17 | 16,Customer 16,20,Female,485.1,Groceries,2024-01-16,Debit Card
18 | 17,Customer 17,39,Female,389.82,Groceries,2024-01-17,PayPal
19 | 18,Customer 18,19,Male,470.35,Groceries,2024-01-18,PayPal
20 | 19,Customer 19,41,Male,448.47,Electronics,2024-01-19,Credit Card
21 | 20,Customer 20,61,Male,302.97,Books,2024-01-20,PayPal
22 | 21,Customer 21,47,Male,461.72,Groceries,2024-01-21,Credit Card
23 | 22,Customer 22,55,Male,53.36,Groceries,2024-01-22,PayPal
24 | 23,Customer 23,19,Male,106.03,Electronics,2024-01-23,Debit Card
25 | 24,Customer 24,38,Male,32.16,Groceries,2024-01-24,PayPal
26 | 25,Customer 25,50,Male,169.41,Electronics,2024-01-25,Credit Card
27 | 26,Customer 26,29,Female,200.45,Clothing,2024-01-26,Credit Card
28 | 27,Customer 27,39,Female,142.96,Groceries,2024-01-27,Debit Card
29 | 28,Customer 28,61,Male,416.08,Clothing,2024-01-28,PayPal
30 | 29,Customer 29,42,Female,184.81,Electronics,2024-01-29,PayPal
31 | 30,Customer 30,44,Female,147.66,Books,2024-01-30,Debit Card
32 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/analyze-pandas-dataframes/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/analyze-parquet/query_parquet.sql:
--------------------------------------------------------------------------------
 1 | SELECT * FROM read_parquet('restaurant_orders.parquet') LIMIT 5;
 2 | 
 3 | DESCRIBE SELECT * FROM read_parquet('restaurant_orders.parquet') LIMIT 5;
 4 | 
 5 | SELECT COUNT(*) AS total_orders FROM read_parquet('restaurant_orders.parquet');
 6 | 
 7 | SELECT SUM(price * quantity) AS total_revenue FROM read_parquet('restaurant_orders.parquet');
 8 | 
 9 | SELECT menu_item, SUM(quantity) AS total_quantity
10 | FROM read_parquet('restaurant_orders.parquet')
11 | GROUP BY menu_item
12 | ORDER BY total_quantity DESC
13 | LIMIT 5;
14 | 
15 | SELECT payment_method, COUNT(*) AS order_count
16 | FROM read_parquet('restaurant_orders.parquet')
17 | GROUP BY payment_method
18 | ORDER BY order_count DESC;
19 | 
20 | SELECT order_time, SUM(price * quantity) 
21 |        OVER (ORDER BY order_time) AS running_revenue
22 | FROM read_parquet('restaurant_orders.parquet');
23 | 
24 | SELECT order_id, customer_name, price * quantity AS order_value,
25 |        RANK() OVER (ORDER BY price * quantity DESC) AS rank
26 | FROM read_parquet('restaurant_orders.parquet')
27 | LIMIT 5;
28 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/analyze-parquet/query_parquet_op.md:
--------------------------------------------------------------------------------
  1 | ```
  2 | ┌──────────┬───────────────┬───┬─────────────────────┬────────────────┐
  3 | │ order_id │ customer_name │ … │     order_time      │ payment_method │
  4 | │  int64   │    varchar    │   │       varchar       │    varchar     │
  5 | ├──────────┼───────────────┼───┼─────────────────────┼────────────────┤
  6 | │        1 │ Grace         │ … │ 2024-02-01 18:00:00 │ PayPal         │
  7 | │        2 │ David         │ … │ 2024-02-01 18:05:00 │ Credit Card    │
  8 | │        3 │ Eve           │ … │ 2024-02-01 18:10:00 │ PayPal         │
  9 | │        4 │ Grace         │ … │ 2024-02-01 18:15:00 │ PayPal         │
 10 | │        5 │ Charlie       │ … │ 2024-02-01 18:20:00 │ Debit Card     │
 11 | ├──────────┴───────────────┴───┴─────────────────────┴────────────────┤
 12 | │ 5 rows                                          8 columns (4 shown) │
 13 | └─────────────────────────────────────────────────────────────────────┘
 14 | ```
 15 | 
 16 | ```
 17 | ┌────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
 18 | │  column_name   │ column_type │  null   │   key   │ default │  extra  │
 19 | │    varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
 20 | ├────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
 21 | │ order_id       │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
 22 | │ customer_name  │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
 23 | │ table_number   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
 24 | │ menu_item      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
 25 | │ price          │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
 26 | │ quantity       │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
 27 | │ order_time     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
 28 | │ payment_method │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
 29 | └────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘
 30 | ```
 31 | 
 32 | ```
 33 | ┌──────────────┐
 34 | │ total_orders │
 35 | │    int64     │
 36 | ├──────────────┤
 37 | │      30      │
 38 | └──────────────┘
 39 | ```
 40 | ```
 41 | ┌────────────────────┐
 42 | │   total_revenue    │
 43 | │       double       │
 44 | ├────────────────────┤
 45 | │ 1770.9800000000005 │
 46 | └────────────────────┘
 47 | 
 48 | ```
 49 | 
 50 | ```
 51 | ┌───────────┬────────────────┐
 52 | │ menu_item │ total_quantity │
 53 | │  varchar  │     int128     │
 54 | ├───────────┼────────────────┤
 55 | │ Pizza     │             16 │
 56 | │ Sushi     │             15 │
 57 | │ Salad     │             14 │
 58 | │ Tacos     │             14 │
 59 | │ Soup      │              7 │
 60 | └───────────┴────────────────┘
 61 | ```
 62 | 
 63 | ```
 64 | ┌────────────────┬─────────────┐
 65 | │ payment_method │ order_count │
 66 | │    varchar     │    int64    │
 67 | ├────────────────┼─────────────┤
 68 | │ PayPal         │           9 │
 69 | │ Credit Card    │           8 │
 70 | │ Cash           │           7 │
 71 | │ Debit Card     │           6 │
 72 | └────────────────┴─────────────┘
 73 | ```
 74 | 
 75 | 
 76 | 
 77 | ```
 78 | ┌─────────────────────┬────────────────────┐
 79 | │     order_time      │  running_revenue   │
 80 | │       varchar       │       double       │
 81 | ├─────────────────────┼────────────────────┤
 82 | │ 2024-02-01 18:00:00 │              55.28 │
 83 | │ 2024-02-01 18:05:00 │              69.36 │
 84 | │ 2024-02-01 18:10:00 │             128.28 │
 85 | │ 2024-02-01 18:15:00 │             195.75 │
 86 | │ 2024-02-01 18:20:00 │             212.96 │
 87 | │ 2024-02-01 18:25:00 │ 339.83000000000004 │
 88 | │ 2024-02-01 18:30:00 │ 360.88000000000005 │
 89 | │ 2024-02-01 18:35:00 │ 413.80000000000007 │
 90 | │ 2024-02-01 18:40:00 │  472.6400000000001 │
 91 | │ 2024-02-01 18:45:00 │  506.6600000000001 │
 92 | │ 2024-02-01 18:50:00 │  547.7600000000001 │
 93 | │ 2024-02-01 18:55:00 │  556.1100000000001 │
 94 | │ 2024-02-01 19:00:00 │  654.9300000000001 │
 95 | │ 2024-02-01 19:05:00 │  774.1800000000001 │
 96 | │ 2024-02-01 19:10:00 │  816.0000000000001 │
 97 | │ 2024-02-01 19:15:00 │  826.5000000000001 │
 98 | │ 2024-02-01 19:20:00 │  951.6000000000001 │
 99 | │ 2024-02-01 19:25:00 │ 1062.0300000000002 │
100 | │ 2024-02-01 19:30:00 │ 1099.8400000000001 │
101 | │ 2024-02-01 19:35:00 │ 1218.9700000000003 │
102 | │ 2024-02-01 19:40:00 │ 1243.9600000000003 │
103 | │ 2024-02-01 19:45:00 │ 1286.2200000000003 │
104 | │ 2024-02-01 19:50:00 │ 1306.6400000000003 │
105 | │ 2024-02-01 19:55:00 │ 1482.0000000000005 │
106 | │ 2024-02-01 20:00:00 │ 1515.0500000000004 │
107 | │ 2024-02-01 20:05:00 │ 1574.7200000000005 │
108 | │ 2024-02-01 20:10:00 │ 1598.3000000000004 │
109 | │ 2024-02-01 20:15:00 │ 1674.2600000000004 │
110 | │ 2024-02-01 20:20:00 │ 1733.1500000000005 │
111 | │ 2024-02-01 20:25:00 │ 1770.9800000000005 │
112 | ├─────────────────────┴────────────────────┤
113 | │ 30 rows                        2 columns │
114 | └──────────────────────────────────────────┘
115 | ```
116 | 
117 | ```
118 | ┌──────────┬───────────────┬────────────────────┬───────┐
119 | │ order_id │ customer_name │    order_value     │ rank  │
120 | │  int64   │    varchar    │       double       │ int64 │
121 | ├──────────┼───────────────┼────────────────────┼───────┤
122 | │       24 │ Hannah        │             175.36 │     1 │
123 | │        6 │ Hannah        │             126.87 │     2 │
124 | │       17 │ David         │ 125.10000000000001 │     3 │
125 | │       14 │ Charlie       │             119.25 │     4 │
126 | │       20 │ Charlie       │             119.13 │     5 │
127 | └──────────┴───────────────┴────────────────────┴───────┘
128 | ```
129 | 
130 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/analyze-parquet/restaurant_orders.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/balapriyac/data-science-tutorials/94d44fe6d8ded10a2a31ac2837bbc4771b060b22/duckdb-miniseries/analyze-parquet/restaurant_orders.parquet


--------------------------------------------------------------------------------
/duckdb-miniseries/descriptive-statistics/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/duckdb-miniseries/hypothesis-testing/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/duckdb/README.md:
--------------------------------------------------------------------------------
 1 | ## To Follow Along
 2 | 
 3 | Install DuckDB, NumPy, and Pandas:
 4 | 
 5 | ```
 6 | $ pip3 install duckdb numpy pandas
 7 | ```
 8 | 
 9 | Run `generate_csv.py` to generate the sample CSV files:
10 | 
11 | ```
12 | $ python3 generate_csv.py
13 | ```
14 | 


--------------------------------------------------------------------------------
/duckdb/generate_csv.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Step 1: Generate Sales Data
 5 | data = {
 6 |     'Product_ID': np.arange(1, 101),
 7 |     'Product_Name': ['Product_' + str(i) for i in range(1, 101)],
 8 |     'Price': np.round(np.random.uniform(10, 500, 100), 2),
 9 |     'Quantity_Sold': np.random.randint(1, 100, 100),
10 |     'Region': np.random.choice(['North', 'South', 'East', 'West'], 100)
11 | }
12 | 
13 | # Create and save sales data DataFrame
14 | sales_df = pd.DataFrame(data)
15 | sales_csv_file = 'sales_data.csv'
16 | sales_df.to_csv(sales_csv_file, index=False)
17 | 
18 | # Step 2: Generate Product Details Data
19 | product_data = {
20 |     'Product_ID': np.arange(1, 101),  # Ensure IDs match the sales_data.csv
21 |     'Manufacturer': ['Manufacturer_' + str(np.random.randint(1, 11)) for _ in range(100)]  # 10 different manufacturers
22 | }
23 | 
24 | # Create and save product details DataFrame
25 | product_details_df = pd.DataFrame(product_data)
26 | product_csv_file = 'product_details.csv'
27 | product_details_df.to_csv(product_csv_file, index=False)
28 | 


--------------------------------------------------------------------------------
/duckdb/main.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | 
 3 | # View the first 5 rows of the data
 4 | duckdb.sql("SELECT * FROM 'sales_data.csv' LIMIT 5").df()
 5 | 
 6 | # Calculate total sales (Price * Quantity_Sold) per region
 7 | query = """
 8 | SELECT Region, SUM(Price * Quantity_Sold) as Total_Sales
 9 | FROM 'sales_data.csv'
10 | GROUP BY Region
11 | ORDER BY Total_Sales DESC
12 | """
13 | total_sales = duckdb.sql(query).df()
14 | 
15 | print("Total sales per region:")
16 | print(total_sales)
17 | 
18 | # Find the top 5 best-selling products by quantity
19 | query = """
20 | SELECT Product_Name, SUM(Quantity_Sold) as Total_Quantity
21 | FROM 'sales_data.csv'
22 | GROUP BY Product_Name
23 | ORDER BY Total_Quantity DESC
24 | LIMIT 5
25 | """
26 | top_products = duckdb.sql(query).df()
27 | 
28 | print("Top 5 best-selling products:")
29 | print(top_products)
30 | 
31 | # Calculate the average price of products by region
32 | query = """
33 | SELECT Region, AVG(Price) as Average_Price
34 | FROM 'sales_data.csv'
35 | GROUP BY Region
36 | """
37 | avg_price_region = duckdb.sql(query).df()
38 | 
39 | print("Average price per region:")
40 | print(avg_price_region)
41 | 
42 | # Calculate total quantity sold by region
43 | query = """
44 | SELECT Region, SUM(Quantity_Sold) as Total_Quantity
45 | FROM 'sales_data.csv'
46 | GROUP BY Region
47 | ORDER BY Total_Quantity DESC
48 | """
49 | total_quantity_region = duckdb.sql(query).df()
50 | 
51 | print("Total quantity sold per region:")
52 | print(total_quantity_region)
53 | 
54 | # A simple join
55 | query = """
56 | SELECT s.Product_Name, s.Region, s.Price, p.Manufacturer
57 | FROM 'sales_data.csv' s
58 | JOIN 'product_details.csv' p
59 | ON s.Product_ID = p.Product_ID
60 | """
61 | joined_data = duckdb.sql(query).df()
62 | 
63 | print(joined_data.head())
64 | 


--------------------------------------------------------------------------------
/fastapi-docker-for-ml-model-deployment/diabetes-predictor/app/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/fastapi-docker-for-ml-model-deployment/diabetes-predictor/app/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from pydantic import BaseModel
 3 | import pickle
 4 | import numpy as np
 5 | import os
 6 | 
 7 | # Define input data schema
 8 | class PatientData(BaseModel):
 9 |     age: float
10 |     sex: float  
11 |     bmi: float
12 |     bp: float   # blood pressure
13 |     s1: float   # serum measurement 1
14 |     s2: float   # serum measurement 2  
15 |     s3: float   # serum measurement 3
16 |     s4: float   # serum measurement 4
17 |     s5: float   # serum measurement 5
18 |     s6: float   # serum measurement 6
19 |     
20 |     class Config:
21 |         schema_extra = {
22 |             "example": {
23 |                 "age": 0.05,
24 |                 "sex": 0.05,
25 |                 "bmi": 0.06,
26 |                 "bp": 0.02,
27 |                 "s1": -0.04,
28 |                 "s2": -0.04,
29 |                 "s3": -0.02,
30 |                 "s4": -0.01,
31 |                 "s5": 0.01,
32 |                 "s6": 0.02
33 |             }
34 |         }
35 | 
36 | # Initialize FastAPI app
37 | app = FastAPI(
38 |     title="Diabetes Progression Predictor",
39 |     description="Predicts diabetes progression from physiological features",
40 |     version="1.0.0"
41 | )
42 | 
43 | # Load the trained model
44 | model_path = os.path.join("models", "diabetes_model.pkl")
45 | with open(model_path, 'rb') as f:
46 |     model = pickle.load(f)
47 | 
48 | @app.post("/predict")
49 | def predict_progression(patient: PatientData):
50 |     """
51 |     Predict diabetes progression score
52 |     """
53 |     # Convert input to numpy array
54 |     features = np.array([[
55 |         patient.age, patient.sex, patient.bmi, patient.bp,
56 |         patient.s1, patient.s2, patient.s3, patient.s4,
57 |         patient.s5, patient.s6
58 |     ]])
59 |     
60 |     # Make prediction
61 |     prediction = model.predict(features)[0]
62 |     
63 |     # Return result with additional context
64 |     return {
65 |         "predicted_progression_score": round(prediction, 2),
66 |         "interpretation": get_interpretation(prediction)
67 |     }
68 | 
69 | def get_interpretation(score):
70 |     """Provide human-readable interpretation of the score"""
71 |     if score < 100:
72 |         return "Below average progression"
73 |     elif score < 150:
74 |         return "Average progression"
75 |     else:
76 |         return "Above average progression"
77 | 
78 | @app.get("/")
79 | def health_check():
80 |     return {"status": "healthy", "model": "diabetes_progression_v1"}
81 | 
82 | 


--------------------------------------------------------------------------------
/fastapi-docker-for-ml-model-deployment/diabetes-predictor/train_model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_diabetes
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.ensemble import RandomForestRegressor
 4 | from sklearn.metrics import mean_squared_error, r2_score
 5 | import pickle
 6 | import os
 7 | 
 8 | # Load the diabetes dataset
 9 | diabetes = load_diabetes()
10 | X, y = diabetes.data, diabetes.target
11 | 
12 | print(f"Dataset shape: {X.shape}")
13 | print(f"Features: {diabetes.feature_names}")
14 | print(f"Target range: {y.min():.1f} to {y.max():.1f}")
15 | 
16 | # Split the data
17 | X_train, X_test, y_train, y_test = train_test_split(
18 |     X, y, test_size=0.2, random_state=42
19 | )
20 | 
21 | print(f"Training samples: {X_train.shape[0]}")
22 | print(f"Test samples: {X_test.shape[0]}")
23 | 
24 | # Train Random Forest model
25 | model = RandomForestRegressor(
26 |     n_estimators=100,
27 |     random_state=42,
28 |     max_depth=10
29 | )
30 | 
31 | model.fit(X_train, y_train)
32 | 
33 | # Make predictions and evaluate
34 | y_pred = model.predict(X_test)
35 | 
36 | mse = mean_squared_error(y_test, y_pred)
37 | r2 = r2_score(y_test, y_pred)
38 | 
39 | print(f"Mean Squared Error: {mse:.2f}")
40 | print(f"R² Score: {r2:.3f}")
41 | 
42 | # Create models directory and save model
43 | os.makedirs('models', exist_ok=True)
44 | 
45 | with open('models/diabetes_model.pkl', 'wb') as f:
46 |     pickle.dump(model, f)
47 | 
48 | print("Model trained and saved successful")
49 | 


--------------------------------------------------------------------------------
/fastapi/README.md:
--------------------------------------------------------------------------------
 1 | ## Getting Started
 2 | 
 3 | Create and activate a dedicated venv for the project:
 4 | 
 5 | ```bash
 6 | $ python3 -m venv v1
 7 | $ source v1/bin/activate
 8 | ```
 9 | Install FastAPI and Uvicorn with `pip`:
10 | 
11 | ```bash
12 | $ pip3 install fastapi uvicorn
13 | ```
14 | Also install scikit-learn:
15 | 
16 | ```bash
17 | $ pip3 install scikit-learn
18 | ```
19 | Check [main.py](https://github.com/balapriyac/data-science-tutorials/blob/main/fastapi/main.py) for the complete code.
20 | 
21 | ## Run the App
22 | 
23 | Run the following command:
24 | 
25 | ```bash
26 | $ uvicorn main:app --reload
27 | ```
28 | 
29 | ## Query the `/predict/` Endpoint
30 | 
31 | Example POST request (using cURL):
32 | 
33 | ```bash
34 | curl -X 'POST' \
35 |   'http://localhost:8000/predict/' \
36 |   -H 'Content-Type: application/json' \
37 |   -d '{
38 |   "sepal_length": 5.1,
39 |   "sepal_width": 3.5,
40 |   "petal_length": 1.4,
41 |   "petal_width": 0.2
42 | }'
43 | 
44 | ```
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/fastapi/main.py:
--------------------------------------------------------------------------------
 1 | # Create a FastAPI app
 2 | # Root endpoint returns the app description
 3 | 
 4 | from fastapi import FastAPI
 5 | 
 6 | app = FastAPI()
 7 | 
 8 | # Define a function to return a description of the app
 9 | def get_app_description():
10 |     return (
11 |         "Welcome to the Iris Species Prediction API!"
12 |         "This API allows you to predict the species of an iris flower based on its sepal and petal measurements."
13 |         "Use the '/predict/' endpoint with a POST request to make predictions."
14 |         "Example usage: POST to '/predict/' with JSON data containing sepal_length, sepal_width, petal_length, and petal_width."
15 |     )
16 | 
17 | # Define the root endpoint to return the app description
18 | @app.get("/")
19 | async def root():
20 |     return {"message": get_app_description()}
21 | 
22 | 
23 | # Build a logistic regression classifier
24 | from sklearn.datasets import load_iris
25 | from sklearn.linear_model import LogisticRegression
26 | 
27 | # Load the Iris dataset
28 | iris = load_iris()
29 | X, y = iris.data, iris.target
30 | 
31 | # Train a logistic regression model
32 | model = LogisticRegression()
33 | model.fit(X, y)
34 | 
35 | # Define a function to predict the species
36 | def predict_species(sepal_length, sepal_width, petal_length, petal_width):
37 |     features = [[sepal_length, sepal_width, petal_length, petal_width]]
38 |     prediction = model.predict(features)
39 |     return iris.target_names[prediction[0]]
40 | 
41 | # Define the Pydantic model for your input data
42 | from pydantic import BaseModel
43 | 
44 | class IrisData(BaseModel):
45 |     sepal_length: float
46 |     sepal_width: float
47 |     petal_length: float
48 |     petal_width: float
49 | 
50 | # Create API endpoint
51 | @app.post("/predict/")
52 | async def predict_species_api(iris_data: IrisData):
53 |     species = predict_species(iris_data.sepal_length, iris_data.sepal_width, iris_data.petal_length, iris_data.petal_width)
54 |     return {"species": species}
55 | 
56 | 


--------------------------------------------------------------------------------
/machine-learning/HyperparameterTuning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## 1. Start Simple – Train a Baseline Model Without Any Tuning"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "i2Ju74quwU_E"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": null,
 29 |       "metadata": {
 30 |         "id": "cBt_HjnMv7Qf"
 31 |       },
 32 |       "outputs": [],
 33 |       "source": [
 34 |         "from sklearn.tree import DecisionTreeClassifier\n",
 35 |         "from sklearn.metrics import accuracy_score\n",
 36 |         "from sklearn.model_selection import train_test_split\n",
 37 |         "from sklearn.datasets import load_iris\n",
 38 |         "\n",
 39 |         "# Load data\n",
 40 |         "data = load_iris()\n",
 41 |         "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n",
 42 |         "\n",
 43 |         "# Initialize model with default parameters\n",
 44 |         "model = DecisionTreeClassifier()\n",
 45 |         "\n",
 46 |         "# Train model\n",
 47 |         "model.fit(X_train, y_train)\n",
 48 |         "\n",
 49 |         "# Predict and evaluate\n",
 50 |         "y_pred = model.predict(X_test)\n",
 51 |         "baseline_accuracy = accuracy_score(y_test, y_pred)\n",
 52 |         "print(f'Baseline Accuracy: {baseline_accuracy:.2f}')"
 53 |       ]
 54 |     },
 55 |     {
 56 |       "cell_type": "markdown",
 57 |       "source": [
 58 |         "## 2. Use Hyperparameter Search with Cross-Validation"
 59 |       ],
 60 |       "metadata": {
 61 |         "id": "aEjaShhwwaC0"
 62 |       }
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "source": [
 67 |         "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
 68 |         "from sklearn.tree import DecisionTreeClassifier\n",
 69 |         "from sklearn.datasets import load_iris\n",
 70 |         "\n",
 71 |         "# Load data\n",
 72 |         "data = load_iris()\n",
 73 |         "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n",
 74 |         "\n",
 75 |         "# Initialize model\n",
 76 |         "model = DecisionTreeClassifier()\n",
 77 |         "\n",
 78 |         "# Define hyperparameter grid for Grid Search\n",
 79 |         "param_grid = {\n",
 80 |         "\t'criterion': ['gini', 'entropy'],\n",
 81 |         "\t'max_depth': [None, 10, 20, 30],\n",
 82 |         "\t'min_samples_split': [2, 5, 10]\n",
 83 |         "}\n",
 84 |         "\n",
 85 |         "from sklearn.model_selection import cross_val_score\n",
 86 |         "\n",
 87 |         "# Grid Search\n",
 88 |         "grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')\n",
 89 |         "grid_search.fit(X_train, y_train)\n",
 90 |         "best_params_grid = grid_search.best_params_\n",
 91 |         "best_score_grid = grid_search.best_score_\n",
 92 |         "\n",
 93 |         "print(f'Best Parameters (Grid Search): {best_params_grid}')\n",
 94 |         "print(f'Best Cross-Validation Score (Grid Search): {best_score_grid:.2f}')"
 95 |       ],
 96 |       "metadata": {
 97 |         "id": "-KWQc3JpwfZb"
 98 |       },
 99 |       "execution_count": null,
100 |       "outputs": []
101 |     },
102 |     {
103 |       "cell_type": "markdown",
104 |       "source": [
105 |         "## 3. Use Randomized Search for Initial Exploration"
106 |       ],
107 |       "metadata": {
108 |         "id": "qMMeeT_JwxHN"
109 |       }
110 |     },
111 |     {
112 |       "cell_type": "code",
113 |       "source": [
114 |         "from sklearn.model_selection import RandomizedSearchCV\n",
115 |         "from sklearn.tree import DecisionTreeClassifier\n",
116 |         "from sklearn.datasets import load_iris\n",
117 |         "import numpy as np\n",
118 |         "\n",
119 |         "# Load data\n",
120 |         "data = load_iris()\n",
121 |         "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n",
122 |         "\n",
123 |         "# Initialize model\n",
124 |         "model = DecisionTreeClassifier()\n",
125 |         "\n",
126 |         "# Define hyperparameter distribution for Random Search\n",
127 |         "param_dist = {\n",
128 |         "\t'criterion': ['gini', 'entropy'],\n",
129 |         "\t'max_depth': [None] + list(range(10, 31)),\n",
130 |         "\t'min_samples_split': range(2, 11),\n",
131 |         "\t'min_samples_leaf': range(1, 11)\n",
132 |         "}\n",
133 |         "\n",
134 |         "# Random Search\n",
135 |         "random_search = RandomizedSearchCV(model, param_dist, n_iter=100, cv=5, scoring='accuracy')\n",
136 |         "random_search.fit(X_train, y_train)\n",
137 |         "best_params_random = random_search.best_params_\n",
138 |         "best_score_random = random_search.best_score_\n",
139 |         "\n",
140 |         "print(f'Best Parameters (Random Search): {best_params_random}')\n",
141 |         "print(f'Best Cross-Validation Score (Random Search): {best_score_random:.2f}')\n",
142 |         "\n",
143 |         "best_model = DecisionTreeClassifier(**best_params_random)\n",
144 |         "best_model.fit(X_train, y_train)\n",
145 |         "y_pred = best_model.predict(X_test)\n",
146 |         "final_accuracy = accuracy_score(y_test, y_pred)\n",
147 |         "\n",
148 |         "print(f'Final Model Accuracy: {final_accuracy:.2f}')"
149 |       ],
150 |       "metadata": {
151 |         "id": "TqNS89p_wyqd"
152 |       },
153 |       "execution_count": null,
154 |       "outputs": []
155 |     },
156 |     {
157 |       "cell_type": "markdown",
158 |       "source": [
159 |         "## 4. Monitor Overfitting with Validation Curves"
160 |       ],
161 |       "metadata": {
162 |         "id": "Yu1VwYCnyCQJ"
163 |       }
164 |     },
165 |     {
166 |       "cell_type": "code",
167 |       "source": [
168 |         "from sklearn.ensemble import RandomForestClassifier\n",
169 |         "from sklearn.model_selection import validation_curve\n",
170 |         "import matplotlib.pyplot as plt\n",
171 |         "import numpy as np\n",
172 |         "\n",
173 |         "# Define hyperparameter range\n",
174 |         "param_range = [10, 100, 200, 400, 800, 1000]\n",
175 |         "\n",
176 |         "# Calculate validation curve\n",
177 |         "train_scores, test_scores = validation_curve(\n",
178 |         "\tRandomForestClassifier(), X_train, y_train,\n",
179 |         "\tparam_name=\"n_estimators\", param_range=param_range,\n",
180 |         "\tcv=5, scoring=\"accuracy\")\n",
181 |         "\n",
182 |         "# Calculate mean and standard deviation\n",
183 |         "train_mean = np.mean(train_scores, axis=1)\n",
184 |         "train_std = np.std(train_scores, axis=1)\n",
185 |         "test_mean = np.mean(test_scores, axis=1)\n",
186 |         "test_std = np.std(test_scores, axis=1)\n",
187 |         "\n",
188 |         "# Plot validation curve\n",
189 |         "plt.plot(param_range, train_mean, label=\"Training score\", color=\"r\")\n",
190 |         "plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color=\"r\", alpha=0.3)\n",
191 |         "plt.plot(param_range, test_mean, label=\"Cross-validation score\", color=\"g\")\n",
192 |         "plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color=\"g\", alpha=0.3)\n",
193 |         "plt.title(\"Validation Curve with Random Forest\")\n",
194 |         "plt.xlabel(\"Number of Estimators\")\n",
195 |         "plt.ylabel(\"Accuracy\")\n",
196 |         "plt.legend(loc=\"best\")\n",
197 |         "plt.show()"
198 |       ],
199 |       "metadata": {
200 |         "id": "hO_wPZKFyFaz"
201 |       },
202 |       "execution_count": null,
203 |       "outputs": []
204 |     },
205 |     {
206 |       "cell_type": "markdown",
207 |       "source": [
208 |         "## 5. Use Bayesian Optimization for Efficient Search"
209 |       ],
210 |       "metadata": {
211 |         "id": "zaIfeNuryKXZ"
212 |       }
213 |     },
214 |     {
215 |       "cell_type": "code",
216 |       "source": [
217 |         "!pip install scikit-optimize"
218 |       ],
219 |       "metadata": {
220 |         "id": "TBi7DOstyNGo"
221 |       },
222 |       "execution_count": null,
223 |       "outputs": []
224 |     },
225 |     {
226 |       "cell_type": "code",
227 |       "source": [
228 |         "from skopt import BayesSearchCV\n",
229 |         "from sklearn.tree import DecisionTreeClassifier\n",
230 |         "from sklearn.datasets import load_iris\n",
231 |         "from sklearn.model_selection import train_test_split\n",
232 |         "from sklearn.metrics import accuracy_score\n",
233 |         "\n",
234 |         "# Load data\n",
235 |         "data = load_iris()\n",
236 |         "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n",
237 |         "\n",
238 |         "# Initialize model\n",
239 |         "model = DecisionTreeClassifier()\n",
240 |         "\n",
241 |         "# Define hyperparameter space for Bayesian Optimization\n",
242 |         "param_space = {\n",
243 |         "\t'criterion': ['gini', 'entropy'],\n",
244 |         "\t'max_depth': [None] + list(range(10, 31)),\n",
245 |         "\t'min_samples_split': (2, 10),\n",
246 |         "\t'min_samples_leaf': (1, 10)\n",
247 |         "}"
248 |       ],
249 |       "metadata": {
250 |         "id": "NA2_2DUeyPiQ"
251 |       },
252 |       "execution_count": null,
253 |       "outputs": []
254 |     },
255 |     {
256 |       "cell_type": "code",
257 |       "source": [
258 |         "# Bayesian Optimization\n",
259 |         "opt = BayesSearchCV(model, param_space, n_iter=32, cv=5, scoring='accuracy')\n",
260 |         "opt.fit(X_train, y_train)\n",
261 |         "best_params_bayes = opt.best_params_\n",
262 |         "best_score_bayes = opt.best_score_\n",
263 |         "\n",
264 |         "print(f'Best Parameters (Bayesian Optimization): {best_params_bayes}')\n",
265 |         "print(f'Best Cross-Validation Score (Bayesian Optimization): {best_score_bayes:.2f}')"
266 |       ],
267 |       "metadata": {
268 |         "id": "ASnb3BUnySWj"
269 |       },
270 |       "execution_count": null,
271 |       "outputs": []
272 |     },
273 |     {
274 |       "cell_type": "code",
275 |       "source": [
276 |         "best_model = DecisionTreeClassifier(**best_params_bayes)\n",
277 |         "best_model.fit(X_train, y_train)\n",
278 |         "y_pred = best_model.predict(X_test)\n",
279 |         "final_accuracy = accuracy_score(y_test, y_pred)\n",
280 |         "\n",
281 |         "print(f'Final Model Accuracy: {final_accuracy:.2f}')"
282 |       ],
283 |       "metadata": {
284 |         "id": "i4yfsPExyWNe"
285 |       },
286 |       "execution_count": null,
287 |       "outputs": []
288 |     },
289 |     {
290 |       "cell_type": "code",
291 |       "source": [],
292 |       "metadata": {
293 |         "id": "dmwUTtZlzcP0"
294 |       },
295 |       "execution_count": null,
296 |       "outputs": []
297 |     }
298 |   ]
299 | }


--------------------------------------------------------------------------------
/machine-learning/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/model_deployment/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use Python 3.11 as the base image
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set the working directory inside the container
 5 | WORKDIR /code
 6 | 
 7 | # Copy the requirements file into the container
 8 | COPY ./requirements.txt /code/requirements.txt
 9 | 
10 | # Install the Python dependencies
11 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12 | 
13 | # Copy the app folder containing the FastAPI app into the container
14 | COPY ./app /code/app
15 | 
16 | # Copy the model directory (with the saved model file) into the container
17 | COPY ./model /code/model
18 | 
19 | # Expose port 80 for FastAPI
20 | EXPOSE 80
21 | 
22 | # Command to run the FastAPI app with Uvicorn
23 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]
24 | 


--------------------------------------------------------------------------------
/model_deployment/README.md:
--------------------------------------------------------------------------------
 1 | ## Deploying ML Models
 2 | 
 3 | ```
 4 | project-directory/
 5 | │
 6 | ├── app/
 7 | │   ├── __init__.py  # Empty file
 8 | │   └── main.py      # FastAPI logic
 9 | │
10 | ├── model/
11 | │   └── linear_regression_model.pkl  # Saved model (after running model_training.py)
12 | │
13 | ├── model_training.py  # Model training code
14 | ├── requirements.txt  # Python dependencies
15 | └── Dockerfile  # Docker configuration
16 | ```
17 | In your project environment, create and activate a virtual environment:
18 | 
19 | ```
20 | $ python3 -m venv v1
21 | $ source v1/bin/activate
22 | ```
23 | Install these required packages using pip:
24 | 
25 | ```
26 | $ pip3 install pandas scikit-learn fastapi uvicorn
27 | ```
28 | 
29 | Run the script to train the model and save it:
30 | 
31 | ```
32 | $ python3 model_training.py
33 | ```
34 | 
35 | You should be able to find the .pkl file (`linear_regression_model.pkl`) in the `model/` directory.
36 | 
37 | Use FastAPI to build an API to serve model predictions and containerize it using Docker.
38 | 
39 | ### Building the Docker Image 
40 | 
41 | Build the Docker image by running the following `docker build` command:
42 | 
43 | ```
44 | $ docker build -t house-price-prediction-api .
45 | ```
46 | 
47 | Next run the Docker container:
48 | 
49 | ```
50 | $ docker run -d -p 80:80 house-price-prediction-api
51 | ```
52 | 
53 | ### Tagging and Pushing the Image to Docker Hub
54 | 
55 | First, login to Docker Hub:
56 | 
57 | ```
58 | $ docker login
59 | ```
60 | 
61 | Tag the Docker image:
62 | 
63 | ```
64 | $ docker tag house-price-prediction-api your_username/house-price-prediction-api:v1
65 | ```
66 | 
67 | Push the image to Docker Hub:
68 | 
69 | ```
70 | $ docker push your_username/house-price-prediction-api:v1
71 | ```
72 | 
73 | Other developers can now pull and run the image like so: 
74 | 
75 | ```
76 | $ docker pull your_username/house-price-prediction-api:v1
77 | $ docker run -d -p 80:80 your_username/house-price-prediction-api:v1
78 | ```
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/model_deployment/app/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/model_deployment/app/main.py:
--------------------------------------------------------------------------------
 1 | # app/main.py
 2 | from fastapi import FastAPI
 3 | from pydantic import BaseModel
 4 | import pickle
 5 | import os
 6 | 
 7 | # Define the input data schema using Pydantic
 8 | class InputData(BaseModel):
 9 |     MedInc: float
10 |     AveRooms: float
11 |     AveOccup: float
12 | 
13 | # Initialize FastAPI app
14 | app = FastAPI(title="House Price Prediction API")
15 | 
16 | # Load the model during startup
17 | model_path = os.path.join("model", "linear_regression_model.pkl")
18 | with open(model_path, 'rb') as f:
19 |     model = pickle.load(f)
20 | 
21 | @app.post("/predict")
22 | def predict(data: InputData):
23 |     # Prepare the data for prediction
24 |     input_features = [[data.MedInc, data.AveRooms, data.AveOccup]]
25 |     
26 |     # Make prediction using the loaded model
27 |     prediction = model.predict(input_features)
28 |     
29 |     # Return the prediction result
30 |     return {"predicted_house_price": prediction[0]}
31 | 


--------------------------------------------------------------------------------
/model_deployment/model_training.py:
--------------------------------------------------------------------------------
 1 | # model_training.py
 2 | import pandas as pd
 3 | from sklearn.datasets import fetch_california_housing
 4 | from sklearn.model_selection import train_test_split
 5 | from sklearn.linear_model import LinearRegression
 6 | import pickle
 7 | import os
 8 | 
 9 | # Load the dataset
10 | data = fetch_california_housing(as_frame=True)
11 | df = data['data']
12 | target = data['target']
13 | 
14 | # Select a few features
15 | selected_features = ['MedInc', 'AveRooms', 'AveOccup']
16 | X = df[selected_features]
17 | y = target
18 | 
19 | # Train-test split
20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
21 | 
22 | # Train the Linear Regression model
23 | model = LinearRegression()
24 | model.fit(X_train, y_train)
25 | 
26 | # Create a 'model' folder to save the trained model
27 | os.makedirs('model', exist_ok=True)
28 | 
29 | # Save the trained model using pickle
30 | with open('model/linear_regression_model.pkl', 'wb') as f:
31 |     pickle.dump(model, f)
32 | 
33 | print("Model trained and saved successfully.")
34 | 


--------------------------------------------------------------------------------
/model_deployment/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | scikit-learn
4 | pandas
5 | 


--------------------------------------------------------------------------------
/natural-language-processing/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/natural-language-processing/nlp_with_python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Installing NLTK"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "2ik7yq56NsrV"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": 17,
 29 |       "metadata": {
 30 |         "colab": {
 31 |           "base_uri": "https://localhost:8080/"
 32 |         },
 33 |         "id": "csTUtV_hIudG",
 34 |         "outputId": "eabe6b0a-f4e8-4841-f869-67facc108602"
 35 |       },
 36 |       "outputs": [
 37 |         {
 38 |           "output_type": "stream",
 39 |           "name": "stdout",
 40 |           "text": [
 41 |             "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.9.1)\n",
 42 |             "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)\n",
 43 |             "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.4.2)\n",
 44 |             "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.9.11)\n",
 45 |             "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.6)\n"
 46 |           ]
 47 |         }
 48 |       ],
 49 |       "source": [
 50 |         "! pip install nltk"
 51 |       ]
 52 |     },
 53 |     {
 54 |       "cell_type": "code",
 55 |       "source": [
 56 |         "import nltk\n",
 57 |         "\n",
 58 |         "# Download essential datasets and models\n",
 59 |         "nltk.download('punkt')  # Tokenizers for sentence and word tokenization\n",
 60 |         "nltk.download('stopwords')  # List of common stop words\n",
 61 |         "nltk.download('wordnet')  # WordNet lexical database for lemmatization\n",
 62 |         "nltk.download('averaged_perceptron_tagger_eng')  # Part-of-speech tagger\n",
 63 |         "nltk.download('maxent_ne_chunker_tab')  # Named Entity Recognition model\n",
 64 |         "nltk.download('words')  # Word corpus for NER\n",
 65 |         "nltk.download('punkt_tab')\n"
 66 |       ],
 67 |       "metadata": {
 68 |         "colab": {
 69 |           "base_uri": "https://localhost:8080/"
 70 |         },
 71 |         "id": "khrbb-C8J5Ip",
 72 |         "outputId": "cd75c3cf-bf1a-4ac0-bcc3-7ce7eb6bf4fe"
 73 |       },
 74 |       "execution_count": 18,
 75 |       "outputs": [
 76 |         {
 77 |           "output_type": "stream",
 78 |           "name": "stderr",
 79 |           "text": [
 80 |             "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
 81 |             "[nltk_data]   Package punkt is already up-to-date!\n",
 82 |             "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
 83 |             "[nltk_data]   Package stopwords is already up-to-date!\n",
 84 |             "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
 85 |             "[nltk_data]   Package wordnet is already up-to-date!\n",
 86 |             "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
 87 |             "[nltk_data]     /root/nltk_data...\n",
 88 |             "[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-\n",
 89 |             "[nltk_data]       date!\n",
 90 |             "[nltk_data] Downloading package maxent_ne_chunker_tab to\n",
 91 |             "[nltk_data]     /root/nltk_data...\n",
 92 |             "[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.\n",
 93 |             "[nltk_data] Downloading package words to /root/nltk_data...\n",
 94 |             "[nltk_data]   Package words is already up-to-date!\n",
 95 |             "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
 96 |             "[nltk_data]   Package punkt_tab is already up-to-date!\n"
 97 |           ]
 98 |         },
 99 |         {
100 |           "output_type": "execute_result",
101 |           "data": {
102 |             "text/plain": [
103 |               "True"
104 |             ]
105 |           },
106 |           "metadata": {},
107 |           "execution_count": 18
108 |         }
109 |       ]
110 |     },
111 |     {
112 |       "cell_type": "markdown",
113 |       "source": [
114 |         "## Text Preprocessing"
115 |       ],
116 |       "metadata": {
117 |         "id": "lIOaDokONw1C"
118 |       }
119 |     },
120 |     {
121 |       "cell_type": "code",
122 |       "source": [
123 |         "import string\n",
124 |         "from nltk.tokenize import word_tokenize, sent_tokenize\n",
125 |         "\n",
126 |         "text = \"Natural Language Processing (NLP) is cool! Let's explore it.\"\n",
127 |         "\n",
128 |         "# Remove punctuation using string.punctuation\n",
129 |         "cleaned_text = ''.join(char for char in text if char not in string.punctuation)\n",
130 |         "print(\"Text without punctuation:\", cleaned_text)\n",
131 |         "\n",
132 |         "# Sentence Tokenization\n",
133 |         "sentences = sent_tokenize(cleaned_text)\n",
134 |         "print(\"Sentences:\", sentences)\n",
135 |         "\n",
136 |         "# Word Tokenization\n",
137 |         "words = word_tokenize(cleaned_text)\n",
138 |         "print(\"Words:\", words)\n"
139 |       ],
140 |       "metadata": {
141 |         "colab": {
142 |           "base_uri": "https://localhost:8080/"
143 |         },
144 |         "id": "IuVHrIa7J_9U",
145 |         "outputId": "7fecd594-1115-4b68-f2f8-ea48bfb800ae"
146 |       },
147 |       "execution_count": 19,
148 |       "outputs": [
149 |         {
150 |           "output_type": "stream",
151 |           "name": "stdout",
152 |           "text": [
153 |             "Text without punctuation: Natural Language Processing NLP is cool Lets explore it\n",
154 |             "Sentences: ['Natural Language Processing NLP is cool Lets explore it']\n",
155 |             "Words: ['Natural', 'Language', 'Processing', 'NLP', 'is', 'cool', 'Lets', 'explore', 'it']\n"
156 |           ]
157 |         }
158 |       ]
159 |     },
160 |     {
161 |       "cell_type": "code",
162 |       "source": [
163 |         "from nltk.corpus import stopwords\n",
164 |         "\n",
165 |         "# Load NLTK's stopwords list\n",
166 |         "stop_words = set(stopwords.words('english'))\n",
167 |         "\n",
168 |         "# Filter out stop words\n",
169 |         "filtered_words = [word for word in words if word.lower() not in stop_words]\n",
170 |         "print(\"Filtered Words:\", filtered_words)\n"
171 |       ],
172 |       "metadata": {
173 |         "colab": {
174 |           "base_uri": "https://localhost:8080/"
175 |         },
176 |         "id": "VX0bZ2y2KDaL",
177 |         "outputId": "d1576147-171d-44ab-862a-71f3383f761c"
178 |       },
179 |       "execution_count": 20,
180 |       "outputs": [
181 |         {
182 |           "output_type": "stream",
183 |           "name": "stdout",
184 |           "text": [
185 |             "Filtered Words: ['Natural', 'Language', 'Processing', 'NLP', 'cool', 'Lets', 'explore']\n"
186 |           ]
187 |         }
188 |       ]
189 |     },
190 |     {
191 |       "cell_type": "code",
192 |       "source": [
193 |         "from nltk.stem import PorterStemmer\n",
194 |         "\n",
195 |         "# Initialize the Porter Stemmer\n",
196 |         "stemmer = PorterStemmer()\n",
197 |         "\n",
198 |         "# Apply stemming to filtered words\n",
199 |         "stemmed_words = [stemmer.stem(word) for word in filtered_words]\n",
200 |         "print(\"Stemmed Words:\", stemmed_words)\n"
201 |       ],
202 |       "metadata": {
203 |         "colab": {
204 |           "base_uri": "https://localhost:8080/"
205 |         },
206 |         "id": "4vEw6L9TKFjk",
207 |         "outputId": "38084b66-0d8e-4463-a35a-d8f454491250"
208 |       },
209 |       "execution_count": 21,
210 |       "outputs": [
211 |         {
212 |           "output_type": "stream",
213 |           "name": "stdout",
214 |           "text": [
215 |             "Stemmed Words: ['natur', 'languag', 'process', 'nlp', 'cool', 'let', 'explor']\n"
216 |           ]
217 |         }
218 |       ]
219 |     },
220 |     {
221 |       "cell_type": "markdown",
222 |       "source": [
223 |         "## Lemmatization"
224 |       ],
225 |       "metadata": {
226 |         "id": "yIKdaABAN2fD"
227 |       }
228 |     },
229 |     {
230 |       "cell_type": "code",
231 |       "source": [
232 |         "from nltk.stem import WordNetLemmatizer\n",
233 |         "\n",
234 |         "# Initialize the Lemmatizer\n",
235 |         "lemmatizer = WordNetLemmatizer()\n",
236 |         "\n",
237 |         "# Lemmatize each word\n",
238 |         "lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in filtered_words]\n",
239 |         "print(\"Lemmatized Words:\", lemmatized_words)\n"
240 |       ],
241 |       "metadata": {
242 |         "colab": {
243 |           "base_uri": "https://localhost:8080/"
244 |         },
245 |         "id": "kZMwY0G0KIkT",
246 |         "outputId": "ffbad6dd-64f7-40a6-e255-e5234ea43797"
247 |       },
248 |       "execution_count": 22,
249 |       "outputs": [
250 |         {
251 |           "output_type": "stream",
252 |           "name": "stdout",
253 |           "text": [
254 |             "Lemmatized Words: ['Natural', 'Language', 'Processing', 'NLP', 'cool', 'Lets', 'explore']\n"
255 |           ]
256 |         }
257 |       ]
258 |     },
259 |     {
260 |       "cell_type": "markdown",
261 |       "source": [
262 |         "## Part-of-Speech (POS) Tagging"
263 |       ],
264 |       "metadata": {
265 |         "id": "zNeQphPZN56x"
266 |       }
267 |     },
268 |     {
269 |       "cell_type": "code",
270 |       "source": [
271 |         "from nltk import pos_tag\n",
272 |         "\n",
273 |         "# Tokenize the text into words\n",
274 |         "text = \"She enjoys playing soccer on weekends.\"\n",
275 |         "\n",
276 |         "# Tokenization (words)\n",
277 |         "words = word_tokenize(text)\n",
278 |         "\n",
279 |         "# POS tagging\n",
280 |         "tagged_words = pos_tag(words)\n",
281 |         "print(\"Tagged Words:\", tagged_words)\n"
282 |       ],
283 |       "metadata": {
284 |         "colab": {
285 |           "base_uri": "https://localhost:8080/"
286 |         },
287 |         "id": "mdCZYFc9KK1j",
288 |         "outputId": "3b345803-2707-4298-ba49-9560f50e87c4"
289 |       },
290 |       "execution_count": 23,
291 |       "outputs": [
292 |         {
293 |           "output_type": "stream",
294 |           "name": "stdout",
295 |           "text": [
296 |             "Tagged Words: [('She', 'PRP'), ('enjoys', 'VBZ'), ('playing', 'VBG'), ('soccer', 'NN'), ('on', 'IN'), ('weekends', 'NNS'), ('.', '.')]\n"
297 |           ]
298 |         }
299 |       ]
300 |     },
301 |     {
302 |       "cell_type": "markdown",
303 |       "source": [
304 |         "## Named Entity Recognition (NER)"
305 |       ],
306 |       "metadata": {
307 |         "id": "W52aYopXOW7V"
308 |       }
309 |     },
310 |     {
311 |       "cell_type": "code",
312 |       "source": [
313 |         "from nltk import ne_chunk, pos_tag, word_tokenize\n",
314 |         "\n",
315 |         "# Sample text\n",
316 |         "text = \"We shall visit the Eiffel Tower on our vacation to Paris.\"\n",
317 |         "\n",
318 |         "# Tokenize the text into words\n",
319 |         "words = word_tokenize(text)\n",
320 |         "\n",
321 |         "# Part-of-speech tagging\n",
322 |         "tagged_words = pos_tag(words)\n",
323 |         "\n",
324 |         "# Named Entity Recognition\n",
325 |         "named_entities = ne_chunk(tagged_words)\n",
326 |         "print(\"Named Entities:\", named_entities)\n"
327 |       ],
328 |       "metadata": {
329 |         "colab": {
330 |           "base_uri": "https://localhost:8080/"
331 |         },
332 |         "id": "fmPqEMqJKQqb",
333 |         "outputId": "c3e63838-2539-4eb9-b071-6055acc40153"
334 |       },
335 |       "execution_count": 24,
336 |       "outputs": [
337 |         {
338 |           "output_type": "stream",
339 |           "name": "stdout",
340 |           "text": [
341 |             "Named Entities: (S\n",
342 |             "  We/PRP\n",
343 |             "  shall/MD\n",
344 |             "  visit/VB\n",
345 |             "  the/DT\n",
346 |             "  (ORGANIZATION Eiffel/NNP Tower/NNP)\n",
347 |             "  on/IN\n",
348 |             "  our/PRP$\n",
349 |             "  vacation/NN\n",
350 |             "  to/TO\n",
351 |             "  (GPE Paris/NNP)\n",
352 |             "  ./.)\n"
353 |           ]
354 |         }
355 |       ]
356 |     }
357 |   ]
358 | }


--------------------------------------------------------------------------------
/pandas/5_steps_data_cleaning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Import pandas"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "M62qZc7zwlwz"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "source": [
 29 |         "import pandas as pd"
 30 |       ],
 31 |       "metadata": {
 32 |         "id": "GlIV3iz-lFCi"
 33 |       },
 34 |       "execution_count": 1,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "source": [
 40 |         "## Step 1 – Run Basic Data Quality Checks"
 41 |       ],
 42 |       "metadata": {
 43 |         "id": "FXkRvvpkvj-t"
 44 |       }
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "execution_count": 2,
 49 |       "metadata": {
 50 |         "id": "Ac1OB3vRiY8z"
 51 |       },
 52 |       "outputs": [],
 53 |       "source": [
 54 |         "def check_data_quality(df):\n",
 55 |         "    # Store initial data quality metrics\n",
 56 |         "    quality_report = {\n",
 57 |         "        'missing_values': df.isnull().sum().to_dict(),\n",
 58 |         "        'duplicates': df.duplicated().sum(),\n",
 59 |         "        'total_rows': len(df),\n",
 60 |         "        'memory_usage': df.memory_usage().sum() / 1024**2  # in MB\n",
 61 |         "    }\n",
 62 |         "    return quality_report\n"
 63 |       ]
 64 |     },
 65 |     {
 66 |       "cell_type": "markdown",
 67 |       "source": [
 68 |         "## Step 2 – Standardize Data Types"
 69 |       ],
 70 |       "metadata": {
 71 |         "id": "OXyrWXTGvqkY"
 72 |       }
 73 |     },
 74 |     {
 75 |       "cell_type": "code",
 76 |       "source": [
 77 |         "def standardize_datatypes(df):\n",
 78 |         "    for column in df.columns:\n",
 79 |         "        # Try converting string dates to datetime\n",
 80 |         "        if df[column].dtype == 'object':\n",
 81 |         "            try:\n",
 82 |         "                df[column] = pd.to_datetime(df[column])\n",
 83 |         "                print(f\"Converted {column} to datetime\")\n",
 84 |         "            except ValueError:\n",
 85 |         "                # Try converting to numeric if datetime fails\n",
 86 |         "                try:\n",
 87 |         "                    df[column] = pd.to_numeric(df[column].str.replace('$', '').str.replace(',', ''))\n",
 88 |         "                    print(f\"Converted {column} to numeric\")\n",
 89 |         "                except:\n",
 90 |         "                    pass\n",
 91 |         "    return df\n"
 92 |       ],
 93 |       "metadata": {
 94 |         "id": "EyVN1pbwjz22"
 95 |       },
 96 |       "execution_count": 3,
 97 |       "outputs": []
 98 |     },
 99 |     {
100 |       "cell_type": "markdown",
101 |       "source": [
102 |         "## Step 3 – Handle Missing Values"
103 |       ],
104 |       "metadata": {
105 |         "id": "94Sbs42VwMdi"
106 |       }
107 |     },
108 |     {
109 |       "cell_type": "code",
110 |       "source": [
111 |         "from sklearn.impute import SimpleImputer\n",
112 |         "\n",
113 |         "def handle_missing_values(df):\n",
114 |         "    # Handle numeric columns\n",
115 |         "    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns\n",
116 |         "    if len(numeric_columns) > 0:\n",
117 |         "        num_imputer = SimpleImputer(strategy='median')\n",
118 |         "        df[numeric_columns] = num_imputer.fit_transform(df[numeric_columns])\n",
119 |         "\n",
120 |         "    # Handle categorical columns\n",
121 |         "    categorical_columns = df.select_dtypes(include=['object']).columns\n",
122 |         "    if len(categorical_columns) > 0:\n",
123 |         "        cat_imputer = SimpleImputer(strategy='most_frequent')\n",
124 |         "        df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])\n",
125 |         "\n",
126 |         "    return df\n"
127 |       ],
128 |       "metadata": {
129 |         "id": "W4lCuzTJkVRb"
130 |       },
131 |       "execution_count": 4,
132 |       "outputs": []
133 |     },
134 |     {
135 |       "cell_type": "markdown",
136 |       "source": [
137 |         "## Step 4 – Detect and Handle Outliers"
138 |       ],
139 |       "metadata": {
140 |         "id": "EyylQ0h1v2Ap"
141 |       }
142 |     },
143 |     {
144 |       "cell_type": "code",
145 |       "source": [
146 |         "def remove_outliers(df):\n",
147 |         "    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns\n",
148 |         "    outliers_removed = {}\n",
149 |         "\n",
150 |         "    for column in numeric_columns:\n",
151 |         "        Q1 = df[column].quantile(0.25)\n",
152 |         "        Q3 = df[column].quantile(0.75)\n",
153 |         "        IQR = Q3 - Q1\n",
154 |         "        lower_bound = Q1 - 1.5 * IQR\n",
155 |         "        upper_bound = Q3 + 1.5 * IQR\n",
156 |         "\n",
157 |         "        # Count outliers before removing\n",
158 |         "        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]\n",
159 |         "\n",
160 |         "        # Cap the values instead of removing them\n",
161 |         "        df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)\n",
162 |         "\n",
163 |         "        if outliers > 0:\n",
164 |         "            outliers_removed[column] = outliers\n",
165 |         "\n",
166 |         "    return df, outliers_removed\n"
167 |       ],
168 |       "metadata": {
169 |         "id": "Hic0lH3pkaYy"
170 |       },
171 |       "execution_count": 5,
172 |       "outputs": []
173 |     },
174 |     {
175 |       "cell_type": "markdown",
176 |       "source": [
177 |         "## Step 5 – Validate the Results"
178 |       ],
179 |       "metadata": {
180 |         "id": "CCV0vKBcwVVB"
181 |       }
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "source": [
186 |         "def validate_cleaning(df, original_shape, cleaning_report):\n",
187 |         "    validation_results = {\n",
188 |         "        'rows_remaining': len(df),\n",
189 |         "        'missing_values_remaining': df.isnull().sum().sum(),\n",
190 |         "        'duplicates_remaining': df.duplicated().sum(),\n",
191 |         "        'data_loss_percentage': (1 - len(df)/original_shape[0]) * 100\n",
192 |         "    }\n",
193 |         "\n",
194 |         "    # Add validation results to the cleaning report\n",
195 |         "    cleaning_report['validation'] = validation_results\n",
196 |         "    return cleaning_report\n"
197 |       ],
198 |       "metadata": {
199 |         "id": "5mCT72R8ke2r"
200 |       },
201 |       "execution_count": 6,
202 |       "outputs": []
203 |     },
204 |     {
205 |       "cell_type": "markdown",
206 |       "source": [
207 |         "## Putting It All Together"
208 |       ],
209 |       "metadata": {
210 |         "id": "W46kva14wX7B"
211 |       }
212 |     },
213 |     {
214 |       "cell_type": "code",
215 |       "source": [
216 |         "def automated_cleaning_pipeline(df):\n",
217 |         "    # Store original shape for reporting\n",
218 |         "    original_shape = df.shape\n",
219 |         "\n",
220 |         "    # Initialize cleaning report\n",
221 |         "    cleaning_report = {}\n",
222 |         "\n",
223 |         "    # Execute each step and collect metrics\n",
224 |         "    cleaning_report['initial_quality'] = check_data_quality(df)\n",
225 |         "\n",
226 |         "    df = standardize_datatypes(df)\n",
227 |         "    df = handle_missing_values(df)\n",
228 |         "    df, outliers = remove_outliers(df)\n",
229 |         "    cleaning_report['outliers_removed'] = outliers\n",
230 |         "\n",
231 |         "    # Validate and finalize report\n",
232 |         "    cleaning_report = validate_cleaning(df, original_shape, cleaning_report)\n",
233 |         "\n",
234 |         "    return df, cleaning_report\n"
235 |       ],
236 |       "metadata": {
237 |         "id": "ybHCRL8Dkmhz"
238 |       },
239 |       "execution_count": 7,
240 |       "outputs": []
241 |     }
242 |   ]
243 | }


--------------------------------------------------------------------------------
/pandas/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pandas/pandas_data_quality_checks_one_liners.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": null,
 20 |       "metadata": {
 21 |         "colab": {
 22 |           "base_uri": "https://localhost:8080/"
 23 |         },
 24 |         "id": "BqAJoDh7f_jn",
 25 |         "outputId": "5631b53b-a7bb-48b4-e6d4-62138ff69cb9"
 26 |       },
 27 |       "outputs": [
 28 |         {
 29 |           "output_type": "stream",
 30 |           "name": "stdout",
 31 |           "text": [
 32 |             "   TransactionID CustomerName Product  Price  Quantity TransactionDate\n",
 33 |             "0            101    Jane Rust  Laptop   1200       1.0      2024-12-01\n",
 34 |             "1            102   june young   Phone    800       2.0      2024/12/01\n",
 35 |             "2            103    Jane Rust  Laptop   1200       NaN      01-12-2024\n",
 36 |             "3            104         None  Tablet   -300       1.0            None\n",
 37 |             "4            105   JUNE YOUNG   Phone    850       1.0      2024-12-01\n"
 38 |           ]
 39 |         }
 40 |       ],
 41 |       "source": [
 42 |         "import pandas as pd\n",
 43 |         "import numpy as np\n",
 44 |         "\n",
 45 |         "# Sample e-commerce transaction data\n",
 46 |         "data = {\n",
 47 |         "    \"TransactionID\": [101, 102, 103, 104, 105],\n",
 48 |         "    \"CustomerName\": [\"Jane Rust\", \"june young\", \"Jane Rust\", None, \"JUNE YOUNG\"],\n",
 49 |         "    \"Product\": [\"Laptop\", \"Phone\", \"Laptop\", \"Tablet\", \"Phone\"],\n",
 50 |         "    \"Price\": [1200, 800, 1200, -300, 850],  # Negative value indicates an issue\n",
 51 |         "    \"Quantity\": [1, 2, None, 1,1],  # Missing value\n",
 52 |         "    \"TransactionDate\": [\"2024-12-01\", \"2024/12/01\", \"01-12-2024\", None, \"2024-12-01\"],\n",
 53 |         "}\n",
 54 |         "\n",
 55 |         "df = pd.DataFrame(data)\n",
 56 |         "\n",
 57 |         "# Display the DataFrame\n",
 58 |         "print(df)\n"
 59 |       ]
 60 |     },
 61 |     {
 62 |       "cell_type": "code",
 63 |       "source": [
 64 |         "df.info()"
 65 |       ],
 66 |       "metadata": {
 67 |         "colab": {
 68 |           "base_uri": "https://localhost:8080/"
 69 |         },
 70 |         "id": "HFs0bFP75P-S",
 71 |         "outputId": "f3d17322-2d06-4cbd-c125-2a340af1d51c"
 72 |       },
 73 |       "execution_count": null,
 74 |       "outputs": [
 75 |         {
 76 |           "output_type": "stream",
 77 |           "name": "stdout",
 78 |           "text": [
 79 |             "<class 'pandas.core.frame.DataFrame'>\n",
 80 |             "RangeIndex: 5 entries, 0 to 4\n",
 81 |             "Data columns (total 6 columns):\n",
 82 |             " #   Column           Non-Null Count  Dtype  \n",
 83 |             "---  ------           --------------  -----  \n",
 84 |             " 0   TransactionID    5 non-null      int64  \n",
 85 |             " 1   CustomerName     4 non-null      object \n",
 86 |             " 2   Product          5 non-null      object \n",
 87 |             " 3   Price            5 non-null      int64  \n",
 88 |             " 4   Quantity         4 non-null      float64\n",
 89 |             " 5   TransactionDate  4 non-null      object \n",
 90 |             "dtypes: float64(1), int64(2), object(3)\n",
 91 |             "memory usage: 368.0+ bytes\n"
 92 |           ]
 93 |         }
 94 |       ]
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [
 99 |         "missing_values = df.isnull().sum()\n",
100 |         "print(\"Missing Values:\\n\", missing_values)\n"
101 |       ],
102 |       "metadata": {
103 |         "colab": {
104 |           "base_uri": "https://localhost:8080/"
105 |         },
106 |         "id": "gCthsJ2kgaa4",
107 |         "outputId": "12e5ad1d-e9f0-41e4-a421-0caced3a384d"
108 |       },
109 |       "execution_count": null,
110 |       "outputs": [
111 |         {
112 |           "output_type": "stream",
113 |           "name": "stdout",
114 |           "text": [
115 |             "Missing Values:\n",
116 |             " TransactionID      0\n",
117 |             "CustomerName       1\n",
118 |             "Product            0\n",
119 |             "Price              0\n",
120 |             "Quantity           1\n",
121 |             "TransactionDate    1\n",
122 |             "dtype: int64\n"
123 |           ]
124 |         }
125 |       ]
126 |     },
127 |     {
128 |       "cell_type": "code",
129 |       "source": [
130 |         "print(\"Data Types:\\n\", df.dtypes)"
131 |       ],
132 |       "metadata": {
133 |         "colab": {
134 |           "base_uri": "https://localhost:8080/"
135 |         },
136 |         "id": "WQthCnS0gcBi",
137 |         "outputId": "d3f21204-399c-4fe3-9478-2450c2ee098e"
138 |       },
139 |       "execution_count": null,
140 |       "outputs": [
141 |         {
142 |           "output_type": "stream",
143 |           "name": "stdout",
144 |           "text": [
145 |             "Data Types:\n",
146 |             " TransactionID        int64\n",
147 |             "CustomerName        object\n",
148 |             "Product             object\n",
149 |             "Price                int64\n",
150 |             "Quantity           float64\n",
151 |             "TransactionDate     object\n",
152 |             "dtype: object\n"
153 |           ]
154 |         }
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "source": [
160 |         "df[\"TransactionDate\"] = pd.to_datetime(df[\"TransactionDate\"], errors=\"coerce\")\n",
161 |         "print(df[\"TransactionDate\"])\n"
162 |       ],
163 |       "metadata": {
164 |         "colab": {
165 |           "base_uri": "https://localhost:8080/"
166 |         },
167 |         "id": "Jy-p-Qy3gdx6",
168 |         "outputId": "9b96fc62-b902-4026-94f2-255f7dee4674"
169 |       },
170 |       "execution_count": null,
171 |       "outputs": [
172 |         {
173 |           "output_type": "stream",
174 |           "name": "stdout",
175 |           "text": [
176 |             "0   2024-12-01\n",
177 |             "1          NaT\n",
178 |             "2          NaT\n",
179 |             "3          NaT\n",
180 |             "4   2024-12-01\n",
181 |             "Name: TransactionDate, dtype: datetime64[ns]\n"
182 |           ]
183 |         }
184 |       ]
185 |     },
186 |     {
187 |       "cell_type": "code",
188 |       "source": [
189 |         "outliers = df[df[\"Price\"] < 0]\n",
190 |         "print(\"Outliers:\\n\", outliers)\n"
191 |       ],
192 |       "metadata": {
193 |         "colab": {
194 |           "base_uri": "https://localhost:8080/"
195 |         },
196 |         "id": "i7smYGpvgfh4",
197 |         "outputId": "277ad03b-1186-4380-dd51-6b6e534f0b82"
198 |       },
199 |       "execution_count": null,
200 |       "outputs": [
201 |         {
202 |           "output_type": "stream",
203 |           "name": "stdout",
204 |           "text": [
205 |             "Outliers:\n",
206 |             "    TransactionID CustomerName Product  Price  Quantity TransactionDate\n",
207 |             "3            104         None  Tablet   -300       1.0             NaT\n"
208 |           ]
209 |         }
210 |       ]
211 |     },
212 |     {
213 |       "cell_type": "code",
214 |       "source": [
215 |         "duplicates = df.duplicated(subset=[\"CustomerName\", \"Product\"], keep=False)\n",
216 |         "print(\"Duplicate Records:\\n\", df[duplicates])\n"
217 |       ],
218 |       "metadata": {
219 |         "colab": {
220 |           "base_uri": "https://localhost:8080/"
221 |         },
222 |         "id": "dIp2HOilghYL",
223 |         "outputId": "7b5a5b29-c2d5-47d4-fe81-78cc3d3655c9"
224 |       },
225 |       "execution_count": null,
226 |       "outputs": [
227 |         {
228 |           "output_type": "stream",
229 |           "name": "stdout",
230 |           "text": [
231 |             "Duplicate Records:\n",
232 |             "    TransactionID CustomerName Product  Price  Quantity TransactionDate\n",
233 |             "0            101    Jane Rust  Laptop   1200       1.0      2024-12-01\n",
234 |             "2            103    Jane Rust  Laptop   1200       NaN             NaT\n"
235 |           ]
236 |         }
237 |       ]
238 |     },
239 |     {
240 |       "cell_type": "code",
241 |       "source": [
242 |         "df[\"CustomerName\"] = df[\"CustomerName\"].str.strip().str.title()\n",
243 |         "print(df[\"CustomerName\"])\n"
244 |       ],
245 |       "metadata": {
246 |         "colab": {
247 |           "base_uri": "https://localhost:8080/"
248 |         },
249 |         "id": "mWIW43kvgjIX",
250 |         "outputId": "c53f8270-9b4a-4214-9be7-1b5a73cbc3bb"
251 |       },
252 |       "execution_count": null,
253 |       "outputs": [
254 |         {
255 |           "output_type": "stream",
256 |           "name": "stdout",
257 |           "text": [
258 |             "0     Jane Rust\n",
259 |             "1    June Young\n",
260 |             "2     Jane Rust\n",
261 |             "3          None\n",
262 |             "4    June Young\n",
263 |             "Name: CustomerName, dtype: object\n"
264 |           ]
265 |         }
266 |       ]
267 |     },
268 |     {
269 |       "cell_type": "code",
270 |       "source": [
271 |         "invalid_prices = df[~df[\"Price\"].between(0, 5000)]\n",
272 |         "print(\"Invalid Prices:\\n\", invalid_prices)\n"
273 |       ],
274 |       "metadata": {
275 |         "colab": {
276 |           "base_uri": "https://localhost:8080/"
277 |         },
278 |         "id": "C7ciFe6-gkxx",
279 |         "outputId": "b2a73dff-c3d9-4a7a-f619-2c011e840aa3"
280 |       },
281 |       "execution_count": null,
282 |       "outputs": [
283 |         {
284 |           "output_type": "stream",
285 |           "name": "stdout",
286 |           "text": [
287 |             "Invalid Prices:\n",
288 |             "    TransactionID CustomerName Product  Price  Quantity TransactionDate\n",
289 |             "3            104         None  Tablet   -300       1.0             NaT\n"
290 |           ]
291 |         }
292 |       ]
293 |     },
294 |     {
295 |       "cell_type": "code",
296 |       "source": [
297 |         "unique_products = df[\"Product\"].value_counts()\n",
298 |         "print(\"Unique Products:\\n\", unique_products)\n"
299 |       ],
300 |       "metadata": {
301 |         "colab": {
302 |           "base_uri": "https://localhost:8080/"
303 |         },
304 |         "id": "U9_V1fJ_gmS-",
305 |         "outputId": "c1842f2f-ebeb-4a17-a1a9-c0887336c207"
306 |       },
307 |       "execution_count": null,
308 |       "outputs": [
309 |         {
310 |           "output_type": "stream",
311 |           "name": "stdout",
312 |           "text": [
313 |             "Unique Products:\n",
314 |             " Product\n",
315 |             "Laptop    2\n",
316 |             "Phone     2\n",
317 |             "Tablet    1\n",
318 |             "Name: count, dtype: int64\n"
319 |           ]
320 |         }
321 |       ]
322 |     },
323 |     {
324 |       "cell_type": "code",
325 |       "source": [
326 |         "inconsistent_names = df[\"CustomerName\"].str.contains(r\"[A-Z]{2,}\", na=False)\n",
327 |         "print(\"Inconsistent Formatting in Names:\\n\", df[inconsistent_names])\n"
328 |       ],
329 |       "metadata": {
330 |         "colab": {
331 |           "base_uri": "https://localhost:8080/"
332 |         },
333 |         "id": "2tWS4iMmgn6B",
334 |         "outputId": "334ae93d-6628-4237-adf7-d54e1023a577"
335 |       },
336 |       "execution_count": null,
337 |       "outputs": [
338 |         {
339 |           "output_type": "stream",
340 |           "name": "stdout",
341 |           "text": [
342 |             "Inconsistent Formatting in Names:\n",
343 |             " Empty DataFrame\n",
344 |             "Columns: [TransactionID, CustomerName, Product, Price, Quantity, TransactionDate]\n",
345 |             "Index: []\n"
346 |           ]
347 |         }
348 |       ]
349 |     },
350 |     {
351 |       "cell_type": "code",
352 |       "source": [
353 |         "issues = df.isnull().sum(axis=1) + (df[\"Price\"] < 0) + (~df[\"TransactionDate\"].notnull())\n",
354 |         "problematic_rows = df[issues > 1]\n",
355 |         "print(\"Rows with Multiple Issues:\\n\", problematic_rows)\n"
356 |       ],
357 |       "metadata": {
358 |         "colab": {
359 |           "base_uri": "https://localhost:8080/"
360 |         },
361 |         "id": "ZJEQ8_5Ugp7x",
362 |         "outputId": "630c82dd-82b3-4bb5-a0a3-9b883d0985a7"
363 |       },
364 |       "execution_count": null,
365 |       "outputs": [
366 |         {
367 |           "output_type": "stream",
368 |           "name": "stdout",
369 |           "text": [
370 |             "Rows with Multiple Issues:\n",
371 |             "    TransactionID CustomerName Product  Price  Quantity TransactionDate\n",
372 |             "1            102   June Young   Phone    800       2.0             NaT\n",
373 |             "2            103    Jane Rust  Laptop   1200       NaN             NaT\n",
374 |             "3            104         None  Tablet   -300       1.0             NaT\n"
375 |           ]
376 |         }
377 |       ]
378 |     }
379 |   ]
380 | }


--------------------------------------------------------------------------------
/postgres/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pyspark/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pyspark/pyspark_data_cleaning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Install PySpark"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "EnkdF6a8IJNL"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": null,
 29 |       "metadata": {
 30 |         "colab": {
 31 |           "base_uri": "https://localhost:8080/"
 32 |         },
 33 |         "id": "40wRo96rr55a",
 34 |         "outputId": "77638389-c474-44c0-98e4-68932fa52e14"
 35 |       },
 36 |       "outputs": [
 37 |         {
 38 |           "output_type": "stream",
 39 |           "name": "stdout",
 40 |           "text": [
 41 |             "Collecting pyspark\n",
 42 |             "  Downloading pyspark-3.5.2.tar.gz (317.3 MB)\n",
 43 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 44 |             "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 45 |             "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n",
 46 |             "Building wheels for collected packages: pyspark\n",
 47 |             "  Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 48 |             "  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6bd80e2df67a29c669daab45ed1eb501ce4a7f36d432bceb1d510132c890bcd0\n",
 49 |             "  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574\n",
 50 |             "Successfully built pyspark\n",
 51 |             "Installing collected packages: pyspark\n",
 52 |             "Successfully installed pyspark-3.5.2\n"
 53 |           ]
 54 |         }
 55 |       ],
 56 |       "source": [
 57 |         "! pip3 install pyspark"
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "markdown",
 62 |       "source": [
 63 |         "## 1. Start a PySpark Session"
 64 |       ],
 65 |       "metadata": {
 66 |         "id": "s5NnsPriK2WZ"
 67 |       }
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "source": [
 72 |         "from pyspark.sql import SparkSession\n",
 73 |         "\n",
 74 |         "# Initialize a Spark session\n",
 75 |         "spark = SparkSession.builder \\\n",
 76 |         "\t.appName(\"DataCleaning\") \\\n",
 77 |         "\t.getOrCreate()\n"
 78 |       ],
 79 |       "metadata": {
 80 |         "id": "sjLGh31-zcjY"
 81 |       },
 82 |       "execution_count": null,
 83 |       "outputs": []
 84 |     },
 85 |     {
 86 |       "cell_type": "markdown",
 87 |       "source": [
 88 |         "## 2. Generate a Sample Dataset"
 89 |       ],
 90 |       "metadata": {
 91 |         "id": "XQmyi3t9K-vm"
 92 |       }
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "source": [
 97 |         "import random\n",
 98 |         "import pandas as pd\n",
 99 |         "\n",
100 |         "# Function to generate random data with some missing values and duplicates\n",
101 |         "def generate_data(n):\n",
102 |         "    customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]\n",
103 |         "    product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']\n",
104 |         "\n",
105 |         "    data = []\n",
106 |         "    for i in range(n):\n",
107 |         "        customer_id = random.choice(customer_ids) if i % 10 != 0 else None  # Introduce some missing values\n",
108 |         "        transaction_id = f'T{str(random.randint(10000, 99999))}'\n",
109 |         "        transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')\n",
110 |         "        amount = round(random.uniform(5, 500), 2)\n",
111 |         "        product_category = random.choice(product_categories)\n",
112 |         "        data.append((customer_id, transaction_id, transaction_date, amount, product_category))\n",
113 |         "\n",
114 |         "        # Introduce duplicates\n",
115 |         "        data.extend(data[:10])\n",
116 |         "\n",
117 |         "    return data"
118 |       ],
119 |       "metadata": {
120 |         "id": "7mZDk8Arzlhj"
121 |       },
122 |       "execution_count": null,
123 |       "outputs": []
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "source": [
128 |         "# Generate 10,000 rows of data\n",
129 |         "data = generate_data(10_000)\n",
130 |         "\n",
131 |         "# Convert to a Pandas DataFrame and then to PySpark DataFrame\n",
132 |         "columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']\n",
133 |         "df = pd.DataFrame(data, columns=columns)\n",
134 |         "spark_df = spark.createDataFrame(df)\n",
135 |         "\n",
136 |         "spark_df.show(5)\n"
137 |       ],
138 |       "metadata": {
139 |         "colab": {
140 |           "base_uri": "https://localhost:8080/"
141 |         },
142 |         "id": "laHfRBDkzp1z",
143 |         "outputId": "137cbd86-ede9-4cc5-bf4b-a753e12bfb4b"
144 |       },
145 |       "execution_count": null,
146 |       "outputs": [
147 |         {
148 |           "output_type": "stream",
149 |           "name": "stdout",
150 |           "text": [
151 |             "+----------+-------------+-------------------+------+---------------+\n",
152 |             "|CustomerID|TransactionID|    TransactionDate|Amount|ProductCategory|\n",
153 |             "+----------+-------------+-------------------+------+---------------+\n",
154 |             "|      NULL|       T17203|2023-03-20 00:00:00|221.92|          Books|\n",
155 |             "|      NULL|       T17203|2023-03-20 00:00:00|221.92|          Books|\n",
156 |             "|    C00058|       T63296|2023-02-11 00:00:00|157.92|      Groceries|\n",
157 |             "|      NULL|       T17203|2023-03-20 00:00:00|221.92|          Books|\n",
158 |             "|      NULL|       T17203|2023-03-20 00:00:00|221.92|          Books|\n",
159 |             "+----------+-------------+-------------------+------+---------------+\n",
160 |             "only showing top 5 rows\n",
161 |             "\n"
162 |           ]
163 |         }
164 |       ]
165 |     },
166 |     {
167 |       "cell_type": "code",
168 |       "source": [
169 |         "spark_df.dtypes"
170 |       ],
171 |       "metadata": {
172 |         "colab": {
173 |           "base_uri": "https://localhost:8080/"
174 |         },
175 |         "id": "Wz3u7w8R8eQF",
176 |         "outputId": "181b8c62-3d8f-4585-ed60-1c2adb33b8f5"
177 |       },
178 |       "execution_count": null,
179 |       "outputs": [
180 |         {
181 |           "output_type": "execute_result",
182 |           "data": {
183 |             "text/plain": [
184 |               "[('CustomerID', 'string'),\n",
185 |               " ('TransactionID', 'string'),\n",
186 |               " ('TransactionDate', 'date'),\n",
187 |               " ('Amount', 'double'),\n",
188 |               " ('ProductCategory', 'string')]"
189 |             ]
190 |           },
191 |           "metadata": {},
192 |           "execution_count": 9
193 |         }
194 |       ]
195 |     },
196 |     {
197 |       "cell_type": "markdown",
198 |       "source": [
199 |         "## 3. Handle Missing Values"
200 |       ],
201 |       "metadata": {
202 |         "id": "nrouSi23Dw_I"
203 |       }
204 |     },
205 |     {
206 |       "cell_type": "code",
207 |       "source": [
208 |         "# Fill missing CustomerID with a default value\n",
209 |         "spark_df = spark_df.fillna({\"CustomerID\": \"Unknown\"})\n"
210 |       ],
211 |       "metadata": {
212 |         "id": "ku6AdAO6z9PA"
213 |       },
214 |       "execution_count": null,
215 |       "outputs": []
216 |     },
217 |     {
218 |       "cell_type": "markdown",
219 |       "source": [
220 |         "## 4. Remove Duplicates"
221 |       ],
222 |       "metadata": {
223 |         "id": "tuP2lpcYD6Nu"
224 |       }
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "source": [
229 |         "from pyspark.sql.functions import col, min, max\n",
230 |         "\n",
231 |         "# Normalize the 'Amount' column\n",
232 |         "min_amount = spark_df.agg(min(col(\"Amount\"))).collect()[0][0]\n",
233 |         "max_amount = spark_df.agg(max(col(\"Amount\"))).collect()[0][0]\n",
234 |         "\n",
235 |         "spark_df = spark_df.withColumn(\"Amount\", (col(\"Amount\") - min_amount) / (max_amount - min_amount))\n"
236 |       ],
237 |       "metadata": {
238 |         "id": "eomZcsnW0HCY"
239 |       },
240 |       "execution_count": null,
241 |       "outputs": []
242 |     },
243 |     {
244 |       "cell_type": "markdown",
245 |       "source": [
246 |         "## 5. Transform Columns"
247 |       ],
248 |       "metadata": {
249 |         "id": "nzJzlkajIzkF"
250 |       }
251 |     },
252 |     {
253 |       "cell_type": "code",
254 |       "source": [
255 |         "from pyspark.sql.functions import col, min, max\n",
256 |         "\n",
257 |         "# Normalize the 'Amount' column\n",
258 |         "min_amount = spark_df.agg(min(col(\"Amount\"))).collect()[0][0]\n",
259 |         "max_amount = spark_df.agg(max(col(\"Amount\"))).collect()[0][0]\n",
260 |         "\n",
261 |         "spark_df = spark_df.withColumn(\"Amount\", (col(\"Amount\") - min_amount) / (max_amount - min_amount))"
262 |       ],
263 |       "metadata": {
264 |         "id": "5TPaJLV3I1r0"
265 |       },
266 |       "execution_count": null,
267 |       "outputs": []
268 |     },
269 |     {
270 |       "cell_type": "markdown",
271 |       "source": [
272 |         "## 6. Handle Outliers"
273 |       ],
274 |       "metadata": {
275 |         "id": "y0JtNxWVJCju"
276 |       }
277 |     },
278 |     {
279 |       "cell_type": "code",
280 |       "source": [
281 |         "from pyspark.sql.functions import col, expr\n",
282 |         "\n",
283 |         "# Calculate Q1, Q3, and IQR\n",
284 |         "quantiles = spark_df.approxQuantile(\"Amount\", [0.25, 0.75], 0.05)\n",
285 |         "Q1 = quantiles[0]\n",
286 |         "Q3 = quantiles[1]\n",
287 |         "IQR = Q3 - Q1\n",
288 |         "\n",
289 |         "# Define the upper and lower bounds\n",
290 |         "lower_bound = Q1 - 1.5 * IQR\n",
291 |         "upper_bound = Q3 + 1.5 * IQR\n",
292 |         "\n",
293 |         "# Filter out the outliers\n",
294 |         "spark_df = spark_df.filter((col(\"Amount\") >= lower_bound) & (col(\"Amount\") <= upper_bound))"
295 |       ],
296 |       "metadata": {
297 |         "id": "KsPQQuGrJHYa"
298 |       },
299 |       "execution_count": null,
300 |       "outputs": []
301 |     },
302 |     {
303 |       "cell_type": "markdown",
304 |       "source": [
305 |         "\n",
306 |         "## 7. Convert Data Types"
307 |       ],
308 |       "metadata": {
309 |         "id": "8mpXH0cdJL8b"
310 |       }
311 |     },
312 |     {
313 |       "cell_type": "code",
314 |       "source": [
315 |         "from pyspark.sql.functions import to_date\n",
316 |         "\n",
317 |         "# Convert 'TransactionDate' to date format\n",
318 |         "# (not quite needed for this dataset)\n",
319 |         "spark_df = spark_df.withColumn(\"TransactionDate\", to_date(col(\"TransactionDate\")))\n"
320 |       ],
321 |       "metadata": {
322 |         "id": "46QEuADu0LIS"
323 |       },
324 |       "execution_count": null,
325 |       "outputs": []
326 |     }
327 |   ]
328 | }


--------------------------------------------------------------------------------
/pyspark/pyspark_read_csv.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Install PySpark"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "5oNZgpo8Ljzu"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "source": [
 29 |         "! pip3 install pyspark"
 30 |       ],
 31 |       "metadata": {
 32 |         "colab": {
 33 |           "base_uri": "https://localhost:8080/"
 34 |         },
 35 |         "id": "T4Se5rOKKjzk",
 36 |         "outputId": "223f9be2-a51e-4006-e89d-b077b3d546c3"
 37 |       },
 38 |       "execution_count": 1,
 39 |       "outputs": [
 40 |         {
 41 |           "output_type": "stream",
 42 |           "name": "stdout",
 43 |           "text": [
 44 |             "Collecting pyspark\n",
 45 |             "  Downloading pyspark-3.5.2.tar.gz (317.3 MB)\n",
 46 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 47 |             "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 48 |             "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n",
 49 |             "Building wheels for collected packages: pyspark\n",
 50 |             "  Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 51 |             "  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=bbfd80a589ea8e2302f3938fd11b4434a84633b28244a0229ecf62245ae601d1\n",
 52 |             "  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574\n",
 53 |             "Successfully built pyspark\n",
 54 |             "Installing collected packages: pyspark\n",
 55 |             "Successfully installed pyspark-3.5.2\n"
 56 |           ]
 57 |         }
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "markdown",
 62 |       "source": [
 63 |         "## 1. Start a PySpark Session\n"
 64 |       ],
 65 |       "metadata": {
 66 |         "id": "myuR4RIqLmvp"
 67 |       }
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "execution_count": 2,
 72 |       "metadata": {
 73 |         "id": "caz_ZJqBKXU2"
 74 |       },
 75 |       "outputs": [],
 76 |       "source": [
 77 |         "from pyspark.sql import SparkSession\n",
 78 |         "\n",
 79 |         "# Initialize a Spark session\n",
 80 |         "spark = SparkSession.builder \\\n",
 81 |         "\t.appName(\"ReadCSV\") \\\n",
 82 |         "\t.getOrCreate()\n"
 83 |       ]
 84 |     },
 85 |     {
 86 |       "cell_type": "markdown",
 87 |       "source": [
 88 |         "## 2. Generate a Sample CSV File"
 89 |       ],
 90 |       "metadata": {
 91 |         "id": "zv3SAft7Ltuk"
 92 |       }
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "source": [
 97 |         "import random\n",
 98 |         "import pandas as pd\n",
 99 |         "\n",
100 |         "# Function to generate random transaction data\n",
101 |         "def generate_data(n):\n",
102 |         "    customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]\n",
103 |         "    product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']\n",
104 |         "\n",
105 |         "    data = []\n",
106 |         "    for _ in range(n):\n",
107 |         "        customer_id = random.choice(customer_ids)\n",
108 |         "        transaction_id = f'T{str(random.randint(10000, 99999))}'\n",
109 |         "        transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')\n",
110 |         "        amount = round(random.uniform(5, 500), 2)\n",
111 |         "        product_category = random.choice(product_categories)\n",
112 |         "        data.append((customer_id, transaction_id, transaction_date, amount, product_category))\n",
113 |         "\n",
114 |         "    return data\n",
115 |         "\n",
116 |         "# Generate 10000 rows of transaction data\n",
117 |         "data = generate_data(10_000)\n",
118 |         "\n",
119 |         "# Convert to a Pandas DataFrame\n",
120 |         "columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']\n",
121 |         "df = pd.DataFrame(data, columns=columns)\n",
122 |         "\n",
123 |         "# Create the CSV file\n",
124 |         "csv_path = \"sample_transactions.csv\"\n",
125 |         "df.to_csv(csv_path, index=False)\n",
126 |         "\n",
127 |         "print(f\"Sample CSV file '{csv_path}' generated.\")"
128 |       ],
129 |       "metadata": {
130 |         "colab": {
131 |           "base_uri": "https://localhost:8080/"
132 |         },
133 |         "id": "RRiKSPCyKaah",
134 |         "outputId": "0aa43d92-c219-4647-94c3-767f49bf9333"
135 |       },
136 |       "execution_count": 3,
137 |       "outputs": [
138 |         {
139 |           "output_type": "stream",
140 |           "name": "stdout",
141 |           "text": [
142 |             "Sample CSV file 'sample_transactions.csv' generated.\n"
143 |           ]
144 |         }
145 |       ]
146 |     },
147 |     {
148 |       "cell_type": "markdown",
149 |       "source": [
150 |         "## 3. Read the CSV File into a PySpark DataFrame"
151 |       ],
152 |       "metadata": {
153 |         "id": "g4PT5XIiLw8U"
154 |       }
155 |     },
156 |     {
157 |       "cell_type": "code",
158 |       "source": [
159 |         "spark_df = spark.read.csv(csv_path, header=True, inferSchema=True)\n",
160 |         "\n",
161 |         "# Show the first 5 rows\n",
162 |         "spark_df.show(5)\n"
163 |       ],
164 |       "metadata": {
165 |         "colab": {
166 |           "base_uri": "https://localhost:8080/"
167 |         },
168 |         "id": "7kGSq4k8KeZL",
169 |         "outputId": "7b5677c9-629b-4e3e-dab7-fced000ca15e"
170 |       },
171 |       "execution_count": 4,
172 |       "outputs": [
173 |         {
174 |           "output_type": "stream",
175 |           "name": "stdout",
176 |           "text": [
177 |             "+----------+-------------+---------------+------+---------------+\n",
178 |             "|CustomerID|TransactionID|TransactionDate|Amount|ProductCategory|\n",
179 |             "+----------+-------------+---------------+------+---------------+\n",
180 |             "|    C00006|       T58996|     2023-01-09| 17.02|      Furniture|\n",
181 |             "|    C00076|       T30519|     2023-02-28|459.67|          Books|\n",
182 |             "|    C00076|       T89246|     2023-06-10|404.95|       Clothing|\n",
183 |             "|    C00049|       T11436|     2023-06-05| 103.9|          Books|\n",
184 |             "|    C00049|       T18176|     2023-04-03|406.55|      Furniture|\n",
185 |             "+----------+-------------+---------------+------+---------------+\n",
186 |             "only showing top 5 rows\n",
187 |             "\n"
188 |           ]
189 |         }
190 |       ]
191 |     },
192 |     {
193 |       "cell_type": "markdown",
194 |       "source": [
195 |         "## 4. Exploring the DataFrame"
196 |       ],
197 |       "metadata": {
198 |         "id": "_RIITip_L5GO"
199 |       }
200 |     },
201 |     {
202 |       "cell_type": "code",
203 |       "source": [
204 |         "# Print the schema of the DataFrame\n",
205 |         "spark_df.printSchema()\n"
206 |       ],
207 |       "metadata": {
208 |         "colab": {
209 |           "base_uri": "https://localhost:8080/"
210 |         },
211 |         "id": "R0xFp27-KgXs",
212 |         "outputId": "a06af877-c1a1-418b-a0d0-4be8a01f3377"
213 |       },
214 |       "execution_count": 10,
215 |       "outputs": [
216 |         {
217 |           "output_type": "stream",
218 |           "name": "stdout",
219 |           "text": [
220 |             "root\n",
221 |             " |-- CustomerID: string (nullable = true)\n",
222 |             " |-- TransactionID: string (nullable = true)\n",
223 |             " |-- TransactionDate: date (nullable = true)\n",
224 |             " |-- Amount: double (nullable = true)\n",
225 |             " |-- ProductCategory: string (nullable = true)\n",
226 |             "\n"
227 |           ]
228 |         }
229 |       ]
230 |     },
231 |     {
232 |       "cell_type": "code",
233 |       "source": [
234 |         "from pyspark.sql.functions import col\n",
235 |         "\n",
236 |         "# Filter transactions with an Amount greater than 100\n",
237 |         "filtered_df = spark_df.filter(col(\"Amount\") > 100)\n",
238 |         "\n",
239 |         "# Select specific columns\n",
240 |         "selected_df = filtered_df.select(\"CustomerID\", \"TransactionID\", \"Amount\")\n",
241 |         "\n",
242 |         "# Show the results\n",
243 |         "selected_df.show(5)"
244 |       ],
245 |       "metadata": {
246 |         "colab": {
247 |           "base_uri": "https://localhost:8080/"
248 |         },
249 |         "id": "ua9R3BnoKiiQ",
250 |         "outputId": "e19be35b-238c-4f8b-de13-e7a7efcd5afc"
251 |       },
252 |       "execution_count": 11,
253 |       "outputs": [
254 |         {
255 |           "output_type": "stream",
256 |           "name": "stdout",
257 |           "text": [
258 |             "+----------+-------------+------+\n",
259 |             "|CustomerID|TransactionID|Amount|\n",
260 |             "+----------+-------------+------+\n",
261 |             "|    C00076|       T30519|459.67|\n",
262 |             "|    C00076|       T89246|404.95|\n",
263 |             "|    C00049|       T11436| 103.9|\n",
264 |             "|    C00049|       T18176|406.55|\n",
265 |             "|    C00096|       T31087|349.47|\n",
266 |             "+----------+-------------+------+\n",
267 |             "only showing top 5 rows\n",
268 |             "\n"
269 |           ]
270 |         }
271 |       ]
272 |     }
273 |   ]
274 | }


--------------------------------------------------------------------------------
/pyspark/pyspark_write_parquet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Install PySpark"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "rsnBVqAGEUX-"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": 1,
 29 |       "metadata": {
 30 |         "colab": {
 31 |           "base_uri": "https://localhost:8080/"
 32 |         },
 33 |         "id": "nmFnWlbOB6lk",
 34 |         "outputId": "157e06fd-ad57-4541-dec5-3052ad9563fd"
 35 |       },
 36 |       "outputs": [
 37 |         {
 38 |           "output_type": "stream",
 39 |           "name": "stdout",
 40 |           "text": [
 41 |             "Collecting pyspark\n",
 42 |             "  Downloading pyspark-3.5.2.tar.gz (317.3 MB)\n",
 43 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 44 |             "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 45 |             "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n",
 46 |             "Building wheels for collected packages: pyspark\n",
 47 |             "  Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 48 |             "  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=bb3917a42031cc2863a57c06e481281d96d8b540fd03498d256aff244f0d14ae\n",
 49 |             "  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574\n",
 50 |             "Successfully built pyspark\n",
 51 |             "Installing collected packages: pyspark\n",
 52 |             "Successfully installed pyspark-3.5.2\n"
 53 |           ]
 54 |         }
 55 |       ],
 56 |       "source": [
 57 |         "! pip install pyspark"
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "markdown",
 62 |       "source": [
 63 |         "## 1. Start a PySpark Session"
 64 |       ],
 65 |       "metadata": {
 66 |         "id": "rcqidYgeEX69"
 67 |       }
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "source": [
 72 |         "from pyspark.sql import SparkSession\n",
 73 |         "\n",
 74 |         "# Initialize a Spark session\n",
 75 |         "spark = SparkSession.builder \\\n",
 76 |         "\t.appName(\"WriteToParquet\") \\\n",
 77 |         "\t.getOrCreate()\n"
 78 |       ],
 79 |       "metadata": {
 80 |         "id": "PCL06Q0oB92P"
 81 |       },
 82 |       "execution_count": 2,
 83 |       "outputs": []
 84 |     },
 85 |     {
 86 |       "cell_type": "markdown",
 87 |       "source": [
 88 |         "## 2. Generating a Sample Dataset"
 89 |       ],
 90 |       "metadata": {
 91 |         "id": "ngfWs7IKEbvs"
 92 |       }
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "source": [
 97 |         "import random\n",
 98 |         "import pandas as pd\n",
 99 |         "\n",
100 |         "# Function to generate random transaction data\n",
101 |         "def generate_data(n):\n",
102 |         "    customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]\n",
103 |         "    product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']\n",
104 |         "\n",
105 |         "    data = []\n",
106 |         "    for _ in range(n):\n",
107 |         "        customer_id = random.choice(customer_ids)\n",
108 |         "        transaction_id = f'T{str(random.randint(10000, 99999))}'\n",
109 |         "        transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')\n",
110 |         "        amount = round(random.uniform(5, 500), 2)\n",
111 |         "        product_category = random.choice(product_categories)\n",
112 |         "        data.append((customer_id, transaction_id, transaction_date, amount, product_category))\n",
113 |         "\n",
114 |         "    return data\n"
115 |       ],
116 |       "metadata": {
117 |         "id": "RIOrcXxoCCoc"
118 |       },
119 |       "execution_count": 3,
120 |       "outputs": []
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "source": [
125 |         "# Generate 100,000 rows of transaction data\n",
126 |         "data = generate_data(100_000)\n",
127 |         "\n",
128 |         "# Convert to a Pandas DataFrame\n",
129 |         "columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']\n",
130 |         "df = pd.DataFrame(data, columns=columns)\n",
131 |         "\n",
132 |         "# Convert to a PySpark DataFrame\n",
133 |         "spark_df = spark.createDataFrame(df)\n",
134 |         "spark_df.show(5)\n"
135 |       ],
136 |       "metadata": {
137 |         "colab": {
138 |           "base_uri": "https://localhost:8080/"
139 |         },
140 |         "id": "COM3xnQsCE7l",
141 |         "outputId": "e8850727-55a3-4b8c-fb68-913195b81b26"
142 |       },
143 |       "execution_count": 4,
144 |       "outputs": [
145 |         {
146 |           "output_type": "stream",
147 |           "name": "stdout",
148 |           "text": [
149 |             "+----------+-------------+-------------------+------+---------------+\n",
150 |             "|CustomerID|TransactionID|    TransactionDate|Amount|ProductCategory|\n",
151 |             "+----------+-------------+-------------------+------+---------------+\n",
152 |             "|    C00012|       T36462|2023-05-05 00:00:00| 90.91|      Furniture|\n",
153 |             "|    C00037|       T81031|2023-03-19 00:00:00|465.54|    Electronics|\n",
154 |             "|    C00092|       T98628|2023-02-25 00:00:00| 180.9|       Clothing|\n",
155 |             "|    C00050|       T46850|2023-04-16 00:00:00|494.67|      Furniture|\n",
156 |             "|    C00097|       T79766|2023-04-11 00:00:00|179.65|      Groceries|\n",
157 |             "+----------+-------------+-------------------+------+---------------+\n",
158 |             "only showing top 5 rows\n",
159 |             "\n"
160 |           ]
161 |         }
162 |       ]
163 |     },
164 |     {
165 |       "cell_type": "markdown",
166 |       "source": [
167 |         "## 3. Writing DataFrames to Parquet Files"
168 |       ],
169 |       "metadata": {
170 |         "id": "QXoDo2PdEetE"
171 |       }
172 |     },
173 |     {
174 |       "cell_type": "code",
175 |       "source": [
176 |         "# Specify the path to the Parquet file\n",
177 |         "output_path = \"transactions.parquet\"\n",
178 |         "\n",
179 |         "# Write the DataFrame to Parquet format\n",
180 |         "spark_df.write.parquet(output_path)\n"
181 |       ],
182 |       "metadata": {
183 |         "id": "USrdKz9zCH3-"
184 |       },
185 |       "execution_count": 5,
186 |       "outputs": []
187 |     },
188 |     {
189 |       "cell_type": "code",
190 |       "source": [
191 |         "! ls"
192 |       ],
193 |       "metadata": {
194 |         "colab": {
195 |           "base_uri": "https://localhost:8080/"
196 |         },
197 |         "id": "nWc4bKgJCKek",
198 |         "outputId": "a522d32b-831e-42b9-d38e-027d7bea5123"
199 |       },
200 |       "execution_count": 6,
201 |       "outputs": [
202 |         {
203 |           "output_type": "stream",
204 |           "name": "stdout",
205 |           "text": [
206 |             "sample_data  transactions.parquet\n"
207 |           ]
208 |         }
209 |       ]
210 |     },
211 |     {
212 |       "cell_type": "markdown",
213 |       "source": [
214 |         "## 4. Writing Partitioned Parquet Files\n"
215 |       ],
216 |       "metadata": {
217 |         "id": "jCzZVwHlElYG"
218 |       }
219 |     },
220 |     {
221 |       "cell_type": "code",
222 |       "source": [
223 |         "# Write the dataframe to Parquet format, partitioned by 'ProductCategory'\n",
224 |         "partitioned_output_path = \"transactions_partitioned.parquet\"\n",
225 |         "spark_df.write.partitionBy(\"ProductCategory\").parquet(partitioned_output_path)\n"
226 |       ],
227 |       "metadata": {
228 |         "id": "AE9MaBStCLXH"
229 |       },
230 |       "execution_count": 7,
231 |       "outputs": []
232 |     },
233 |     {
234 |       "cell_type": "code",
235 |       "source": [
236 |         "! ls"
237 |       ],
238 |       "metadata": {
239 |         "colab": {
240 |           "base_uri": "https://localhost:8080/"
241 |         },
242 |         "id": "n3JMnfYIDoaP",
243 |         "outputId": "cda8bdc6-f319-4fcf-fddc-1a7aeca95b40"
244 |       },
245 |       "execution_count": 8,
246 |       "outputs": [
247 |         {
248 |           "output_type": "stream",
249 |           "name": "stdout",
250 |           "text": [
251 |             "sample_data  transactions.parquet  transactions_partitioned.parquet\n"
252 |           ]
253 |         }
254 |       ]
255 |     },
256 |     {
257 |       "cell_type": "code",
258 |       "source": [
259 |         "! ls transactions_partitioned.parquet"
260 |       ],
261 |       "metadata": {
262 |         "colab": {
263 |           "base_uri": "https://localhost:8080/"
264 |         },
265 |         "id": "dHS4VW8qD8nI",
266 |         "outputId": "3acd5f08-79d8-4a69-9667-adc1f07ad114"
267 |       },
268 |       "execution_count": 10,
269 |       "outputs": [
270 |         {
271 |           "output_type": "stream",
272 |           "name": "stdout",
273 |           "text": [
274 |             "'ProductCategory=Books'     'ProductCategory=Electronics'  'ProductCategory=Groceries'\n",
275 |             "'ProductCategory=Clothing'  'ProductCategory=Furniture'     _SUCCESS\n"
276 |           ]
277 |         }
278 |       ]
279 |     },
280 |     {
281 |       "cell_type": "markdown",
282 |       "source": [
283 |         "## 5. Reading Parquet Files"
284 |       ],
285 |       "metadata": {
286 |         "id": "gJdb3rHwEosV"
287 |       }
288 |     },
289 |     {
290 |       "cell_type": "code",
291 |       "source": [
292 |         "# Read in the Parquet file\n",
293 |         "df_read = spark.read.parquet(output_path)\n",
294 |         "\n",
295 |         "# Show the content of the DataFrame\n",
296 |         "df_read.show(5)\n"
297 |       ],
298 |       "metadata": {
299 |         "colab": {
300 |           "base_uri": "https://localhost:8080/"
301 |         },
302 |         "id": "xyH_5SMxCNww",
303 |         "outputId": "2091c0ba-0c2a-4552-b07a-f439d4ce9502"
304 |       },
305 |       "execution_count": 11,
306 |       "outputs": [
307 |         {
308 |           "output_type": "stream",
309 |           "name": "stdout",
310 |           "text": [
311 |             "+----------+-------------+-------------------+------+---------------+\n",
312 |             "|CustomerID|TransactionID|    TransactionDate|Amount|ProductCategory|\n",
313 |             "+----------+-------------+-------------------+------+---------------+\n",
314 |             "|    C00012|       T36462|2023-05-05 00:00:00| 90.91|      Furniture|\n",
315 |             "|    C00037|       T81031|2023-03-19 00:00:00|465.54|    Electronics|\n",
316 |             "|    C00092|       T98628|2023-02-25 00:00:00| 180.9|       Clothing|\n",
317 |             "|    C00050|       T46850|2023-04-16 00:00:00|494.67|      Furniture|\n",
318 |             "|    C00097|       T79766|2023-04-11 00:00:00|179.65|      Groceries|\n",
319 |             "+----------+-------------+-------------------+------+---------------+\n",
320 |             "only showing top 5 rows\n",
321 |             "\n"
322 |           ]
323 |         }
324 |       ]
325 |     }
326 |   ]
327 | }


--------------------------------------------------------------------------------
/regex/learn_regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | text = "Data science is cool as you get to work with real-world data"
 4 | matches = re.findall(r"data", text)
 5 | print(matches)
 6 | 
 7 | matches = re.findall(r"data", text, re.IGNORECASE)
 8 | print(matches)
 9 | 
10 | text = "The cat sat on the mat. The bat flew over the rat."
11 | pattern = r"The ... "
12 | matches = re.findall(pattern, text)
13 | print(matches)
14 | 
15 | text = "The cat sat on the mat. The bat flew over the rat."
16 | pattern = r"[cb]at"
17 | matches = re.findall(pattern, text)
18 | print(matches)
19 |      
20 | 
21 | # Find all lowercase words that start with a-d
22 | pattern = r"\b[a-d][a-z]*\b"
23 | text = "apple banana cherry date elephant fig grape kiwi lemon mango orange"
24 | matches = re.findall(pattern, text)
25 | print(matches)
26 | 
27 | 
28 | text = "Contact: john.doe@example.com"
29 | pattern = r"(?P[\w.]+)@(?P[\w.]+)"
30 | 
31 | match = re.search(pattern, text)
32 | if match:
33 |     print(f"Username: {match.group('username')}")
34 |     print(f"Domain: {match.group('domain')}")
35 | 
36 |      
37 |      
38 | text = "Phone numbers: 555-1234, 555-5678, 5551234"
39 | pattern = r"\b\d{3}-?\d{4}\b"
40 | matches = re.findall(pattern, text)
41 | print(matches)
42 | 
43 | 
44 | 
45 | text = "Python is popular in data science."
46 | 
47 | # ^ anchors to the start of the string
48 | start_matches = re.findall(r"^Python", text)
49 | print(start_matches)
50 | 
51 | # $ anchors to the end of the string
52 | end_matches = re.findall(r"science\.$", text)
53 | print(end_matches)
54 |      
55 | text = "Dates: 2023-10-15, 2022-05-22"
56 | pattern = r"(\d{4})-(\d{2})-(\d{2})"
57 | 
58 | # findall returns tuples of the captured groups
59 | matches = re.findall(pattern, text)
60 | print(matches)
61 | 
62 | # You can use these to create structured data
63 | for year, month, day in matches:
64 |     print(f"Year: {year}, Month: {month}, Day: {day}")
65 | 
66 | 
67 | text = "Contact: john.doe@example.com"
68 | pattern = r"(?P[\w.]+)@(?P[\w.]+)"
69 | 
70 | match = re.search(pattern, text)
71 | if match:
72 |     print(f"Username: {match.group('username')}")
73 |     print(f"Domain: {match.group('domain')}")
74 | 


--------------------------------------------------------------------------------
/regex/quick-ref-regex.md:
--------------------------------------------------------------------------------
 1 | # Regular Expressions Quick Reference Table
 2 | 
 3 | ## Basic Metacharacters
 4 | 
 5 | | Character | Description | Example | Matches |
 6 | |-----------|-------------|---------|---------|
 7 | | `.` | Any character except newline | `a.b` | "acb", "adb", "a3b", etc. |
 8 | | `^` | Start of string | `^Hello` | "Hello world" but not "Say Hello" |
 9 | | `$` | End of string | `world$` | "Hello world" but not "world class" |
10 | | `*` | 0 or more repetitions | `ab*c` | "ac", "abc", "abbc", etc. |
11 | | `+` | 1 or more repetitions | `ab+c` | "abc", "abbc", etc. but not "ac" |
12 | | `?` | 0 or 1 repetition | `ab?c` | "ac", "abc" but not "abbc" |
13 | | `{n}` | Exactly n repetitions | `a{3}` | "aaa" |
14 | | `{m,n}` | m to n repetitions | `a{2,4}` | "aa", "aaa", "aaaa" |
15 | | `{m,}` | m or more repetitions | `a{2,}` | "aa", "aaa", "aaaa", etc. |
16 | | `\` | Escape character | `\.` | Literal period "." |
17 | | `[]` | Character class | `[abc]` | "a", "b", or "c" |
18 | | `\|` | Alternation (OR) | `cat\|dog` | "cat" or "dog" |
19 | | `()` | Grouping | `(ab)+` | "ab", "abab", etc. |
20 | 
21 | ## Character Classes
22 | 
23 | | Expression | Description | Equivalent | 
24 | |------------|-------------|------------|
25 | | `\d` | Any digit | `[0-9]` |
26 | | `\D` | Any non-digit | `[^0-9]` |
27 | | `\w` | Any word character | `[a-zA-Z0-9_]` |
28 | | `\W` | Any non-word character | `[^a-zA-Z0-9_]` |
29 | | `\s` | Any whitespace | `[ \t\n\r\f\v]` |
30 | | `\S` | Any non-whitespace | `[^ \t\n\r\f\v]` |
31 | | `[abc]` | Any of listed characters | - |
32 | | `[^abc]` | Any character except listed | - |
33 | | `[a-z]` | Any character in range | - |
34 | 
35 | ## Assertions
36 | 
37 | | Expression | Description |
38 | |------------|-------------|
39 | | `(?=...)` | Positive lookahead |
40 | | `(?!...)` | Negative lookahead |
41 | | `(?<=...)` | Positive lookbehind |
42 | | `(?<!...)` | Negative lookbehind |
43 | | `\b` | Word boundary |
44 | | `\B` | Not a word boundary |
45 | 
46 | ## Common Python Regex Functions
47 | 
48 | | Function | Description |
49 | |----------|-------------|
50 | | `re.search(pattern, string)` | Returns first match or None |
51 | | `re.match(pattern, string)` | Matches at start of string |
52 | | `re.findall(pattern, string)` | Returns all matches as list |
53 | | `re.finditer(pattern, string)` | Returns iterator of match objects |
54 | | `re.sub(pattern, repl, string)` | Substitutes matches with replacement |
55 | | `re.split(pattern, string)` | Splits string by pattern |
56 | | `re.compile(pattern)` | Compiles pattern for reuse |
57 | 
58 | ## Common Patterns for Data Science
59 | 
60 | | Task | Pattern | Example |
61 | |------|---------|---------|
62 | | Email | `[\w.%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}` | user@example.com |
63 | | Phone (US) | `\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}` | (555) 123-4567 |
64 | | Date (ISO) | `\d{4}-\d{2}-\d{2}` | 2023-10-15 |
65 | | URL | `https?://[^\s/$.?#].[^\s]*` | https://example.com |
66 | | IP Address | `\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b` | 192.168.1.1 |
67 | | Credit Card | `\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b` | 1234-5678-9012-3456 |
68 | | ZIP Code (US) | `\b\d{5}(?:-\d{4})?\b` | 12345 or 12345-6789 |
69 | 


--------------------------------------------------------------------------------
/regex/regex_basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": 15,
 20 |       "metadata": {
 21 |         "id": "4Kgk8fm-Gl62"
 22 |       },
 23 |       "outputs": [],
 24 |       "source": [
 25 |         "import re"
 26 |       ]
 27 |     },
 28 |     {
 29 |       "cell_type": "markdown",
 30 |       "source": [
 31 |         "## Literal Characters"
 32 |       ],
 33 |       "metadata": {
 34 |         "id": "_OEHuAM4LqbD"
 35 |       }
 36 |     },
 37 |     {
 38 |       "cell_type": "code",
 39 |       "source": [
 40 |         "text = \"Data science is cool as you get to work with real-world data\"\n",
 41 |         "matches = re.findall(r\"data\", text)\n",
 42 |         "print(matches)"
 43 |       ],
 44 |       "metadata": {
 45 |         "colab": {
 46 |           "base_uri": "https://localhost:8080/"
 47 |         },
 48 |         "id": "7YinijGNHGPr",
 49 |         "outputId": "0b7f3076-708d-4366-df70-55ecf74f7926"
 50 |       },
 51 |       "execution_count": 16,
 52 |       "outputs": [
 53 |         {
 54 |           "output_type": "stream",
 55 |           "name": "stdout",
 56 |           "text": [
 57 |             "['data']\n"
 58 |           ]
 59 |         }
 60 |       ]
 61 |     },
 62 |     {
 63 |       "cell_type": "code",
 64 |       "source": [
 65 |         "matches = re.findall(r\"data\", text, re.IGNORECASE)\n",
 66 |         "print(matches)"
 67 |       ],
 68 |       "metadata": {
 69 |         "colab": {
 70 |           "base_uri": "https://localhost:8080/"
 71 |         },
 72 |         "id": "jsN4FjAcHRW0",
 73 |         "outputId": "17ae65cd-55eb-4c02-f7d1-df61ea5994c5"
 74 |       },
 75 |       "execution_count": 17,
 76 |       "outputs": [
 77 |         {
 78 |           "output_type": "stream",
 79 |           "name": "stdout",
 80 |           "text": [
 81 |             "['Data', 'data']\n"
 82 |           ]
 83 |         }
 84 |       ]
 85 |     },
 86 |     {
 87 |       "cell_type": "markdown",
 88 |       "source": [
 89 |         "## Metacharacters"
 90 |       ],
 91 |       "metadata": {
 92 |         "id": "k5VRBo4uMtAW"
 93 |       }
 94 |     },
 95 |     {
 96 |       "cell_type": "code",
 97 |       "source": [
 98 |         "text = \"The cat sat on the mat. The bat flew over the rat.\"\n",
 99 |         "pattern = r\"The ... \"\n",
100 |         "matches = re.findall(pattern, text)\n",
101 |         "print(matches)"
102 |       ],
103 |       "metadata": {
104 |         "colab": {
105 |           "base_uri": "https://localhost:8080/"
106 |         },
107 |         "id": "xrUpq6UsJ_NY",
108 |         "outputId": "532e76b9-6c99-49de-915d-1afe8a0eb8c1"
109 |       },
110 |       "execution_count": 18,
111 |       "outputs": [
112 |         {
113 |           "output_type": "stream",
114 |           "name": "stdout",
115 |           "text": [
116 |             "['The cat ', 'The bat ']\n"
117 |           ]
118 |         }
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "source": [
124 |         "text = \"The cat sat on the mat. The bat flew over the rat.\"\n",
125 |         "pattern = r\"[cb]at\"\n",
126 |         "matches = re.findall(pattern, text)\n",
127 |         "print(matches)"
128 |       ],
129 |       "metadata": {
130 |         "colab": {
131 |           "base_uri": "https://localhost:8080/"
132 |         },
133 |         "id": "-HmeMBRdKAeI",
134 |         "outputId": "9ad21d68-90e5-4f6c-c150-5bcd3b6233d5"
135 |       },
136 |       "execution_count": 19,
137 |       "outputs": [
138 |         {
139 |           "output_type": "stream",
140 |           "name": "stdout",
141 |           "text": [
142 |             "['cat', 'bat']\n"
143 |           ]
144 |         }
145 |       ]
146 |     },
147 |     {
148 |       "cell_type": "code",
149 |       "source": [
150 |         "# Find all lowercase words that start with a-d\n",
151 |         "pattern = r\"\\b[a-d][a-z]*\\b\"\n",
152 |         "text = \"apple banana cherry date elephant fig grape kiwi lemon mango orange\"\n",
153 |         "matches = re.findall(pattern, text)\n",
154 |         "print(matches)"
155 |       ],
156 |       "metadata": {
157 |         "colab": {
158 |           "base_uri": "https://localhost:8080/"
159 |         },
160 |         "id": "mG8NusxoKPIO",
161 |         "outputId": "dd5a109c-3b7c-4614-cb57-a317ebddd7eb"
162 |       },
163 |       "execution_count": 20,
164 |       "outputs": [
165 |         {
166 |           "output_type": "stream",
167 |           "name": "stdout",
168 |           "text": [
169 |             "['apple', 'banana', 'cherry', 'date']\n"
170 |           ]
171 |         }
172 |       ]
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "source": [
177 |         "text = \"Phone numbers: 555-1234, 555-5678, 5551234\"\n",
178 |         "pattern = r\"\\b\\d{3}-?\\d{4}\\b\"\n",
179 |         "matches = re.findall(pattern, text)\n",
180 |         "print(matches)"
181 |       ],
182 |       "metadata": {
183 |         "colab": {
184 |           "base_uri": "https://localhost:8080/"
185 |         },
186 |         "id": "fV0l2pr3KgoV",
187 |         "outputId": "04b8b125-91b7-495b-dd06-fe5b2cfb6fca"
188 |       },
189 |       "execution_count": 21,
190 |       "outputs": [
191 |         {
192 |           "output_type": "stream",
193 |           "name": "stdout",
194 |           "text": [
195 |             "['555-1234', '555-5678', '5551234']\n"
196 |           ]
197 |         }
198 |       ]
199 |     },
200 |     {
201 |       "cell_type": "markdown",
202 |       "source": [
203 |         "## Anchors"
204 |       ],
205 |       "metadata": {
206 |         "id": "o-5zXUVIMyMo"
207 |       }
208 |     },
209 |     {
210 |       "cell_type": "code",
211 |       "source": [
212 |         "text = \"Python is popular in data science.\"\n",
213 |         "\n",
214 |         "# ^ anchors to the start of the string\n",
215 |         "start_matches = re.findall(r\"^Python\", text)\n",
216 |         "print(start_matches)\n",
217 |         "\n",
218 |         "# $ anchors to the end of the string\n",
219 |         "end_matches = re.findall(r\"science\\.$\", text)\n",
220 |         "print(end_matches)"
221 |       ],
222 |       "metadata": {
223 |         "colab": {
224 |           "base_uri": "https://localhost:8080/"
225 |         },
226 |         "id": "l8AoE1MsLV_p",
227 |         "outputId": "2c2968a0-792e-4f48-c696-77c1269b15bb"
228 |       },
229 |       "execution_count": 22,
230 |       "outputs": [
231 |         {
232 |           "output_type": "stream",
233 |           "name": "stdout",
234 |           "text": [
235 |             "['Python']\n",
236 |             "['science.']\n"
237 |           ]
238 |         }
239 |       ]
240 |     },
241 |     {
242 |       "cell_type": "markdown",
243 |       "source": [
244 |         "## Capturing Groups"
245 |       ],
246 |       "metadata": {
247 |         "id": "saT34V8XM2D9"
248 |       }
249 |     },
250 |     {
251 |       "cell_type": "code",
252 |       "source": [
253 |         "text = \"Dates: 2023-10-15, 2022-05-22\"\n",
254 |         "pattern = r\"(\\d{4})-(\\d{2})-(\\d{2})\"\n",
255 |         "\n",
256 |         "# findall returns tuples of the captured groups\n",
257 |         "matches = re.findall(pattern, text)\n",
258 |         "print(matches)\n",
259 |         "\n",
260 |         "# You can use these to create structured data\n",
261 |         "for year, month, day in matches:\n",
262 |         "    print(f\"Year: {year}, Month: {month}, Day: {day}\")"
263 |       ],
264 |       "metadata": {
265 |         "colab": {
266 |           "base_uri": "https://localhost:8080/"
267 |         },
268 |         "id": "aUgP6sCbLZxL",
269 |         "outputId": "7e9270b7-8648-434e-e8c0-6f96f3f09c6e"
270 |       },
271 |       "execution_count": 23,
272 |       "outputs": [
273 |         {
274 |           "output_type": "stream",
275 |           "name": "stdout",
276 |           "text": [
277 |             "[('2023', '10', '15'), ('2022', '05', '22')]\n",
278 |             "Year: 2023, Month: 10, Day: 15\n",
279 |             "Year: 2022, Month: 05, Day: 22\n"
280 |           ]
281 |         }
282 |       ]
283 |     },
284 |     {
285 |       "cell_type": "markdown",
286 |       "source": [
287 |         "## Named Groups"
288 |       ],
289 |       "metadata": {
290 |         "id": "5OC-KsrtM9V9"
291 |       }
292 |     },
293 |     {
294 |       "cell_type": "code",
295 |       "source": [
296 |         "text = \"Contact: john.doe@example.com\"\n",
297 |         "pattern = r\"(?P<username>[\\w.]+)@(?P<domain>[\\w.]+)\"\n",
298 |         "\n",
299 |         "match = re.search(pattern, text)\n",
300 |         "if match:\n",
301 |         "    print(f\"Username: {match.group('username')}\")\n",
302 |         "    print(f\"Domain: {match.group('domain')}\")\n"
303 |       ],
304 |       "metadata": {
305 |         "colab": {
306 |           "base_uri": "https://localhost:8080/"
307 |         },
308 |         "id": "L9FOB0uiLexp",
309 |         "outputId": "c73b8c7f-af37-4016-b404-d9ad7502514f"
310 |       },
311 |       "execution_count": 24,
312 |       "outputs": [
313 |         {
314 |           "output_type": "stream",
315 |           "name": "stdout",
316 |           "text": [
317 |             "Username: john.doe\n",
318 |             "Domain: example.com\n"
319 |           ]
320 |         }
321 |       ]
322 |     }
323 |   ]
324 | }


--------------------------------------------------------------------------------
/regex/regex_contd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": 1,
 20 |       "metadata": {
 21 |         "id": "VxllMcawIjlB"
 22 |       },
 23 |       "outputs": [],
 24 |       "source": [
 25 |         "import re"
 26 |       ]
 27 |     },
 28 |     {
 29 |       "cell_type": "code",
 30 |       "source": [
 31 |         "text = \"<div>First content</div><div>Second content</div>\"\n",
 32 |         "\n",
 33 |         "# Greedy matching (default)\n",
 34 |         "greedy = re.findall(r\"<div>(.*)</div>\", text)\n",
 35 |         "print(f\"Greedy: {greedy}\")\n",
 36 |         "\n",
 37 |         "# Non-greedy matching\n",
 38 |         "non_greedy = re.findall(r\"<div>(.*?)</div>\", text)\n",
 39 |         "print(f\"Non-greedy: {non_greedy}\")\n"
 40 |       ],
 41 |       "metadata": {
 42 |         "colab": {
 43 |           "base_uri": "https://localhost:8080/"
 44 |         },
 45 |         "id": "Cb35InjGIzW3",
 46 |         "outputId": "cf4f7b63-8e9c-4ad0-f4f7-a0c6bbd52a03"
 47 |       },
 48 |       "execution_count": 2,
 49 |       "outputs": [
 50 |         {
 51 |           "output_type": "stream",
 52 |           "name": "stdout",
 53 |           "text": [
 54 |             "Greedy: ['First content</div><div>Second content']\n",
 55 |             "Non-greedy: ['First content', 'Second content']\n"
 56 |           ]
 57 |         }
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "source": [
 63 |         "# Password validation\n",
 64 |         "password = \"Password123\"\n",
 65 |         "has_uppercase = bool(re.search(r\"(?=.*[A-Z])\", password))\n",
 66 |         "has_lowercase = bool(re.search(r\"(?=.*[a-z])\", password))\n",
 67 |         "has_digit = bool(re.search(r\"(?=.*\\d)\", password))\n",
 68 |         "is_long_enough = len(password) >= 8\n",
 69 |         "\n",
 70 |         "if all([has_uppercase, has_lowercase, has_digit, is_long_enough]):\n",
 71 |         "    print(\"Password meets requirements\")\n",
 72 |         "else:\n",
 73 |         "    print(\"Password does not meet all requirements\")\n"
 74 |       ],
 75 |       "metadata": {
 76 |         "colab": {
 77 |           "base_uri": "https://localhost:8080/"
 78 |         },
 79 |         "id": "N21dkm8NI-TG",
 80 |         "outputId": "4c0b31c3-422c-4145-e07f-aff64c4a5950"
 81 |       },
 82 |       "execution_count": 3,
 83 |       "outputs": [
 84 |         {
 85 |           "output_type": "stream",
 86 |           "name": "stdout",
 87 |           "text": [
 88 |             "Password meets requirements\n"
 89 |           ]
 90 |         }
 91 |       ]
 92 |     },
 93 |     {
 94 |       "cell_type": "code",
 95 |       "source": [],
 96 |       "metadata": {
 97 |         "id": "VNTuYz1bI-Jc"
 98 |       },
 99 |       "execution_count": 3,
100 |       "outputs": []
101 |     }
102 |   ]
103 | }


--------------------------------------------------------------------------------
/regex/regex_examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": null,
 20 |       "metadata": {
 21 |         "colab": {
 22 |           "base_uri": "https://localhost:8080/"
 23 |         },
 24 |         "id": "YPKB2a5Hsw02",
 25 |         "outputId": "3da0932d-8614-477c-b6a6-d202c795f3da"
 26 |       },
 27 |       "outputs": [
 28 |         {
 29 |           "output_type": "stream",
 30 |           "name": "stdout",
 31 |           "text": [
 32 |             "Contact info: 1234567890 and 9876543210.\n"
 33 |           ]
 34 |         }
 35 |       ],
 36 |       "source": [
 37 |         "import re\n",
 38 |         "\n",
 39 |         "text = \"Contact info: (123)-456-7890 and 987-654-3210.\"\n",
 40 |         "cleaned_text = re.sub(r'[()-]', '', text)\n",
 41 |         "print(cleaned_text)\n"
 42 |       ]
 43 |     },
 44 |     {
 45 |       "cell_type": "code",
 46 |       "source": [
 47 |         "text = \"Please reach out to us at support@example.org or help@example.org.\"\n",
 48 |         "emails = re.findall(r'\\b[\\w.-]+?@\\w+?\\.\\w+?\\b', text)\n",
 49 |         "print(emails)\n"
 50 |       ],
 51 |       "metadata": {
 52 |         "colab": {
 53 |           "base_uri": "https://localhost:8080/"
 54 |         },
 55 |         "id": "eGS7s-zTs-9T",
 56 |         "outputId": "74a8b91b-5af7-48f6-9dd3-d690b9f36b21"
 57 |       },
 58 |       "execution_count": null,
 59 |       "outputs": [
 60 |         {
 61 |           "output_type": "stream",
 62 |           "name": "stdout",
 63 |           "text": [
 64 |             "['support@example.org', 'help@example.org']\n"
 65 |           ]
 66 |         }
 67 |       ]
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "source": [
 72 |         "text = \"This\tis\ta\tstring   with   multiple   unnecessary    spaces.\"\n",
 73 |         "cleaned_text = re.sub(r'\\s+', ' ', text)\n",
 74 |         "print(cleaned_text)\n"
 75 |       ],
 76 |       "metadata": {
 77 |         "colab": {
 78 |           "base_uri": "https://localhost:8080/"
 79 |         },
 80 |         "id": "dd0K0-LrtmBi",
 81 |         "outputId": "5436543f-4b19-4081-f8d4-5ec6f64d6d6b"
 82 |       },
 83 |       "execution_count": null,
 84 |       "outputs": [
 85 |         {
 86 |           "output_type": "stream",
 87 |           "name": "stdout",
 88 |           "text": [
 89 |             "This is a string with multiple unnecessary spaces.\n"
 90 |           ]
 91 |         }
 92 |       ]
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "source": [
 97 |         "email = \"test@example.com\"\n",
 98 |         "if re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', email):\n",
 99 |         "    print(\"Valid email\")  # Output: Valid email\n",
100 |         "else:\n",
101 |         "    print(\"Invalid email\")\n"
102 |       ],
103 |       "metadata": {
104 |         "colab": {
105 |           "base_uri": "https://localhost:8080/"
106 |         },
107 |         "id": "j05fGS1UyCfe",
108 |         "outputId": "402a5319-ba31-44d8-e870-ccbc35535af3"
109 |       },
110 |       "execution_count": null,
111 |       "outputs": [
112 |         {
113 |           "output_type": "stream",
114 |           "name": "stdout",
115 |           "text": [
116 |             "Valid email\n"
117 |           ]
118 |         }
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "source": [
124 |         "text = \"This is sentence one. And this is sentence two! Is this sentence three?\"\n",
125 |         "sentences = re.split(r'[.!?]', text)\n",
126 |         "print(sentences)  # Output: ['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n"
127 |       ],
128 |       "metadata": {
129 |         "colab": {
130 |           "base_uri": "https://localhost:8080/"
131 |         },
132 |         "id": "7f68JqnBzBX9",
133 |         "outputId": "455de3e7-3cd0-4ffc-ad69-d058e9ecedff"
134 |       },
135 |       "execution_count": null,
136 |       "outputs": [
137 |         {
138 |           "output_type": "stream",
139 |           "name": "stdout",
140 |           "text": [
141 |             "['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n"
142 |           ]
143 |         }
144 |       ]
145 |     },
146 |     {
147 |       "cell_type": "code",
148 |       "source": [
149 |         "import pandas as pd\n",
150 |         "\n",
151 |         "data = {\n",
152 |         "\t'names': ['Alice123', 'Bob!@#', 'Charlie$$$'],\n",
153 |         "\t'emails': ['alice@example.com', 'bob_at_example.com', 'charlie@example.com']\n",
154 |         "}\n",
155 |         "df = pd.DataFrame(data)\n",
156 |         "\n",
157 |         "# Remove non-alphabetic characters from names\n",
158 |         "df['names'] = df['names'].str.replace(r'[^a-zA-Z]', '', regex=True)\n",
159 |         "\n",
160 |         "# Validate email addresses\n",
161 |         "df['valid_email'] = df['emails'].apply(lambda x: bool(re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', x)))\n",
162 |         "\n",
163 |         "print(df)\n"
164 |       ],
165 |       "metadata": {
166 |         "colab": {
167 |           "base_uri": "https://localhost:8080/"
168 |         },
169 |         "id": "qboHFiS30UMQ",
170 |         "outputId": "eeb42cb5-ebcf-4ebe-f301-74c2c1ac184a"
171 |       },
172 |       "execution_count": null,
173 |       "outputs": [
174 |         {
175 |           "output_type": "stream",
176 |           "name": "stdout",
177 |           "text": [
178 |             "     names               emails  valid_email\n",
179 |             "0    Alice    alice@example.com         True\n",
180 |             "1      Bob   bob_at_example.com        False\n",
181 |             "2  Charlie  charlie@example.com         True\n"
182 |           ]
183 |         }
184 |       ]
185 |     },
186 |     {
187 |       "cell_type": "code",
188 |       "source": [],
189 |       "metadata": {
190 |         "id": "la5oKWfX0U2Z"
191 |       },
192 |       "execution_count": null,
193 |       "outputs": []
194 |     }
195 |   ]
196 | }


--------------------------------------------------------------------------------
/statistics/Basic_Stats_Functions_Python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Import the Built-In `statistics` Module"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "s8yOidchG5UV"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "source": [
 29 |         "import statistics"
 30 |       ],
 31 |       "metadata": {
 32 |         "id": "cOmUhMH9bIAb"
 33 |       },
 34 |       "execution_count": null,
 35 |       "outputs": []
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "source": [
 40 |         "## 1. Mean"
 41 |       ],
 42 |       "metadata": {
 43 |         "id": "Cy_sSAo4bExW"
 44 |       }
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "execution_count": null,
 49 |       "metadata": {
 50 |         "colab": {
 51 |           "base_uri": "https://localhost:8080/"
 52 |         },
 53 |         "id": "v-3qQD50a9hT",
 54 |         "outputId": "2a7d7cd5-d8f9-445d-f56f-59ac7f4e57b6"
 55 |       },
 56 |       "outputs": [
 57 |         {
 58 |           "output_type": "stream",
 59 |           "name": "stdout",
 60 |           "text": [
 61 |             "Mean: 30\n"
 62 |           ]
 63 |         }
 64 |       ],
 65 |       "source": [
 66 |         "data = [10, 20, 30, 40, 50]\n",
 67 |         "mean = statistics.mean(data)\n",
 68 |         "print(\"Mean:\", mean)"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "cell_type": "markdown",
 73 |       "source": [
 74 |         "## 2. Median"
 75 |       ],
 76 |       "metadata": {
 77 |         "id": "obFi961MbQ46"
 78 |       }
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "source": [
 83 |         "data = [15, 20, 35, 40, 50]\n",
 84 |         "median = statistics.median(data)\n",
 85 |         "print(\"Median:\", median)"
 86 |       ],
 87 |       "metadata": {
 88 |         "colab": {
 89 |           "base_uri": "https://localhost:8080/"
 90 |         },
 91 |         "id": "FrS8KYaXbPWy",
 92 |         "outputId": "f07a115a-5f18-462a-ae47-ab3c239db261"
 93 |       },
 94 |       "execution_count": null,
 95 |       "outputs": [
 96 |         {
 97 |           "output_type": "stream",
 98 |           "name": "stdout",
 99 |           "text": [
100 |             "Median: 35\n"
101 |           ]
102 |         }
103 |       ]
104 |     },
105 |     {
106 |       "cell_type": "markdown",
107 |       "source": [
108 |         "## 3. Mode"
109 |       ],
110 |       "metadata": {
111 |         "id": "b9ybgj32bYKy"
112 |       }
113 |     },
114 |     {
115 |       "cell_type": "code",
116 |       "source": [
117 |         "data = [1, 2, 2, 3, 4, 4, 4]\n",
118 |         "mode = statistics.mode(data)\n",
119 |         "print(\"Mode:\", mode)"
120 |       ],
121 |       "metadata": {
122 |         "colab": {
123 |           "base_uri": "https://localhost:8080/"
124 |         },
125 |         "id": "AgrG9I5fbWU0",
126 |         "outputId": "eebf7a08-b1d0-42f7-f982-3a20b9241082"
127 |       },
128 |       "execution_count": null,
129 |       "outputs": [
130 |         {
131 |           "output_type": "stream",
132 |           "name": "stdout",
133 |           "text": [
134 |             "Mode: 4\n"
135 |           ]
136 |         }
137 |       ]
138 |     },
139 |     {
140 |       "cell_type": "code",
141 |       "source": [
142 |         "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n",
143 |         "mode = statistics.mode(data)\n",
144 |         "print(\"Modes:\", mode)"
145 |       ],
146 |       "metadata": {
147 |         "colab": {
148 |           "base_uri": "https://localhost:8080/"
149 |         },
150 |         "id": "d3D3oyVBccaa",
151 |         "outputId": "93c7fa31-ca9c-429f-df8d-d93ca9eef080"
152 |       },
153 |       "execution_count": null,
154 |       "outputs": [
155 |         {
156 |           "output_type": "stream",
157 |           "name": "stdout",
158 |           "text": [
159 |             "Modes: 2\n"
160 |           ]
161 |         }
162 |       ]
163 |     },
164 |     {
165 |       "cell_type": "code",
166 |       "source": [
167 |         "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n",
168 |         "modes = statistics.multimode(data)\n",
169 |         "print(\"Modes:\", modes)"
170 |       ],
171 |       "metadata": {
172 |         "colab": {
173 |           "base_uri": "https://localhost:8080/"
174 |         },
175 |         "id": "62_XzwJhcH3d",
176 |         "outputId": "e7cf6cd4-50b3-42a5-b1ad-d45be40c602a"
177 |       },
178 |       "execution_count": null,
179 |       "outputs": [
180 |         {
181 |           "output_type": "stream",
182 |           "name": "stdout",
183 |           "text": [
184 |             "Modes: [2, 4, 7]\n"
185 |           ]
186 |         }
187 |       ]
188 |     },
189 |     {
190 |       "cell_type": "markdown",
191 |       "source": [
192 |         "## 4. Standard Deviation"
193 |       ],
194 |       "metadata": {
195 |         "id": "neQiIHTC6CtL"
196 |       }
197 |     },
198 |     {
199 |       "cell_type": "code",
200 |       "source": [
201 |         "data = [12, 15, 22, 29, 35]\n",
202 |         "std_dev = statistics.stdev(data)\n",
203 |         "print(f\"Standard Deviation: {std_dev:.3f}\")"
204 |       ],
205 |       "metadata": {
206 |         "colab": {
207 |           "base_uri": "https://localhost:8080/"
208 |         },
209 |         "id": "uY-DcaV4cRux",
210 |         "outputId": "98166ea5-b57c-4e1b-f526-5cfbb3f9aed7"
211 |       },
212 |       "execution_count": null,
213 |       "outputs": [
214 |         {
215 |           "output_type": "stream",
216 |           "name": "stdout",
217 |           "text": [
218 |             "Standard Deviation: 9.555\n"
219 |           ]
220 |         }
221 |       ]
222 |     },
223 |     {
224 |       "cell_type": "markdown",
225 |       "source": [
226 |         "## 5. Variance"
227 |       ],
228 |       "metadata": {
229 |         "id": "q6Ra31AD7jcU"
230 |       }
231 |     },
232 |     {
233 |       "cell_type": "code",
234 |       "source": [
235 |         "data = [8, 10, 12, 14, 16]\n",
236 |         "variance = statistics.variance(data)\n",
237 |         "print(f\"Variance: {variance:.2f}\")"
238 |       ],
239 |       "metadata": {
240 |         "colab": {
241 |           "base_uri": "https://localhost:8080/"
242 |         },
243 |         "id": "ALOJxc4V6G0a",
244 |         "outputId": "ff7c8a7d-8250-4fc3-b48a-a59dc1e15877"
245 |       },
246 |       "execution_count": null,
247 |       "outputs": [
248 |         {
249 |           "output_type": "stream",
250 |           "name": "stdout",
251 |           "text": [
252 |             "Variance: 10.00\n"
253 |           ]
254 |         }
255 |       ]
256 |     },
257 |     {
258 |       "cell_type": "markdown",
259 |       "source": [
260 |         "## 6. Covariance"
261 |       ],
262 |       "metadata": {
263 |         "id": "oXGwdDsci1AP"
264 |       }
265 |     },
266 |     {
267 |       "cell_type": "code",
268 |       "source": [
269 |         "data1 = [2, 4, 6, 8, 10]\n",
270 |         "data2 = [1, 3, 5, 7, 9]\n",
271 |         "covariance = statistics.covariance(data1, data2)\n",
272 |         "print(\"Covariance:\", covariance)"
273 |       ],
274 |       "metadata": {
275 |         "id": "5wjqe8n67uoT",
276 |         "colab": {
277 |           "base_uri": "https://localhost:8080/"
278 |         },
279 |         "outputId": "7c11b3be-9d00-47ef-b05d-ad6faff58c65"
280 |       },
281 |       "execution_count": 1,
282 |       "outputs": [
283 |         {
284 |           "output_type": "stream",
285 |           "name": "stdout",
286 |           "text": [
287 |             "Covariance: 10.0\n"
288 |           ]
289 |         }
290 |       ]
291 |     },
292 |     {
293 |       "cell_type": "markdown",
294 |       "source": [
295 |         "## 7. Quantiles"
296 |       ],
297 |       "metadata": {
298 |         "id": "DqquyE0XmKg-"
299 |       }
300 |     },
301 |     {
302 |       "cell_type": "code",
303 |       "source": [
304 |         "data = [1, 5, 7, 9, 10, 12, 16, 18, 19, 21]\n",
305 |         "# Quartiles\n",
306 |         "quantiles = statistics.quantiles(data, n=4)\n",
307 |         "print(\"Quantiles (Quartiles):\", quantiles)"
308 |       ],
309 |       "metadata": {
310 |         "colab": {
311 |           "base_uri": "https://localhost:8080/"
312 |         },
313 |         "id": "5p1xVng-kwju",
314 |         "outputId": "903cb4f8-5bb6-488a-c582-62126fbff758"
315 |       },
316 |       "execution_count": 4,
317 |       "outputs": [
318 |         {
319 |           "output_type": "stream",
320 |           "name": "stdout",
321 |           "text": [
322 |             "Quantiles (Quartiles): [6.5, 11.0, 18.25]\n"
323 |           ]
324 |         }
325 |       ]
326 |     },
327 |     {
328 |       "cell_type": "markdown",
329 |       "source": [
330 |         "## 8. Correlation"
331 |       ],
332 |       "metadata": {
333 |         "id": "eUTp6xe2CCVM"
334 |       }
335 |     },
336 |     {
337 |       "cell_type": "code",
338 |       "source": [
339 |         "data1 = [1, 2, 3, 4, 5]\n",
340 |         "data2 = [2, 4, 6, 8, 10]\n",
341 |         "correlation = statistics.correlation(data1, data2)\n",
342 |         "print(\"Correlation:\", correlation)"
343 |       ],
344 |       "metadata": {
345 |         "colab": {
346 |           "base_uri": "https://localhost:8080/"
347 |         },
348 |         "id": "1CFP4t68mO4r",
349 |         "outputId": "a43c0f06-8c1e-4229-ab23-1c8a35aef2e7"
350 |       },
351 |       "execution_count": 5,
352 |       "outputs": [
353 |         {
354 |           "output_type": "stream",
355 |           "name": "stdout",
356 |           "text": [
357 |             "Correlation: 1.0\n"
358 |           ]
359 |         }
360 |       ]
361 |     },
362 |     {
363 |       "cell_type": "markdown",
364 |       "source": [
365 |         "## 9. Linear Regression"
366 |       ],
367 |       "metadata": {
368 |         "id": "AMq5BdfuFMiB"
369 |       }
370 |     },
371 |     {
372 |       "cell_type": "code",
373 |       "source": [
374 |         "x = [1, 2, 3, 4, 5]\n",
375 |         "y = [3, 4, 2, 5, 7]\n",
376 |         "slope, intercept = statistics.linear_regression(x, y)\n",
377 |         "print(\"Slope:\", slope)\n",
378 |         "print(\"Intercept:\", intercept)"
379 |       ],
380 |       "metadata": {
381 |         "colab": {
382 |           "base_uri": "https://localhost:8080/"
383 |         },
384 |         "id": "TJVQAIjACFxz",
385 |         "outputId": "e79e8709-0a67-4b51-dbb8-634fdc52ad3a"
386 |       },
387 |       "execution_count": 7,
388 |       "outputs": [
389 |         {
390 |           "output_type": "stream",
391 |           "name": "stdout",
392 |           "text": [
393 |             "Slope: 0.9\n",
394 |             "Intercept: 1.5\n"
395 |           ]
396 |         }
397 |       ]
398 |     },
399 |     {
400 |       "cell_type": "markdown",
401 |       "source": [
402 |         "## 10. Normal Distribution"
403 |       ],
404 |       "metadata": {
405 |         "id": "EjcEB4bcGILJ"
406 |       }
407 |     },
408 |     {
409 |       "cell_type": "code",
410 |       "source": [
411 |         "# Create a normal distribution with mean 30 and standard deviation 10\n",
412 |         "normal_dist = statistics.NormalDist(mu=30, sigma=10)\n",
413 |         "\n",
414 |         "# Calculate the probability of a value less than or equal to 20\n",
415 |         "probability = normal_dist.cdf(20)\n",
416 |         "print(f\"Probability (CDF) of 20: {probability:.3f}\")\n",
417 |         "\n",
418 |         "# Calculate the z-score for a value\n",
419 |         "z_score = normal_dist.inv_cdf(0.975)\n",
420 |         "print(f\"Z-score for 97.5th percentile: {z_score:.3f}\")"
421 |       ],
422 |       "metadata": {
423 |         "colab": {
424 |           "base_uri": "https://localhost:8080/"
425 |         },
426 |         "id": "sYkVaHDtFQ6m",
427 |         "outputId": "2f1db0e8-3b5e-4764-8151-02e92b413513"
428 |       },
429 |       "execution_count": 11,
430 |       "outputs": [
431 |         {
432 |           "output_type": "stream",
433 |           "name": "stdout",
434 |           "text": [
435 |             "Probability (CDF) of 20: 0.159\n",
436 |             "Z-score for 97.5th percentile: 49.600\n"
437 |           ]
438 |         }
439 |       ]
440 |     },
441 |     {
442 |       "cell_type": "code",
443 |       "source": [],
444 |       "metadata": {
445 |         "id": "w8eCKz1tJy55"
446 |       },
447 |       "execution_count": null,
448 |       "outputs": []
449 |     }
450 |   ]
451 | }


--------------------------------------------------------------------------------
/statistics/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/statistics/probability/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/statistics/probability/joint_and_conditional_pbty.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Step 1: Creating Sample Data"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "YjO9ZVZIM8Ye"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": null,
 29 |       "metadata": {
 30 |         "colab": {
 31 |           "base_uri": "https://localhost:8080/"
 32 |         },
 33 |         "id": "sAXOTiD9Ltz0",
 34 |         "outputId": "63fdc6fe-0ae1-4b61-ba92-4e481cc8c561"
 35 |       },
 36 |       "outputs": [
 37 |         {
 38 |           "output_type": "stream",
 39 |           "name": "stdout",
 40 |           "text": [
 41 |             "  Age_Group Sports_Interest\n",
 42 |             "0      Teen             Yes\n",
 43 |             "1      Teen              No\n",
 44 |             "2      Teen             Yes\n",
 45 |             "3     Adult              No\n",
 46 |             "4     Adult              No\n",
 47 |             "5    Senior             Yes\n",
 48 |             "6    Senior             Yes\n",
 49 |             "7    Senior              No\n"
 50 |           ]
 51 |         }
 52 |       ],
 53 |       "source": [
 54 |         "import pandas as pd\n",
 55 |         "\n",
 56 |         "# Sample data\n",
 57 |         "data = {\n",
 58 |         "    \"Age_Group\": [\"Teen\", \"Teen\", \"Teen\", \"Adult\", \"Adult\", \"Senior\", \"Senior\", \"Senior\"],\n",
 59 |         "    \"Sports_Interest\": [\"Yes\", \"No\", \"Yes\", \"No\", \"No\", \"Yes\", \"Yes\", \"No\"]\n",
 60 |         "}\n",
 61 |         "\n",
 62 |         "df = pd.DataFrame(data)\n",
 63 |         "\n",
 64 |         "# Display the data\n",
 65 |         "print(df)\n"
 66 |       ]
 67 |     },
 68 |     {
 69 |       "cell_type": "markdown",
 70 |       "source": [
 71 |         "## Step 2: Calculating Joint Probability"
 72 |       ],
 73 |       "metadata": {
 74 |         "id": "1VY0hLRKMWMr"
 75 |       }
 76 |     },
 77 |     {
 78 |       "cell_type": "code",
 79 |       "source": [
 80 |         "# Total number of observations\n",
 81 |         "total_count = len(df)\n",
 82 |         "\n",
 83 |         "# Count occurrences where Age_Group is \"Teen\" and Sports_Interest is \"Yes\"\n",
 84 |         "joint_count = len(df[(df['Age_Group'] == 'Teen') & (df['Sports_Interest'] == 'Yes')])\n",
 85 |         "\n",
 86 |         "# Joint probability\n",
 87 |         "joint_probability = joint_count / total_count\n",
 88 |         "\n",
 89 |         "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_probability}\")\n"
 90 |       ],
 91 |       "metadata": {
 92 |         "colab": {
 93 |           "base_uri": "https://localhost:8080/"
 94 |         },
 95 |         "id": "M32eM5NPMHNd",
 96 |         "outputId": "35e64e55-358f-471c-b583-ca322b9597c0"
 97 |       },
 98 |       "execution_count": null,
 99 |       "outputs": [
100 |         {
101 |           "output_type": "stream",
102 |           "name": "stdout",
103 |           "text": [
104 |             "Joint Probability (Teen and Sports Interest Yes): 0.25\n"
105 |           ]
106 |         }
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "markdown",
111 |       "source": [
112 |         "## Step 3: Calculating Conditional Probability"
113 |       ],
114 |       "metadata": {
115 |         "id": "OIs1olhPMZgq"
116 |       }
117 |     },
118 |     {
119 |       "cell_type": "code",
120 |       "source": [
121 |         "# Filter data for Age_Group = \"Teen\"\n",
122 |         "teen_data = df[df['Age_Group'] == 'Teen']\n",
123 |         "\n",
124 |         "# Count occurrences of Sports_Interest = \"Yes\" among teens\n",
125 |         "conditional_count = len(teen_data[teen_data['Sports_Interest'] == 'Yes'])\n",
126 |         "\n",
127 |         "# Conditional probability\n",
128 |         "conditional_probability = conditional_count / len(teen_data)\n",
129 |         "\n",
130 |         "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_probability:.3f}\")\n"
131 |       ],
132 |       "metadata": {
133 |         "colab": {
134 |           "base_uri": "https://localhost:8080/"
135 |         },
136 |         "id": "vMTq6kaKMJdd",
137 |         "outputId": "559e5632-7ca7-44bd-9d59-4f2aeb19f50a"
138 |       },
139 |       "execution_count": null,
140 |       "outputs": [
141 |         {
142 |           "output_type": "stream",
143 |           "name": "stdout",
144 |           "text": [
145 |             "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n"
146 |           ]
147 |         }
148 |       ]
149 |     },
150 |     {
151 |       "cell_type": "markdown",
152 |       "source": [
153 |         "## Step 4: Generalizing with Functions"
154 |       ],
155 |       "metadata": {
156 |         "id": "L2uNqq9zM1I2"
157 |       }
158 |     },
159 |     {
160 |       "cell_type": "code",
161 |       "source": [
162 |         "def calculate_joint_probability(df, condition1, condition2):\n",
163 |         "    total_count = len(df)\n",
164 |         "    joint_count = len(df[(df[condition1[0]] == condition1[1]) & (df[condition2[0]] == condition2[1])])\n",
165 |         "    return joint_count / total_count\n",
166 |         "\n",
167 |         "def calculate_conditional_probability(df, given_condition, target_condition):\n",
168 |         "    subset = df[df[given_condition[0]] == given_condition[1]]\n",
169 |         "    conditional_count = len(subset[subset[target_condition[0]] == target_condition[1]])\n",
170 |         "    return conditional_count / len(subset)\n"
171 |       ],
172 |       "metadata": {
173 |         "id": "VGoD5_-2MMfE"
174 |       },
175 |       "execution_count": null,
176 |       "outputs": []
177 |     },
178 |     {
179 |       "cell_type": "code",
180 |       "source": [
181 |         "# Joint Probability of \"Teen\" and \"Sports_Interest = Yes\"\n",
182 |         "joint_prob = calculate_joint_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n",
183 |         "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_prob}\")\n",
184 |         "\n",
185 |         "# Conditional Probability of \"Sports_Interest = Yes\" given \"Age_Group = Teen\"\n",
186 |         "conditional_prob = calculate_conditional_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n",
187 |         "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_prob:.3f}\")\n"
188 |       ],
189 |       "metadata": {
190 |         "colab": {
191 |           "base_uri": "https://localhost:8080/"
192 |         },
193 |         "id": "rSEt6qJgMQQN",
194 |         "outputId": "0f5e0527-f942-4f4a-8081-7ec6d4591708"
195 |       },
196 |       "execution_count": null,
197 |       "outputs": [
198 |         {
199 |           "output_type": "stream",
200 |           "name": "stdout",
201 |           "text": [
202 |             "Joint Probability (Teen and Sports Interest Yes): 0.25\n",
203 |             "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n"
204 |           ]
205 |         }
206 |       ]
207 |     }
208 |   ]
209 | }


--------------------------------------------------------------------------------
/statistics/sparse_data_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": 1,
 20 |       "metadata": {
 21 |         "id": "0R9gVhnIMrNH"
 22 |       },
 23 |       "outputs": [],
 24 |       "source": [
 25 |         "import numpy as np\n",
 26 |         "from scipy import sparse\n",
 27 |         "import pandas as pd\n",
 28 |         "from scipy import stats\n"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "code",
 33 |       "source": [
 34 |         "# Create a sparse matrix where rows are users and columns are products\n",
 35 |         "# Only storing the actual interactions\n",
 36 |         "row = np.array([0, 3, 1, 0])     # User IDs\n",
 37 |         "col = np.array([0, 3, 1, 2])     # Product IDs\n",
 38 |         "data = np.array([4, 5, 7, 9])    # Interaction values (like ratings)\n",
 39 |         "\n",
 40 |         "# Create the sparse matrix\n",
 41 |         "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n",
 42 |         "\n",
 43 |         "# seeing the sparse matrix as a regular matrix\n",
 44 |         "print(\"Here's our sparse matrix as a regular array:\")\n",
 45 |         "print(sparse_matrix.toarray())\n"
 46 |       ],
 47 |       "metadata": {
 48 |         "colab": {
 49 |           "base_uri": "https://localhost:8080/"
 50 |         },
 51 |         "id": "RkAQQ8QCMzM7",
 52 |         "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5"
 53 |       },
 54 |       "execution_count": 2,
 55 |       "outputs": [
 56 |         {
 57 |           "output_type": "stream",
 58 |           "name": "stdout",
 59 |           "text": [
 60 |             "Here's our sparse matrix as a regular array:\n",
 61 |             "[[4 0 9 0]\n",
 62 |             " [0 7 0 0]\n",
 63 |             " [0 0 0 0]\n",
 64 |             " [0 0 0 5]]\n"
 65 |           ]
 66 |         }
 67 |       ]
 68 |     },
 69 |     {
 70 |       "cell_type": "code",
 71 |       "source": [
 72 |         "def calculate_sparse_mean(sparse_matrix):\n",
 73 |         "    \"\"\"\n",
 74 |         "    Calculate mean of non-zero elements in a sparse matrix.\n",
 75 |         "    This is useful when zeros represent 'no data' rather than actual zeros.\n",
 76 |         "    \"\"\"\n",
 77 |         "    if sparse_matrix.nnz == 0:  # nnz is the number of non-zero elements\n",
 78 |         "        return 0.0\n",
 79 |         "    return sparse_matrix.sum() / sparse_matrix.nnz\n",
 80 |         "\n",
 81 |         "mean_value = calculate_sparse_mean(sparse_matrix)\n",
 82 |         "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n"
 83 |       ],
 84 |       "metadata": {
 85 |         "colab": {
 86 |           "base_uri": "https://localhost:8080/"
 87 |         },
 88 |         "id": "Dz0BFJXXM1ia",
 89 |         "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5"
 90 |       },
 91 |       "execution_count": 3,
 92 |       "outputs": [
 93 |         {
 94 |           "output_type": "stream",
 95 |           "name": "stdout",
 96 |           "text": [
 97 |             "\n",
 98 |             "Mean of non-zero elements: 6.25\n"
 99 |           ]
100 |         }
101 |       ]
102 |     },
103 |     {
104 |       "cell_type": "code",
105 |       "source": [
106 |         "def analyze_row_patterns(sparse_matrix):\n",
107 |         "    \"\"\"\n",
108 |         "    Analyze patterns in each row of a sparse matrix.\n",
109 |         "    Returns dictionary with various row statistics.\n",
110 |         "    \"\"\"\n",
111 |         "    # Convert to CSR format for efficient row operations\n",
112 |         "    csr_matrix = sparse_matrix.tocsr()\n",
113 |         "\n",
114 |         "    # Calculate statistics\n",
115 |         "    row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n",
116 |         "    row_nonzeros = np.diff(csr_matrix.indptr)  # Number of non-zeros per row\n",
117 |         "\n",
118 |         "    # Calculate means, handling empty rows\n",
119 |         "    row_means = np.zeros_like(row_sums, dtype=float)\n",
120 |         "    mask = row_nonzeros > 0\n",
121 |         "    row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n",
122 |         "\n",
123 |         "    return {\n",
124 |         "        'activity_sum': row_sums,      # Total activity per user\n",
125 |         "        'interaction_count': row_nonzeros,  # Number of interactions per user\n",
126 |         "        'average_value': row_means     # Average value per user\n",
127 |         "    }\n"
128 |       ],
129 |       "metadata": {
130 |         "id": "SF3ygrrvM4Ks"
131 |       },
132 |       "execution_count": 4,
133 |       "outputs": []
134 |     },
135 |     {
136 |       "cell_type": "code",
137 |       "source": [
138 |         "stats = analyze_row_patterns(sparse_matrix)\n",
139 |         "print(\"\\nUser Statistics:\")\n",
140 |         "for i, (sum_val, count, mean) in enumerate(zip(\n",
141 |         "    stats['activity_sum'],\n",
142 |         "    stats['interaction_count'],\n",
143 |         "    stats['average_value']\n",
144 |         ")):\n",
145 |         "    print(f\"User {i}: {count} interactions, \"\n",
146 |         "          f\"total activity = {sum_val}, \"\n",
147 |         "          f\"average value = {mean:.2f}\")\n"
148 |       ],
149 |       "metadata": {
150 |         "colab": {
151 |           "base_uri": "https://localhost:8080/"
152 |         },
153 |         "id": "IAzJ8tHRM519",
154 |         "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04"
155 |       },
156 |       "execution_count": 5,
157 |       "outputs": [
158 |         {
159 |           "output_type": "stream",
160 |           "name": "stdout",
161 |           "text": [
162 |             "\n",
163 |             "User Statistics:\n",
164 |             "User 0: 2 interactions, total activity = 13, average value = 6.50\n",
165 |             "User 1: 1 interactions, total activity = 7, average value = 7.00\n",
166 |             "User 2: 0 interactions, total activity = 0, average value = 0.00\n",
167 |             "User 3: 1 interactions, total activity = 5, average value = 5.00\n"
168 |           ]
169 |         }
170 |       ]
171 |     },
172 |     {
173 |       "cell_type": "code",
174 |       "source": [
175 |         "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n",
176 |         "    \"\"\"\n",
177 |         "    Calculate correlation between columns, considering only overlapping non-zero elements.\n",
178 |         "    Like finding which products are often rated similarly.\n",
179 |         "    \"\"\"\n",
180 |         "    # Convert to dense format for this calculation\n",
181 |         "    # (For very large matrices, you'd want to do this differently)\n",
182 |         "    dense_cols = sparse_matrix.toarray().T\n",
183 |         "    n_cols = dense_cols.shape[0]\n",
184 |         "    correlations = np.zeros((n_cols, n_cols))\n",
185 |         "\n",
186 |         "    for i in range(n_cols):\n",
187 |         "        for j in range(i, n_cols):\n",
188 |         "            # Find where both columns have non-zero values\n",
189 |         "            mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n",
190 |         "            if mask.sum() >= min_overlap:\n",
191 |         "                corr = stats.pearsonr(dense_cols[i][mask],\n",
192 |         "                                    dense_cols[j][mask])[0]\n",
193 |         "                correlations[i, j] = correlations[j, i] = corr\n",
194 |         "\n",
195 |         "    return correlations"
196 |       ],
197 |       "metadata": {
198 |         "id": "ADRakCn4M8KD"
199 |       },
200 |       "execution_count": 6,
201 |       "outputs": []
202 |     },
203 |     {
204 |       "cell_type": "code",
205 |       "source": [
206 |         "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n",
207 |         "print(\"\\nCorrelation matrix:\")\n",
208 |         "print(corr_matrix)"
209 |       ],
210 |       "metadata": {
211 |         "colab": {
212 |           "base_uri": "https://localhost:8080/"
213 |         },
214 |         "id": "7UuFzRB6M979",
215 |         "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7"
216 |       },
217 |       "execution_count": 7,
218 |       "outputs": [
219 |         {
220 |           "output_type": "stream",
221 |           "name": "stdout",
222 |           "text": [
223 |             "\n",
224 |             "Correlation matrix:\n",
225 |             "[[0. 0. 0. 0.]\n",
226 |             " [0. 0. 0. 0.]\n",
227 |             " [0. 0. 0. 0.]\n",
228 |             " [0. 0. 0. 0.]]\n"
229 |           ]
230 |         }
231 |       ]
232 |     }
233 |   ]
234 | }


--------------------------------------------------------------------------------
/statistics/sparse_data_analysis_v0_1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     },
  8 |     "kernelspec": {
  9 |       "name": "python3",
 10 |       "display_name": "Python 3"
 11 |     },
 12 |     "language_info": {
 13 |       "name": "python"
 14 |     }
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "source": [
 20 |         "## Imports"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "DPt0ex-tOHxH"
 24 |       }
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": null,
 29 |       "metadata": {
 30 |         "id": "0R9gVhnIMrNH"
 31 |       },
 32 |       "outputs": [],
 33 |       "source": [
 34 |         "import numpy as np\n",
 35 |         "from scipy import sparse\n",
 36 |         "import pandas as pd\n",
 37 |         "from scipy import stats\n"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "markdown",
 42 |       "source": [
 43 |         "## Creating a Sparse Matrix"
 44 |       ],
 45 |       "metadata": {
 46 |         "id": "M7kq8YzvOKG-"
 47 |       }
 48 |     },
 49 |     {
 50 |       "cell_type": "code",
 51 |       "source": [
 52 |         "# Create a sparse matrix where rows are users and columns are products\n",
 53 |         "# Only storing the actual interactions\n",
 54 |         "row = np.array([0, 3, 1, 0])     # User IDs\n",
 55 |         "col = np.array([0, 3, 1, 2])     # Product IDs\n",
 56 |         "data = np.array([4, 5, 7, 9])    # Interaction values (like ratings)\n",
 57 |         "\n",
 58 |         "# Create the sparse matrix\n",
 59 |         "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n",
 60 |         "\n",
 61 |         "# seeing the sparse matrix as a regular matrix\n",
 62 |         "print(\"Here's our sparse matrix as a regular array:\")\n",
 63 |         "print(sparse_matrix.toarray())\n"
 64 |       ],
 65 |       "metadata": {
 66 |         "colab": {
 67 |           "base_uri": "https://localhost:8080/"
 68 |         },
 69 |         "id": "RkAQQ8QCMzM7",
 70 |         "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5"
 71 |       },
 72 |       "execution_count": null,
 73 |       "outputs": [
 74 |         {
 75 |           "output_type": "stream",
 76 |           "name": "stdout",
 77 |           "text": [
 78 |             "Here's our sparse matrix as a regular array:\n",
 79 |             "[[4 0 9 0]\n",
 80 |             " [0 7 0 0]\n",
 81 |             " [0 0 0 0]\n",
 82 |             " [0 0 0 5]]\n"
 83 |           ]
 84 |         }
 85 |       ]
 86 |     },
 87 |     {
 88 |       "cell_type": "markdown",
 89 |       "source": [
 90 |         "## Basic Statistical Analysis"
 91 |       ],
 92 |       "metadata": {
 93 |         "id": "VRm2aXYiOS3F"
 94 |       }
 95 |     },
 96 |     {
 97 |       "cell_type": "code",
 98 |       "source": [
 99 |         "def calculate_sparse_mean(sparse_matrix):\n",
100 |         "    \"\"\"\n",
101 |         "    Calculate mean of non-zero elements in a sparse matrix.\n",
102 |         "    This is useful when zeros represent 'no data' rather than actual zeros.\n",
103 |         "    \"\"\"\n",
104 |         "    if sparse_matrix.nnz == 0:  # nnz is the number of non-zero elements\n",
105 |         "        return 0.0\n",
106 |         "    return sparse_matrix.sum() / sparse_matrix.nnz\n",
107 |         "\n",
108 |         "mean_value = calculate_sparse_mean(sparse_matrix)\n",
109 |         "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n"
110 |       ],
111 |       "metadata": {
112 |         "colab": {
113 |           "base_uri": "https://localhost:8080/"
114 |         },
115 |         "id": "Dz0BFJXXM1ia",
116 |         "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5"
117 |       },
118 |       "execution_count": null,
119 |       "outputs": [
120 |         {
121 |           "output_type": "stream",
122 |           "name": "stdout",
123 |           "text": [
124 |             "\n",
125 |             "Mean of non-zero elements: 6.25\n"
126 |           ]
127 |         }
128 |       ]
129 |     },
130 |     {
131 |       "cell_type": "markdown",
132 |       "source": [
133 |         "## Handling Row and Column Statistics"
134 |       ],
135 |       "metadata": {
136 |         "id": "fDhW59jyOWl1"
137 |       }
138 |     },
139 |     {
140 |       "cell_type": "code",
141 |       "source": [
142 |         "def analyze_row_patterns(sparse_matrix):\n",
143 |         "    \"\"\"\n",
144 |         "    Analyze patterns in each row of a sparse matrix.\n",
145 |         "    Returns dictionary with various row statistics.\n",
146 |         "    \"\"\"\n",
147 |         "    # Convert to CSR format for efficient row operations\n",
148 |         "    csr_matrix = sparse_matrix.tocsr()\n",
149 |         "\n",
150 |         "    # Calculate statistics\n",
151 |         "    row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n",
152 |         "    row_nonzeros = np.diff(csr_matrix.indptr)  # Number of non-zeros per row\n",
153 |         "\n",
154 |         "    # Calculate means, handling empty rows\n",
155 |         "    row_means = np.zeros_like(row_sums, dtype=float)\n",
156 |         "    mask = row_nonzeros > 0\n",
157 |         "    row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n",
158 |         "\n",
159 |         "    return {\n",
160 |         "        'activity_sum': row_sums,      # Total activity per user\n",
161 |         "        'interaction_count': row_nonzeros,  # Number of interactions per user\n",
162 |         "        'average_value': row_means     # Average value per user\n",
163 |         "    }\n"
164 |       ],
165 |       "metadata": {
166 |         "id": "SF3ygrrvM4Ks"
167 |       },
168 |       "execution_count": null,
169 |       "outputs": []
170 |     },
171 |     {
172 |       "cell_type": "code",
173 |       "source": [
174 |         "stats = analyze_row_patterns(sparse_matrix)\n",
175 |         "print(\"\\nUser Statistics:\")\n",
176 |         "for i, (sum_val, count, mean) in enumerate(zip(\n",
177 |         "    stats['activity_sum'],\n",
178 |         "    stats['interaction_count'],\n",
179 |         "    stats['average_value']\n",
180 |         ")):\n",
181 |         "    print(f\"User {i}: {count} interactions, \"\n",
182 |         "          f\"total activity = {sum_val}, \"\n",
183 |         "          f\"average value = {mean:.2f}\")\n"
184 |       ],
185 |       "metadata": {
186 |         "colab": {
187 |           "base_uri": "https://localhost:8080/"
188 |         },
189 |         "id": "IAzJ8tHRM519",
190 |         "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04"
191 |       },
192 |       "execution_count": null,
193 |       "outputs": [
194 |         {
195 |           "output_type": "stream",
196 |           "name": "stdout",
197 |           "text": [
198 |             "\n",
199 |             "User Statistics:\n",
200 |             "User 0: 2 interactions, total activity = 13, average value = 6.50\n",
201 |             "User 1: 1 interactions, total activity = 7, average value = 7.00\n",
202 |             "User 2: 0 interactions, total activity = 0, average value = 0.00\n",
203 |             "User 3: 1 interactions, total activity = 5, average value = 5.00\n"
204 |           ]
205 |         }
206 |       ]
207 |     },
208 |     {
209 |       "cell_type": "markdown",
210 |       "source": [
211 |         "## Correlation Analysis"
212 |       ],
213 |       "metadata": {
214 |         "id": "m5ETMgcxOatl"
215 |       }
216 |     },
217 |     {
218 |       "cell_type": "code",
219 |       "source": [
220 |         "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n",
221 |         "    \"\"\"\n",
222 |         "    Calculate correlation between columns, considering only overlapping non-zero elements.\n",
223 |         "    Like finding which products are often rated similarly.\n",
224 |         "    \"\"\"\n",
225 |         "    # Convert to dense format for this calculation\n",
226 |         "    # (For very large matrices, you'd want to do this differently)\n",
227 |         "    dense_cols = sparse_matrix.toarray().T\n",
228 |         "    n_cols = dense_cols.shape[0]\n",
229 |         "    correlations = np.zeros((n_cols, n_cols))\n",
230 |         "\n",
231 |         "    for i in range(n_cols):\n",
232 |         "        for j in range(i, n_cols):\n",
233 |         "            # Find where both columns have non-zero values\n",
234 |         "            mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n",
235 |         "            if mask.sum() >= min_overlap:\n",
236 |         "                corr = stats.pearsonr(dense_cols[i][mask],\n",
237 |         "                                    dense_cols[j][mask])[0]\n",
238 |         "                correlations[i, j] = correlations[j, i] = corr\n",
239 |         "\n",
240 |         "    return correlations"
241 |       ],
242 |       "metadata": {
243 |         "id": "ADRakCn4M8KD"
244 |       },
245 |       "execution_count": null,
246 |       "outputs": []
247 |     },
248 |     {
249 |       "cell_type": "code",
250 |       "source": [
251 |         "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n",
252 |         "print(\"\\nCorrelation matrix:\")\n",
253 |         "print(corr_matrix)"
254 |       ],
255 |       "metadata": {
256 |         "colab": {
257 |           "base_uri": "https://localhost:8080/"
258 |         },
259 |         "id": "7UuFzRB6M979",
260 |         "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7"
261 |       },
262 |       "execution_count": null,
263 |       "outputs": [
264 |         {
265 |           "output_type": "stream",
266 |           "name": "stdout",
267 |           "text": [
268 |             "\n",
269 |             "Correlation matrix:\n",
270 |             "[[0. 0. 0. 0.]\n",
271 |             " [0. 0. 0. 0.]\n",
272 |             " [0. 0. 0. 0.]\n",
273 |             " [0. 0. 0. 0.]]\n"
274 |           ]
275 |         }
276 |       ]
277 |     }
278 |   ]
279 | }


--------------------------------------------------------------------------------
/vibe-coding/speed-reader/README.md:
--------------------------------------------------------------------------------
  1 | # RSVP Speed Reader 🚀
  2 | 
  3 | A minimalist command-line speed reading application using RSVP (Rapid Serial Visual Presentation) technique. Read faster, comprehend better, and eliminate distractions with this clean terminal-based reader.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Focused Reading**: Words appear one at a time in a fixed position, eliminating eye movement
  8 | - **Variable Speed**: Adjustable reading speed from 50 to 1000+ words per minute
  9 | - **Real-time Controls**: Pause, navigate, and adjust speed while reading
 10 | - **Adaptive Display**: Uses 40% center area of your terminal with clean borders
 11 | - **Progress Tracking**: Visual progress indicator and word count
 12 | - **Multiple Input Methods**: Read from files or paste text directly
 13 | - **Clean Interface**: Static controls, distraction-free design
 14 | - **Smart Timing**: Longer words get slightly more display time
 15 | 
 16 | ## Interface Overview
 17 | 
 18 | ```
 19 |                     Speed: 300 WPM                Progress: 45/150
 20 |               ┌────────────────────────────────────────────┐
 21 |               │                                            │
 22 |               │                                            │
 23 |               │              c u r r e n t                 │
 24 |               │                                            │
 25 |               │                                            │
 26 |               └────────────────────────────────────────────┘
 27 | 
 28 |               SPACE = Pause/Resume
 29 |               ↑/↓ = Speed Up/Down
 30 |               ←/→ = Previous/Next
 31 |               Q = Quit
 32 | ```
 33 | 
 34 | ## 🚀 Quick Start
 35 | 
 36 | ### Installation
 37 | 
 38 | ```bash
 39 | # Clone or download the script
 40 | git clone https://github.com/yourusername/rsvp-speed-reader.git
 41 | cd rsvp-speed-reader
 42 | 
 43 | # Or download directly
 44 | curl -O https://raw.githubusercontent.com/yourusername/rsvp-speed-reader/main/rsvp_reader.py
 45 | ```
 46 | 
 47 | ### Basic Usage
 48 | 
 49 | ```bash
 50 | # Read a text file
 51 | python3 rsvp_reader.py -f your_document.txt
 52 | 
 53 | # Read with custom speed (300 WPM)
 54 | python3 rsvp_reader.py -f book.txt -w 300
 55 | 
 56 | # Read text directly from command line
 57 | python3 rsvp_reader.py "Your text content here to speed read through"
 58 | 
 59 | # Read with word chunks (2 words at a time)
 60 | python3 rsvp_reader.py -f article.txt -c 2 -w 200
 61 | 
 62 | # Try with sample text (no arguments)
 63 | python3 rsvp_reader.py
 64 | 
 65 | # Make it executable
 66 | chmod +x rsvp_reader.py
 67 | 
 68 | # Then run
 69 | ./rsvp_reader.py
 70 | ```
 71 | 
 72 | ## 🎮 Controls
 73 | 
 74 | While reading, use these keyboard controls:
 75 | 
 76 | | Key | Action |
 77 | |-----|--------|
 78 | | `SPACE` | Pause/Resume reading |
 79 | | `↑` | Increase speed by 25 WPM |
 80 | | `↓` | Decrease speed by 25 WPM |
 81 | | `→` | Skip to next word |
 82 | | `←` | Go back to previous word |
 83 | | `Q` | Quit application |
 84 | 
 85 | ## ⚙️ Command Line Options
 86 | 
 87 | ```bash
 88 | python3 rsvp_reader.py [OPTIONS] [INPUT]
 89 | ```
 90 | 
 91 | ### Arguments
 92 | 
 93 | - `INPUT` - Text file path or direct text string (optional)
 94 | 
 95 | ### Options
 96 | 
 97 | - `-f, --file` - Treat input as file path
 98 | - `-w, --wpm INTEGER` - Words per minute (default: 150)
 99 | - `-c, --chunk INTEGER` - Words per chunk (default: 1)
100 | - `-h, --help` - Show help message
101 | 
102 | ### Examples with Options
103 | 
104 | ```bash
105 | # Slow reading with larger chunks
106 | python3 rsvp_reader.py -f textbook.txt -w 120 -c 3
107 | 
108 | # Fast reading for familiar content
109 | python3 rsvp_reader.py -f news.txt -w 500
110 | 
111 | # Medium pace for technical content
112 | python3 rsvp_reader.py -f documentation.txt -w 200 -c 2
113 | ```
114 | 
115 | ## 📋 Requirements
116 | 
117 | - **Python 3.6+** - No additional packages required
118 | - **Terminal** - Works in any terminal that supports ANSI escape codes
119 | - **Operating System** - Linux, macOS, Windows (with proper terminal)
120 | 
121 | 
122 | ### Recommended Starting Points
123 | 
124 | - **New to speed reading**: Start at 150 WPM
125 | - **Experienced reader**: Start at 250 WPM  
126 | - **Technical content**: Use 120-180 WPM with chunks
127 | - **Light reading**: 300-400 WPM is comfortable
128 | 
129 | 
130 | ## 🐛 Troubleshooting
131 | 
132 | ### Common Issues
133 | 
134 | **Terminal not clearing properly**
135 | ```bash
136 | # Reset terminal
137 | reset
138 | # Or try
139 | tput reset
140 | ```
141 | 
142 | **Keyboard controls not responding**
143 | - Ensure terminal has focus
144 | - Try running in different terminal emulator
145 | - Check if running over SSH with proper terminal settings
146 | 
147 | **Words appearing off-center**
148 | - Resize terminal window
149 | - Try different terminal dimensions
150 | - App adapts to terminal size automatically
151 | 
152 | **Speed too fast/slow to start**
153 | ```bash
154 | # Start with comfortable speed
155 | python3 rsvp_reader.py -f file.txt -w 150
156 | 
157 | # Adjust in real-time with ↑/↓ keys
158 | ```
159 | 
160 | ## Acknowledgments
161 | 
162 | - Built through collaborative "vibe coding"
163 | - Inspired by RSVP research and speed reading techniques
164 | 
165 | Happy Speed Reading! 📚⚡
166 | 


--------------------------------------------------------------------------------