├── README.md ├── bash-data-cleaning ├── clean_user_data.sh ├── output.md └── users.csv ├── build-with-python └── data-cleaning-n-validation-pipeline │ └── main.py ├── containerizing-python-apps └── currency-api │ ├── .dockerignore │ ├── Dockerfile │ ├── main.py │ └── requirements.txt ├── data-cleaning-essential-guide ├── README.md └── messy_data.csv ├── data-cleaning ├── README.md ├── generate_df.py ├── pandas_data_cleaning_one_liners.ipynb └── pandas_eda_one_liners.ipynb ├── data-science-app ├── README.md ├── app.py └── model_training.py ├── deploy_ml_models ├── Dockerfile ├── README.md ├── app.py └── linear_regression.py ├── docker ├── README.md ├── docker-volume │ └── volume-postgres.md ├── leverage-build-cache │ └── README.md └── minimal-img-python-apps │ ├── Dockerfile │ ├── README.md │ ├── app.py │ └── requirements.txt ├── duckdb-json ├── ecommerce_data.json ├── output.md └── query_json.sql ├── duckdb-miniseries ├── README.md ├── analyze-csv │ ├── query_csv.sql │ ├── query_csv_op.md │ └── shopping_data.csv ├── analyze-pandas-dataframes │ ├── DuckDB_pandas_df.ipynb │ └── README.md ├── analyze-parquet │ ├── query_parquet.sql │ ├── query_parquet_op.md │ └── restaurant_orders.parquet ├── descriptive-statistics │ ├── DuckDB_descriptive_stats.ipynb │ ├── README.md │ └── cab_ride_data.csv └── hypothesis-testing │ ├── DuckDB_hypothesis_testing.ipynb │ ├── README.md │ └── cab_ride_data.csv ├── duckdb ├── README.md ├── generate_csv.py └── main.py ├── fastapi-docker-for-ml-model-deployment └── diabetes-predictor │ ├── app │ ├── __init__.py │ └── main.py │ └── train_model.py ├── fastapi ├── README.md └── main.py ├── machine-learning ├── Feature_Engineering_Tips.ipynb ├── HyperparameterTuning.ipynb ├── LinearRegressionExample.ipynb ├── LogisticRegressionExample.ipynb └── README.md ├── model_deployment ├── Dockerfile ├── README.md ├── app │ ├── __init__.py │ └── main.py ├── model_training.py └── requirements.txt ├── natural-language-processing ├── README.md └── nlp_with_python.ipynb ├── pandas ├── 5_steps_data_cleaning.ipynb ├── README.md ├── common_pandas_errors.ipynb ├── data_cleaning_with_pandas.ipynb ├── pandas_data_quality_checks_one_liners.ipynb └── pandas_plotting_functions.ipynb ├── postgres └── README.md ├── pyspark ├── README.md ├── pyspark_data_cleaning.ipynb ├── pyspark_read_csv.ipynb └── pyspark_write_parquet.ipynb ├── regex ├── learn_regex.py ├── quick-ref-regex.md ├── regex_basics.ipynb ├── regex_contd.ipynb └── regex_examples.ipynb ├── statistical_plots.ipynb └── statistics ├── Basic_Stats_Functions_Python.ipynb ├── DescriptiveStats.ipynb ├── DescriptiveStats[final].ipynb ├── Outlier_Detection_Tips.ipynb ├── README.md ├── Stats_Libraries.ipynb ├── Visualizing_Statistical_Data.ipynb ├── handle_excel_files.ipynb ├── outlier_detection_techniques.ipynb ├── probability ├── README.md ├── beta_distribution.ipynb ├── cauchy_distribution.ipynb ├── geometric_distribution.ipynb ├── joint_and_conditional_pbty.ipynb └── poisson_distribution.ipynb ├── process_csv_files.ipynb ├── scipy_time_series_analysis.ipynb ├── sparse_data_analysis.ipynb ├── sparse_data_analysis_v0_1.ipynb └── time_series_decomposition.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Data Science Tutorials 2 | > If you're coming from one of my data science tutorials, you'll find the code and the links to the tutorials here. 3 | I hope you find them helpful. Happy learning and coding! 4 | 5 | data-science-tutorials 6 | 7 | 8 | 9 | |No.| Article| Code| 10 | |----|----|------| 11 | |1|[Build a Data Science App with Python in 10 Easy Steps](https://www.kdnuggets.com/build-data-science-app-with-python-10-easy-steps)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/data-science-app)| 12 | |2|[A Practical Guide to Deploying Machine Learning Models](https://machinelearningmastery.com/a-practical-guide-to-deploying-machine-learning-models/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/model_deployment)| 13 | |3|[FastAPI Tutorial: Build APIs with Python in Minutes](https://www.kdnuggets.com/fastapi-tutorial-build-apis-with-python-in-minutes)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/fastapi)| 14 | |4|[The Beginner’s Guide to Natural Language Processing with Python](https://machinelearningmastery.com/the-beginners-guide-to-natural-language-processing-with-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/natural-language-processing)| 15 | |5|[How to Perform Statistical Analysis on Sparse Data in Python](https://www.statology.org/how-to-perform-statistical-analysis-sparse-data-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/sparse_data_analysis_v0_1.ipynb)| 16 | |6|[10 Essential Statistical Functions in Python](https://www.statology.org/10-essential-statistical-functions-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/Basic_Stats_Functions_Python.ipynb)| 17 | |7|[How to Calculate Joint and Conditional Probabilities in Python](https://www.statology.org/how-to-calculate-joint-and-conditional-probabilities-in-python/)|[Code]()| 18 | |8|[How to Use the Geometric Distribution in Python](https://www.statology.org/how-to-use-the-geometric-distribution-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/geometric_distribution.ipynb)| 19 | |9|[How to Use the Beta Distribution in Python](https://www.statology.org/how-to-use-the-beta-distribution-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/beta_distribution.ipynb)| 20 | |10|[How to Use the Cauchy Distribution in Python](https://www.statology.org/how-to-use-the-cauchy-distribution-in-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/cauchy_distribution.ipynb)| 21 | |11|[Tips for Effective Outlier Detection in Real-World Datasets](https://www.statology.org/tips-for-effective-outlier-detection-in-real-world-datasets/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/Outlier_Detection_Tips.ipynb)| 22 | |12|[How to Interpret Statistical Plots in Python](https://www.statology.org/how-to-interpret-statistical-plots-python/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistical_plots.ipynb)| 23 | |13|[Data Cleaning with Bash: A Handbook for Developers](https://www.kdnuggets.com/data-cleaning-with-bash-a-handbook-for-developers)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/bash-data-cleaning)| 24 | |14| [Analyzing JSON Data with DuckDB & SQL](https://www.kdnuggets.com/analyzing-json-data-with-duckdb-sql)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-json)| 25 | |15| [The Poisson Distribution: From Basics to Real-World Examples](https://www.statology.org/the-poisson-distribution-from-basics-to-real-world-examples/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/probability/poisson_distribution.ipynb)| 26 | |16|[Why & How to Containerize Your Existing Python Apps](https://www.kdnuggets.com/why-how-to-containerize-your-existing-python-apps)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/containerizing-python-apps/currency-api)| 27 | |17|[How to Analyze Parquet Files with DuckDB](https://www.statology.org/how-to-analyze-parquet-files-with-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/analyze-parquet)| 28 | |18|[How to Analyze CSV Files with DuckDB](https://www.statology.org/how-to-analyze-csv-files-with-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/analyze-csv)| 29 | |19|[How to Query Pandas DataFrames with DuckDB](https://www.statology.org/how-to-query-pandas-dataframes-with-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/analyze-pandas-dataframes)| 30 | |20|[How to Calculate Descriptive Statistics in DuckDB](https://www.statology.org/how-to-calculate-descriptive-statistics-in-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/descriptive-statistics)| 31 | |21|[How to Perform Hypothesis Testing in DuckDB](https://www.statology.org/how-to-perform-hypothesis-testing-in-duckdb/)|[Code](https://github.com/balapriyac/data-science-tutorials/tree/main/duckdb-miniseries/hypothesis-testing)| 32 | |22|[Top 5 Statistical Techniques to Detect and Handle Outliers in Data](https://www.statology.org/top-5-statistical-techniques-detect-handle-outliers-data/)|[Code](https://github.com/balapriyac/data-science-tutorials/blob/main/statistics/outlier_detection_techniques.ipynb)| 33 | 34 | 35 | -------------------------------------------------------------------------------- /bash-data-cleaning/clean_user_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define input and output files 4 | INPUT_FILE="users.csv" 5 | OUTPUT_FILE="users_cleaned.csv" 6 | TEMP_FILE="temp.csv" 7 | 8 | echo "Starting data cleaning process..." 9 | 10 | # Step 1: Handle Missing Values 11 | echo "Step 1: Handling missing values..." 12 | sed 's/,,/,NULL,/g; s/,$/,NULL/g' $INPUT_FILE > $OUTPUT_FILE 13 | 14 | # Step 2: Fix Missing First Names 15 | echo "Step 2: Fixing missing first names..." 16 | awk -F, 'BEGIN {OFS=","} {if ($2 == "" || $2 == "NULL") $2 = "Unknown"; print}' $OUTPUT_FILE > $TEMP_FILE 17 | mv $TEMP_FILE $OUTPUT_FILE 18 | 19 | # Step 3: Fix Invalid Email Formats 20 | echo "Step 3: Fixing invalid email formats..." 21 | awk -F, 'BEGIN {OFS=","} {if ($3 !~ /@/ || $3 == "" || $3 == "NULL" || $3 == "not_an_email") $3 = "unknown@example.com"; print}' $OUTPUT_FILE > $TEMP_FILE 22 | mv $TEMP_FILE $OUTPUT_FILE 23 | 24 | # Step 4: Correct Date Formats 25 | echo "Step 4: Correcting date formats..." 26 | awk -F, 'BEGIN {OFS=","} {if ($5 == "invalid_date" || $5 == "" || $5 == "NULL") $5 = "2023-01-20"; print}' $OUTPUT_FILE > $TEMP_FILE 27 | mv $TEMP_FILE $OUTPUT_FILE 28 | 29 | # Step 5: Ensure Last Login Date is Valid 30 | echo "Step 5: Ensuring last login date is valid..." 31 | awk -F, 'BEGIN {OFS=","} {if ($6 == "" || $6 == "NULL") $6 = "2023-03-23"; print}' $OUTPUT_FILE > $TEMP_FILE 32 | mv $TEMP_FILE $OUTPUT_FILE 33 | 34 | # Step 6: Handle Negative Values 35 | echo "Step 6: Handling negative values..." 36 | awk -F, 'BEGIN {OFS=","} {if ($7 < 0) $7 = 0; print}' $OUTPUT_FILE > $TEMP_FILE 37 | mv $TEMP_FILE $OUTPUT_FILE 38 | 39 | # Validation checks 40 | echo "Running validation checks..." 41 | 42 | # Check for empty fields 43 | EMPTY_FIELDS=$(grep -c ",," $OUTPUT_FILE) 44 | echo "Empty fields remaining: $EMPTY_FIELDS" 45 | 46 | # Check for invalid emails 47 | INVALID_EMAILS=$(grep -v -E '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' $OUTPUT_FILE | grep -v "email" | wc -l) 48 | echo "Invalid emails remaining: $INVALID_EMAILS" 49 | 50 | # Check for invalid dates 51 | INVALID_DATES=$(grep -v -E '[0-9]{4}-[0-9]{2}-[0-9]{2}' $OUTPUT_FILE | grep -v "signup_date" | wc -l) 52 | echo "Invalid dates remaining: $INVALID_DATES" 53 | 54 | # Check for negative values 55 | NEGATIVE_VALUES=$(awk -F, '$7 < 0 {print}' $OUTPUT_FILE | wc -l) 56 | echo "Negative values remaining: $NEGATIVE_VALUES" 57 | 58 | echo "Data cleaning complete. Cleaned data saved to $OUTPUT_FILE" 59 | 60 | # Optional: Remove temporary file if it exists 61 | if [ -f "$TEMP_FILE" ]; then 62 | rm $TEMP_FILE 63 | fi 64 | -------------------------------------------------------------------------------- /bash-data-cleaning/output.md: -------------------------------------------------------------------------------- 1 | ``` 2 | $ head users.csv 3 | id,first_name,last_name,email,signup_date,last_login,purchase_amount 4 | 1,John,Smith,john.smith@example.com,2023-01-15,2023-03-20,125.99 5 | 2,Jane,Doe,jane.doe@example.com,2023-01-16,2023-03-21,210.50 6 | 3,Bob,Johnson,bob@example.com,2023-01-17,2023-03-22,0 7 | 4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25 8 | 5,,Brown,mike.brown@example.com,2023-01-19,2023-03-24,150.75 9 | 6,Sarah,Miller,sarah.miller@example.com,invalid_date,2023-03-25,95.00 10 | 7,David,Jones,david.jones@example.com,2023-01-21,2023-03-26,300.00 11 | 8,Lisa,Garcia,lisa.garcia@example.com,2023-01-22,2023-03-27,-50.00 12 | 9,James,Martinez,mymail@example.com,2023-01-23,2023-03-28,125.00 13 | ``` 14 | 15 | ``` 16 | $ head -n 5 users.csv 17 | id,first_name,last_name,email,signup_date,last_login,purchase_amount 18 | 1,John,Smith,john.smith@example.com,2023-01-15,2023-03-20,125.99 19 | 2,Jane,Doe,jane.doe@example.com,2023-01-16,2023-03-21,210.50 20 | 3,Bob,Johnson,bob@example.com,2023-01-17,2023-03-22,0 21 | 4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25 22 | ``` 23 | ``` 24 | $ grep -c ",," users.csv 25 | 2 26 | ``` 27 | 28 | ``` 29 | grep -n ",," users.csv 30 | 5:4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25 31 | 6:5,,Brown,mike.brown@example.com,2023-01-19,2023-03-24,150.75 32 | ``` 33 | 34 | ``` 35 | $ grep -v -E '^[0-9]{4}-[0-9]{2}-[0-9]{2}$' users.csv | grep "invalid_date" 36 | 6,Sarah,Miller,sarah.miller@example.com,invalid_date,2023-03-25,95.00 37 | 38 | ``` 39 | 40 | ``` 41 | $ awk -F, '$7 < 0 {print $0}' users.csv 42 | 8,Lisa,Garcia,lisa.garcia@example.com,2023-01-22,2023-03-27,-50.00 43 | ``` 44 | 45 | ``` 46 | $ awk -F, 'NR>1 {sum += $7} END {print "Total purchases: $" sum}' users_cleaned.csv 47 | ``` 48 | ``` 49 | Total purchases: $1282.49 50 | ``` 51 | 52 | ``` 53 | balapriya@balapriya-82C4:~/bash-data-cleaning$ awk -F, 'NR>1 {sum += $7; count++} END {print "Average purchase: $" sum/count}' users_cleaned.csv 54 | Average purchase: $128.249 55 | ``` 56 | ``` 57 | $ awk -F, 'NR>1 { 58 | split($5, date, "-"); 59 | months[date[2]]++; 60 | } 61 | END { 62 | for (month in months) { 63 | print "Month " month ": " months[month] " users" 64 | } 65 | }' users_cleaned.csv 66 | Month 01: 10 users 67 | ``` 68 | -------------------------------------------------------------------------------- /bash-data-cleaning/users.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,email,signup_date,last_login,purchase_amount 2 | 1,John,Smith,john.smith@example.com,2023-01-15,2023-03-20,125.99 3 | 2,Jane,Doe,jane.doe@example.com,2023-01-16,2023-03-21,210.50 4 | 3,Bob,Johnson,bob@example.com,2023-01-17,2023-03-22,0 5 | 4,Alice,Williams,alice.williams@example.com,2023-01-18,,75.25 6 | 5,,Brown,mike.brown@example.com,2023-01-19,2023-03-24,150.75 7 | 6,Sarah,Miller,sarah.miller@example.com,invalid_date,2023-03-25,95.00 8 | 7,David,Jones,david.jones@example.com,2023-01-21,2023-03-26,300.00 9 | 8,Lisa,Garcia,lisa.garcia@example.com,2023-01-22,2023-03-27,-50.00 10 | 9,James,Martinez,mymail@example.com,2023-01-23,2023-03-28,125.00 11 | 10,Emily,Anderson,emily.anderson@example.com,2023-01-24,2023-03-29,200.00 12 | -------------------------------------------------------------------------------- /build-with-python/data-cleaning-n-validation-pipeline/main.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from pydantic import BaseModel, ValidationError, field_validator 4 | from typing import Optional, List, Dict, Any 5 | 6 | class DataValidator(BaseModel): 7 | """Pydantic model for data validation""" 8 | name: str 9 | age: Optional[int] = None 10 | email: Optional[str] = None 11 | salary: Optional[float] = None 12 | 13 | @field_validator('age') 14 | @classmethod 15 | def validate_age(cls, v): 16 | if v is not None and (v < 0 or v > 120): 17 | raise ValueError('Age must be between 0 and 120') 18 | return v 19 | 20 | @field_validator('email') 21 | @classmethod 22 | def validate_email(cls, v): 23 | if v and '@' not in v: 24 | raise ValueError('Invalid email format') 25 | return v 26 | 27 | class DataPipeline: 28 | def __init__(self): 29 | self.cleaning_stats = {'duplicates_removed': 0, 'nulls_handled': 0, 'validation_errors': 0} 30 | 31 | def clean_data(self, df: pd.DataFrame) -> pd.DataFrame: 32 | """Clean the dataset by handling duplicates and missing values""" 33 | initial_rows = len(df) 34 | 35 | # Remove duplicates 36 | df = df.drop_duplicates() 37 | self.cleaning_stats['duplicates_removed'] = initial_rows - len(df) 38 | 39 | # Handle missing values 40 | numeric_columns = df.select_dtypes(include=[np.number]).columns 41 | df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median()) 42 | 43 | string_columns = df.select_dtypes(include=['object']).columns 44 | df[string_columns] = df[string_columns].fillna('Unknown') 45 | 46 | self.cleaning_stats['nulls_handled'] = df.isnull().sum().sum() 47 | return df 48 | 49 | def validate_data(self, df: pd.DataFrame) -> pd.DataFrame: 50 | """Validate each row using Pydantic model""" 51 | valid_rows = [] 52 | errors = [] 53 | 54 | for idx, row in df.iterrows(): 55 | try: 56 | validated_row = DataValidator(**row.to_dict()) 57 | valid_rows.append(validated_row.model_dump()) 58 | except ValidationError as e: 59 | errors.append({'row': idx, 'errors': str(e)}) 60 | 61 | self.cleaning_stats['validation_errors'] = len(errors) 62 | return pd.DataFrame(valid_rows), errors 63 | 64 | def process(self, df: pd.DataFrame) -> Dict[str, Any]: 65 | """Main pipeline method""" 66 | cleaned_df = self.clean_data(df.copy()) 67 | validated_df, validation_errors = self.validate_data(cleaned_df) 68 | 69 | return { 70 | 'cleaned_data': validated_df, 71 | 'validation_errors': validation_errors, 72 | 'stats': self.cleaning_stats 73 | } 74 | 75 | 76 | # Example usage 77 | if __name__ == "__main__": 78 | # Sample messy data 79 | sample_data = pd.DataFrame({ 80 | 'name': ['Tara Jamison', 'Jane Smith', 'Lucy Lee', None, 'Clara Clark','Jane Smith'], 81 | 'age': [25, -5, 25, 35, 150,-5], 82 | 'email': ['taraj@email.com', 'invalid-email', 'lucy@email.com', 'jane@email.com', 'clara@email.com','invalid-email'], 83 | 'salary': [50000, 60000, 50000, None, 75000,60000] 84 | }) 85 | 86 | pipeline = DataPipeline() 87 | result = pipeline.process(sample_data) 88 | 89 | print("Cleaned Data:") 90 | print(result['cleaned_data']) 91 | print(f"\nStats: {result['stats']}") 92 | print(f"Validation Errors: {len(result['validation_errors'])}") 93 | -------------------------------------------------------------------------------- /containerizing-python-apps/currency-api/.dockerignore: -------------------------------------------------------------------------------- 1 | # Virtual environment 2 | venv/ 3 | env/ 4 | ENV/ 5 | 6 | # Python cache files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | *.so 11 | .Python 12 | 13 | # Distribution / packaging 14 | dist/ 15 | build/ 16 | *.egg-info/ 17 | 18 | # Unit test / coverage reports 19 | htmlcov/ 20 | .tox/ 21 | .coverage 22 | .coverage.* 23 | .cache 24 | nosetests.xml 25 | coverage.xml 26 | *.cover 27 | 28 | # Environments 29 | .env 30 | .venv 31 | 32 | # IDE specific files 33 | .idea/ 34 | .vscode/ 35 | *.swp 36 | *.swo 37 | 38 | # Local development files 39 | .DS_Store 40 | -------------------------------------------------------------------------------- /containerizing-python-apps/currency-api/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use official Python image 2 | FROM python:3.11-slim 3 | # Set work directory 4 | WORKDIR /app 5 | # Install dependencies 6 | COPY requirements.txt . 7 | RUN pip install --no-cache-dir -r requirements.txt 8 | # Copy app code 9 | COPY . . 10 | # Expose port 11 | EXPOSE 8000 12 | # Run the app 13 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] 14 | -------------------------------------------------------------------------------- /containerizing-python-apps/currency-api/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException, Query 2 | from pydantic import BaseModel 3 | from typing import Literal 4 | 5 | app = FastAPI(title="Currency Converter") 6 | 7 | 8 | class ConversionResponse(BaseModel): 9 | from_currency: str 10 | to_currency: str 11 | amount: float 12 | converted: float 13 | rate: float 14 | 15 | 16 | mock_rates = { 17 | ("USD", "EUR"): 0.91, 18 | ("EUR", "USD"): 1.10, 19 | ("USD", "JPY"): 145.0, 20 | } 21 | 22 | 23 | @app.get("/convert", response_model=ConversionResponse) 24 | def convert( 25 | amount: float = Query(..., gt=0), 26 | from_currency: Literal["USD", "EUR"] = "USD", 27 | to_currency: Literal["USD", "EUR", "JPY"] = "EUR", 28 | ): 29 | if from_currency == to_currency: 30 | raise HTTPException( 31 | status_code=400, detail="From and to currencies must differ." 32 | ) 33 | 34 | rate = mock_rates.get((from_currency, to_currency)) 35 | if not rate: 36 | raise HTTPException(status_code=400, detail="Conversion rate not available.") 37 | 38 | converted = amount * rate 39 | return ConversionResponse( 40 | from_currency=from_currency, 41 | to_currency=to_currency, 42 | amount=amount, 43 | converted=round(converted, 2), 44 | rate=rate, 45 | ) 46 | 47 | -------------------------------------------------------------------------------- /containerizing-python-apps/currency-api/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn 3 | -------------------------------------------------------------------------------- /data-cleaning-essential-guide/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data-cleaning-essential-guide/messy_data.csv: -------------------------------------------------------------------------------- 1 | id,name,age,gender,email,income,job_title,department,start_date,education,customer_rating,comments,phone_number,country,purchase_amount 2 | 1,john smith,34,M,john.smith@email.com,75000,Sr. Developer,IT,2019-03-15,Bachelor's,4.5,Loyal customer since 2019,555-123-4567,USA,1250.99 3 | 2,Jane Doe,28,F,jane.doe@email.com,82000,Senior Developer,Engineering,2020-01-10,Masters,4.8,Has premium membership,555-987-6543,united states,879.50 4 | 3,Bob Johnson,,Male,bob.j@email,65000,Data Analyst,Analytics,2018-05-20,bachelor,3.2,occasional buyer,5551234567,US,450 5 | 4,,42,M,mike.wilson@email.com,92000,Project Manager,IT,2017-11-05,PHD,4.2,,555-555-5555,United States,1500 6 | 5,Sarah Williams,31,female,sarah.w@email.com,,UX Designer,Design,2021-02-28,Masters Degree,4.9,Very satisfied with service,555-789-0123,usa,2100.75 7 | 6,Alex Brown,38,M,alex@email.com,78500,senior developer,IT,2019-08-12,Bachelors,,"Left a positive review, potential for upsell",555.111.2222,United States,950.25 8 | 7,Emily Wilson,27,F,emily@incomplete,69000,Data Scientist,Analytics,2020/07/15,PHD,4.6,New customer - first purchase,5559876543,US,780 9 | 8,Jane Doe,29,Female,different.jane@email.com,81000,Sr. Developer,Engineering,2020-01-10,Masters,4.5,Duplicate customer record,555-987-6543,USA,1200 10 | 9,Chris Martin,45,M,chris.martin@email.com,-5000,Marketing Specialist,Marketing,2022-04-18,bachelor's degree,3.8,Returned last purchase,555-123-7890,U.S.A.,350.99 11 | 10,Lisa Johnson,33,F,lisa.j@email.com,88000,Product Manager,Product,1/15/2021,MBA,6.2,High-value customer,555-444-3333,United States,3500 12 | 11,David Thompson,51,Male,david.t@email.com,110000,Director,Executive,2015-09-30,masters degree,4.1,Prefers phone contact,555)-222-1111,us,2700.50 13 | 12,Jennifer Clark,36,F,jennifer@email,72000,HR Specialist,Human Resources,2018-12-05,Bachelors,3.9,Seasonal buyer,555*789*4561,USA,575.25 14 | 13,Michael Scott,44,M,mscott@email.com,80000,Regional Manager,Management,2022/02/10,Ba chelor,2.1,Multiple complaints filed,5551112222,United states,150 15 | 14,amanda king,39,f,amanda.king@email.com,93500,Lead Engineer,Engineering,2017-06-22,PhD,4.7,Technical customer - reads documentation,555-333-9876,usa,1850.75 16 | 15,Ryan Chen,32,M,ryan.chen@email.com,79000,BI Analyst,Analytics,2019-11-18,MSc,,"Contact attempted, no response yet",555-666-7777,United States,0 17 | -------------------------------------------------------------------------------- /data-cleaning/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data-cleaning/generate_df.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from datetime import datetime, timedelta 4 | 5 | # Set seed for reproducibility 6 | np.random.seed(42) 7 | 8 | # Create a sample dataset of customer orders 9 | n_rows = 1000 10 | 11 | # Generate random dates in the last year 12 | start_date = datetime(2024, 1, 1) 13 | end_date = datetime(2025, 3, 1) 14 | dates = [start_date + timedelta(days=np.random.randint(0, (end_date - start_date).days)) for _ in range(n_rows)] 15 | 16 | # Generate customer IDs with some duplicates and inconsistent formats 17 | customer_formats = ['CUS-{}', 'C{}', 'CUST-{}', 'Customer {}', '{}'] 18 | customer_ids = [np.random.choice(customer_formats).format(np.random.randint(1000, 9999)) for _ in range(n_rows)] 19 | 20 | # Generate email addresses with some errors 21 | email_domains = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'company.com'] 22 | emails = [] 23 | for i in range(n_rows): 24 | username = f"user{np.random.randint(100, 999)}" 25 | domain = np.random.choice(email_domains) 26 | # Introduce some errors 27 | if np.random.random() < 0.05: # Missing @ symbol 28 | emails.append(f"{username}{domain}") 29 | elif np.random.random() < 0.05: # Extra spaces 30 | emails.append(f" {username}@{domain} ") 31 | elif np.random.random() < 0.05: # Typos 32 | emails.append(f"{username}@{domain.replace('com', 'cm')}") 33 | else: 34 | emails.append(f"{username}@{domain}") 35 | 36 | # Generate product IDs with some missing values 37 | product_ids = [f"PROD-{np.random.randint(100, 999)}" if np.random.random() > 0.03 else np.nan for _ in range(n_rows)] 38 | 39 | # Generate quantities with some outliers 40 | quantities = [np.random.randint(1, 10) if np.random.random() > 0.02 else np.random.randint(100, 1000) for _ in range(n_rows)] 41 | 42 | # Generate prices with some negative values and inconsistent formats 43 | prices = [] 44 | for _ in range(n_rows): 45 | price = np.random.uniform(9.99, 199.99) 46 | if np.random.random() < 0.02: # Negative price 47 | price = -price 48 | if np.random.random() < 0.1: # String format 49 | prices.append(f"${price:.2f}") 50 | elif np.random.random() < 0.1: # Integer format 51 | prices.append(int(price)) 52 | else: 53 | prices.append(price) 54 | 55 | # Generate shipping status with some inconsistent values 56 | status_options = ['Shipped', 'shipped', 'SHIPPED', 'In Transit', 'in transit', 'In-Transit', 'Delivered', 'delivered', 'DELIVERED', 'Pending', 'pending'] 57 | shipping_status = [np.random.choice(status_options) for _ in range(n_rows)] 58 | 59 | # Create the DataFrame 60 | df = pd.DataFrame({ 61 | 'order_date': dates, 62 | 'customer_id': customer_ids, 63 | 'email': emails, 64 | 'product_id': product_ids, 65 | 'quantity': quantities, 66 | 'price': prices, 67 | 'shipping_status': shipping_status 68 | }) 69 | 70 | # Add some completely blank rows 71 | blank_indices = np.random.choice(range(n_rows), size=5, replace=False) 72 | for idx in blank_indices: 73 | df.loc[idx, :] = np.nan 74 | 75 | # Add some duplicate rows 76 | dup_indices = np.random.choice(range(n_rows), size=10, replace=False) 77 | df = pd.concat([df, df.loc[dup_indices]], ignore_index=True) 78 | 79 | # Print the first few rows to see the data 80 | print(df.head()) 81 | -------------------------------------------------------------------------------- /data-science-app/README.md: -------------------------------------------------------------------------------- 1 | ## Building a Simple Data Science App 2 | 3 | Install the required libraries in a virtual environment for the project: 4 | ``` 5 | $ pip3 install fastapi uvicorn scikit-learn pandas 6 | ``` 7 | 8 | Run `model_training.py` to train the logistic regression model. 9 | 10 | To run the FastAPI app, use: 11 | ``` 12 | $ uvicorn app:app --reload 13 | ``` 14 | Use curl to send POST requests to the `/predict` endpoint. 15 | -------------------------------------------------------------------------------- /data-science-app/app.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from pydantic import BaseModel 3 | import pickle 4 | 5 | app = FastAPI() 6 | 7 | @app.get("/") 8 | def read_root(): 9 | return {"message": "A Simple Prediction API"} 10 | 11 | def load_model(): 12 | with open('model/classifier.pkl', 'rb') as f: 13 | model = pickle.load(f) 14 | return model 15 | 16 | class WineFeatures(BaseModel): 17 | alcohol: float 18 | malic_acid: float 19 | ash: float 20 | alcalinity_of_ash: float 21 | magnesium: float 22 | total_phenols: float 23 | flavanoids: float 24 | nonflavanoid_phenols: float 25 | proanthocyanins: float 26 | color_intensity: float 27 | hue: float 28 | od280_od315_of_diluted_wines: float 29 | proline: float 30 | 31 | @app.post("/predict") 32 | def predict_wine(features: WineFeatures): 33 | model = load_model() 34 | input_data = [[ 35 | features.alcohol, features.malic_acid, features.ash, features.alcalinity_of_ash, 36 | features.magnesium, features.total_phenols, features.flavanoids, 37 | features.nonflavanoid_phenols, features.proanthocyanins, features.color_intensity, 38 | features.hue, features.od280_od315_of_diluted_wines, features.proline 39 | ]] 40 | 41 | prediction = model.predict(input_data) 42 | return {"prediction": int(prediction[0])} 43 | -------------------------------------------------------------------------------- /data-science-app/model_training.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_wine 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.preprocessing import StandardScaler 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.metrics import accuracy_score 6 | import pandas as pd 7 | import pickle 8 | 9 | def load_wine_data(): 10 | wine_data = load_wine() 11 | df = pd.DataFrame(data=wine_data.data, columns=wine_data.feature_names) 12 | df['target'] = wine_data.target # Adding the target (wine quality class) 13 | return df 14 | 15 | def preprocess_data(df): 16 | X = df.drop('target', axis=1) # Features 17 | y = df['target'] # Target (wine quality) 18 | 19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27) 20 | 21 | # Feature scaling 22 | scaler = StandardScaler() 23 | X_train_scaled = scaler.fit_transform(X_train) 24 | X_test_scaled = scaler.transform(X_test) 25 | 26 | return X_train_scaled, X_test_scaled, y_train, y_test 27 | 28 | def train_model(X_train, y_train): 29 | model = LogisticRegression(random_state=42) 30 | model.fit(X_train, y_train) 31 | 32 | # Save the trained model using pickle 33 | with open('classifier.pkl', 'wb') as f: 34 | pickle.dump(model, f) 35 | 36 | return model 37 | 38 | def evaluate_model(model, X_test, y_test): 39 | y_pred = model.predict(X_test) 40 | accuracy = accuracy_score(y_test, y_pred) 41 | print(f"Accuracy: {accuracy:.2f}") 42 | 43 | if __name__ == "__main__": 44 | df = load_wine_data() 45 | X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df) 46 | model = train_model(X_train_scaled, y_train) 47 | evaluate_model(model, X_test_scaled, y_test) 48 | -------------------------------------------------------------------------------- /deploy_ml_models/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Python 3.11 as the base image 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory inside the container 5 | WORKDIR /app 6 | 7 | # Copy the application files 8 | COPY app.py . 9 | COPY linear_regression_model.pkl . 10 | COPY requirements.txt . 11 | 12 | # Install the required libraries 13 | RUN pip install -r requirements.txt 14 | 15 | # Expose the port that FastAPI will run on 16 | EXPOSE 8000 17 | 18 | # Command to run the FastAPI app with uvicorn 19 | CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] 20 | -------------------------------------------------------------------------------- /deploy_ml_models/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /deploy_ml_models/app.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | import pickle 3 | import pandas as pd 4 | from pydantic import BaseModel 5 | 6 | # Load the saved model using pickle 7 | with open('linear_regression_model.pkl', 'rb') as f: 8 | model = pickle.load(f) 9 | 10 | # Create the FastAPI app 11 | app = FastAPI() 12 | 13 | # Define the input schema for the API 14 | class HousingData(BaseModel): 15 | MedInc: float 16 | HouseAge: float 17 | AveRooms: float 18 | AveOccup: float 19 | 20 | # Define the prediction endpoint 21 | @app.post('/predict') 22 | def predict_price(data: HousingData): 23 | # Convert input data to a DataFrame 24 | input_data = pd.DataFrame([data.dict()]) 25 | 26 | # Make a prediction 27 | prediction = model.predict(input_data) 28 | 29 | # Return the predicted price 30 | return {"Predicted Price": prediction[0]} 31 | -------------------------------------------------------------------------------- /deploy_ml_models/linear_regression.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.datasets import fetch_california_housing 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.linear_model import LinearRegression 5 | import pickle 6 | 7 | # Load the California Housing dataset 8 | data = fetch_california_housing(as_frame=True) 9 | df = pd.DataFrame(data.data, columns=data.feature_names) 10 | df['target'] = data.target 11 | 12 | # Define features and target variable 13 | X = df[['MedInc', 'HouseAge', 'AveRooms', 'AveOccup']] # Selecting some example features 14 | y = df['target'] # Target variable is the housing price (scaled) 15 | 16 | # Split the dataset into training and test sets 17 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 18 | 19 | # Initialize and train the linear regression model 20 | model = LinearRegression() 21 | model.fit(X_train, y_train) 22 | 23 | # Evaluate the model on the test set 24 | score = model.score(X_test, y_test) 25 | print(f"Model R-squared: {score:.4f}") 26 | 27 | # Save the model using pickle 28 | with open('linear_regression_model.pkl', 'wb') as f: 29 | pickle.dump(model, f) 30 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docker/docker-volume/volume-postgres.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docker/leverage-build-cache/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docker/minimal-img-python-apps/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official lightweight Python 3.11-slim image 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Install dependencies 8 | COPY requirements.txt requirements.txt 9 | RUN pip install --no-cache-dir -r requirements.txt 10 | 11 | # Copy the current directory contents into the container at /app 12 | COPY . . 13 | 14 | # Expose the port the app runs on 15 | EXPOSE 5000 16 | 17 | # Run the application 18 | CMD ["python3", "app.py"] 19 | -------------------------------------------------------------------------------- /docker/minimal-img-python-apps/README.md: -------------------------------------------------------------------------------- 1 | ## Create Minimal Docker Image for a Sample Python Application 2 | 3 | Your project directory should look like so: 4 | 5 | ``` 6 | / 7 | ├── app.py 8 | ├── Dockerfile 9 | ├── requirements.txt 10 | ``` 11 | 12 | Build the docker image for the Flask app: 13 | 14 | ```sh 15 | $ docker build -t inventory-app:slim . 16 | ``` 17 | -------------------------------------------------------------------------------- /docker/minimal-img-python-apps/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | 3 | app = Flask(__name__) 4 | 5 | # In-memory database for simplicity 6 | inventory = {} 7 | 8 | @app.route('/inventory', methods=['POST']) 9 | def add_item(): 10 | item = request.get_json() 11 | item_id = item.get('id') 12 | if not item_id: 13 | return jsonify({"error": "Item ID is required"}), 400 14 | if item_id in inventory: 15 | return jsonify({"error": "Item already exists"}), 400 16 | inventory[item_id] = item 17 | return jsonify(item), 201 18 | 19 | @app.route('/inventory/', methods=['GET']) 20 | def get_item(item_id): 21 | item = inventory.get(item_id) 22 | if not item: 23 | return jsonify({"error": "Item not found"}), 404 24 | return jsonify(item) 25 | 26 | @app.route('/inventory/', methods=['PUT']) 27 | def update_item(item_id): 28 | if item_id not in inventory: 29 | return jsonify({"error": "Item not found"}), 404 30 | updated_item = request.get_json() 31 | inventory[item_id] = updated_item 32 | return jsonify(updated_item) 33 | 34 | @app.route('/inventory/', methods=['DELETE']) 35 | def delete_item(item_id): 36 | if item_id not in inventory: 37 | return jsonify({"error": "Item not found"}), 404 38 | del inventory[item_id] 39 | return '', 204 40 | 41 | if __name__ == '__main__': 42 | app.run(host='0.0.0.0', port=5000) 43 | -------------------------------------------------------------------------------- /docker/minimal-img-python-apps/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==3.0.3 2 | -------------------------------------------------------------------------------- /duckdb-json/ecommerce_data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "order_id": "ORD-1001", 4 | "customer": { 5 | "id": "CUST-101", 6 | "name": "Alex Johnson", 7 | "email": "alex.j@example.com", 8 | "address": { 9 | "street": "123 Main St", 10 | "city": "Boston", 11 | "state": "MA", 12 | "zip": "02108" 13 | }, 14 | "loyalty_tier": "gold" 15 | }, 16 | "order_date": "2023-10-15T14:30:00", 17 | "items": [ 18 | { 19 | "product_id": "PROD-501", 20 | "name": "Wireless Headphones", 21 | "category": "Electronics", 22 | "price": 129.99, 23 | "quantity": 1, 24 | "tags": ["bluetooth", "noise-cancelling", "audio"] 25 | }, 26 | { 27 | "product_id": "PROD-245", 28 | "name": "Smartphone Case", 29 | "category": "Accessories", 30 | "price": 24.99, 31 | "quantity": 2, 32 | "tags": ["protective", "smartphone", "silicone"] 33 | } 34 | ], 35 | "payment": { 36 | "method": "credit_card", 37 | "total": 179.97, 38 | "status": "completed" 39 | } 40 | }, 41 | { 42 | "order_id": "ORD-1002", 43 | "customer": { 44 | "id": "CUST-102", 45 | "name": "Sarah Miller", 46 | "email": "sarahm@example.com", 47 | "address": { 48 | "street": "456 Oak Ave", 49 | "city": "Seattle", 50 | "state": "WA", 51 | "zip": "98101" 52 | }, 53 | "loyalty_tier": "silver" 54 | }, 55 | "order_date": "2023-10-16T09:15:00", 56 | "items": [ 57 | { 58 | "product_id": "PROD-103", 59 | "name": "Coffee Maker", 60 | "category": "Kitchen", 61 | "price": 89.99, 62 | "quantity": 1, 63 | "tags": ["appliance", "coffee", "automatic"] 64 | }, 65 | { 66 | "product_id": "PROD-107", 67 | "name": "Coffee Beans Premium Blend", 68 | "category": "Food & Beverage", 69 | "price": 15.99, 70 | "quantity": 3, 71 | "tags": ["coffee", "organic", "fair-trade"] 72 | } 73 | ], 74 | "payment": { 75 | "method": "paypal", 76 | "total": 137.96, 77 | "status": "completed" 78 | } 79 | } 80 | ] 81 | -------------------------------------------------------------------------------- /duckdb-json/output.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ┌──────────┬───┬──────────────────────┬──────────────────────┐ 3 | │ order_id │ … │ items │ payment │ 4 | │ varchar │ │ struct(product_id … │ struct("method" va… │ 5 | ├──────────┼───┼──────────────────────┼──────────────────────┤ 6 | │ ORD-1001 │ … │ [{'product_id': PR… │ {'method': credit_… │ 7 | │ ORD-1002 │ … │ [{'product_id': PR… │ {'method': paypal,… │ 8 | ├──────────┴───┴──────────────────────┴──────────────────────┤ 9 | │ 2 rows 5 columns (3 shown) │ 10 | └────────────────────────────────────────────────────────────┘ 11 | ``` 12 | 13 | ``` 14 | ┌─────────────┐ 15 | │ order_count │ 16 | │ int64 │ 17 | ├─────────────┤ 18 | │ 2 │ 19 | └─────────────┘ 20 | ``` 21 | 22 | ``` 23 | ┌──────────┬───────────────┐ 24 | │ order_id │ customer_name │ 25 | │ varchar │ varchar │ 26 | ├──────────┼───────────────┤ 27 | │ ORD-1001 │ Alex Johnson │ 28 | │ ORD-1002 │ Sarah Miller │ 29 | └──────────┴───────────────┘ 30 | ``` 31 | 32 | ``` 33 | ┌──────────┬───────────────┬─────────┬─────────┐ 34 | │ order_id │ customer_name │ city │ state │ 35 | │ varchar │ varchar │ varchar │ varchar │ 36 | ├──────────┼───────────────┼─────────┼─────────┤ 37 | │ ORD-1001 │ Alex Johnson │ Boston │ MA │ 38 | │ ORD-1002 │ Sarah Miller │ Seattle │ WA │ 39 | └──────────┴───────────────┴─────────┴─────────┘ 40 | ``` 41 | ``` 42 | ┌──────────┬───────────────┐ 43 | │ order_id │ customer_name │ 44 | │ varchar │ varchar │ 45 | ├──────────┼───────────────┤ 46 | │ ORD-1002 │ Sarah Miller │ 47 | └──────────┴───────────────┘ 48 | ``` 49 | 50 | ``` 51 | ┌──────────┬────────────────┬───────────────┐ 52 | │ order_id │ payment_method │ total_amount │ 53 | │ varchar │ varchar │ decimal(18,3) │ 54 | ├──────────┼────────────────┼───────────────┤ 55 | │ ORD-1001 │ credit_card │ 179.970 │ 56 | │ ORD-1002 │ paypal │ 137.960 │ 57 | └──────────┴────────────────┴───────────────┘ 58 | ``` 59 | 60 | ``` 61 | ┌──────────┬───────────────┬───┬───────────────┬──────────┐ 62 | │ order_id │ customer_name │ … │ price │ quantity │ 63 | │ varchar │ varchar │ │ decimal(18,3) │ int32 │ 64 | ├──────────┼───────────────┼───┼───────────────┼──────────┤ 65 | │ ORD-1001 │ Alex Johnson │ … │ 129.990 │ 1 │ 66 | │ ORD-1001 │ Alex Johnson │ … │ 24.990 │ 2 │ 67 | │ ORD-1002 │ Sarah Miller │ … │ 89.990 │ 1 │ 68 | │ ORD-1002 │ Sarah Miller │ … │ 15.990 │ 3 │ 69 | ├──────────┴───────────────┴───┴───────────────┴──────────┤ 70 | │ 4 rows 6 columns (4 shown) │ 71 | └─────────────────────────────────────────────────────────┘ 72 | ``` 73 | 74 | ``` 75 | ┌──────────┬───────────────┬───────────────┬────────────┐ 76 | │ order_id │ customer_name │ order_total │ item_count │ 77 | │ varchar │ varchar │ decimal(18,3) │ uint64 │ 78 | ├──────────┼───────────────┼───────────────┼────────────┤ 79 | │ ORD-1001 │ Alex Johnson │ 179.970 │ 2 │ 80 | │ ORD-1002 │ Sarah Miller │ 137.960 │ 2 │ 81 | └──────────┴───────────────┴───────────────┴────────────┘ 82 | ``` 83 | ``` 84 | ┌─────────────────┬───────────┐ 85 | │ category │ avg_price │ 86 | │ varchar │ double │ 87 | ├─────────────────┼───────────┤ 88 | │ Electronics │ 129.99 │ 89 | │ Kitchen │ 89.99 │ 90 | │ Accessories │ 24.99 │ 91 | │ Food & Beverage │ 15.99 │ 92 | └─────────────────┴───────────┘ 93 | ``` 94 | ``` 95 | ┌──────────┬───────────────┬───┬───────────────┬──────────┐ 96 | │ order_id │ customer_name │ … │ price │ quantity │ 97 | │ varchar │ varchar │ │ decimal(18,3) │ int32 │ 98 | ├──────────┼───────────────┼───┼───────────────┼──────────┤ 99 | │ ORD-1001 │ Alex Johnson │ … │ 129.990 │ 1 │ 100 | │ ORD-1001 │ Alex Johnson │ … │ 24.990 │ 2 │ 101 | │ ORD-1002 │ Sarah Miller │ … │ 89.990 │ 1 │ 102 | │ ORD-1002 │ Sarah Miller │ … │ 15.990 │ 3 │ 103 | ├──────────┴───────────────┴───┴───────────────┴──────────┤ 104 | │ 4 rows 6 columns (4 shown) │ 105 | └─────────────────────────────────────────────────────────┘ 106 | ``` 107 | 108 | ``` 109 | ┌──────────┬───────────────┬───────────────────────────────────────────────────┐ 110 | │ order_id │ customer_name │ item │ 111 | │ varchar │ varchar │ struct(product_id varchar, "name" varchar, cate… │ 112 | ├──────────┼───────────────┼───────────────────────────────────────────────────┤ 113 | │ ORD-1001 │ Alex Johnson │ {'product_id': PROD-501, 'name': Wireless Headp… │ 114 | │ ORD-1001 │ Alex Johnson │ {'product_id': PROD-245, 'name': Smartphone Cas… │ 115 | │ ORD-1002 │ Sarah Miller │ {'product_id': PROD-103, 'name': Coffee Maker, … │ 116 | │ ORD-1002 │ Sarah Miller │ {'product_id': PROD-107, 'name': Coffee Beans P… │ 117 | └──────────┴───────────────┴───────────────────────────────────────────────────┘ 118 | ``` 119 | -------------------------------------------------------------------------------- /duckdb-json/query_json.sql: -------------------------------------------------------------------------------- 1 | -- Create a table from the JSON file 2 | CREATE TABLE ecommerce AS 3 | SELECT * FROM read_json_auto('ecommerce_data.json'); 4 | 5 | -- View the data 6 | SELECT * FROM ecommerce; 7 | 8 | -- Count the number of orders 9 | SELECT COUNT(*) AS order_count FROM ecommerce; 10 | 11 | -- Get order IDs and customer names 12 | SELECT 13 | order_id, 14 | customer->>'name' AS customer_name 15 | FROM ecommerce; 16 | 17 | -- Extract customer address information 18 | SELECT 19 | order_id, 20 | customer->>'name' AS customer_name, 21 | customer->'address'->>'city' AS city, 22 | customer->'address'->>'state' AS state 23 | FROM ecommerce; 24 | 25 | -- Find orders from customers in Seattle 26 | SELECT 27 | order_id, 28 | customer->>'name' AS customer_name 29 | FROM ecommerce 30 | WHERE customer->'address'->>'city' = 'Seattle'; 31 | 32 | -- Get payment details 33 | SELECT 34 | order_id, 35 | payment->>'method' AS payment_method, 36 | CAST(payment->>'total' AS DECIMAL) AS total_amount 37 | FROM ecommerce; 38 | 39 | -- Unnest the items array into separate rows 40 | SELECT 41 | order_id, 42 | customer->>'name' AS customer_name, 43 | unnest(items) AS item 44 | FROM ecommerce; 45 | 46 | -- Get specific item details 47 | SELECT 48 | order_id, 49 | customer->>'name' AS customer_name, 50 | item->>'name' AS product_name, 51 | item->>'category' AS category, 52 | CAST(item->>'price' AS DECIMAL) AS price, 53 | CAST(item->>'quantity' AS INTEGER) AS quantity 54 | FROM ( 55 | SELECT 56 | order_id, 57 | customer, 58 | unnest(items) AS item 59 | FROM ecommerce 60 | ) AS unnested_items; 61 | 62 | -- Calculate total value of each order 63 | SELECT 64 | order_id, 65 | customer->>'name' AS customer_name, 66 | CAST(payment->>'total' AS DECIMAL) AS order_total, 67 | json_array_length(items) AS item_count 68 | FROM ecommerce; 69 | 70 | -- Calculate average price by product category 71 | SELECT 72 | item->>'category' AS category, 73 | AVG(CAST(item->>'price' AS DECIMAL)) AS avg_price 74 | FROM ( 75 | SELECT unnest(items) AS item 76 | FROM ecommerce 77 | ) AS unnested_items 78 | GROUP BY category 79 | ORDER BY avg_price DESC; 80 | 81 | -------------------------------------------------------------------------------- /duckdb-miniseries/README.md: -------------------------------------------------------------------------------- 1 | This directory contains the code and notebooks for a DuckDB mini-series I've been working on. The mini-series focuses on working with common data formats and performing basic statistical analysis with DuckDB. 2 | 3 | - [How to Analyze Parquet Files with DuckDB](https://www.statology.org/how-to-analyze-parquet-files-with-duckdb/) 4 | - [How to Analyze CSV Files with DuckDB](https://www.statology.org/how-to-analyze-csv-files-with-duckdb/) 5 | - [How to Query Pandas DataFrames with DuckDB](https://www.statology.org/how-to-query-pandas-dataframes-with-duckdb/) 6 | - [How to Calculate Descriptive Statistics in DuckDB](https://www.statology.org/how-to-calculate-descriptive-statistics-in-duckdb/) 7 | - [How to Perform Hypothesis Testing in DuckDB](https://www.statology.org/how-to-perform-hypothesis-testing-in-duckdb/) 8 | -------------------------------------------------------------------------------- /duckdb-miniseries/analyze-csv/query_csv.sql: -------------------------------------------------------------------------------- 1 | 2 | SELECT * FROM read_csv('shopping_data.csv') LIMIT 5; 3 | 4 | DESCRIBE SELECT * FROM read_csv('shopping_data.csv') LIMIT 5; 5 | 6 | SELECT 7 | MIN(age) AS min_age, MAX(age) AS max_age, AVG(age) AS avg_age, 8 | MIN(purchase_amount) AS min_purchase, MAX(purchase_amount) AS max_purchase, AVG(purchase_amount) AS avg_purchase 9 | FROM read_csv('shopping_data.csv'); 10 | 11 | SELECT customer_name, age, purchase_amount, category 12 | FROM read_csv_auto('shopping_data.csv') 13 | WHERE purchase_amount > 200 14 | ORDER BY purchase_amount DESC; 15 | 16 | SELECT category, COUNT(*) AS total_purchases, SUM(purchase_amount) AS total_sales, AVG(purchase_amount) AS avg_spent 17 | FROM read_csv_auto('shopping_data.csv') 18 | GROUP BY category 19 | ORDER BY total_sales DESC; 20 | 21 | -------------------------------------------------------------------------------- /duckdb-miniseries/analyze-csv/query_csv_op.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ┌─────────────┬───────────────┬───────┬───┬───────────────┬────────────────┐ 3 | │ customer_id │ customer_name │ age │ … │ purchase_date │ payment_method │ 4 | │ int64 │ varchar │ int64 │ │ date │ varchar │ 5 | ├─────────────┼───────────────┼───────┼───┼───────────────┼────────────────┤ 6 | │ 1 │ Customer 1 │ 56 │ … │ 2024-01-01 │ PayPal │ 7 | │ 2 │ Customer 2 │ 46 │ … │ 2024-01-02 │ Credit Card │ 8 | │ 3 │ Customer 3 │ 32 │ … │ 2024-01-03 │ Cash │ 9 | │ 4 │ Customer 4 │ 60 │ … │ 2024-01-04 │ Cash │ 10 | │ 5 │ Customer 5 │ 25 │ … │ 2024-01-05 │ Debit Card │ 11 | ├─────────────┴───────────────┴───────┴───┴───────────────┴────────────────┤ 12 | │ 5 rows 8 columns (5 shown) │ 13 | └──────────────────────────────────────────────────────────────────────────┘ 14 | ``` 15 | 16 | ``` 17 | ┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐ 18 | │ column_name │ column_type │ null │ key │ default │ extra │ 19 | │ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │ 20 | ├─────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤ 21 | │ customer_id │ BIGINT │ YES │ NULL │ NULL │ NULL │ 22 | │ customer_name │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 23 | │ age │ BIGINT │ YES │ NULL │ NULL │ NULL │ 24 | │ gender │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 25 | │ purchase_amount │ DOUBLE │ YES │ NULL │ NULL │ NULL │ 26 | │ category │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 27 | │ purchase_date │ DATE │ YES │ NULL │ NULL │ NULL │ 28 | │ payment_method │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 29 | └─────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘ 30 | ``` 31 | 32 | ``` 33 | ┌─────────┬─────────┬───┬──────────────┬────────────────────┐ 34 | │ min_age │ max_age │ … │ max_purchase │ avg_purchase │ 35 | │ int64 │ int64 │ │ double │ double │ 36 | ├─────────┼─────────┼───┼──────────────┼────────────────────┤ 37 | │ 19 │ 61 │ … │ 485.1 │ 242.92866666666666 │ 38 | ├─────────┴─────────┴───┴──────────────┴────────────────────┤ 39 | │ 1 rows 6 columns (4 shown) │ 40 | └───────────────────────────────────────────────────────────┘ 41 | ``` 42 | 43 | ``` 44 | ┌───────────────┬───────┬─────────────────┬─────────────┐ 45 | │ customer_name │ age │ purchase_amount │ category │ 46 | │ varchar │ int64 │ double │ varchar │ 47 | ├───────────────┼───────┼─────────────────┼─────────────┤ 48 | │ Customer 16 │ 20 │ 485.1 │ Groceries │ 49 | │ Customer 18 │ 19 │ 470.35 │ Groceries │ 50 | │ Customer 21 │ 47 │ 461.72 │ Groceries │ 51 | │ Customer 9 │ 40 │ 455.57 │ Groceries │ 52 | │ Customer 19 │ 41 │ 448.47 │ Electronics │ 53 | │ Customer 28 │ 61 │ 416.08 │ Clothing │ 54 | │ Customer 1 │ 56 │ 406.11 │ Books │ 55 | │ Customer 17 │ 39 │ 389.82 │ Groceries │ 56 | │ Customer 4 │ 60 │ 345.27 │ Electronics │ 57 | │ Customer 11 │ 28 │ 334.64 │ Books │ 58 | │ Customer 20 │ 61 │ 302.97 │ Books │ 59 | │ Customer 14 │ 57 │ 277.89 │ Books │ 60 | │ Customer 13 │ 53 │ 264.83 │ Books │ 61 | │ Customer 7 │ 56 │ 252.64 │ Electronics │ 62 | │ Customer 5 │ 25 │ 225.67 │ Groceries │ 63 | │ Customer 26 │ 29 │ 200.45 │ Clothing │ 64 | ├───────────────┴───────┴─────────────────┴─────────────┤ 65 | │ 16 rows 4 columns │ 66 | └───────────────────────────────────────────────────────┘ 67 | ``` 68 | 69 | ``` 70 | ┌─────────────┬─────────────────┬────────────────────┬────────────────────┐ 71 | │ category │ total_purchases │ total_sales │ avg_spent │ 72 | │ varchar │ int64 │ double │ double │ 73 | ├─────────────┼─────────────────┼────────────────────┼────────────────────┤ 74 | │ Groceries │ 9 │ 2716.7100000000005 │ 301.85666666666674 │ 75 | │ Electronics │ 12 │ 2119.94 │ 176.66166666666666 │ 76 | │ Books │ 7 │ 1834.6799999999998 │ 262.09714285714284 │ 77 | │ Clothing │ 2 │ 616.53 │ 308.265 │ 78 | └─────────────┴─────────────────┴────────────────────┴────────────────────┘ 79 | ``` 80 | 81 | -------------------------------------------------------------------------------- /duckdb-miniseries/analyze-csv/shopping_data.csv: -------------------------------------------------------------------------------- 1 | customer_id,customer_name,age,gender,purchase_amount,category,purchase_date,payment_method 2 | 1,Customer 1,56,Male,406.11,Books,2024-01-01,PayPal 3 | 2,Customer 2,46,Male,159.26,Electronics,2024-01-02,Credit Card 4 | 3,Customer 3,32,Female,57.86,Electronics,2024-01-03,Cash 5 | 4,Customer 4,60,Female,345.27,Electronics,2024-01-04,Cash 6 | 5,Customer 5,25,Female,225.67,Groceries,2024-01-05,Debit Card 7 | 6,Customer 6,38,Female,69.8,Electronics,2024-01-06,Credit Card 8 | 7,Customer 7,56,Female,252.64,Electronics,2024-01-07,Cash 9 | 8,Customer 8,36,Male,26.85,Electronics,2024-01-08,PayPal 10 | 9,Customer 9,40,Female,455.57,Groceries,2024-01-09,PayPal 11 | 10,Customer 10,28,Female,136.8,Electronics,2024-01-10,Debit Card 12 | 11,Customer 11,28,Male,334.64,Books,2024-01-11,Cash 13 | 12,Customer 12,41,Female,162.74,Electronics,2024-01-12,Credit Card 14 | 13,Customer 13,53,Male,264.83,Books,2024-01-13,PayPal 15 | 14,Customer 14,57,Female,277.89,Books,2024-01-14,Cash 16 | 15,Customer 15,41,Male,100.58,Books,2024-01-15,Cash 17 | 16,Customer 16,20,Female,485.1,Groceries,2024-01-16,Debit Card 18 | 17,Customer 17,39,Female,389.82,Groceries,2024-01-17,PayPal 19 | 18,Customer 18,19,Male,470.35,Groceries,2024-01-18,PayPal 20 | 19,Customer 19,41,Male,448.47,Electronics,2024-01-19,Credit Card 21 | 20,Customer 20,61,Male,302.97,Books,2024-01-20,PayPal 22 | 21,Customer 21,47,Male,461.72,Groceries,2024-01-21,Credit Card 23 | 22,Customer 22,55,Male,53.36,Groceries,2024-01-22,PayPal 24 | 23,Customer 23,19,Male,106.03,Electronics,2024-01-23,Debit Card 25 | 24,Customer 24,38,Male,32.16,Groceries,2024-01-24,PayPal 26 | 25,Customer 25,50,Male,169.41,Electronics,2024-01-25,Credit Card 27 | 26,Customer 26,29,Female,200.45,Clothing,2024-01-26,Credit Card 28 | 27,Customer 27,39,Female,142.96,Groceries,2024-01-27,Debit Card 29 | 28,Customer 28,61,Male,416.08,Clothing,2024-01-28,PayPal 30 | 29,Customer 29,42,Female,184.81,Electronics,2024-01-29,PayPal 31 | 30,Customer 30,44,Female,147.66,Books,2024-01-30,Debit Card 32 | -------------------------------------------------------------------------------- /duckdb-miniseries/analyze-pandas-dataframes/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /duckdb-miniseries/analyze-parquet/query_parquet.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM read_parquet('restaurant_orders.parquet') LIMIT 5; 2 | 3 | DESCRIBE SELECT * FROM read_parquet('restaurant_orders.parquet') LIMIT 5; 4 | 5 | SELECT COUNT(*) AS total_orders FROM read_parquet('restaurant_orders.parquet'); 6 | 7 | SELECT SUM(price * quantity) AS total_revenue FROM read_parquet('restaurant_orders.parquet'); 8 | 9 | SELECT menu_item, SUM(quantity) AS total_quantity 10 | FROM read_parquet('restaurant_orders.parquet') 11 | GROUP BY menu_item 12 | ORDER BY total_quantity DESC 13 | LIMIT 5; 14 | 15 | SELECT payment_method, COUNT(*) AS order_count 16 | FROM read_parquet('restaurant_orders.parquet') 17 | GROUP BY payment_method 18 | ORDER BY order_count DESC; 19 | 20 | SELECT order_time, SUM(price * quantity) 21 | OVER (ORDER BY order_time) AS running_revenue 22 | FROM read_parquet('restaurant_orders.parquet'); 23 | 24 | SELECT order_id, customer_name, price * quantity AS order_value, 25 | RANK() OVER (ORDER BY price * quantity DESC) AS rank 26 | FROM read_parquet('restaurant_orders.parquet') 27 | LIMIT 5; 28 | -------------------------------------------------------------------------------- /duckdb-miniseries/analyze-parquet/query_parquet_op.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ┌──────────┬───────────────┬───┬─────────────────────┬────────────────┐ 3 | │ order_id │ customer_name │ … │ order_time │ payment_method │ 4 | │ int64 │ varchar │ │ varchar │ varchar │ 5 | ├──────────┼───────────────┼───┼─────────────────────┼────────────────┤ 6 | │ 1 │ Grace │ … │ 2024-02-01 18:00:00 │ PayPal │ 7 | │ 2 │ David │ … │ 2024-02-01 18:05:00 │ Credit Card │ 8 | │ 3 │ Eve │ … │ 2024-02-01 18:10:00 │ PayPal │ 9 | │ 4 │ Grace │ … │ 2024-02-01 18:15:00 │ PayPal │ 10 | │ 5 │ Charlie │ … │ 2024-02-01 18:20:00 │ Debit Card │ 11 | ├──────────┴───────────────┴───┴─────────────────────┴────────────────┤ 12 | │ 5 rows 8 columns (4 shown) │ 13 | └─────────────────────────────────────────────────────────────────────┘ 14 | ``` 15 | 16 | ``` 17 | ┌────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐ 18 | │ column_name │ column_type │ null │ key │ default │ extra │ 19 | │ varchar │ varchar │ varchar │ varchar │ varchar │ varchar │ 20 | ├────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤ 21 | │ order_id │ BIGINT │ YES │ NULL │ NULL │ NULL │ 22 | │ customer_name │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 23 | │ table_number │ BIGINT │ YES │ NULL │ NULL │ NULL │ 24 | │ menu_item │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 25 | │ price │ DOUBLE │ YES │ NULL │ NULL │ NULL │ 26 | │ quantity │ BIGINT │ YES │ NULL │ NULL │ NULL │ 27 | │ order_time │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 28 | │ payment_method │ VARCHAR │ YES │ NULL │ NULL │ NULL │ 29 | └────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘ 30 | ``` 31 | 32 | ``` 33 | ┌──────────────┐ 34 | │ total_orders │ 35 | │ int64 │ 36 | ├──────────────┤ 37 | │ 30 │ 38 | └──────────────┘ 39 | ``` 40 | ``` 41 | ┌────────────────────┐ 42 | │ total_revenue │ 43 | │ double │ 44 | ├────────────────────┤ 45 | │ 1770.9800000000005 │ 46 | └────────────────────┘ 47 | 48 | ``` 49 | 50 | ``` 51 | ┌───────────┬────────────────┐ 52 | │ menu_item │ total_quantity │ 53 | │ varchar │ int128 │ 54 | ├───────────┼────────────────┤ 55 | │ Pizza │ 16 │ 56 | │ Sushi │ 15 │ 57 | │ Salad │ 14 │ 58 | │ Tacos │ 14 │ 59 | │ Soup │ 7 │ 60 | └───────────┴────────────────┘ 61 | ``` 62 | 63 | ``` 64 | ┌────────────────┬─────────────┐ 65 | │ payment_method │ order_count │ 66 | │ varchar │ int64 │ 67 | ├────────────────┼─────────────┤ 68 | │ PayPal │ 9 │ 69 | │ Credit Card │ 8 │ 70 | │ Cash │ 7 │ 71 | │ Debit Card │ 6 │ 72 | └────────────────┴─────────────┘ 73 | ``` 74 | 75 | 76 | 77 | ``` 78 | ┌─────────────────────┬────────────────────┐ 79 | │ order_time │ running_revenue │ 80 | │ varchar │ double │ 81 | ├─────────────────────┼────────────────────┤ 82 | │ 2024-02-01 18:00:00 │ 55.28 │ 83 | │ 2024-02-01 18:05:00 │ 69.36 │ 84 | │ 2024-02-01 18:10:00 │ 128.28 │ 85 | │ 2024-02-01 18:15:00 │ 195.75 │ 86 | │ 2024-02-01 18:20:00 │ 212.96 │ 87 | │ 2024-02-01 18:25:00 │ 339.83000000000004 │ 88 | │ 2024-02-01 18:30:00 │ 360.88000000000005 │ 89 | │ 2024-02-01 18:35:00 │ 413.80000000000007 │ 90 | │ 2024-02-01 18:40:00 │ 472.6400000000001 │ 91 | │ 2024-02-01 18:45:00 │ 506.6600000000001 │ 92 | │ 2024-02-01 18:50:00 │ 547.7600000000001 │ 93 | │ 2024-02-01 18:55:00 │ 556.1100000000001 │ 94 | │ 2024-02-01 19:00:00 │ 654.9300000000001 │ 95 | │ 2024-02-01 19:05:00 │ 774.1800000000001 │ 96 | │ 2024-02-01 19:10:00 │ 816.0000000000001 │ 97 | │ 2024-02-01 19:15:00 │ 826.5000000000001 │ 98 | │ 2024-02-01 19:20:00 │ 951.6000000000001 │ 99 | │ 2024-02-01 19:25:00 │ 1062.0300000000002 │ 100 | │ 2024-02-01 19:30:00 │ 1099.8400000000001 │ 101 | │ 2024-02-01 19:35:00 │ 1218.9700000000003 │ 102 | │ 2024-02-01 19:40:00 │ 1243.9600000000003 │ 103 | │ 2024-02-01 19:45:00 │ 1286.2200000000003 │ 104 | │ 2024-02-01 19:50:00 │ 1306.6400000000003 │ 105 | │ 2024-02-01 19:55:00 │ 1482.0000000000005 │ 106 | │ 2024-02-01 20:00:00 │ 1515.0500000000004 │ 107 | │ 2024-02-01 20:05:00 │ 1574.7200000000005 │ 108 | │ 2024-02-01 20:10:00 │ 1598.3000000000004 │ 109 | │ 2024-02-01 20:15:00 │ 1674.2600000000004 │ 110 | │ 2024-02-01 20:20:00 │ 1733.1500000000005 │ 111 | │ 2024-02-01 20:25:00 │ 1770.9800000000005 │ 112 | ├─────────────────────┴────────────────────┤ 113 | │ 30 rows 2 columns │ 114 | └──────────────────────────────────────────┘ 115 | ``` 116 | 117 | ``` 118 | ┌──────────┬───────────────┬────────────────────┬───────┐ 119 | │ order_id │ customer_name │ order_value │ rank │ 120 | │ int64 │ varchar │ double │ int64 │ 121 | ├──────────┼───────────────┼────────────────────┼───────┤ 122 | │ 24 │ Hannah │ 175.36 │ 1 │ 123 | │ 6 │ Hannah │ 126.87 │ 2 │ 124 | │ 17 │ David │ 125.10000000000001 │ 3 │ 125 | │ 14 │ Charlie │ 119.25 │ 4 │ 126 | │ 20 │ Charlie │ 119.13 │ 5 │ 127 | └──────────┴───────────────┴────────────────────┴───────┘ 128 | ``` 129 | 130 | -------------------------------------------------------------------------------- /duckdb-miniseries/analyze-parquet/restaurant_orders.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/balapriyac/data-science-tutorials/c8efaa5894b2c4c651dba10b3e6612d47bbeac6d/duckdb-miniseries/analyze-parquet/restaurant_orders.parquet -------------------------------------------------------------------------------- /duckdb-miniseries/descriptive-statistics/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /duckdb-miniseries/hypothesis-testing/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /duckdb/README.md: -------------------------------------------------------------------------------- 1 | ## To Follow Along 2 | 3 | Install DuckDB, NumPy, and Pandas: 4 | 5 | ``` 6 | $ pip3 install duckdb numpy pandas 7 | ``` 8 | 9 | Run `generate_csv.py` to generate the sample CSV files: 10 | 11 | ``` 12 | $ python3 generate_csv.py 13 | ``` 14 | -------------------------------------------------------------------------------- /duckdb/generate_csv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Step 1: Generate Sales Data 5 | data = { 6 | 'Product_ID': np.arange(1, 101), 7 | 'Product_Name': ['Product_' + str(i) for i in range(1, 101)], 8 | 'Price': np.round(np.random.uniform(10, 500, 100), 2), 9 | 'Quantity_Sold': np.random.randint(1, 100, 100), 10 | 'Region': np.random.choice(['North', 'South', 'East', 'West'], 100) 11 | } 12 | 13 | # Create and save sales data DataFrame 14 | sales_df = pd.DataFrame(data) 15 | sales_csv_file = 'sales_data.csv' 16 | sales_df.to_csv(sales_csv_file, index=False) 17 | 18 | # Step 2: Generate Product Details Data 19 | product_data = { 20 | 'Product_ID': np.arange(1, 101), # Ensure IDs match the sales_data.csv 21 | 'Manufacturer': ['Manufacturer_' + str(np.random.randint(1, 11)) for _ in range(100)] # 10 different manufacturers 22 | } 23 | 24 | # Create and save product details DataFrame 25 | product_details_df = pd.DataFrame(product_data) 26 | product_csv_file = 'product_details.csv' 27 | product_details_df.to_csv(product_csv_file, index=False) 28 | -------------------------------------------------------------------------------- /duckdb/main.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | 3 | # View the first 5 rows of the data 4 | duckdb.sql("SELECT * FROM 'sales_data.csv' LIMIT 5").df() 5 | 6 | # Calculate total sales (Price * Quantity_Sold) per region 7 | query = """ 8 | SELECT Region, SUM(Price * Quantity_Sold) as Total_Sales 9 | FROM 'sales_data.csv' 10 | GROUP BY Region 11 | ORDER BY Total_Sales DESC 12 | """ 13 | total_sales = duckdb.sql(query).df() 14 | 15 | print("Total sales per region:") 16 | print(total_sales) 17 | 18 | # Find the top 5 best-selling products by quantity 19 | query = """ 20 | SELECT Product_Name, SUM(Quantity_Sold) as Total_Quantity 21 | FROM 'sales_data.csv' 22 | GROUP BY Product_Name 23 | ORDER BY Total_Quantity DESC 24 | LIMIT 5 25 | """ 26 | top_products = duckdb.sql(query).df() 27 | 28 | print("Top 5 best-selling products:") 29 | print(top_products) 30 | 31 | # Calculate the average price of products by region 32 | query = """ 33 | SELECT Region, AVG(Price) as Average_Price 34 | FROM 'sales_data.csv' 35 | GROUP BY Region 36 | """ 37 | avg_price_region = duckdb.sql(query).df() 38 | 39 | print("Average price per region:") 40 | print(avg_price_region) 41 | 42 | # Calculate total quantity sold by region 43 | query = """ 44 | SELECT Region, SUM(Quantity_Sold) as Total_Quantity 45 | FROM 'sales_data.csv' 46 | GROUP BY Region 47 | ORDER BY Total_Quantity DESC 48 | """ 49 | total_quantity_region = duckdb.sql(query).df() 50 | 51 | print("Total quantity sold per region:") 52 | print(total_quantity_region) 53 | 54 | # A simple join 55 | query = """ 56 | SELECT s.Product_Name, s.Region, s.Price, p.Manufacturer 57 | FROM 'sales_data.csv' s 58 | JOIN 'product_details.csv' p 59 | ON s.Product_ID = p.Product_ID 60 | """ 61 | joined_data = duckdb.sql(query).df() 62 | 63 | print(joined_data.head()) 64 | -------------------------------------------------------------------------------- /fastapi-docker-for-ml-model-deployment/diabetes-predictor/app/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /fastapi-docker-for-ml-model-deployment/diabetes-predictor/app/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from pydantic import BaseModel 3 | import pickle 4 | import numpy as np 5 | import os 6 | 7 | # Define input data schema 8 | class PatientData(BaseModel): 9 | age: float 10 | sex: float 11 | bmi: float 12 | bp: float # blood pressure 13 | s1: float # serum measurement 1 14 | s2: float # serum measurement 2 15 | s3: float # serum measurement 3 16 | s4: float # serum measurement 4 17 | s5: float # serum measurement 5 18 | s6: float # serum measurement 6 19 | 20 | class Config: 21 | schema_extra = { 22 | "example": { 23 | "age": 0.05, 24 | "sex": 0.05, 25 | "bmi": 0.06, 26 | "bp": 0.02, 27 | "s1": -0.04, 28 | "s2": -0.04, 29 | "s3": -0.02, 30 | "s4": -0.01, 31 | "s5": 0.01, 32 | "s6": 0.02 33 | } 34 | } 35 | 36 | # Initialize FastAPI app 37 | app = FastAPI( 38 | title="Diabetes Progression Predictor", 39 | description="Predicts diabetes progression from physiological features", 40 | version="1.0.0" 41 | ) 42 | 43 | # Load the trained model 44 | model_path = os.path.join("models", "diabetes_model.pkl") 45 | with open(model_path, 'rb') as f: 46 | model = pickle.load(f) 47 | 48 | @app.post("/predict") 49 | def predict_progression(patient: PatientData): 50 | """ 51 | Predict diabetes progression score 52 | """ 53 | # Convert input to numpy array 54 | features = np.array([[ 55 | patient.age, patient.sex, patient.bmi, patient.bp, 56 | patient.s1, patient.s2, patient.s3, patient.s4, 57 | patient.s5, patient.s6 58 | ]]) 59 | 60 | # Make prediction 61 | prediction = model.predict(features)[0] 62 | 63 | # Return result with additional context 64 | return { 65 | "predicted_progression_score": round(prediction, 2), 66 | "interpretation": get_interpretation(prediction) 67 | } 68 | 69 | def get_interpretation(score): 70 | """Provide human-readable interpretation of the score""" 71 | if score < 100: 72 | return "Below average progression" 73 | elif score < 150: 74 | return "Average progression" 75 | else: 76 | return "Above average progression" 77 | 78 | @app.get("/") 79 | def health_check(): 80 | return {"status": "healthy", "model": "diabetes_progression_v1"} 81 | 82 | -------------------------------------------------------------------------------- /fastapi-docker-for-ml-model-deployment/diabetes-predictor/train_model.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_diabetes 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.ensemble import RandomForestRegressor 4 | from sklearn.metrics import mean_squared_error, r2_score 5 | import pickle 6 | import os 7 | 8 | # Load the diabetes dataset 9 | diabetes = load_diabetes() 10 | X, y = diabetes.data, diabetes.target 11 | 12 | print(f"Dataset shape: {X.shape}") 13 | print(f"Features: {diabetes.feature_names}") 14 | print(f"Target range: {y.min():.1f} to {y.max():.1f}") 15 | 16 | # Split the data 17 | X_train, X_test, y_train, y_test = train_test_split( 18 | X, y, test_size=0.2, random_state=42 19 | ) 20 | 21 | print(f"Training samples: {X_train.shape[0]}") 22 | print(f"Test samples: {X_test.shape[0]}") 23 | 24 | # Train Random Forest model 25 | model = RandomForestRegressor( 26 | n_estimators=100, 27 | random_state=42, 28 | max_depth=10 29 | ) 30 | 31 | model.fit(X_train, y_train) 32 | 33 | # Make predictions and evaluate 34 | y_pred = model.predict(X_test) 35 | 36 | mse = mean_squared_error(y_test, y_pred) 37 | r2 = r2_score(y_test, y_pred) 38 | 39 | print(f"Mean Squared Error: {mse:.2f}") 40 | print(f"R² Score: {r2:.3f}") 41 | 42 | # Create models directory and save model 43 | os.makedirs('models', exist_ok=True) 44 | 45 | with open('models/diabetes_model.pkl', 'wb') as f: 46 | pickle.dump(model, f) 47 | 48 | print("Model trained and saved successful") 49 | -------------------------------------------------------------------------------- /fastapi/README.md: -------------------------------------------------------------------------------- 1 | ## Getting Started 2 | 3 | Create and activate a dedicated venv for the project: 4 | 5 | ```bash 6 | $ python3 -m venv v1 7 | $ source v1/bin/activate 8 | ``` 9 | Install FastAPI and Uvicorn with `pip`: 10 | 11 | ```bash 12 | $ pip3 install fastapi uvicorn 13 | ``` 14 | Also install scikit-learn: 15 | 16 | ```bash 17 | $ pip3 install scikit-learn 18 | ``` 19 | Check [main.py](https://github.com/balapriyac/data-science-tutorials/blob/main/fastapi/main.py) for the complete code. 20 | 21 | ## Run the App 22 | 23 | Run the following command: 24 | 25 | ```bash 26 | $ uvicorn main:app --reload 27 | ``` 28 | 29 | ## Query the `/predict/` Endpoint 30 | 31 | Example POST request (using cURL): 32 | 33 | ```bash 34 | curl -X 'POST' \ 35 | 'http://localhost:8000/predict/' \ 36 | -H 'Content-Type: application/json' \ 37 | -d '{ 38 | "sepal_length": 5.1, 39 | "sepal_width": 3.5, 40 | "petal_length": 1.4, 41 | "petal_width": 0.2 42 | }' 43 | 44 | ``` 45 | 46 | 47 | -------------------------------------------------------------------------------- /fastapi/main.py: -------------------------------------------------------------------------------- 1 | # Create a FastAPI app 2 | # Root endpoint returns the app description 3 | 4 | from fastapi import FastAPI 5 | 6 | app = FastAPI() 7 | 8 | # Define a function to return a description of the app 9 | def get_app_description(): 10 | return ( 11 | "Welcome to the Iris Species Prediction API!" 12 | "This API allows you to predict the species of an iris flower based on its sepal and petal measurements." 13 | "Use the '/predict/' endpoint with a POST request to make predictions." 14 | "Example usage: POST to '/predict/' with JSON data containing sepal_length, sepal_width, petal_length, and petal_width." 15 | ) 16 | 17 | # Define the root endpoint to return the app description 18 | @app.get("/") 19 | async def root(): 20 | return {"message": get_app_description()} 21 | 22 | 23 | # Build a logistic regression classifier 24 | from sklearn.datasets import load_iris 25 | from sklearn.linear_model import LogisticRegression 26 | 27 | # Load the Iris dataset 28 | iris = load_iris() 29 | X, y = iris.data, iris.target 30 | 31 | # Train a logistic regression model 32 | model = LogisticRegression() 33 | model.fit(X, y) 34 | 35 | # Define a function to predict the species 36 | def predict_species(sepal_length, sepal_width, petal_length, petal_width): 37 | features = [[sepal_length, sepal_width, petal_length, petal_width]] 38 | prediction = model.predict(features) 39 | return iris.target_names[prediction[0]] 40 | 41 | # Define the Pydantic model for your input data 42 | from pydantic import BaseModel 43 | 44 | class IrisData(BaseModel): 45 | sepal_length: float 46 | sepal_width: float 47 | petal_length: float 48 | petal_width: float 49 | 50 | # Create API endpoint 51 | @app.post("/predict/") 52 | async def predict_species_api(iris_data: IrisData): 53 | species = predict_species(iris_data.sepal_length, iris_data.sepal_width, iris_data.petal_length, iris_data.petal_width) 54 | return {"species": species} 55 | 56 | -------------------------------------------------------------------------------- /machine-learning/HyperparameterTuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## 1. Start Simple – Train a Baseline Model Without Any Tuning" 21 | ], 22 | "metadata": { 23 | "id": "i2Ju74quwU_E" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "id": "cBt_HjnMv7Qf" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from sklearn.tree import DecisionTreeClassifier\n", 35 | "from sklearn.metrics import accuracy_score\n", 36 | "from sklearn.model_selection import train_test_split\n", 37 | "from sklearn.datasets import load_iris\n", 38 | "\n", 39 | "# Load data\n", 40 | "data = load_iris()\n", 41 | "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n", 42 | "\n", 43 | "# Initialize model with default parameters\n", 44 | "model = DecisionTreeClassifier()\n", 45 | "\n", 46 | "# Train model\n", 47 | "model.fit(X_train, y_train)\n", 48 | "\n", 49 | "# Predict and evaluate\n", 50 | "y_pred = model.predict(X_test)\n", 51 | "baseline_accuracy = accuracy_score(y_test, y_pred)\n", 52 | "print(f'Baseline Accuracy: {baseline_accuracy:.2f}')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "source": [ 58 | "## 2. Use Hyperparameter Search with Cross-Validation" 59 | ], 60 | "metadata": { 61 | "id": "aEjaShhwwaC0" 62 | } 63 | }, 64 | { 65 | "cell_type": "code", 66 | "source": [ 67 | "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", 68 | "from sklearn.tree import DecisionTreeClassifier\n", 69 | "from sklearn.datasets import load_iris\n", 70 | "\n", 71 | "# Load data\n", 72 | "data = load_iris()\n", 73 | "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n", 74 | "\n", 75 | "# Initialize model\n", 76 | "model = DecisionTreeClassifier()\n", 77 | "\n", 78 | "# Define hyperparameter grid for Grid Search\n", 79 | "param_grid = {\n", 80 | "\t'criterion': ['gini', 'entropy'],\n", 81 | "\t'max_depth': [None, 10, 20, 30],\n", 82 | "\t'min_samples_split': [2, 5, 10]\n", 83 | "}\n", 84 | "\n", 85 | "from sklearn.model_selection import cross_val_score\n", 86 | "\n", 87 | "# Grid Search\n", 88 | "grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')\n", 89 | "grid_search.fit(X_train, y_train)\n", 90 | "best_params_grid = grid_search.best_params_\n", 91 | "best_score_grid = grid_search.best_score_\n", 92 | "\n", 93 | "print(f'Best Parameters (Grid Search): {best_params_grid}')\n", 94 | "print(f'Best Cross-Validation Score (Grid Search): {best_score_grid:.2f}')" 95 | ], 96 | "metadata": { 97 | "id": "-KWQc3JpwfZb" 98 | }, 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "source": [ 105 | "## 3. Use Randomized Search for Initial Exploration" 106 | ], 107 | "metadata": { 108 | "id": "qMMeeT_JwxHN" 109 | } 110 | }, 111 | { 112 | "cell_type": "code", 113 | "source": [ 114 | "from sklearn.model_selection import RandomizedSearchCV\n", 115 | "from sklearn.tree import DecisionTreeClassifier\n", 116 | "from sklearn.datasets import load_iris\n", 117 | "import numpy as np\n", 118 | "\n", 119 | "# Load data\n", 120 | "data = load_iris()\n", 121 | "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n", 122 | "\n", 123 | "# Initialize model\n", 124 | "model = DecisionTreeClassifier()\n", 125 | "\n", 126 | "# Define hyperparameter distribution for Random Search\n", 127 | "param_dist = {\n", 128 | "\t'criterion': ['gini', 'entropy'],\n", 129 | "\t'max_depth': [None] + list(range(10, 31)),\n", 130 | "\t'min_samples_split': range(2, 11),\n", 131 | "\t'min_samples_leaf': range(1, 11)\n", 132 | "}\n", 133 | "\n", 134 | "# Random Search\n", 135 | "random_search = RandomizedSearchCV(model, param_dist, n_iter=100, cv=5, scoring='accuracy')\n", 136 | "random_search.fit(X_train, y_train)\n", 137 | "best_params_random = random_search.best_params_\n", 138 | "best_score_random = random_search.best_score_\n", 139 | "\n", 140 | "print(f'Best Parameters (Random Search): {best_params_random}')\n", 141 | "print(f'Best Cross-Validation Score (Random Search): {best_score_random:.2f}')\n", 142 | "\n", 143 | "best_model = DecisionTreeClassifier(**best_params_random)\n", 144 | "best_model.fit(X_train, y_train)\n", 145 | "y_pred = best_model.predict(X_test)\n", 146 | "final_accuracy = accuracy_score(y_test, y_pred)\n", 147 | "\n", 148 | "print(f'Final Model Accuracy: {final_accuracy:.2f}')" 149 | ], 150 | "metadata": { 151 | "id": "TqNS89p_wyqd" 152 | }, 153 | "execution_count": null, 154 | "outputs": [] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "source": [ 159 | "## 4. Monitor Overfitting with Validation Curves" 160 | ], 161 | "metadata": { 162 | "id": "Yu1VwYCnyCQJ" 163 | } 164 | }, 165 | { 166 | "cell_type": "code", 167 | "source": [ 168 | "from sklearn.ensemble import RandomForestClassifier\n", 169 | "from sklearn.model_selection import validation_curve\n", 170 | "import matplotlib.pyplot as plt\n", 171 | "import numpy as np\n", 172 | "\n", 173 | "# Define hyperparameter range\n", 174 | "param_range = [10, 100, 200, 400, 800, 1000]\n", 175 | "\n", 176 | "# Calculate validation curve\n", 177 | "train_scores, test_scores = validation_curve(\n", 178 | "\tRandomForestClassifier(), X_train, y_train,\n", 179 | "\tparam_name=\"n_estimators\", param_range=param_range,\n", 180 | "\tcv=5, scoring=\"accuracy\")\n", 181 | "\n", 182 | "# Calculate mean and standard deviation\n", 183 | "train_mean = np.mean(train_scores, axis=1)\n", 184 | "train_std = np.std(train_scores, axis=1)\n", 185 | "test_mean = np.mean(test_scores, axis=1)\n", 186 | "test_std = np.std(test_scores, axis=1)\n", 187 | "\n", 188 | "# Plot validation curve\n", 189 | "plt.plot(param_range, train_mean, label=\"Training score\", color=\"r\")\n", 190 | "plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, color=\"r\", alpha=0.3)\n", 191 | "plt.plot(param_range, test_mean, label=\"Cross-validation score\", color=\"g\")\n", 192 | "plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, color=\"g\", alpha=0.3)\n", 193 | "plt.title(\"Validation Curve with Random Forest\")\n", 194 | "plt.xlabel(\"Number of Estimators\")\n", 195 | "plt.ylabel(\"Accuracy\")\n", 196 | "plt.legend(loc=\"best\")\n", 197 | "plt.show()" 198 | ], 199 | "metadata": { 200 | "id": "hO_wPZKFyFaz" 201 | }, 202 | "execution_count": null, 203 | "outputs": [] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "source": [ 208 | "## 5. Use Bayesian Optimization for Efficient Search" 209 | ], 210 | "metadata": { 211 | "id": "zaIfeNuryKXZ" 212 | } 213 | }, 214 | { 215 | "cell_type": "code", 216 | "source": [ 217 | "!pip install scikit-optimize" 218 | ], 219 | "metadata": { 220 | "id": "TBi7DOstyNGo" 221 | }, 222 | "execution_count": null, 223 | "outputs": [] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "source": [ 228 | "from skopt import BayesSearchCV\n", 229 | "from sklearn.tree import DecisionTreeClassifier\n", 230 | "from sklearn.datasets import load_iris\n", 231 | "from sklearn.model_selection import train_test_split\n", 232 | "from sklearn.metrics import accuracy_score\n", 233 | "\n", 234 | "# Load data\n", 235 | "data = load_iris()\n", 236 | "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)\n", 237 | "\n", 238 | "# Initialize model\n", 239 | "model = DecisionTreeClassifier()\n", 240 | "\n", 241 | "# Define hyperparameter space for Bayesian Optimization\n", 242 | "param_space = {\n", 243 | "\t'criterion': ['gini', 'entropy'],\n", 244 | "\t'max_depth': [None] + list(range(10, 31)),\n", 245 | "\t'min_samples_split': (2, 10),\n", 246 | "\t'min_samples_leaf': (1, 10)\n", 247 | "}" 248 | ], 249 | "metadata": { 250 | "id": "NA2_2DUeyPiQ" 251 | }, 252 | "execution_count": null, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "source": [ 258 | "# Bayesian Optimization\n", 259 | "opt = BayesSearchCV(model, param_space, n_iter=32, cv=5, scoring='accuracy')\n", 260 | "opt.fit(X_train, y_train)\n", 261 | "best_params_bayes = opt.best_params_\n", 262 | "best_score_bayes = opt.best_score_\n", 263 | "\n", 264 | "print(f'Best Parameters (Bayesian Optimization): {best_params_bayes}')\n", 265 | "print(f'Best Cross-Validation Score (Bayesian Optimization): {best_score_bayes:.2f}')" 266 | ], 267 | "metadata": { 268 | "id": "ASnb3BUnySWj" 269 | }, 270 | "execution_count": null, 271 | "outputs": [] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "source": [ 276 | "best_model = DecisionTreeClassifier(**best_params_bayes)\n", 277 | "best_model.fit(X_train, y_train)\n", 278 | "y_pred = best_model.predict(X_test)\n", 279 | "final_accuracy = accuracy_score(y_test, y_pred)\n", 280 | "\n", 281 | "print(f'Final Model Accuracy: {final_accuracy:.2f}')" 282 | ], 283 | "metadata": { 284 | "id": "i4yfsPExyWNe" 285 | }, 286 | "execution_count": null, 287 | "outputs": [] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "source": [], 292 | "metadata": { 293 | "id": "dmwUTtZlzcP0" 294 | }, 295 | "execution_count": null, 296 | "outputs": [] 297 | } 298 | ] 299 | } -------------------------------------------------------------------------------- /machine-learning/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model_deployment/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use Python 3.11 as the base image 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory inside the container 5 | WORKDIR /code 6 | 7 | # Copy the requirements file into the container 8 | COPY ./requirements.txt /code/requirements.txt 9 | 10 | # Install the Python dependencies 11 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt 12 | 13 | # Copy the app folder containing the FastAPI app into the container 14 | COPY ./app /code/app 15 | 16 | # Copy the model directory (with the saved model file) into the container 17 | COPY ./model /code/model 18 | 19 | # Expose port 80 for FastAPI 20 | EXPOSE 80 21 | 22 | # Command to run the FastAPI app with Uvicorn 23 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"] 24 | -------------------------------------------------------------------------------- /model_deployment/README.md: -------------------------------------------------------------------------------- 1 | ## Deploying ML Models 2 | 3 | ``` 4 | project-directory/ 5 | │ 6 | ├── app/ 7 | │ ├── __init__.py # Empty file 8 | │ └── main.py # FastAPI logic 9 | │ 10 | ├── model/ 11 | │ └── linear_regression_model.pkl # Saved model (after running model_training.py) 12 | │ 13 | ├── model_training.py # Model training code 14 | ├── requirements.txt # Python dependencies 15 | └── Dockerfile # Docker configuration 16 | ``` 17 | In your project environment, create and activate a virtual environment: 18 | 19 | ``` 20 | $ python3 -m venv v1 21 | $ source v1/bin/activate 22 | ``` 23 | Install these required packages using pip: 24 | 25 | ``` 26 | $ pip3 install pandas scikit-learn fastapi uvicorn 27 | ``` 28 | 29 | Run the script to train the model and save it: 30 | 31 | ``` 32 | $ python3 model_training.py 33 | ``` 34 | 35 | You should be able to find the .pkl file (`linear_regression_model.pkl`) in the `model/` directory. 36 | 37 | Use FastAPI to build an API to serve model predictions and containerize it using Docker. 38 | 39 | ### Building the Docker Image 40 | 41 | Build the Docker image by running the following `docker build` command: 42 | 43 | ``` 44 | $ docker build -t house-price-prediction-api . 45 | ``` 46 | 47 | Next run the Docker container: 48 | 49 | ``` 50 | $ docker run -d -p 80:80 house-price-prediction-api 51 | ``` 52 | 53 | ### Tagging and Pushing the Image to Docker Hub 54 | 55 | First, login to Docker Hub: 56 | 57 | ``` 58 | $ docker login 59 | ``` 60 | 61 | Tag the Docker image: 62 | 63 | ``` 64 | $ docker tag house-price-prediction-api your_username/house-price-prediction-api:v1 65 | ``` 66 | 67 | Push the image to Docker Hub: 68 | 69 | ``` 70 | $ docker push your_username/house-price-prediction-api:v1 71 | ``` 72 | 73 | Other developers can now pull and run the image like so: 74 | 75 | ``` 76 | $ docker pull your_username/house-price-prediction-api:v1 77 | $ docker run -d -p 80:80 your_username/house-price-prediction-api:v1 78 | ``` 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /model_deployment/app/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model_deployment/app/main.py: -------------------------------------------------------------------------------- 1 | # app/main.py 2 | from fastapi import FastAPI 3 | from pydantic import BaseModel 4 | import pickle 5 | import os 6 | 7 | # Define the input data schema using Pydantic 8 | class InputData(BaseModel): 9 | MedInc: float 10 | AveRooms: float 11 | AveOccup: float 12 | 13 | # Initialize FastAPI app 14 | app = FastAPI(title="House Price Prediction API") 15 | 16 | # Load the model during startup 17 | model_path = os.path.join("model", "linear_regression_model.pkl") 18 | with open(model_path, 'rb') as f: 19 | model = pickle.load(f) 20 | 21 | @app.post("/predict") 22 | def predict(data: InputData): 23 | # Prepare the data for prediction 24 | input_features = [[data.MedInc, data.AveRooms, data.AveOccup]] 25 | 26 | # Make prediction using the loaded model 27 | prediction = model.predict(input_features) 28 | 29 | # Return the prediction result 30 | return {"predicted_house_price": prediction[0]} 31 | -------------------------------------------------------------------------------- /model_deployment/model_training.py: -------------------------------------------------------------------------------- 1 | # model_training.py 2 | import pandas as pd 3 | from sklearn.datasets import fetch_california_housing 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.linear_model import LinearRegression 6 | import pickle 7 | import os 8 | 9 | # Load the dataset 10 | data = fetch_california_housing(as_frame=True) 11 | df = data['data'] 12 | target = data['target'] 13 | 14 | # Select a few features 15 | selected_features = ['MedInc', 'AveRooms', 'AveOccup'] 16 | X = df[selected_features] 17 | y = target 18 | 19 | # Train-test split 20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 21 | 22 | # Train the Linear Regression model 23 | model = LinearRegression() 24 | model.fit(X_train, y_train) 25 | 26 | # Create a 'model' folder to save the trained model 27 | os.makedirs('model', exist_ok=True) 28 | 29 | # Save the trained model using pickle 30 | with open('model/linear_regression_model.pkl', 'wb') as f: 31 | pickle.dump(model, f) 32 | 33 | print("Model trained and saved successfully.") 34 | -------------------------------------------------------------------------------- /model_deployment/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn 3 | scikit-learn 4 | pandas 5 | -------------------------------------------------------------------------------- /natural-language-processing/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /natural-language-processing/nlp_with_python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Installing NLTK" 21 | ], 22 | "metadata": { 23 | "id": "2ik7yq56NsrV" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 17, 29 | "metadata": { 30 | "colab": { 31 | "base_uri": "https://localhost:8080/" 32 | }, 33 | "id": "csTUtV_hIudG", 34 | "outputId": "eabe6b0a-f4e8-4841-f869-67facc108602" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.9.1)\n", 42 | "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)\n", 43 | "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.4.2)\n", 44 | "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.9.11)\n", 45 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.6)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "! pip install nltk" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "source": [ 56 | "import nltk\n", 57 | "\n", 58 | "# Download essential datasets and models\n", 59 | "nltk.download('punkt') # Tokenizers for sentence and word tokenization\n", 60 | "nltk.download('stopwords') # List of common stop words\n", 61 | "nltk.download('wordnet') # WordNet lexical database for lemmatization\n", 62 | "nltk.download('averaged_perceptron_tagger_eng') # Part-of-speech tagger\n", 63 | "nltk.download('maxent_ne_chunker_tab') # Named Entity Recognition model\n", 64 | "nltk.download('words') # Word corpus for NER\n", 65 | "nltk.download('punkt_tab')\n" 66 | ], 67 | "metadata": { 68 | "colab": { 69 | "base_uri": "https://localhost:8080/" 70 | }, 71 | "id": "khrbb-C8J5Ip", 72 | "outputId": "cd75c3cf-bf1a-4ac0-bcc3-7ce7eb6bf4fe" 73 | }, 74 | "execution_count": 18, 75 | "outputs": [ 76 | { 77 | "output_type": "stream", 78 | "name": "stderr", 79 | "text": [ 80 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 81 | "[nltk_data] Package punkt is already up-to-date!\n", 82 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 83 | "[nltk_data] Package stopwords is already up-to-date!\n", 84 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", 85 | "[nltk_data] Package wordnet is already up-to-date!\n", 86 | "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n", 87 | "[nltk_data] /root/nltk_data...\n", 88 | "[nltk_data] Package averaged_perceptron_tagger_eng is already up-to-\n", 89 | "[nltk_data] date!\n", 90 | "[nltk_data] Downloading package maxent_ne_chunker_tab to\n", 91 | "[nltk_data] /root/nltk_data...\n", 92 | "[nltk_data] Unzipping chunkers/maxent_ne_chunker_tab.zip.\n", 93 | "[nltk_data] Downloading package words to /root/nltk_data...\n", 94 | "[nltk_data] Package words is already up-to-date!\n", 95 | "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n", 96 | "[nltk_data] Package punkt_tab is already up-to-date!\n" 97 | ] 98 | }, 99 | { 100 | "output_type": "execute_result", 101 | "data": { 102 | "text/plain": [ 103 | "True" 104 | ] 105 | }, 106 | "metadata": {}, 107 | "execution_count": 18 108 | } 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "source": [ 114 | "## Text Preprocessing" 115 | ], 116 | "metadata": { 117 | "id": "lIOaDokONw1C" 118 | } 119 | }, 120 | { 121 | "cell_type": "code", 122 | "source": [ 123 | "import string\n", 124 | "from nltk.tokenize import word_tokenize, sent_tokenize\n", 125 | "\n", 126 | "text = \"Natural Language Processing (NLP) is cool! Let's explore it.\"\n", 127 | "\n", 128 | "# Remove punctuation using string.punctuation\n", 129 | "cleaned_text = ''.join(char for char in text if char not in string.punctuation)\n", 130 | "print(\"Text without punctuation:\", cleaned_text)\n", 131 | "\n", 132 | "# Sentence Tokenization\n", 133 | "sentences = sent_tokenize(cleaned_text)\n", 134 | "print(\"Sentences:\", sentences)\n", 135 | "\n", 136 | "# Word Tokenization\n", 137 | "words = word_tokenize(cleaned_text)\n", 138 | "print(\"Words:\", words)\n" 139 | ], 140 | "metadata": { 141 | "colab": { 142 | "base_uri": "https://localhost:8080/" 143 | }, 144 | "id": "IuVHrIa7J_9U", 145 | "outputId": "7fecd594-1115-4b68-f2f8-ea48bfb800ae" 146 | }, 147 | "execution_count": 19, 148 | "outputs": [ 149 | { 150 | "output_type": "stream", 151 | "name": "stdout", 152 | "text": [ 153 | "Text without punctuation: Natural Language Processing NLP is cool Lets explore it\n", 154 | "Sentences: ['Natural Language Processing NLP is cool Lets explore it']\n", 155 | "Words: ['Natural', 'Language', 'Processing', 'NLP', 'is', 'cool', 'Lets', 'explore', 'it']\n" 156 | ] 157 | } 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "source": [ 163 | "from nltk.corpus import stopwords\n", 164 | "\n", 165 | "# Load NLTK's stopwords list\n", 166 | "stop_words = set(stopwords.words('english'))\n", 167 | "\n", 168 | "# Filter out stop words\n", 169 | "filtered_words = [word for word in words if word.lower() not in stop_words]\n", 170 | "print(\"Filtered Words:\", filtered_words)\n" 171 | ], 172 | "metadata": { 173 | "colab": { 174 | "base_uri": "https://localhost:8080/" 175 | }, 176 | "id": "VX0bZ2y2KDaL", 177 | "outputId": "d1576147-171d-44ab-862a-71f3383f761c" 178 | }, 179 | "execution_count": 20, 180 | "outputs": [ 181 | { 182 | "output_type": "stream", 183 | "name": "stdout", 184 | "text": [ 185 | "Filtered Words: ['Natural', 'Language', 'Processing', 'NLP', 'cool', 'Lets', 'explore']\n" 186 | ] 187 | } 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "source": [ 193 | "from nltk.stem import PorterStemmer\n", 194 | "\n", 195 | "# Initialize the Porter Stemmer\n", 196 | "stemmer = PorterStemmer()\n", 197 | "\n", 198 | "# Apply stemming to filtered words\n", 199 | "stemmed_words = [stemmer.stem(word) for word in filtered_words]\n", 200 | "print(\"Stemmed Words:\", stemmed_words)\n" 201 | ], 202 | "metadata": { 203 | "colab": { 204 | "base_uri": "https://localhost:8080/" 205 | }, 206 | "id": "4vEw6L9TKFjk", 207 | "outputId": "38084b66-0d8e-4463-a35a-d8f454491250" 208 | }, 209 | "execution_count": 21, 210 | "outputs": [ 211 | { 212 | "output_type": "stream", 213 | "name": "stdout", 214 | "text": [ 215 | "Stemmed Words: ['natur', 'languag', 'process', 'nlp', 'cool', 'let', 'explor']\n" 216 | ] 217 | } 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "source": [ 223 | "## Lemmatization" 224 | ], 225 | "metadata": { 226 | "id": "yIKdaABAN2fD" 227 | } 228 | }, 229 | { 230 | "cell_type": "code", 231 | "source": [ 232 | "from nltk.stem import WordNetLemmatizer\n", 233 | "\n", 234 | "# Initialize the Lemmatizer\n", 235 | "lemmatizer = WordNetLemmatizer()\n", 236 | "\n", 237 | "# Lemmatize each word\n", 238 | "lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in filtered_words]\n", 239 | "print(\"Lemmatized Words:\", lemmatized_words)\n" 240 | ], 241 | "metadata": { 242 | "colab": { 243 | "base_uri": "https://localhost:8080/" 244 | }, 245 | "id": "kZMwY0G0KIkT", 246 | "outputId": "ffbad6dd-64f7-40a6-e255-e5234ea43797" 247 | }, 248 | "execution_count": 22, 249 | "outputs": [ 250 | { 251 | "output_type": "stream", 252 | "name": "stdout", 253 | "text": [ 254 | "Lemmatized Words: ['Natural', 'Language', 'Processing', 'NLP', 'cool', 'Lets', 'explore']\n" 255 | ] 256 | } 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "source": [ 262 | "## Part-of-Speech (POS) Tagging" 263 | ], 264 | "metadata": { 265 | "id": "zNeQphPZN56x" 266 | } 267 | }, 268 | { 269 | "cell_type": "code", 270 | "source": [ 271 | "from nltk import pos_tag\n", 272 | "\n", 273 | "# Tokenize the text into words\n", 274 | "text = \"She enjoys playing soccer on weekends.\"\n", 275 | "\n", 276 | "# Tokenization (words)\n", 277 | "words = word_tokenize(text)\n", 278 | "\n", 279 | "# POS tagging\n", 280 | "tagged_words = pos_tag(words)\n", 281 | "print(\"Tagged Words:\", tagged_words)\n" 282 | ], 283 | "metadata": { 284 | "colab": { 285 | "base_uri": "https://localhost:8080/" 286 | }, 287 | "id": "mdCZYFc9KK1j", 288 | "outputId": "3b345803-2707-4298-ba49-9560f50e87c4" 289 | }, 290 | "execution_count": 23, 291 | "outputs": [ 292 | { 293 | "output_type": "stream", 294 | "name": "stdout", 295 | "text": [ 296 | "Tagged Words: [('She', 'PRP'), ('enjoys', 'VBZ'), ('playing', 'VBG'), ('soccer', 'NN'), ('on', 'IN'), ('weekends', 'NNS'), ('.', '.')]\n" 297 | ] 298 | } 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "source": [ 304 | "## Named Entity Recognition (NER)" 305 | ], 306 | "metadata": { 307 | "id": "W52aYopXOW7V" 308 | } 309 | }, 310 | { 311 | "cell_type": "code", 312 | "source": [ 313 | "from nltk import ne_chunk, pos_tag, word_tokenize\n", 314 | "\n", 315 | "# Sample text\n", 316 | "text = \"We shall visit the Eiffel Tower on our vacation to Paris.\"\n", 317 | "\n", 318 | "# Tokenize the text into words\n", 319 | "words = word_tokenize(text)\n", 320 | "\n", 321 | "# Part-of-speech tagging\n", 322 | "tagged_words = pos_tag(words)\n", 323 | "\n", 324 | "# Named Entity Recognition\n", 325 | "named_entities = ne_chunk(tagged_words)\n", 326 | "print(\"Named Entities:\", named_entities)\n" 327 | ], 328 | "metadata": { 329 | "colab": { 330 | "base_uri": "https://localhost:8080/" 331 | }, 332 | "id": "fmPqEMqJKQqb", 333 | "outputId": "c3e63838-2539-4eb9-b071-6055acc40153" 334 | }, 335 | "execution_count": 24, 336 | "outputs": [ 337 | { 338 | "output_type": "stream", 339 | "name": "stdout", 340 | "text": [ 341 | "Named Entities: (S\n", 342 | " We/PRP\n", 343 | " shall/MD\n", 344 | " visit/VB\n", 345 | " the/DT\n", 346 | " (ORGANIZATION Eiffel/NNP Tower/NNP)\n", 347 | " on/IN\n", 348 | " our/PRP$\n", 349 | " vacation/NN\n", 350 | " to/TO\n", 351 | " (GPE Paris/NNP)\n", 352 | " ./.)\n" 353 | ] 354 | } 355 | ] 356 | } 357 | ] 358 | } -------------------------------------------------------------------------------- /pandas/5_steps_data_cleaning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Import pandas" 21 | ], 22 | "metadata": { 23 | "id": "M62qZc7zwlwz" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "source": [ 29 | "import pandas as pd" 30 | ], 31 | "metadata": { 32 | "id": "GlIV3iz-lFCi" 33 | }, 34 | "execution_count": 1, 35 | "outputs": [] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "source": [ 40 | "## Step 1 – Run Basic Data Quality Checks" 41 | ], 42 | "metadata": { 43 | "id": "FXkRvvpkvj-t" 44 | } 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": { 50 | "id": "Ac1OB3vRiY8z" 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "def check_data_quality(df):\n", 55 | " # Store initial data quality metrics\n", 56 | " quality_report = {\n", 57 | " 'missing_values': df.isnull().sum().to_dict(),\n", 58 | " 'duplicates': df.duplicated().sum(),\n", 59 | " 'total_rows': len(df),\n", 60 | " 'memory_usage': df.memory_usage().sum() / 1024**2 # in MB\n", 61 | " }\n", 62 | " return quality_report\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "source": [ 68 | "## Step 2 – Standardize Data Types" 69 | ], 70 | "metadata": { 71 | "id": "OXyrWXTGvqkY" 72 | } 73 | }, 74 | { 75 | "cell_type": "code", 76 | "source": [ 77 | "def standardize_datatypes(df):\n", 78 | " for column in df.columns:\n", 79 | " # Try converting string dates to datetime\n", 80 | " if df[column].dtype == 'object':\n", 81 | " try:\n", 82 | " df[column] = pd.to_datetime(df[column])\n", 83 | " print(f\"Converted {column} to datetime\")\n", 84 | " except ValueError:\n", 85 | " # Try converting to numeric if datetime fails\n", 86 | " try:\n", 87 | " df[column] = pd.to_numeric(df[column].str.replace('$', '').str.replace(',', ''))\n", 88 | " print(f\"Converted {column} to numeric\")\n", 89 | " except:\n", 90 | " pass\n", 91 | " return df\n" 92 | ], 93 | "metadata": { 94 | "id": "EyVN1pbwjz22" 95 | }, 96 | "execution_count": 3, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "source": [ 102 | "## Step 3 – Handle Missing Values" 103 | ], 104 | "metadata": { 105 | "id": "94Sbs42VwMdi" 106 | } 107 | }, 108 | { 109 | "cell_type": "code", 110 | "source": [ 111 | "from sklearn.impute import SimpleImputer\n", 112 | "\n", 113 | "def handle_missing_values(df):\n", 114 | " # Handle numeric columns\n", 115 | " numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns\n", 116 | " if len(numeric_columns) > 0:\n", 117 | " num_imputer = SimpleImputer(strategy='median')\n", 118 | " df[numeric_columns] = num_imputer.fit_transform(df[numeric_columns])\n", 119 | "\n", 120 | " # Handle categorical columns\n", 121 | " categorical_columns = df.select_dtypes(include=['object']).columns\n", 122 | " if len(categorical_columns) > 0:\n", 123 | " cat_imputer = SimpleImputer(strategy='most_frequent')\n", 124 | " df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])\n", 125 | "\n", 126 | " return df\n" 127 | ], 128 | "metadata": { 129 | "id": "W4lCuzTJkVRb" 130 | }, 131 | "execution_count": 4, 132 | "outputs": [] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "source": [ 137 | "## Step 4 – Detect and Handle Outliers" 138 | ], 139 | "metadata": { 140 | "id": "EyylQ0h1v2Ap" 141 | } 142 | }, 143 | { 144 | "cell_type": "code", 145 | "source": [ 146 | "def remove_outliers(df):\n", 147 | " numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns\n", 148 | " outliers_removed = {}\n", 149 | "\n", 150 | " for column in numeric_columns:\n", 151 | " Q1 = df[column].quantile(0.25)\n", 152 | " Q3 = df[column].quantile(0.75)\n", 153 | " IQR = Q3 - Q1\n", 154 | " lower_bound = Q1 - 1.5 * IQR\n", 155 | " upper_bound = Q3 + 1.5 * IQR\n", 156 | "\n", 157 | " # Count outliers before removing\n", 158 | " outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]\n", 159 | "\n", 160 | " # Cap the values instead of removing them\n", 161 | " df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)\n", 162 | "\n", 163 | " if outliers > 0:\n", 164 | " outliers_removed[column] = outliers\n", 165 | "\n", 166 | " return df, outliers_removed\n" 167 | ], 168 | "metadata": { 169 | "id": "Hic0lH3pkaYy" 170 | }, 171 | "execution_count": 5, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "source": [ 177 | "## Step 5 – Validate the Results" 178 | ], 179 | "metadata": { 180 | "id": "CCV0vKBcwVVB" 181 | } 182 | }, 183 | { 184 | "cell_type": "code", 185 | "source": [ 186 | "def validate_cleaning(df, original_shape, cleaning_report):\n", 187 | " validation_results = {\n", 188 | " 'rows_remaining': len(df),\n", 189 | " 'missing_values_remaining': df.isnull().sum().sum(),\n", 190 | " 'duplicates_remaining': df.duplicated().sum(),\n", 191 | " 'data_loss_percentage': (1 - len(df)/original_shape[0]) * 100\n", 192 | " }\n", 193 | "\n", 194 | " # Add validation results to the cleaning report\n", 195 | " cleaning_report['validation'] = validation_results\n", 196 | " return cleaning_report\n" 197 | ], 198 | "metadata": { 199 | "id": "5mCT72R8ke2r" 200 | }, 201 | "execution_count": 6, 202 | "outputs": [] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "source": [ 207 | "## Putting It All Together" 208 | ], 209 | "metadata": { 210 | "id": "W46kva14wX7B" 211 | } 212 | }, 213 | { 214 | "cell_type": "code", 215 | "source": [ 216 | "def automated_cleaning_pipeline(df):\n", 217 | " # Store original shape for reporting\n", 218 | " original_shape = df.shape\n", 219 | "\n", 220 | " # Initialize cleaning report\n", 221 | " cleaning_report = {}\n", 222 | "\n", 223 | " # Execute each step and collect metrics\n", 224 | " cleaning_report['initial_quality'] = check_data_quality(df)\n", 225 | "\n", 226 | " df = standardize_datatypes(df)\n", 227 | " df = handle_missing_values(df)\n", 228 | " df, outliers = remove_outliers(df)\n", 229 | " cleaning_report['outliers_removed'] = outliers\n", 230 | "\n", 231 | " # Validate and finalize report\n", 232 | " cleaning_report = validate_cleaning(df, original_shape, cleaning_report)\n", 233 | "\n", 234 | " return df, cleaning_report\n" 235 | ], 236 | "metadata": { 237 | "id": "ybHCRL8Dkmhz" 238 | }, 239 | "execution_count": 7, 240 | "outputs": [] 241 | } 242 | ] 243 | } -------------------------------------------------------------------------------- /pandas/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pandas/pandas_data_quality_checks_one_liners.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "colab": { 22 | "base_uri": "https://localhost:8080/" 23 | }, 24 | "id": "BqAJoDh7f_jn", 25 | "outputId": "5631b53b-a7bb-48b4-e6d4-62138ff69cb9" 26 | }, 27 | "outputs": [ 28 | { 29 | "output_type": "stream", 30 | "name": "stdout", 31 | "text": [ 32 | " TransactionID CustomerName Product Price Quantity TransactionDate\n", 33 | "0 101 Jane Rust Laptop 1200 1.0 2024-12-01\n", 34 | "1 102 june young Phone 800 2.0 2024/12/01\n", 35 | "2 103 Jane Rust Laptop 1200 NaN 01-12-2024\n", 36 | "3 104 None Tablet -300 1.0 None\n", 37 | "4 105 JUNE YOUNG Phone 850 1.0 2024-12-01\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "\n", 45 | "# Sample e-commerce transaction data\n", 46 | "data = {\n", 47 | " \"TransactionID\": [101, 102, 103, 104, 105],\n", 48 | " \"CustomerName\": [\"Jane Rust\", \"june young\", \"Jane Rust\", None, \"JUNE YOUNG\"],\n", 49 | " \"Product\": [\"Laptop\", \"Phone\", \"Laptop\", \"Tablet\", \"Phone\"],\n", 50 | " \"Price\": [1200, 800, 1200, -300, 850], # Negative value indicates an issue\n", 51 | " \"Quantity\": [1, 2, None, 1,1], # Missing value\n", 52 | " \"TransactionDate\": [\"2024-12-01\", \"2024/12/01\", \"01-12-2024\", None, \"2024-12-01\"],\n", 53 | "}\n", 54 | "\n", 55 | "df = pd.DataFrame(data)\n", 56 | "\n", 57 | "# Display the DataFrame\n", 58 | "print(df)\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "source": [ 64 | "df.info()" 65 | ], 66 | "metadata": { 67 | "colab": { 68 | "base_uri": "https://localhost:8080/" 69 | }, 70 | "id": "HFs0bFP75P-S", 71 | "outputId": "f3d17322-2d06-4cbd-c125-2a340af1d51c" 72 | }, 73 | "execution_count": null, 74 | "outputs": [ 75 | { 76 | "output_type": "stream", 77 | "name": "stdout", 78 | "text": [ 79 | "\n", 80 | "RangeIndex: 5 entries, 0 to 4\n", 81 | "Data columns (total 6 columns):\n", 82 | " # Column Non-Null Count Dtype \n", 83 | "--- ------ -------------- ----- \n", 84 | " 0 TransactionID 5 non-null int64 \n", 85 | " 1 CustomerName 4 non-null object \n", 86 | " 2 Product 5 non-null object \n", 87 | " 3 Price 5 non-null int64 \n", 88 | " 4 Quantity 4 non-null float64\n", 89 | " 5 TransactionDate 4 non-null object \n", 90 | "dtypes: float64(1), int64(2), object(3)\n", 91 | "memory usage: 368.0+ bytes\n" 92 | ] 93 | } 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "source": [ 99 | "missing_values = df.isnull().sum()\n", 100 | "print(\"Missing Values:\\n\", missing_values)\n" 101 | ], 102 | "metadata": { 103 | "colab": { 104 | "base_uri": "https://localhost:8080/" 105 | }, 106 | "id": "gCthsJ2kgaa4", 107 | "outputId": "12e5ad1d-e9f0-41e4-a421-0caced3a384d" 108 | }, 109 | "execution_count": null, 110 | "outputs": [ 111 | { 112 | "output_type": "stream", 113 | "name": "stdout", 114 | "text": [ 115 | "Missing Values:\n", 116 | " TransactionID 0\n", 117 | "CustomerName 1\n", 118 | "Product 0\n", 119 | "Price 0\n", 120 | "Quantity 1\n", 121 | "TransactionDate 1\n", 122 | "dtype: int64\n" 123 | ] 124 | } 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "source": [ 130 | "print(\"Data Types:\\n\", df.dtypes)" 131 | ], 132 | "metadata": { 133 | "colab": { 134 | "base_uri": "https://localhost:8080/" 135 | }, 136 | "id": "WQthCnS0gcBi", 137 | "outputId": "d3f21204-399c-4fe3-9478-2450c2ee098e" 138 | }, 139 | "execution_count": null, 140 | "outputs": [ 141 | { 142 | "output_type": "stream", 143 | "name": "stdout", 144 | "text": [ 145 | "Data Types:\n", 146 | " TransactionID int64\n", 147 | "CustomerName object\n", 148 | "Product object\n", 149 | "Price int64\n", 150 | "Quantity float64\n", 151 | "TransactionDate object\n", 152 | "dtype: object\n" 153 | ] 154 | } 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "source": [ 160 | "df[\"TransactionDate\"] = pd.to_datetime(df[\"TransactionDate\"], errors=\"coerce\")\n", 161 | "print(df[\"TransactionDate\"])\n" 162 | ], 163 | "metadata": { 164 | "colab": { 165 | "base_uri": "https://localhost:8080/" 166 | }, 167 | "id": "Jy-p-Qy3gdx6", 168 | "outputId": "9b96fc62-b902-4026-94f2-255f7dee4674" 169 | }, 170 | "execution_count": null, 171 | "outputs": [ 172 | { 173 | "output_type": "stream", 174 | "name": "stdout", 175 | "text": [ 176 | "0 2024-12-01\n", 177 | "1 NaT\n", 178 | "2 NaT\n", 179 | "3 NaT\n", 180 | "4 2024-12-01\n", 181 | "Name: TransactionDate, dtype: datetime64[ns]\n" 182 | ] 183 | } 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "source": [ 189 | "outliers = df[df[\"Price\"] < 0]\n", 190 | "print(\"Outliers:\\n\", outliers)\n" 191 | ], 192 | "metadata": { 193 | "colab": { 194 | "base_uri": "https://localhost:8080/" 195 | }, 196 | "id": "i7smYGpvgfh4", 197 | "outputId": "277ad03b-1186-4380-dd51-6b6e534f0b82" 198 | }, 199 | "execution_count": null, 200 | "outputs": [ 201 | { 202 | "output_type": "stream", 203 | "name": "stdout", 204 | "text": [ 205 | "Outliers:\n", 206 | " TransactionID CustomerName Product Price Quantity TransactionDate\n", 207 | "3 104 None Tablet -300 1.0 NaT\n" 208 | ] 209 | } 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "source": [ 215 | "duplicates = df.duplicated(subset=[\"CustomerName\", \"Product\"], keep=False)\n", 216 | "print(\"Duplicate Records:\\n\", df[duplicates])\n" 217 | ], 218 | "metadata": { 219 | "colab": { 220 | "base_uri": "https://localhost:8080/" 221 | }, 222 | "id": "dIp2HOilghYL", 223 | "outputId": "7b5a5b29-c2d5-47d4-fe81-78cc3d3655c9" 224 | }, 225 | "execution_count": null, 226 | "outputs": [ 227 | { 228 | "output_type": "stream", 229 | "name": "stdout", 230 | "text": [ 231 | "Duplicate Records:\n", 232 | " TransactionID CustomerName Product Price Quantity TransactionDate\n", 233 | "0 101 Jane Rust Laptop 1200 1.0 2024-12-01\n", 234 | "2 103 Jane Rust Laptop 1200 NaN NaT\n" 235 | ] 236 | } 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "source": [ 242 | "df[\"CustomerName\"] = df[\"CustomerName\"].str.strip().str.title()\n", 243 | "print(df[\"CustomerName\"])\n" 244 | ], 245 | "metadata": { 246 | "colab": { 247 | "base_uri": "https://localhost:8080/" 248 | }, 249 | "id": "mWIW43kvgjIX", 250 | "outputId": "c53f8270-9b4a-4214-9be7-1b5a73cbc3bb" 251 | }, 252 | "execution_count": null, 253 | "outputs": [ 254 | { 255 | "output_type": "stream", 256 | "name": "stdout", 257 | "text": [ 258 | "0 Jane Rust\n", 259 | "1 June Young\n", 260 | "2 Jane Rust\n", 261 | "3 None\n", 262 | "4 June Young\n", 263 | "Name: CustomerName, dtype: object\n" 264 | ] 265 | } 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "source": [ 271 | "invalid_prices = df[~df[\"Price\"].between(0, 5000)]\n", 272 | "print(\"Invalid Prices:\\n\", invalid_prices)\n" 273 | ], 274 | "metadata": { 275 | "colab": { 276 | "base_uri": "https://localhost:8080/" 277 | }, 278 | "id": "C7ciFe6-gkxx", 279 | "outputId": "b2a73dff-c3d9-4a7a-f619-2c011e840aa3" 280 | }, 281 | "execution_count": null, 282 | "outputs": [ 283 | { 284 | "output_type": "stream", 285 | "name": "stdout", 286 | "text": [ 287 | "Invalid Prices:\n", 288 | " TransactionID CustomerName Product Price Quantity TransactionDate\n", 289 | "3 104 None Tablet -300 1.0 NaT\n" 290 | ] 291 | } 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "source": [ 297 | "unique_products = df[\"Product\"].value_counts()\n", 298 | "print(\"Unique Products:\\n\", unique_products)\n" 299 | ], 300 | "metadata": { 301 | "colab": { 302 | "base_uri": "https://localhost:8080/" 303 | }, 304 | "id": "U9_V1fJ_gmS-", 305 | "outputId": "c1842f2f-ebeb-4a17-a1a9-c0887336c207" 306 | }, 307 | "execution_count": null, 308 | "outputs": [ 309 | { 310 | "output_type": "stream", 311 | "name": "stdout", 312 | "text": [ 313 | "Unique Products:\n", 314 | " Product\n", 315 | "Laptop 2\n", 316 | "Phone 2\n", 317 | "Tablet 1\n", 318 | "Name: count, dtype: int64\n" 319 | ] 320 | } 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "source": [ 326 | "inconsistent_names = df[\"CustomerName\"].str.contains(r\"[A-Z]{2,}\", na=False)\n", 327 | "print(\"Inconsistent Formatting in Names:\\n\", df[inconsistent_names])\n" 328 | ], 329 | "metadata": { 330 | "colab": { 331 | "base_uri": "https://localhost:8080/" 332 | }, 333 | "id": "2tWS4iMmgn6B", 334 | "outputId": "334ae93d-6628-4237-adf7-d54e1023a577" 335 | }, 336 | "execution_count": null, 337 | "outputs": [ 338 | { 339 | "output_type": "stream", 340 | "name": "stdout", 341 | "text": [ 342 | "Inconsistent Formatting in Names:\n", 343 | " Empty DataFrame\n", 344 | "Columns: [TransactionID, CustomerName, Product, Price, Quantity, TransactionDate]\n", 345 | "Index: []\n" 346 | ] 347 | } 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "source": [ 353 | "issues = df.isnull().sum(axis=1) + (df[\"Price\"] < 0) + (~df[\"TransactionDate\"].notnull())\n", 354 | "problematic_rows = df[issues > 1]\n", 355 | "print(\"Rows with Multiple Issues:\\n\", problematic_rows)\n" 356 | ], 357 | "metadata": { 358 | "colab": { 359 | "base_uri": "https://localhost:8080/" 360 | }, 361 | "id": "ZJEQ8_5Ugp7x", 362 | "outputId": "630c82dd-82b3-4bb5-a0a3-9b883d0985a7" 363 | }, 364 | "execution_count": null, 365 | "outputs": [ 366 | { 367 | "output_type": "stream", 368 | "name": "stdout", 369 | "text": [ 370 | "Rows with Multiple Issues:\n", 371 | " TransactionID CustomerName Product Price Quantity TransactionDate\n", 372 | "1 102 June Young Phone 800 2.0 NaT\n", 373 | "2 103 Jane Rust Laptop 1200 NaN NaT\n", 374 | "3 104 None Tablet -300 1.0 NaT\n" 375 | ] 376 | } 377 | ] 378 | } 379 | ] 380 | } -------------------------------------------------------------------------------- /postgres/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyspark/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pyspark/pyspark_data_cleaning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Install PySpark" 21 | ], 22 | "metadata": { 23 | "id": "EnkdF6a8IJNL" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "colab": { 31 | "base_uri": "https://localhost:8080/" 32 | }, 33 | "id": "40wRo96rr55a", 34 | "outputId": "77638389-c474-44c0-98e4-68932fa52e14" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | "Collecting pyspark\n", 42 | " Downloading pyspark-3.5.2.tar.gz (317.3 MB)\n", 43 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 44 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 45 | "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n", 46 | "Building wheels for collected packages: pyspark\n", 47 | " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 48 | " Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=6bd80e2df67a29c669daab45ed1eb501ce4a7f36d432bceb1d510132c890bcd0\n", 49 | " Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574\n", 50 | "Successfully built pyspark\n", 51 | "Installing collected packages: pyspark\n", 52 | "Successfully installed pyspark-3.5.2\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "! pip3 install pyspark" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "source": [ 63 | "## 1. Start a PySpark Session" 64 | ], 65 | "metadata": { 66 | "id": "s5NnsPriK2WZ" 67 | } 68 | }, 69 | { 70 | "cell_type": "code", 71 | "source": [ 72 | "from pyspark.sql import SparkSession\n", 73 | "\n", 74 | "# Initialize a Spark session\n", 75 | "spark = SparkSession.builder \\\n", 76 | "\t.appName(\"DataCleaning\") \\\n", 77 | "\t.getOrCreate()\n" 78 | ], 79 | "metadata": { 80 | "id": "sjLGh31-zcjY" 81 | }, 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "source": [ 88 | "## 2. Generate a Sample Dataset" 89 | ], 90 | "metadata": { 91 | "id": "XQmyi3t9K-vm" 92 | } 93 | }, 94 | { 95 | "cell_type": "code", 96 | "source": [ 97 | "import random\n", 98 | "import pandas as pd\n", 99 | "\n", 100 | "# Function to generate random data with some missing values and duplicates\n", 101 | "def generate_data(n):\n", 102 | " customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]\n", 103 | " product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']\n", 104 | "\n", 105 | " data = []\n", 106 | " for i in range(n):\n", 107 | " customer_id = random.choice(customer_ids) if i % 10 != 0 else None # Introduce some missing values\n", 108 | " transaction_id = f'T{str(random.randint(10000, 99999))}'\n", 109 | " transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')\n", 110 | " amount = round(random.uniform(5, 500), 2)\n", 111 | " product_category = random.choice(product_categories)\n", 112 | " data.append((customer_id, transaction_id, transaction_date, amount, product_category))\n", 113 | "\n", 114 | " # Introduce duplicates\n", 115 | " data.extend(data[:10])\n", 116 | "\n", 117 | " return data" 118 | ], 119 | "metadata": { 120 | "id": "7mZDk8Arzlhj" 121 | }, 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "source": [ 128 | "# Generate 10,000 rows of data\n", 129 | "data = generate_data(10_000)\n", 130 | "\n", 131 | "# Convert to a Pandas DataFrame and then to PySpark DataFrame\n", 132 | "columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']\n", 133 | "df = pd.DataFrame(data, columns=columns)\n", 134 | "spark_df = spark.createDataFrame(df)\n", 135 | "\n", 136 | "spark_df.show(5)\n" 137 | ], 138 | "metadata": { 139 | "colab": { 140 | "base_uri": "https://localhost:8080/" 141 | }, 142 | "id": "laHfRBDkzp1z", 143 | "outputId": "137cbd86-ede9-4cc5-bf4b-a753e12bfb4b" 144 | }, 145 | "execution_count": null, 146 | "outputs": [ 147 | { 148 | "output_type": "stream", 149 | "name": "stdout", 150 | "text": [ 151 | "+----------+-------------+-------------------+------+---------------+\n", 152 | "|CustomerID|TransactionID| TransactionDate|Amount|ProductCategory|\n", 153 | "+----------+-------------+-------------------+------+---------------+\n", 154 | "| NULL| T17203|2023-03-20 00:00:00|221.92| Books|\n", 155 | "| NULL| T17203|2023-03-20 00:00:00|221.92| Books|\n", 156 | "| C00058| T63296|2023-02-11 00:00:00|157.92| Groceries|\n", 157 | "| NULL| T17203|2023-03-20 00:00:00|221.92| Books|\n", 158 | "| NULL| T17203|2023-03-20 00:00:00|221.92| Books|\n", 159 | "+----------+-------------+-------------------+------+---------------+\n", 160 | "only showing top 5 rows\n", 161 | "\n" 162 | ] 163 | } 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "source": [ 169 | "spark_df.dtypes" 170 | ], 171 | "metadata": { 172 | "colab": { 173 | "base_uri": "https://localhost:8080/" 174 | }, 175 | "id": "Wz3u7w8R8eQF", 176 | "outputId": "181b8c62-3d8f-4585-ed60-1c2adb33b8f5" 177 | }, 178 | "execution_count": null, 179 | "outputs": [ 180 | { 181 | "output_type": "execute_result", 182 | "data": { 183 | "text/plain": [ 184 | "[('CustomerID', 'string'),\n", 185 | " ('TransactionID', 'string'),\n", 186 | " ('TransactionDate', 'date'),\n", 187 | " ('Amount', 'double'),\n", 188 | " ('ProductCategory', 'string')]" 189 | ] 190 | }, 191 | "metadata": {}, 192 | "execution_count": 9 193 | } 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "source": [ 199 | "## 3. Handle Missing Values" 200 | ], 201 | "metadata": { 202 | "id": "nrouSi23Dw_I" 203 | } 204 | }, 205 | { 206 | "cell_type": "code", 207 | "source": [ 208 | "# Fill missing CustomerID with a default value\n", 209 | "spark_df = spark_df.fillna({\"CustomerID\": \"Unknown\"})\n" 210 | ], 211 | "metadata": { 212 | "id": "ku6AdAO6z9PA" 213 | }, 214 | "execution_count": null, 215 | "outputs": [] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "source": [ 220 | "## 4. Remove Duplicates" 221 | ], 222 | "metadata": { 223 | "id": "tuP2lpcYD6Nu" 224 | } 225 | }, 226 | { 227 | "cell_type": "code", 228 | "source": [ 229 | "from pyspark.sql.functions import col, min, max\n", 230 | "\n", 231 | "# Normalize the 'Amount' column\n", 232 | "min_amount = spark_df.agg(min(col(\"Amount\"))).collect()[0][0]\n", 233 | "max_amount = spark_df.agg(max(col(\"Amount\"))).collect()[0][0]\n", 234 | "\n", 235 | "spark_df = spark_df.withColumn(\"Amount\", (col(\"Amount\") - min_amount) / (max_amount - min_amount))\n" 236 | ], 237 | "metadata": { 238 | "id": "eomZcsnW0HCY" 239 | }, 240 | "execution_count": null, 241 | "outputs": [] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "source": [ 246 | "## 5. Transform Columns" 247 | ], 248 | "metadata": { 249 | "id": "nzJzlkajIzkF" 250 | } 251 | }, 252 | { 253 | "cell_type": "code", 254 | "source": [ 255 | "from pyspark.sql.functions import col, min, max\n", 256 | "\n", 257 | "# Normalize the 'Amount' column\n", 258 | "min_amount = spark_df.agg(min(col(\"Amount\"))).collect()[0][0]\n", 259 | "max_amount = spark_df.agg(max(col(\"Amount\"))).collect()[0][0]\n", 260 | "\n", 261 | "spark_df = spark_df.withColumn(\"Amount\", (col(\"Amount\") - min_amount) / (max_amount - min_amount))" 262 | ], 263 | "metadata": { 264 | "id": "5TPaJLV3I1r0" 265 | }, 266 | "execution_count": null, 267 | "outputs": [] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "source": [ 272 | "## 6. Handle Outliers" 273 | ], 274 | "metadata": { 275 | "id": "y0JtNxWVJCju" 276 | } 277 | }, 278 | { 279 | "cell_type": "code", 280 | "source": [ 281 | "from pyspark.sql.functions import col, expr\n", 282 | "\n", 283 | "# Calculate Q1, Q3, and IQR\n", 284 | "quantiles = spark_df.approxQuantile(\"Amount\", [0.25, 0.75], 0.05)\n", 285 | "Q1 = quantiles[0]\n", 286 | "Q3 = quantiles[1]\n", 287 | "IQR = Q3 - Q1\n", 288 | "\n", 289 | "# Define the upper and lower bounds\n", 290 | "lower_bound = Q1 - 1.5 * IQR\n", 291 | "upper_bound = Q3 + 1.5 * IQR\n", 292 | "\n", 293 | "# Filter out the outliers\n", 294 | "spark_df = spark_df.filter((col(\"Amount\") >= lower_bound) & (col(\"Amount\") <= upper_bound))" 295 | ], 296 | "metadata": { 297 | "id": "KsPQQuGrJHYa" 298 | }, 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "source": [ 305 | "\n", 306 | "## 7. Convert Data Types" 307 | ], 308 | "metadata": { 309 | "id": "8mpXH0cdJL8b" 310 | } 311 | }, 312 | { 313 | "cell_type": "code", 314 | "source": [ 315 | "from pyspark.sql.functions import to_date\n", 316 | "\n", 317 | "# Convert 'TransactionDate' to date format\n", 318 | "# (not quite needed for this dataset)\n", 319 | "spark_df = spark_df.withColumn(\"TransactionDate\", to_date(col(\"TransactionDate\")))\n" 320 | ], 321 | "metadata": { 322 | "id": "46QEuADu0LIS" 323 | }, 324 | "execution_count": null, 325 | "outputs": [] 326 | } 327 | ] 328 | } -------------------------------------------------------------------------------- /pyspark/pyspark_read_csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Install PySpark" 21 | ], 22 | "metadata": { 23 | "id": "5oNZgpo8Ljzu" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "source": [ 29 | "! pip3 install pyspark" 30 | ], 31 | "metadata": { 32 | "colab": { 33 | "base_uri": "https://localhost:8080/" 34 | }, 35 | "id": "T4Se5rOKKjzk", 36 | "outputId": "223f9be2-a51e-4006-e89d-b077b3d546c3" 37 | }, 38 | "execution_count": 1, 39 | "outputs": [ 40 | { 41 | "output_type": "stream", 42 | "name": "stdout", 43 | "text": [ 44 | "Collecting pyspark\n", 45 | " Downloading pyspark-3.5.2.tar.gz (317.3 MB)\n", 46 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 47 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 48 | "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n", 49 | "Building wheels for collected packages: pyspark\n", 50 | " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 51 | " Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=bbfd80a589ea8e2302f3938fd11b4434a84633b28244a0229ecf62245ae601d1\n", 52 | " Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574\n", 53 | "Successfully built pyspark\n", 54 | "Installing collected packages: pyspark\n", 55 | "Successfully installed pyspark-3.5.2\n" 56 | ] 57 | } 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "source": [ 63 | "## 1. Start a PySpark Session\n" 64 | ], 65 | "metadata": { 66 | "id": "myuR4RIqLmvp" 67 | } 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 2, 72 | "metadata": { 73 | "id": "caz_ZJqBKXU2" 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "from pyspark.sql import SparkSession\n", 78 | "\n", 79 | "# Initialize a Spark session\n", 80 | "spark = SparkSession.builder \\\n", 81 | "\t.appName(\"ReadCSV\") \\\n", 82 | "\t.getOrCreate()\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "source": [ 88 | "## 2. Generate a Sample CSV File" 89 | ], 90 | "metadata": { 91 | "id": "zv3SAft7Ltuk" 92 | } 93 | }, 94 | { 95 | "cell_type": "code", 96 | "source": [ 97 | "import random\n", 98 | "import pandas as pd\n", 99 | "\n", 100 | "# Function to generate random transaction data\n", 101 | "def generate_data(n):\n", 102 | " customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]\n", 103 | " product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']\n", 104 | "\n", 105 | " data = []\n", 106 | " for _ in range(n):\n", 107 | " customer_id = random.choice(customer_ids)\n", 108 | " transaction_id = f'T{str(random.randint(10000, 99999))}'\n", 109 | " transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')\n", 110 | " amount = round(random.uniform(5, 500), 2)\n", 111 | " product_category = random.choice(product_categories)\n", 112 | " data.append((customer_id, transaction_id, transaction_date, amount, product_category))\n", 113 | "\n", 114 | " return data\n", 115 | "\n", 116 | "# Generate 10000 rows of transaction data\n", 117 | "data = generate_data(10_000)\n", 118 | "\n", 119 | "# Convert to a Pandas DataFrame\n", 120 | "columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']\n", 121 | "df = pd.DataFrame(data, columns=columns)\n", 122 | "\n", 123 | "# Create the CSV file\n", 124 | "csv_path = \"sample_transactions.csv\"\n", 125 | "df.to_csv(csv_path, index=False)\n", 126 | "\n", 127 | "print(f\"Sample CSV file '{csv_path}' generated.\")" 128 | ], 129 | "metadata": { 130 | "colab": { 131 | "base_uri": "https://localhost:8080/" 132 | }, 133 | "id": "RRiKSPCyKaah", 134 | "outputId": "0aa43d92-c219-4647-94c3-767f49bf9333" 135 | }, 136 | "execution_count": 3, 137 | "outputs": [ 138 | { 139 | "output_type": "stream", 140 | "name": "stdout", 141 | "text": [ 142 | "Sample CSV file 'sample_transactions.csv' generated.\n" 143 | ] 144 | } 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "source": [ 150 | "## 3. Read the CSV File into a PySpark DataFrame" 151 | ], 152 | "metadata": { 153 | "id": "g4PT5XIiLw8U" 154 | } 155 | }, 156 | { 157 | "cell_type": "code", 158 | "source": [ 159 | "spark_df = spark.read.csv(csv_path, header=True, inferSchema=True)\n", 160 | "\n", 161 | "# Show the first 5 rows\n", 162 | "spark_df.show(5)\n" 163 | ], 164 | "metadata": { 165 | "colab": { 166 | "base_uri": "https://localhost:8080/" 167 | }, 168 | "id": "7kGSq4k8KeZL", 169 | "outputId": "7b5677c9-629b-4e3e-dab7-fced000ca15e" 170 | }, 171 | "execution_count": 4, 172 | "outputs": [ 173 | { 174 | "output_type": "stream", 175 | "name": "stdout", 176 | "text": [ 177 | "+----------+-------------+---------------+------+---------------+\n", 178 | "|CustomerID|TransactionID|TransactionDate|Amount|ProductCategory|\n", 179 | "+----------+-------------+---------------+------+---------------+\n", 180 | "| C00006| T58996| 2023-01-09| 17.02| Furniture|\n", 181 | "| C00076| T30519| 2023-02-28|459.67| Books|\n", 182 | "| C00076| T89246| 2023-06-10|404.95| Clothing|\n", 183 | "| C00049| T11436| 2023-06-05| 103.9| Books|\n", 184 | "| C00049| T18176| 2023-04-03|406.55| Furniture|\n", 185 | "+----------+-------------+---------------+------+---------------+\n", 186 | "only showing top 5 rows\n", 187 | "\n" 188 | ] 189 | } 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "source": [ 195 | "## 4. Exploring the DataFrame" 196 | ], 197 | "metadata": { 198 | "id": "_RIITip_L5GO" 199 | } 200 | }, 201 | { 202 | "cell_type": "code", 203 | "source": [ 204 | "# Print the schema of the DataFrame\n", 205 | "spark_df.printSchema()\n" 206 | ], 207 | "metadata": { 208 | "colab": { 209 | "base_uri": "https://localhost:8080/" 210 | }, 211 | "id": "R0xFp27-KgXs", 212 | "outputId": "a06af877-c1a1-418b-a0d0-4be8a01f3377" 213 | }, 214 | "execution_count": 10, 215 | "outputs": [ 216 | { 217 | "output_type": "stream", 218 | "name": "stdout", 219 | "text": [ 220 | "root\n", 221 | " |-- CustomerID: string (nullable = true)\n", 222 | " |-- TransactionID: string (nullable = true)\n", 223 | " |-- TransactionDate: date (nullable = true)\n", 224 | " |-- Amount: double (nullable = true)\n", 225 | " |-- ProductCategory: string (nullable = true)\n", 226 | "\n" 227 | ] 228 | } 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "source": [ 234 | "from pyspark.sql.functions import col\n", 235 | "\n", 236 | "# Filter transactions with an Amount greater than 100\n", 237 | "filtered_df = spark_df.filter(col(\"Amount\") > 100)\n", 238 | "\n", 239 | "# Select specific columns\n", 240 | "selected_df = filtered_df.select(\"CustomerID\", \"TransactionID\", \"Amount\")\n", 241 | "\n", 242 | "# Show the results\n", 243 | "selected_df.show(5)" 244 | ], 245 | "metadata": { 246 | "colab": { 247 | "base_uri": "https://localhost:8080/" 248 | }, 249 | "id": "ua9R3BnoKiiQ", 250 | "outputId": "e19be35b-238c-4f8b-de13-e7a7efcd5afc" 251 | }, 252 | "execution_count": 11, 253 | "outputs": [ 254 | { 255 | "output_type": "stream", 256 | "name": "stdout", 257 | "text": [ 258 | "+----------+-------------+------+\n", 259 | "|CustomerID|TransactionID|Amount|\n", 260 | "+----------+-------------+------+\n", 261 | "| C00076| T30519|459.67|\n", 262 | "| C00076| T89246|404.95|\n", 263 | "| C00049| T11436| 103.9|\n", 264 | "| C00049| T18176|406.55|\n", 265 | "| C00096| T31087|349.47|\n", 266 | "+----------+-------------+------+\n", 267 | "only showing top 5 rows\n", 268 | "\n" 269 | ] 270 | } 271 | ] 272 | } 273 | ] 274 | } -------------------------------------------------------------------------------- /pyspark/pyspark_write_parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Install PySpark" 21 | ], 22 | "metadata": { 23 | "id": "rsnBVqAGEUX-" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "colab": { 31 | "base_uri": "https://localhost:8080/" 32 | }, 33 | "id": "nmFnWlbOB6lk", 34 | "outputId": "157e06fd-ad57-4541-dec5-3052ad9563fd" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | "Collecting pyspark\n", 42 | " Downloading pyspark-3.5.2.tar.gz (317.3 MB)\n", 43 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m317.3/317.3 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 44 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 45 | "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n", 46 | "Building wheels for collected packages: pyspark\n", 47 | " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 48 | " Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=bb3917a42031cc2863a57c06e481281d96d8b540fd03498d256aff244f0d14ae\n", 49 | " Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574\n", 50 | "Successfully built pyspark\n", 51 | "Installing collected packages: pyspark\n", 52 | "Successfully installed pyspark-3.5.2\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "! pip install pyspark" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "source": [ 63 | "## 1. Start a PySpark Session" 64 | ], 65 | "metadata": { 66 | "id": "rcqidYgeEX69" 67 | } 68 | }, 69 | { 70 | "cell_type": "code", 71 | "source": [ 72 | "from pyspark.sql import SparkSession\n", 73 | "\n", 74 | "# Initialize a Spark session\n", 75 | "spark = SparkSession.builder \\\n", 76 | "\t.appName(\"WriteToParquet\") \\\n", 77 | "\t.getOrCreate()\n" 78 | ], 79 | "metadata": { 80 | "id": "PCL06Q0oB92P" 81 | }, 82 | "execution_count": 2, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "source": [ 88 | "## 2. Generating a Sample Dataset" 89 | ], 90 | "metadata": { 91 | "id": "ngfWs7IKEbvs" 92 | } 93 | }, 94 | { 95 | "cell_type": "code", 96 | "source": [ 97 | "import random\n", 98 | "import pandas as pd\n", 99 | "\n", 100 | "# Function to generate random transaction data\n", 101 | "def generate_data(n):\n", 102 | " customer_ids = [f'C{str(i).zfill(5)}' for i in range(1, 101)]\n", 103 | " product_categories = ['Electronics', 'Books', 'Clothing', 'Groceries', 'Furniture']\n", 104 | "\n", 105 | " data = []\n", 106 | " for _ in range(n):\n", 107 | " customer_id = random.choice(customer_ids)\n", 108 | " transaction_id = f'T{str(random.randint(10000, 99999))}'\n", 109 | " transaction_date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='d')\n", 110 | " amount = round(random.uniform(5, 500), 2)\n", 111 | " product_category = random.choice(product_categories)\n", 112 | " data.append((customer_id, transaction_id, transaction_date, amount, product_category))\n", 113 | "\n", 114 | " return data\n" 115 | ], 116 | "metadata": { 117 | "id": "RIOrcXxoCCoc" 118 | }, 119 | "execution_count": 3, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "source": [ 125 | "# Generate 100,000 rows of transaction data\n", 126 | "data = generate_data(100_000)\n", 127 | "\n", 128 | "# Convert to a Pandas DataFrame\n", 129 | "columns = ['CustomerID', 'TransactionID', 'TransactionDate', 'Amount', 'ProductCategory']\n", 130 | "df = pd.DataFrame(data, columns=columns)\n", 131 | "\n", 132 | "# Convert to a PySpark DataFrame\n", 133 | "spark_df = spark.createDataFrame(df)\n", 134 | "spark_df.show(5)\n" 135 | ], 136 | "metadata": { 137 | "colab": { 138 | "base_uri": "https://localhost:8080/" 139 | }, 140 | "id": "COM3xnQsCE7l", 141 | "outputId": "e8850727-55a3-4b8c-fb68-913195b81b26" 142 | }, 143 | "execution_count": 4, 144 | "outputs": [ 145 | { 146 | "output_type": "stream", 147 | "name": "stdout", 148 | "text": [ 149 | "+----------+-------------+-------------------+------+---------------+\n", 150 | "|CustomerID|TransactionID| TransactionDate|Amount|ProductCategory|\n", 151 | "+----------+-------------+-------------------+------+---------------+\n", 152 | "| C00012| T36462|2023-05-05 00:00:00| 90.91| Furniture|\n", 153 | "| C00037| T81031|2023-03-19 00:00:00|465.54| Electronics|\n", 154 | "| C00092| T98628|2023-02-25 00:00:00| 180.9| Clothing|\n", 155 | "| C00050| T46850|2023-04-16 00:00:00|494.67| Furniture|\n", 156 | "| C00097| T79766|2023-04-11 00:00:00|179.65| Groceries|\n", 157 | "+----------+-------------+-------------------+------+---------------+\n", 158 | "only showing top 5 rows\n", 159 | "\n" 160 | ] 161 | } 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "source": [ 167 | "## 3. Writing DataFrames to Parquet Files" 168 | ], 169 | "metadata": { 170 | "id": "QXoDo2PdEetE" 171 | } 172 | }, 173 | { 174 | "cell_type": "code", 175 | "source": [ 176 | "# Specify the path to the Parquet file\n", 177 | "output_path = \"transactions.parquet\"\n", 178 | "\n", 179 | "# Write the DataFrame to Parquet format\n", 180 | "spark_df.write.parquet(output_path)\n" 181 | ], 182 | "metadata": { 183 | "id": "USrdKz9zCH3-" 184 | }, 185 | "execution_count": 5, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "source": [ 191 | "! ls" 192 | ], 193 | "metadata": { 194 | "colab": { 195 | "base_uri": "https://localhost:8080/" 196 | }, 197 | "id": "nWc4bKgJCKek", 198 | "outputId": "a522d32b-831e-42b9-d38e-027d7bea5123" 199 | }, 200 | "execution_count": 6, 201 | "outputs": [ 202 | { 203 | "output_type": "stream", 204 | "name": "stdout", 205 | "text": [ 206 | "sample_data transactions.parquet\n" 207 | ] 208 | } 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "source": [ 214 | "## 4. Writing Partitioned Parquet Files\n" 215 | ], 216 | "metadata": { 217 | "id": "jCzZVwHlElYG" 218 | } 219 | }, 220 | { 221 | "cell_type": "code", 222 | "source": [ 223 | "# Write the dataframe to Parquet format, partitioned by 'ProductCategory'\n", 224 | "partitioned_output_path = \"transactions_partitioned.parquet\"\n", 225 | "spark_df.write.partitionBy(\"ProductCategory\").parquet(partitioned_output_path)\n" 226 | ], 227 | "metadata": { 228 | "id": "AE9MaBStCLXH" 229 | }, 230 | "execution_count": 7, 231 | "outputs": [] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "source": [ 236 | "! ls" 237 | ], 238 | "metadata": { 239 | "colab": { 240 | "base_uri": "https://localhost:8080/" 241 | }, 242 | "id": "n3JMnfYIDoaP", 243 | "outputId": "cda8bdc6-f319-4fcf-fddc-1a7aeca95b40" 244 | }, 245 | "execution_count": 8, 246 | "outputs": [ 247 | { 248 | "output_type": "stream", 249 | "name": "stdout", 250 | "text": [ 251 | "sample_data transactions.parquet transactions_partitioned.parquet\n" 252 | ] 253 | } 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "source": [ 259 | "! ls transactions_partitioned.parquet" 260 | ], 261 | "metadata": { 262 | "colab": { 263 | "base_uri": "https://localhost:8080/" 264 | }, 265 | "id": "dHS4VW8qD8nI", 266 | "outputId": "3acd5f08-79d8-4a69-9667-adc1f07ad114" 267 | }, 268 | "execution_count": 10, 269 | "outputs": [ 270 | { 271 | "output_type": "stream", 272 | "name": "stdout", 273 | "text": [ 274 | "'ProductCategory=Books' 'ProductCategory=Electronics' 'ProductCategory=Groceries'\n", 275 | "'ProductCategory=Clothing' 'ProductCategory=Furniture' _SUCCESS\n" 276 | ] 277 | } 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "source": [ 283 | "## 5. Reading Parquet Files" 284 | ], 285 | "metadata": { 286 | "id": "gJdb3rHwEosV" 287 | } 288 | }, 289 | { 290 | "cell_type": "code", 291 | "source": [ 292 | "# Read in the Parquet file\n", 293 | "df_read = spark.read.parquet(output_path)\n", 294 | "\n", 295 | "# Show the content of the DataFrame\n", 296 | "df_read.show(5)\n" 297 | ], 298 | "metadata": { 299 | "colab": { 300 | "base_uri": "https://localhost:8080/" 301 | }, 302 | "id": "xyH_5SMxCNww", 303 | "outputId": "2091c0ba-0c2a-4552-b07a-f439d4ce9502" 304 | }, 305 | "execution_count": 11, 306 | "outputs": [ 307 | { 308 | "output_type": "stream", 309 | "name": "stdout", 310 | "text": [ 311 | "+----------+-------------+-------------------+------+---------------+\n", 312 | "|CustomerID|TransactionID| TransactionDate|Amount|ProductCategory|\n", 313 | "+----------+-------------+-------------------+------+---------------+\n", 314 | "| C00012| T36462|2023-05-05 00:00:00| 90.91| Furniture|\n", 315 | "| C00037| T81031|2023-03-19 00:00:00|465.54| Electronics|\n", 316 | "| C00092| T98628|2023-02-25 00:00:00| 180.9| Clothing|\n", 317 | "| C00050| T46850|2023-04-16 00:00:00|494.67| Furniture|\n", 318 | "| C00097| T79766|2023-04-11 00:00:00|179.65| Groceries|\n", 319 | "+----------+-------------+-------------------+------+---------------+\n", 320 | "only showing top 5 rows\n", 321 | "\n" 322 | ] 323 | } 324 | ] 325 | } 326 | ] 327 | } -------------------------------------------------------------------------------- /regex/learn_regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | text = "Data science is cool as you get to work with real-world data" 4 | matches = re.findall(r"data", text) 5 | print(matches) 6 | 7 | matches = re.findall(r"data", text, re.IGNORECASE) 8 | print(matches) 9 | 10 | text = "The cat sat on the mat. The bat flew over the rat." 11 | pattern = r"The ... " 12 | matches = re.findall(pattern, text) 13 | print(matches) 14 | 15 | text = "The cat sat on the mat. The bat flew over the rat." 16 | pattern = r"[cb]at" 17 | matches = re.findall(pattern, text) 18 | print(matches) 19 | 20 | 21 | # Find all lowercase words that start with a-d 22 | pattern = r"\b[a-d][a-z]*\b" 23 | text = "apple banana cherry date elephant fig grape kiwi lemon mango orange" 24 | matches = re.findall(pattern, text) 25 | print(matches) 26 | 27 | 28 | text = "Contact: john.doe@example.com" 29 | pattern = r"(?P[\w.]+)@(?P[\w.]+)" 30 | 31 | match = re.search(pattern, text) 32 | if match: 33 | print(f"Username: {match.group('username')}") 34 | print(f"Domain: {match.group('domain')}") 35 | 36 | 37 | 38 | text = "Phone numbers: 555-1234, 555-5678, 5551234" 39 | pattern = r"\b\d{3}-?\d{4}\b" 40 | matches = re.findall(pattern, text) 41 | print(matches) 42 | 43 | 44 | 45 | text = "Python is popular in data science." 46 | 47 | # ^ anchors to the start of the string 48 | start_matches = re.findall(r"^Python", text) 49 | print(start_matches) 50 | 51 | # $ anchors to the end of the string 52 | end_matches = re.findall(r"science\.$", text) 53 | print(end_matches) 54 | 55 | text = "Dates: 2023-10-15, 2022-05-22" 56 | pattern = r"(\d{4})-(\d{2})-(\d{2})" 57 | 58 | # findall returns tuples of the captured groups 59 | matches = re.findall(pattern, text) 60 | print(matches) 61 | 62 | # You can use these to create structured data 63 | for year, month, day in matches: 64 | print(f"Year: {year}, Month: {month}, Day: {day}") 65 | 66 | 67 | text = "Contact: john.doe@example.com" 68 | pattern = r"(?P[\w.]+)@(?P[\w.]+)" 69 | 70 | match = re.search(pattern, text) 71 | if match: 72 | print(f"Username: {match.group('username')}") 73 | print(f"Domain: {match.group('domain')}") 74 | -------------------------------------------------------------------------------- /regex/quick-ref-regex.md: -------------------------------------------------------------------------------- 1 | # Regular Expressions Quick Reference Table 2 | 3 | ## Basic Metacharacters 4 | 5 | | Character | Description | Example | Matches | 6 | |-----------|-------------|---------|---------| 7 | | `.` | Any character except newline | `a.b` | "acb", "adb", "a3b", etc. | 8 | | `^` | Start of string | `^Hello` | "Hello world" but not "Say Hello" | 9 | | `$` | End of string | `world$` | "Hello world" but not "world class" | 10 | | `*` | 0 or more repetitions | `ab*c` | "ac", "abc", "abbc", etc. | 11 | | `+` | 1 or more repetitions | `ab+c` | "abc", "abbc", etc. but not "ac" | 12 | | `?` | 0 or 1 repetition | `ab?c` | "ac", "abc" but not "abbc" | 13 | | `{n}` | Exactly n repetitions | `a{3}` | "aaa" | 14 | | `{m,n}` | m to n repetitions | `a{2,4}` | "aa", "aaa", "aaaa" | 15 | | `{m,}` | m or more repetitions | `a{2,}` | "aa", "aaa", "aaaa", etc. | 16 | | `\` | Escape character | `\.` | Literal period "." | 17 | | `[]` | Character class | `[abc]` | "a", "b", or "c" | 18 | | `\|` | Alternation (OR) | `cat\|dog` | "cat" or "dog" | 19 | | `()` | Grouping | `(ab)+` | "ab", "abab", etc. | 20 | 21 | ## Character Classes 22 | 23 | | Expression | Description | Equivalent | 24 | |------------|-------------|------------| 25 | | `\d` | Any digit | `[0-9]` | 26 | | `\D` | Any non-digit | `[^0-9]` | 27 | | `\w` | Any word character | `[a-zA-Z0-9_]` | 28 | | `\W` | Any non-word character | `[^a-zA-Z0-9_]` | 29 | | `\s` | Any whitespace | `[ \t\n\r\f\v]` | 30 | | `\S` | Any non-whitespace | `[^ \t\n\r\f\v]` | 31 | | `[abc]` | Any of listed characters | - | 32 | | `[^abc]` | Any character except listed | - | 33 | | `[a-z]` | Any character in range | - | 34 | 35 | ## Assertions 36 | 37 | | Expression | Description | 38 | |------------|-------------| 39 | | `(?=...)` | Positive lookahead | 40 | | `(?!...)` | Negative lookahead | 41 | | `(?<=...)` | Positive lookbehind | 42 | | `(?[\\w.]+)@(?P[\\w.]+)\"\n", 298 | "\n", 299 | "match = re.search(pattern, text)\n", 300 | "if match:\n", 301 | " print(f\"Username: {match.group('username')}\")\n", 302 | " print(f\"Domain: {match.group('domain')}\")\n" 303 | ], 304 | "metadata": { 305 | "colab": { 306 | "base_uri": "https://localhost:8080/" 307 | }, 308 | "id": "L9FOB0uiLexp", 309 | "outputId": "c73b8c7f-af37-4016-b404-d9ad7502514f" 310 | }, 311 | "execution_count": 24, 312 | "outputs": [ 313 | { 314 | "output_type": "stream", 315 | "name": "stdout", 316 | "text": [ 317 | "Username: john.doe\n", 318 | "Domain: example.com\n" 319 | ] 320 | } 321 | ] 322 | } 323 | ] 324 | } -------------------------------------------------------------------------------- /regex/regex_contd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "id": "VxllMcawIjlB" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import re" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "source": [ 31 | "text = \"
First content
Second content
\"\n", 32 | "\n", 33 | "# Greedy matching (default)\n", 34 | "greedy = re.findall(r\"
(.*)
\", text)\n", 35 | "print(f\"Greedy: {greedy}\")\n", 36 | "\n", 37 | "# Non-greedy matching\n", 38 | "non_greedy = re.findall(r\"
(.*?)
\", text)\n", 39 | "print(f\"Non-greedy: {non_greedy}\")\n" 40 | ], 41 | "metadata": { 42 | "colab": { 43 | "base_uri": "https://localhost:8080/" 44 | }, 45 | "id": "Cb35InjGIzW3", 46 | "outputId": "cf4f7b63-8e9c-4ad0-f4f7-a0c6bbd52a03" 47 | }, 48 | "execution_count": 2, 49 | "outputs": [ 50 | { 51 | "output_type": "stream", 52 | "name": "stdout", 53 | "text": [ 54 | "Greedy: ['First content
Second content']\n", 55 | "Non-greedy: ['First content', 'Second content']\n" 56 | ] 57 | } 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "source": [ 63 | "# Password validation\n", 64 | "password = \"Password123\"\n", 65 | "has_uppercase = bool(re.search(r\"(?=.*[A-Z])\", password))\n", 66 | "has_lowercase = bool(re.search(r\"(?=.*[a-z])\", password))\n", 67 | "has_digit = bool(re.search(r\"(?=.*\\d)\", password))\n", 68 | "is_long_enough = len(password) >= 8\n", 69 | "\n", 70 | "if all([has_uppercase, has_lowercase, has_digit, is_long_enough]):\n", 71 | " print(\"Password meets requirements\")\n", 72 | "else:\n", 73 | " print(\"Password does not meet all requirements\")\n" 74 | ], 75 | "metadata": { 76 | "colab": { 77 | "base_uri": "https://localhost:8080/" 78 | }, 79 | "id": "N21dkm8NI-TG", 80 | "outputId": "4c0b31c3-422c-4145-e07f-aff64c4a5950" 81 | }, 82 | "execution_count": 3, 83 | "outputs": [ 84 | { 85 | "output_type": "stream", 86 | "name": "stdout", 87 | "text": [ 88 | "Password meets requirements\n" 89 | ] 90 | } 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "source": [], 96 | "metadata": { 97 | "id": "VNTuYz1bI-Jc" 98 | }, 99 | "execution_count": 3, 100 | "outputs": [] 101 | } 102 | ] 103 | } -------------------------------------------------------------------------------- /regex/regex_examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "colab": { 22 | "base_uri": "https://localhost:8080/" 23 | }, 24 | "id": "YPKB2a5Hsw02", 25 | "outputId": "3da0932d-8614-477c-b6a6-d202c795f3da" 26 | }, 27 | "outputs": [ 28 | { 29 | "output_type": "stream", 30 | "name": "stdout", 31 | "text": [ 32 | "Contact info: 1234567890 and 9876543210.\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "import re\n", 38 | "\n", 39 | "text = \"Contact info: (123)-456-7890 and 987-654-3210.\"\n", 40 | "cleaned_text = re.sub(r'[()-]', '', text)\n", 41 | "print(cleaned_text)\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "source": [ 47 | "text = \"Please reach out to us at support@example.org or help@example.org.\"\n", 48 | "emails = re.findall(r'\\b[\\w.-]+?@\\w+?\\.\\w+?\\b', text)\n", 49 | "print(emails)\n" 50 | ], 51 | "metadata": { 52 | "colab": { 53 | "base_uri": "https://localhost:8080/" 54 | }, 55 | "id": "eGS7s-zTs-9T", 56 | "outputId": "74a8b91b-5af7-48f6-9dd3-d690b9f36b21" 57 | }, 58 | "execution_count": null, 59 | "outputs": [ 60 | { 61 | "output_type": "stream", 62 | "name": "stdout", 63 | "text": [ 64 | "['support@example.org', 'help@example.org']\n" 65 | ] 66 | } 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "source": [ 72 | "text = \"This\tis\ta\tstring with multiple unnecessary spaces.\"\n", 73 | "cleaned_text = re.sub(r'\\s+', ' ', text)\n", 74 | "print(cleaned_text)\n" 75 | ], 76 | "metadata": { 77 | "colab": { 78 | "base_uri": "https://localhost:8080/" 79 | }, 80 | "id": "dd0K0-LrtmBi", 81 | "outputId": "5436543f-4b19-4081-f8d4-5ec6f64d6d6b" 82 | }, 83 | "execution_count": null, 84 | "outputs": [ 85 | { 86 | "output_type": "stream", 87 | "name": "stdout", 88 | "text": [ 89 | "This is a string with multiple unnecessary spaces.\n" 90 | ] 91 | } 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "source": [ 97 | "email = \"test@example.com\"\n", 98 | "if re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', email):\n", 99 | " print(\"Valid email\") # Output: Valid email\n", 100 | "else:\n", 101 | " print(\"Invalid email\")\n" 102 | ], 103 | "metadata": { 104 | "colab": { 105 | "base_uri": "https://localhost:8080/" 106 | }, 107 | "id": "j05fGS1UyCfe", 108 | "outputId": "402a5319-ba31-44d8-e870-ccbc35535af3" 109 | }, 110 | "execution_count": null, 111 | "outputs": [ 112 | { 113 | "output_type": "stream", 114 | "name": "stdout", 115 | "text": [ 116 | "Valid email\n" 117 | ] 118 | } 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "source": [ 124 | "text = \"This is sentence one. And this is sentence two! Is this sentence three?\"\n", 125 | "sentences = re.split(r'[.!?]', text)\n", 126 | "print(sentences) # Output: ['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n" 127 | ], 128 | "metadata": { 129 | "colab": { 130 | "base_uri": "https://localhost:8080/" 131 | }, 132 | "id": "7f68JqnBzBX9", 133 | "outputId": "455de3e7-3cd0-4ffc-ad69-d058e9ecedff" 134 | }, 135 | "execution_count": null, 136 | "outputs": [ 137 | { 138 | "output_type": "stream", 139 | "name": "stdout", 140 | "text": [ 141 | "['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n" 142 | ] 143 | } 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "source": [ 149 | "import pandas as pd\n", 150 | "\n", 151 | "data = {\n", 152 | "\t'names': ['Alice123', 'Bob!@#', 'Charlie$$$'],\n", 153 | "\t'emails': ['alice@example.com', 'bob_at_example.com', 'charlie@example.com']\n", 154 | "}\n", 155 | "df = pd.DataFrame(data)\n", 156 | "\n", 157 | "# Remove non-alphabetic characters from names\n", 158 | "df['names'] = df['names'].str.replace(r'[^a-zA-Z]', '', regex=True)\n", 159 | "\n", 160 | "# Validate email addresses\n", 161 | "df['valid_email'] = df['emails'].apply(lambda x: bool(re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', x)))\n", 162 | "\n", 163 | "print(df)\n" 164 | ], 165 | "metadata": { 166 | "colab": { 167 | "base_uri": "https://localhost:8080/" 168 | }, 169 | "id": "qboHFiS30UMQ", 170 | "outputId": "eeb42cb5-ebcf-4ebe-f301-74c2c1ac184a" 171 | }, 172 | "execution_count": null, 173 | "outputs": [ 174 | { 175 | "output_type": "stream", 176 | "name": "stdout", 177 | "text": [ 178 | " names emails valid_email\n", 179 | "0 Alice alice@example.com True\n", 180 | "1 Bob bob_at_example.com False\n", 181 | "2 Charlie charlie@example.com True\n" 182 | ] 183 | } 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "source": [], 189 | "metadata": { 190 | "id": "la5oKWfX0U2Z" 191 | }, 192 | "execution_count": null, 193 | "outputs": [] 194 | } 195 | ] 196 | } -------------------------------------------------------------------------------- /statistics/Basic_Stats_Functions_Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Import the Built-In `statistics` Module" 21 | ], 22 | "metadata": { 23 | "id": "s8yOidchG5UV" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "source": [ 29 | "import statistics" 30 | ], 31 | "metadata": { 32 | "id": "cOmUhMH9bIAb" 33 | }, 34 | "execution_count": null, 35 | "outputs": [] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "source": [ 40 | "## 1. Mean" 41 | ], 42 | "metadata": { 43 | "id": "Cy_sSAo4bExW" 44 | } 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "colab": { 51 | "base_uri": "https://localhost:8080/" 52 | }, 53 | "id": "v-3qQD50a9hT", 54 | "outputId": "2a7d7cd5-d8f9-445d-f56f-59ac7f4e57b6" 55 | }, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "name": "stdout", 60 | "text": [ 61 | "Mean: 30\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "data = [10, 20, 30, 40, 50]\n", 67 | "mean = statistics.mean(data)\n", 68 | "print(\"Mean:\", mean)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "source": [ 74 | "## 2. Median" 75 | ], 76 | "metadata": { 77 | "id": "obFi961MbQ46" 78 | } 79 | }, 80 | { 81 | "cell_type": "code", 82 | "source": [ 83 | "data = [15, 20, 35, 40, 50]\n", 84 | "median = statistics.median(data)\n", 85 | "print(\"Median:\", median)" 86 | ], 87 | "metadata": { 88 | "colab": { 89 | "base_uri": "https://localhost:8080/" 90 | }, 91 | "id": "FrS8KYaXbPWy", 92 | "outputId": "f07a115a-5f18-462a-ae47-ab3c239db261" 93 | }, 94 | "execution_count": null, 95 | "outputs": [ 96 | { 97 | "output_type": "stream", 98 | "name": "stdout", 99 | "text": [ 100 | "Median: 35\n" 101 | ] 102 | } 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "source": [ 108 | "## 3. Mode" 109 | ], 110 | "metadata": { 111 | "id": "b9ybgj32bYKy" 112 | } 113 | }, 114 | { 115 | "cell_type": "code", 116 | "source": [ 117 | "data = [1, 2, 2, 3, 4, 4, 4]\n", 118 | "mode = statistics.mode(data)\n", 119 | "print(\"Mode:\", mode)" 120 | ], 121 | "metadata": { 122 | "colab": { 123 | "base_uri": "https://localhost:8080/" 124 | }, 125 | "id": "AgrG9I5fbWU0", 126 | "outputId": "eebf7a08-b1d0-42f7-f982-3a20b9241082" 127 | }, 128 | "execution_count": null, 129 | "outputs": [ 130 | { 131 | "output_type": "stream", 132 | "name": "stdout", 133 | "text": [ 134 | "Mode: 4\n" 135 | ] 136 | } 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "source": [ 142 | "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n", 143 | "mode = statistics.mode(data)\n", 144 | "print(\"Modes:\", mode)" 145 | ], 146 | "metadata": { 147 | "colab": { 148 | "base_uri": "https://localhost:8080/" 149 | }, 150 | "id": "d3D3oyVBccaa", 151 | "outputId": "93c7fa31-ca9c-429f-df8d-d93ca9eef080" 152 | }, 153 | "execution_count": null, 154 | "outputs": [ 155 | { 156 | "output_type": "stream", 157 | "name": "stdout", 158 | "text": [ 159 | "Modes: 2\n" 160 | ] 161 | } 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "source": [ 167 | "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n", 168 | "modes = statistics.multimode(data)\n", 169 | "print(\"Modes:\", modes)" 170 | ], 171 | "metadata": { 172 | "colab": { 173 | "base_uri": "https://localhost:8080/" 174 | }, 175 | "id": "62_XzwJhcH3d", 176 | "outputId": "e7cf6cd4-50b3-42a5-b1ad-d45be40c602a" 177 | }, 178 | "execution_count": null, 179 | "outputs": [ 180 | { 181 | "output_type": "stream", 182 | "name": "stdout", 183 | "text": [ 184 | "Modes: [2, 4, 7]\n" 185 | ] 186 | } 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "source": [ 192 | "## 4. Standard Deviation" 193 | ], 194 | "metadata": { 195 | "id": "neQiIHTC6CtL" 196 | } 197 | }, 198 | { 199 | "cell_type": "code", 200 | "source": [ 201 | "data = [12, 15, 22, 29, 35]\n", 202 | "std_dev = statistics.stdev(data)\n", 203 | "print(f\"Standard Deviation: {std_dev:.3f}\")" 204 | ], 205 | "metadata": { 206 | "colab": { 207 | "base_uri": "https://localhost:8080/" 208 | }, 209 | "id": "uY-DcaV4cRux", 210 | "outputId": "98166ea5-b57c-4e1b-f526-5cfbb3f9aed7" 211 | }, 212 | "execution_count": null, 213 | "outputs": [ 214 | { 215 | "output_type": "stream", 216 | "name": "stdout", 217 | "text": [ 218 | "Standard Deviation: 9.555\n" 219 | ] 220 | } 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "source": [ 226 | "## 5. Variance" 227 | ], 228 | "metadata": { 229 | "id": "q6Ra31AD7jcU" 230 | } 231 | }, 232 | { 233 | "cell_type": "code", 234 | "source": [ 235 | "data = [8, 10, 12, 14, 16]\n", 236 | "variance = statistics.variance(data)\n", 237 | "print(f\"Variance: {variance:.2f}\")" 238 | ], 239 | "metadata": { 240 | "colab": { 241 | "base_uri": "https://localhost:8080/" 242 | }, 243 | "id": "ALOJxc4V6G0a", 244 | "outputId": "ff7c8a7d-8250-4fc3-b48a-a59dc1e15877" 245 | }, 246 | "execution_count": null, 247 | "outputs": [ 248 | { 249 | "output_type": "stream", 250 | "name": "stdout", 251 | "text": [ 252 | "Variance: 10.00\n" 253 | ] 254 | } 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "source": [ 260 | "## 6. Covariance" 261 | ], 262 | "metadata": { 263 | "id": "oXGwdDsci1AP" 264 | } 265 | }, 266 | { 267 | "cell_type": "code", 268 | "source": [ 269 | "data1 = [2, 4, 6, 8, 10]\n", 270 | "data2 = [1, 3, 5, 7, 9]\n", 271 | "covariance = statistics.covariance(data1, data2)\n", 272 | "print(\"Covariance:\", covariance)" 273 | ], 274 | "metadata": { 275 | "id": "5wjqe8n67uoT", 276 | "colab": { 277 | "base_uri": "https://localhost:8080/" 278 | }, 279 | "outputId": "7c11b3be-9d00-47ef-b05d-ad6faff58c65" 280 | }, 281 | "execution_count": 1, 282 | "outputs": [ 283 | { 284 | "output_type": "stream", 285 | "name": "stdout", 286 | "text": [ 287 | "Covariance: 10.0\n" 288 | ] 289 | } 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "source": [ 295 | "## 7. Quantiles" 296 | ], 297 | "metadata": { 298 | "id": "DqquyE0XmKg-" 299 | } 300 | }, 301 | { 302 | "cell_type": "code", 303 | "source": [ 304 | "data = [1, 5, 7, 9, 10, 12, 16, 18, 19, 21]\n", 305 | "# Quartiles\n", 306 | "quantiles = statistics.quantiles(data, n=4)\n", 307 | "print(\"Quantiles (Quartiles):\", quantiles)" 308 | ], 309 | "metadata": { 310 | "colab": { 311 | "base_uri": "https://localhost:8080/" 312 | }, 313 | "id": "5p1xVng-kwju", 314 | "outputId": "903cb4f8-5bb6-488a-c582-62126fbff758" 315 | }, 316 | "execution_count": 4, 317 | "outputs": [ 318 | { 319 | "output_type": "stream", 320 | "name": "stdout", 321 | "text": [ 322 | "Quantiles (Quartiles): [6.5, 11.0, 18.25]\n" 323 | ] 324 | } 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "source": [ 330 | "## 8. Correlation" 331 | ], 332 | "metadata": { 333 | "id": "eUTp6xe2CCVM" 334 | } 335 | }, 336 | { 337 | "cell_type": "code", 338 | "source": [ 339 | "data1 = [1, 2, 3, 4, 5]\n", 340 | "data2 = [2, 4, 6, 8, 10]\n", 341 | "correlation = statistics.correlation(data1, data2)\n", 342 | "print(\"Correlation:\", correlation)" 343 | ], 344 | "metadata": { 345 | "colab": { 346 | "base_uri": "https://localhost:8080/" 347 | }, 348 | "id": "1CFP4t68mO4r", 349 | "outputId": "a43c0f06-8c1e-4229-ab23-1c8a35aef2e7" 350 | }, 351 | "execution_count": 5, 352 | "outputs": [ 353 | { 354 | "output_type": "stream", 355 | "name": "stdout", 356 | "text": [ 357 | "Correlation: 1.0\n" 358 | ] 359 | } 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "source": [ 365 | "## 9. Linear Regression" 366 | ], 367 | "metadata": { 368 | "id": "AMq5BdfuFMiB" 369 | } 370 | }, 371 | { 372 | "cell_type": "code", 373 | "source": [ 374 | "x = [1, 2, 3, 4, 5]\n", 375 | "y = [3, 4, 2, 5, 7]\n", 376 | "slope, intercept = statistics.linear_regression(x, y)\n", 377 | "print(\"Slope:\", slope)\n", 378 | "print(\"Intercept:\", intercept)" 379 | ], 380 | "metadata": { 381 | "colab": { 382 | "base_uri": "https://localhost:8080/" 383 | }, 384 | "id": "TJVQAIjACFxz", 385 | "outputId": "e79e8709-0a67-4b51-dbb8-634fdc52ad3a" 386 | }, 387 | "execution_count": 7, 388 | "outputs": [ 389 | { 390 | "output_type": "stream", 391 | "name": "stdout", 392 | "text": [ 393 | "Slope: 0.9\n", 394 | "Intercept: 1.5\n" 395 | ] 396 | } 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "source": [ 402 | "## 10. Normal Distribution" 403 | ], 404 | "metadata": { 405 | "id": "EjcEB4bcGILJ" 406 | } 407 | }, 408 | { 409 | "cell_type": "code", 410 | "source": [ 411 | "# Create a normal distribution with mean 30 and standard deviation 10\n", 412 | "normal_dist = statistics.NormalDist(mu=30, sigma=10)\n", 413 | "\n", 414 | "# Calculate the probability of a value less than or equal to 20\n", 415 | "probability = normal_dist.cdf(20)\n", 416 | "print(f\"Probability (CDF) of 20: {probability:.3f}\")\n", 417 | "\n", 418 | "# Calculate the z-score for a value\n", 419 | "z_score = normal_dist.inv_cdf(0.975)\n", 420 | "print(f\"Z-score for 97.5th percentile: {z_score:.3f}\")" 421 | ], 422 | "metadata": { 423 | "colab": { 424 | "base_uri": "https://localhost:8080/" 425 | }, 426 | "id": "sYkVaHDtFQ6m", 427 | "outputId": "2f1db0e8-3b5e-4764-8151-02e92b413513" 428 | }, 429 | "execution_count": 11, 430 | "outputs": [ 431 | { 432 | "output_type": "stream", 433 | "name": "stdout", 434 | "text": [ 435 | "Probability (CDF) of 20: 0.159\n", 436 | "Z-score for 97.5th percentile: 49.600\n" 437 | ] 438 | } 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "source": [], 444 | "metadata": { 445 | "id": "w8eCKz1tJy55" 446 | }, 447 | "execution_count": null, 448 | "outputs": [] 449 | } 450 | ] 451 | } -------------------------------------------------------------------------------- /statistics/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /statistics/handle_excel_files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Generating a Sample Excel File" 21 | ], 22 | "metadata": { 23 | "id": "TJ5jHxsTviGe" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "id": "oGEfUQfJolvB", 31 | "colab": { 32 | "base_uri": "https://localhost:8080/" 33 | }, 34 | "outputId": "8b18535d-bdfc-4198-a405-2071610bec82" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | "Sample Excel file 'employee_data.xlsx' generated successfully.\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import pandas as pd\n", 47 | "\n", 48 | "# Sample employee data\n", 49 | "data = {\n", 50 | " 'employee_id': [101, 102, 103, 104, 105],\n", 51 | " 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],\n", 52 | " 'department': ['HR', 'Finance', 'IT', 'Sales', 'Marketing'],\n", 53 | " 'salary': [55000, 62000, 72000, 50000, 57000],\n", 54 | " 'performance_score': [3.8, 4.2, 4.5, 3.5, 4.0],\n", 55 | " 'years_at_company': [2, 5, 3, 4, 1]\n", 56 | "}\n", 57 | "\n", 58 | "# Create a DataFrame\n", 59 | "df = pd.DataFrame(data)\n", 60 | "\n", 61 | "# Save to an Excel file\n", 62 | "df.to_excel('employee_data.xlsx', index=False)\n", 63 | "\n", 64 | "print(\"Sample Excel file 'employee_data.xlsx' generated successfully.\")" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "source": [ 70 | "## Reading in the Excel File" 71 | ], 72 | "metadata": { 73 | "id": "o0EPIZ8bvlWR" 74 | } 75 | }, 76 | { 77 | "cell_type": "code", 78 | "source": [ 79 | "# Read Excel file into a DataFrame\n", 80 | "df = pd.read_excel('employee_data.xlsx')\n", 81 | "\n", 82 | "print(df.head())" 83 | ], 84 | "metadata": { 85 | "colab": { 86 | "base_uri": "https://localhost:8080/" 87 | }, 88 | "id": "cBRgGP-Wvs-P", 89 | "outputId": "c423480e-8cff-40f5-ef1a-fb3e5a55c0d6" 90 | }, 91 | "execution_count": null, 92 | "outputs": [ 93 | { 94 | "output_type": "stream", 95 | "name": "stdout", 96 | "text": [ 97 | " employee_id name department salary performance_score \\\n", 98 | "0 101 Alice HR 55000 3.8 \n", 99 | "1 102 Bob Finance 62000 4.2 \n", 100 | "2 103 Charlie IT 72000 4.5 \n", 101 | "3 104 David Sales 50000 3.5 \n", 102 | "4 105 Eva Marketing 57000 4.0 \n", 103 | "\n", 104 | " years_at_company \n", 105 | "0 2 \n", 106 | "1 5 \n", 107 | "2 3 \n", 108 | "3 4 \n", 109 | "4 1 \n" 110 | ] 111 | } 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "source": [ 117 | "## Exploring and Summarizing Data" 118 | ], 119 | "metadata": { 120 | "id": "ldEpbTVevteH" 121 | } 122 | }, 123 | { 124 | "cell_type": "code", 125 | "source": [ 126 | "# Get info about the DataFrame\n", 127 | "print(df.info())" 128 | ], 129 | "metadata": { 130 | "colab": { 131 | "base_uri": "https://localhost:8080/" 132 | }, 133 | "id": "MCwxQH84vv5u", 134 | "outputId": "49d9df4b-94e6-4fd2-e515-d6ebdf5a2a85" 135 | }, 136 | "execution_count": null, 137 | "outputs": [ 138 | { 139 | "output_type": "stream", 140 | "name": "stdout", 141 | "text": [ 142 | "\n", 143 | "RangeIndex: 5 entries, 0 to 4\n", 144 | "Data columns (total 6 columns):\n", 145 | " # Column Non-Null Count Dtype \n", 146 | "--- ------ -------------- ----- \n", 147 | " 0 employee_id 5 non-null int64 \n", 148 | " 1 name 5 non-null object \n", 149 | " 2 department 5 non-null object \n", 150 | " 3 salary 5 non-null int64 \n", 151 | " 4 performance_score 5 non-null float64\n", 152 | " 5 years_at_company 5 non-null int64 \n", 153 | "dtypes: float64(1), int64(3), object(2)\n", 154 | "memory usage: 368.0+ bytes\n", 155 | "None\n" 156 | ] 157 | } 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "source": [ 163 | "# Get descriptive statistics\n", 164 | "print(df.describe())" 165 | ], 166 | "metadata": { 167 | "colab": { 168 | "base_uri": "https://localhost:8080/" 169 | }, 170 | "id": "UF_BXV17wUyv", 171 | "outputId": "ef65974f-d366-4d99-9b6f-2a776bb9a2ca" 172 | }, 173 | "execution_count": null, 174 | "outputs": [ 175 | { 176 | "output_type": "stream", 177 | "name": "stdout", 178 | "text": [ 179 | " employee_id salary performance_score years_at_company\n", 180 | "count 5.000000 5.000000 5.000000 5.000000\n", 181 | "mean 103.000000 59200.000000 4.000000 3.000000\n", 182 | "std 1.581139 8348.652586 0.380789 1.581139\n", 183 | "min 101.000000 50000.000000 3.500000 1.000000\n", 184 | "25% 102.000000 55000.000000 3.800000 2.000000\n", 185 | "50% 103.000000 57000.000000 4.000000 3.000000\n", 186 | "75% 104.000000 62000.000000 4.200000 4.000000\n", 187 | "max 105.000000 72000.000000 4.500000 5.000000\n" 188 | ] 189 | } 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "source": [ 195 | "## Handling Missing Values" 196 | ], 197 | "metadata": { 198 | "id": "rVlV_FmmvwOP" 199 | } 200 | }, 201 | { 202 | "cell_type": "code", 203 | "source": [ 204 | "# Check for missing values\n", 205 | "missing_values = df.isna().sum()\n", 206 | "print(missing_values)" 207 | ], 208 | "metadata": { 209 | "colab": { 210 | "base_uri": "https://localhost:8080/" 211 | }, 212 | "id": "5vMERpSvvzmH", 213 | "outputId": "8c9292f5-aa19-4f19-a2e6-76aaaaee6e82" 214 | }, 215 | "execution_count": null, 216 | "outputs": [ 217 | { 218 | "output_type": "stream", 219 | "name": "stdout", 220 | "text": [ 221 | "employee_id 0\n", 222 | "name 0\n", 223 | "department 0\n", 224 | "salary 0\n", 225 | "performance_score 0\n", 226 | "years_at_company 0\n", 227 | "dtype: int64\n" 228 | ] 229 | } 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "source": [ 235 | "# Fill missing performance scores with the average\n", 236 | "df['performance_score'] = df['performance_score'].fillna(df['performance_score'].mean())" 237 | ], 238 | "metadata": { 239 | "id": "f8QMM_3UweNm" 240 | }, 241 | "execution_count": null, 242 | "outputs": [] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "source": [ 247 | "## Basic Data Manipulation" 248 | ], 249 | "metadata": { 250 | "id": "UoQF_uYxv2kR" 251 | } 252 | }, 253 | { 254 | "cell_type": "code", 255 | "source": [ 256 | " # Filter employees with a performance score above 4\n", 257 | "high_performers = df[df['performance_score'] > 4]\n", 258 | "print(high_performers)" 259 | ], 260 | "metadata": { 261 | "colab": { 262 | "base_uri": "https://localhost:8080/" 263 | }, 264 | "id": "gyWnuUX8v10g", 265 | "outputId": "aad60691-ddfa-461e-efa5-0889de7bf047" 266 | }, 267 | "execution_count": null, 268 | "outputs": [ 269 | { 270 | "output_type": "stream", 271 | "name": "stdout", 272 | "text": [ 273 | " employee_id name department salary performance_score \\\n", 274 | "1 102 Bob Finance 62000 4.2 \n", 275 | "2 103 Charlie IT 72000 4.5 \n", 276 | "\n", 277 | " years_at_company \n", 278 | "1 5 \n", 279 | "2 3 \n" 280 | ] 281 | } 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "source": [ 287 | "# Select specific columns\n", 288 | "selected_columns = df[['name', 'department', 'salary']]\n", 289 | "print(selected_columns)" 290 | ], 291 | "metadata": { 292 | "colab": { 293 | "base_uri": "https://localhost:8080/" 294 | }, 295 | "id": "2NYJXMEywkNm", 296 | "outputId": "5e3fe3ee-fe57-476d-d45b-84f8c41f3324" 297 | }, 298 | "execution_count": null, 299 | "outputs": [ 300 | { 301 | "output_type": "stream", 302 | "name": "stdout", 303 | "text": [ 304 | " name department salary\n", 305 | "0 Alice HR 55000\n", 306 | "1 Bob Finance 62000\n", 307 | "2 Charlie IT 72000\n", 308 | "3 David Sales 50000\n", 309 | "4 Eva Marketing 57000\n" 310 | ] 311 | } 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "source": [ 317 | "# Add a new column for bonus\n", 318 | "df['bonus'] = df['salary'].apply(lambda x: x * 0.10)\n", 319 | "print(df.head())" 320 | ], 321 | "metadata": { 322 | "colab": { 323 | "base_uri": "https://localhost:8080/" 324 | }, 325 | "id": "jiVSYWN5wmO-", 326 | "outputId": "2aae7e21-c9b5-4b62-9e99-06265a1dfae1" 327 | }, 328 | "execution_count": null, 329 | "outputs": [ 330 | { 331 | "output_type": "stream", 332 | "name": "stdout", 333 | "text": [ 334 | " employee_id name department salary performance_score \\\n", 335 | "0 101 Alice HR 55000 3.8 \n", 336 | "1 102 Bob Finance 62000 4.2 \n", 337 | "2 103 Charlie IT 72000 4.5 \n", 338 | "3 104 David Sales 50000 3.5 \n", 339 | "4 105 Eva Marketing 57000 4.0 \n", 340 | "\n", 341 | " years_at_company bonus \n", 342 | "0 2 5500.0 \n", 343 | "1 5 6200.0 \n", 344 | "2 3 7200.0 \n", 345 | "3 4 5000.0 \n", 346 | "4 1 5700.0 \n" 347 | ] 348 | } 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "source": [ 354 | "## Grouping and Aggregating Data" 355 | ], 356 | "metadata": { 357 | "id": "O3B9eGDsv8Bs" 358 | } 359 | }, 360 | { 361 | "cell_type": "code", 362 | "source": [ 363 | "# Calculate average salary grouped by department\n", 364 | "average_salary_by_department = df.groupby('department')['salary'].mean().reset_index()\n", 365 | "print(average_salary_by_department)" 366 | ], 367 | "metadata": { 368 | "colab": { 369 | "base_uri": "https://localhost:8080/" 370 | }, 371 | "id": "nKeNszX6v9QA", 372 | "outputId": "8e5170dc-51f9-4032-9bfa-b0cd5404e651" 373 | }, 374 | "execution_count": null, 375 | "outputs": [ 376 | { 377 | "output_type": "stream", 378 | "name": "stdout", 379 | "text": [ 380 | " department salary\n", 381 | "0 Finance 62000.0\n", 382 | "1 HR 55000.0\n", 383 | "2 IT 72000.0\n", 384 | "3 Marketing 57000.0\n", 385 | "4 Sales 50000.0\n" 386 | ] 387 | } 388 | ] 389 | } 390 | ] 391 | } -------------------------------------------------------------------------------- /statistics/probability/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /statistics/probability/joint_and_conditional_pbty.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Step 1: Creating Sample Data" 21 | ], 22 | "metadata": { 23 | "id": "YjO9ZVZIM8Ye" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "colab": { 31 | "base_uri": "https://localhost:8080/" 32 | }, 33 | "id": "sAXOTiD9Ltz0", 34 | "outputId": "63fdc6fe-0ae1-4b61-ba92-4e481cc8c561" 35 | }, 36 | "outputs": [ 37 | { 38 | "output_type": "stream", 39 | "name": "stdout", 40 | "text": [ 41 | " Age_Group Sports_Interest\n", 42 | "0 Teen Yes\n", 43 | "1 Teen No\n", 44 | "2 Teen Yes\n", 45 | "3 Adult No\n", 46 | "4 Adult No\n", 47 | "5 Senior Yes\n", 48 | "6 Senior Yes\n", 49 | "7 Senior No\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import pandas as pd\n", 55 | "\n", 56 | "# Sample data\n", 57 | "data = {\n", 58 | " \"Age_Group\": [\"Teen\", \"Teen\", \"Teen\", \"Adult\", \"Adult\", \"Senior\", \"Senior\", \"Senior\"],\n", 59 | " \"Sports_Interest\": [\"Yes\", \"No\", \"Yes\", \"No\", \"No\", \"Yes\", \"Yes\", \"No\"]\n", 60 | "}\n", 61 | "\n", 62 | "df = pd.DataFrame(data)\n", 63 | "\n", 64 | "# Display the data\n", 65 | "print(df)\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "source": [ 71 | "## Step 2: Calculating Joint Probability" 72 | ], 73 | "metadata": { 74 | "id": "1VY0hLRKMWMr" 75 | } 76 | }, 77 | { 78 | "cell_type": "code", 79 | "source": [ 80 | "# Total number of observations\n", 81 | "total_count = len(df)\n", 82 | "\n", 83 | "# Count occurrences where Age_Group is \"Teen\" and Sports_Interest is \"Yes\"\n", 84 | "joint_count = len(df[(df['Age_Group'] == 'Teen') & (df['Sports_Interest'] == 'Yes')])\n", 85 | "\n", 86 | "# Joint probability\n", 87 | "joint_probability = joint_count / total_count\n", 88 | "\n", 89 | "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_probability}\")\n" 90 | ], 91 | "metadata": { 92 | "colab": { 93 | "base_uri": "https://localhost:8080/" 94 | }, 95 | "id": "M32eM5NPMHNd", 96 | "outputId": "35e64e55-358f-471c-b583-ca322b9597c0" 97 | }, 98 | "execution_count": null, 99 | "outputs": [ 100 | { 101 | "output_type": "stream", 102 | "name": "stdout", 103 | "text": [ 104 | "Joint Probability (Teen and Sports Interest Yes): 0.25\n" 105 | ] 106 | } 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "source": [ 112 | "## Step 3: Calculating Conditional Probability" 113 | ], 114 | "metadata": { 115 | "id": "OIs1olhPMZgq" 116 | } 117 | }, 118 | { 119 | "cell_type": "code", 120 | "source": [ 121 | "# Filter data for Age_Group = \"Teen\"\n", 122 | "teen_data = df[df['Age_Group'] == 'Teen']\n", 123 | "\n", 124 | "# Count occurrences of Sports_Interest = \"Yes\" among teens\n", 125 | "conditional_count = len(teen_data[teen_data['Sports_Interest'] == 'Yes'])\n", 126 | "\n", 127 | "# Conditional probability\n", 128 | "conditional_probability = conditional_count / len(teen_data)\n", 129 | "\n", 130 | "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_probability:.3f}\")\n" 131 | ], 132 | "metadata": { 133 | "colab": { 134 | "base_uri": "https://localhost:8080/" 135 | }, 136 | "id": "vMTq6kaKMJdd", 137 | "outputId": "559e5632-7ca7-44bd-9d59-4f2aeb19f50a" 138 | }, 139 | "execution_count": null, 140 | "outputs": [ 141 | { 142 | "output_type": "stream", 143 | "name": "stdout", 144 | "text": [ 145 | "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n" 146 | ] 147 | } 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "source": [ 153 | "## Step 4: Generalizing with Functions" 154 | ], 155 | "metadata": { 156 | "id": "L2uNqq9zM1I2" 157 | } 158 | }, 159 | { 160 | "cell_type": "code", 161 | "source": [ 162 | "def calculate_joint_probability(df, condition1, condition2):\n", 163 | " total_count = len(df)\n", 164 | " joint_count = len(df[(df[condition1[0]] == condition1[1]) & (df[condition2[0]] == condition2[1])])\n", 165 | " return joint_count / total_count\n", 166 | "\n", 167 | "def calculate_conditional_probability(df, given_condition, target_condition):\n", 168 | " subset = df[df[given_condition[0]] == given_condition[1]]\n", 169 | " conditional_count = len(subset[subset[target_condition[0]] == target_condition[1]])\n", 170 | " return conditional_count / len(subset)\n" 171 | ], 172 | "metadata": { 173 | "id": "VGoD5_-2MMfE" 174 | }, 175 | "execution_count": null, 176 | "outputs": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "source": [ 181 | "# Joint Probability of \"Teen\" and \"Sports_Interest = Yes\"\n", 182 | "joint_prob = calculate_joint_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n", 183 | "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_prob}\")\n", 184 | "\n", 185 | "# Conditional Probability of \"Sports_Interest = Yes\" given \"Age_Group = Teen\"\n", 186 | "conditional_prob = calculate_conditional_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n", 187 | "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_prob:.3f}\")\n" 188 | ], 189 | "metadata": { 190 | "colab": { 191 | "base_uri": "https://localhost:8080/" 192 | }, 193 | "id": "rSEt6qJgMQQN", 194 | "outputId": "0f5e0527-f942-4f4a-8081-7ec6d4591708" 195 | }, 196 | "execution_count": null, 197 | "outputs": [ 198 | { 199 | "output_type": "stream", 200 | "name": "stdout", 201 | "text": [ 202 | "Joint Probability (Teen and Sports Interest Yes): 0.25\n", 203 | "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n" 204 | ] 205 | } 206 | ] 207 | } 208 | ] 209 | } -------------------------------------------------------------------------------- /statistics/sparse_data_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "id": "0R9gVhnIMrNH" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "from scipy import sparse\n", 27 | "import pandas as pd\n", 28 | "from scipy import stats\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "source": [ 34 | "# Create a sparse matrix where rows are users and columns are products\n", 35 | "# Only storing the actual interactions\n", 36 | "row = np.array([0, 3, 1, 0]) # User IDs\n", 37 | "col = np.array([0, 3, 1, 2]) # Product IDs\n", 38 | "data = np.array([4, 5, 7, 9]) # Interaction values (like ratings)\n", 39 | "\n", 40 | "# Create the sparse matrix\n", 41 | "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n", 42 | "\n", 43 | "# seeing the sparse matrix as a regular matrix\n", 44 | "print(\"Here's our sparse matrix as a regular array:\")\n", 45 | "print(sparse_matrix.toarray())\n" 46 | ], 47 | "metadata": { 48 | "colab": { 49 | "base_uri": "https://localhost:8080/" 50 | }, 51 | "id": "RkAQQ8QCMzM7", 52 | "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5" 53 | }, 54 | "execution_count": 2, 55 | "outputs": [ 56 | { 57 | "output_type": "stream", 58 | "name": "stdout", 59 | "text": [ 60 | "Here's our sparse matrix as a regular array:\n", 61 | "[[4 0 9 0]\n", 62 | " [0 7 0 0]\n", 63 | " [0 0 0 0]\n", 64 | " [0 0 0 5]]\n" 65 | ] 66 | } 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "source": [ 72 | "def calculate_sparse_mean(sparse_matrix):\n", 73 | " \"\"\"\n", 74 | " Calculate mean of non-zero elements in a sparse matrix.\n", 75 | " This is useful when zeros represent 'no data' rather than actual zeros.\n", 76 | " \"\"\"\n", 77 | " if sparse_matrix.nnz == 0: # nnz is the number of non-zero elements\n", 78 | " return 0.0\n", 79 | " return sparse_matrix.sum() / sparse_matrix.nnz\n", 80 | "\n", 81 | "mean_value = calculate_sparse_mean(sparse_matrix)\n", 82 | "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n" 83 | ], 84 | "metadata": { 85 | "colab": { 86 | "base_uri": "https://localhost:8080/" 87 | }, 88 | "id": "Dz0BFJXXM1ia", 89 | "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5" 90 | }, 91 | "execution_count": 3, 92 | "outputs": [ 93 | { 94 | "output_type": "stream", 95 | "name": "stdout", 96 | "text": [ 97 | "\n", 98 | "Mean of non-zero elements: 6.25\n" 99 | ] 100 | } 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "source": [ 106 | "def analyze_row_patterns(sparse_matrix):\n", 107 | " \"\"\"\n", 108 | " Analyze patterns in each row of a sparse matrix.\n", 109 | " Returns dictionary with various row statistics.\n", 110 | " \"\"\"\n", 111 | " # Convert to CSR format for efficient row operations\n", 112 | " csr_matrix = sparse_matrix.tocsr()\n", 113 | "\n", 114 | " # Calculate statistics\n", 115 | " row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n", 116 | " row_nonzeros = np.diff(csr_matrix.indptr) # Number of non-zeros per row\n", 117 | "\n", 118 | " # Calculate means, handling empty rows\n", 119 | " row_means = np.zeros_like(row_sums, dtype=float)\n", 120 | " mask = row_nonzeros > 0\n", 121 | " row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n", 122 | "\n", 123 | " return {\n", 124 | " 'activity_sum': row_sums, # Total activity per user\n", 125 | " 'interaction_count': row_nonzeros, # Number of interactions per user\n", 126 | " 'average_value': row_means # Average value per user\n", 127 | " }\n" 128 | ], 129 | "metadata": { 130 | "id": "SF3ygrrvM4Ks" 131 | }, 132 | "execution_count": 4, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "source": [ 138 | "stats = analyze_row_patterns(sparse_matrix)\n", 139 | "print(\"\\nUser Statistics:\")\n", 140 | "for i, (sum_val, count, mean) in enumerate(zip(\n", 141 | " stats['activity_sum'],\n", 142 | " stats['interaction_count'],\n", 143 | " stats['average_value']\n", 144 | ")):\n", 145 | " print(f\"User {i}: {count} interactions, \"\n", 146 | " f\"total activity = {sum_val}, \"\n", 147 | " f\"average value = {mean:.2f}\")\n" 148 | ], 149 | "metadata": { 150 | "colab": { 151 | "base_uri": "https://localhost:8080/" 152 | }, 153 | "id": "IAzJ8tHRM519", 154 | "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04" 155 | }, 156 | "execution_count": 5, 157 | "outputs": [ 158 | { 159 | "output_type": "stream", 160 | "name": "stdout", 161 | "text": [ 162 | "\n", 163 | "User Statistics:\n", 164 | "User 0: 2 interactions, total activity = 13, average value = 6.50\n", 165 | "User 1: 1 interactions, total activity = 7, average value = 7.00\n", 166 | "User 2: 0 interactions, total activity = 0, average value = 0.00\n", 167 | "User 3: 1 interactions, total activity = 5, average value = 5.00\n" 168 | ] 169 | } 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "source": [ 175 | "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n", 176 | " \"\"\"\n", 177 | " Calculate correlation between columns, considering only overlapping non-zero elements.\n", 178 | " Like finding which products are often rated similarly.\n", 179 | " \"\"\"\n", 180 | " # Convert to dense format for this calculation\n", 181 | " # (For very large matrices, you'd want to do this differently)\n", 182 | " dense_cols = sparse_matrix.toarray().T\n", 183 | " n_cols = dense_cols.shape[0]\n", 184 | " correlations = np.zeros((n_cols, n_cols))\n", 185 | "\n", 186 | " for i in range(n_cols):\n", 187 | " for j in range(i, n_cols):\n", 188 | " # Find where both columns have non-zero values\n", 189 | " mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n", 190 | " if mask.sum() >= min_overlap:\n", 191 | " corr = stats.pearsonr(dense_cols[i][mask],\n", 192 | " dense_cols[j][mask])[0]\n", 193 | " correlations[i, j] = correlations[j, i] = corr\n", 194 | "\n", 195 | " return correlations" 196 | ], 197 | "metadata": { 198 | "id": "ADRakCn4M8KD" 199 | }, 200 | "execution_count": 6, 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "source": [ 206 | "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n", 207 | "print(\"\\nCorrelation matrix:\")\n", 208 | "print(corr_matrix)" 209 | ], 210 | "metadata": { 211 | "colab": { 212 | "base_uri": "https://localhost:8080/" 213 | }, 214 | "id": "7UuFzRB6M979", 215 | "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7" 216 | }, 217 | "execution_count": 7, 218 | "outputs": [ 219 | { 220 | "output_type": "stream", 221 | "name": "stdout", 222 | "text": [ 223 | "\n", 224 | "Correlation matrix:\n", 225 | "[[0. 0. 0. 0.]\n", 226 | " [0. 0. 0. 0.]\n", 227 | " [0. 0. 0. 0.]\n", 228 | " [0. 0. 0. 0.]]\n" 229 | ] 230 | } 231 | ] 232 | } 233 | ] 234 | } -------------------------------------------------------------------------------- /statistics/sparse_data_analysis_v0_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "## Imports" 21 | ], 22 | "metadata": { 23 | "id": "DPt0ex-tOHxH" 24 | } 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "id": "0R9gVhnIMrNH" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import numpy as np\n", 35 | "from scipy import sparse\n", 36 | "import pandas as pd\n", 37 | "from scipy import stats\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "source": [ 43 | "## Creating a Sparse Matrix" 44 | ], 45 | "metadata": { 46 | "id": "M7kq8YzvOKG-" 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "source": [ 52 | "# Create a sparse matrix where rows are users and columns are products\n", 53 | "# Only storing the actual interactions\n", 54 | "row = np.array([0, 3, 1, 0]) # User IDs\n", 55 | "col = np.array([0, 3, 1, 2]) # Product IDs\n", 56 | "data = np.array([4, 5, 7, 9]) # Interaction values (like ratings)\n", 57 | "\n", 58 | "# Create the sparse matrix\n", 59 | "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n", 60 | "\n", 61 | "# seeing the sparse matrix as a regular matrix\n", 62 | "print(\"Here's our sparse matrix as a regular array:\")\n", 63 | "print(sparse_matrix.toarray())\n" 64 | ], 65 | "metadata": { 66 | "colab": { 67 | "base_uri": "https://localhost:8080/" 68 | }, 69 | "id": "RkAQQ8QCMzM7", 70 | "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5" 71 | }, 72 | "execution_count": null, 73 | "outputs": [ 74 | { 75 | "output_type": "stream", 76 | "name": "stdout", 77 | "text": [ 78 | "Here's our sparse matrix as a regular array:\n", 79 | "[[4 0 9 0]\n", 80 | " [0 7 0 0]\n", 81 | " [0 0 0 0]\n", 82 | " [0 0 0 5]]\n" 83 | ] 84 | } 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "source": [ 90 | "## Basic Statistical Analysis" 91 | ], 92 | "metadata": { 93 | "id": "VRm2aXYiOS3F" 94 | } 95 | }, 96 | { 97 | "cell_type": "code", 98 | "source": [ 99 | "def calculate_sparse_mean(sparse_matrix):\n", 100 | " \"\"\"\n", 101 | " Calculate mean of non-zero elements in a sparse matrix.\n", 102 | " This is useful when zeros represent 'no data' rather than actual zeros.\n", 103 | " \"\"\"\n", 104 | " if sparse_matrix.nnz == 0: # nnz is the number of non-zero elements\n", 105 | " return 0.0\n", 106 | " return sparse_matrix.sum() / sparse_matrix.nnz\n", 107 | "\n", 108 | "mean_value = calculate_sparse_mean(sparse_matrix)\n", 109 | "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n" 110 | ], 111 | "metadata": { 112 | "colab": { 113 | "base_uri": "https://localhost:8080/" 114 | }, 115 | "id": "Dz0BFJXXM1ia", 116 | "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5" 117 | }, 118 | "execution_count": null, 119 | "outputs": [ 120 | { 121 | "output_type": "stream", 122 | "name": "stdout", 123 | "text": [ 124 | "\n", 125 | "Mean of non-zero elements: 6.25\n" 126 | ] 127 | } 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "source": [ 133 | "## Handling Row and Column Statistics" 134 | ], 135 | "metadata": { 136 | "id": "fDhW59jyOWl1" 137 | } 138 | }, 139 | { 140 | "cell_type": "code", 141 | "source": [ 142 | "def analyze_row_patterns(sparse_matrix):\n", 143 | " \"\"\"\n", 144 | " Analyze patterns in each row of a sparse matrix.\n", 145 | " Returns dictionary with various row statistics.\n", 146 | " \"\"\"\n", 147 | " # Convert to CSR format for efficient row operations\n", 148 | " csr_matrix = sparse_matrix.tocsr()\n", 149 | "\n", 150 | " # Calculate statistics\n", 151 | " row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n", 152 | " row_nonzeros = np.diff(csr_matrix.indptr) # Number of non-zeros per row\n", 153 | "\n", 154 | " # Calculate means, handling empty rows\n", 155 | " row_means = np.zeros_like(row_sums, dtype=float)\n", 156 | " mask = row_nonzeros > 0\n", 157 | " row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n", 158 | "\n", 159 | " return {\n", 160 | " 'activity_sum': row_sums, # Total activity per user\n", 161 | " 'interaction_count': row_nonzeros, # Number of interactions per user\n", 162 | " 'average_value': row_means # Average value per user\n", 163 | " }\n" 164 | ], 165 | "metadata": { 166 | "id": "SF3ygrrvM4Ks" 167 | }, 168 | "execution_count": null, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "source": [ 174 | "stats = analyze_row_patterns(sparse_matrix)\n", 175 | "print(\"\\nUser Statistics:\")\n", 176 | "for i, (sum_val, count, mean) in enumerate(zip(\n", 177 | " stats['activity_sum'],\n", 178 | " stats['interaction_count'],\n", 179 | " stats['average_value']\n", 180 | ")):\n", 181 | " print(f\"User {i}: {count} interactions, \"\n", 182 | " f\"total activity = {sum_val}, \"\n", 183 | " f\"average value = {mean:.2f}\")\n" 184 | ], 185 | "metadata": { 186 | "colab": { 187 | "base_uri": "https://localhost:8080/" 188 | }, 189 | "id": "IAzJ8tHRM519", 190 | "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04" 191 | }, 192 | "execution_count": null, 193 | "outputs": [ 194 | { 195 | "output_type": "stream", 196 | "name": "stdout", 197 | "text": [ 198 | "\n", 199 | "User Statistics:\n", 200 | "User 0: 2 interactions, total activity = 13, average value = 6.50\n", 201 | "User 1: 1 interactions, total activity = 7, average value = 7.00\n", 202 | "User 2: 0 interactions, total activity = 0, average value = 0.00\n", 203 | "User 3: 1 interactions, total activity = 5, average value = 5.00\n" 204 | ] 205 | } 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "source": [ 211 | "## Correlation Analysis" 212 | ], 213 | "metadata": { 214 | "id": "m5ETMgcxOatl" 215 | } 216 | }, 217 | { 218 | "cell_type": "code", 219 | "source": [ 220 | "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n", 221 | " \"\"\"\n", 222 | " Calculate correlation between columns, considering only overlapping non-zero elements.\n", 223 | " Like finding which products are often rated similarly.\n", 224 | " \"\"\"\n", 225 | " # Convert to dense format for this calculation\n", 226 | " # (For very large matrices, you'd want to do this differently)\n", 227 | " dense_cols = sparse_matrix.toarray().T\n", 228 | " n_cols = dense_cols.shape[0]\n", 229 | " correlations = np.zeros((n_cols, n_cols))\n", 230 | "\n", 231 | " for i in range(n_cols):\n", 232 | " for j in range(i, n_cols):\n", 233 | " # Find where both columns have non-zero values\n", 234 | " mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n", 235 | " if mask.sum() >= min_overlap:\n", 236 | " corr = stats.pearsonr(dense_cols[i][mask],\n", 237 | " dense_cols[j][mask])[0]\n", 238 | " correlations[i, j] = correlations[j, i] = corr\n", 239 | "\n", 240 | " return correlations" 241 | ], 242 | "metadata": { 243 | "id": "ADRakCn4M8KD" 244 | }, 245 | "execution_count": null, 246 | "outputs": [] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "source": [ 251 | "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n", 252 | "print(\"\\nCorrelation matrix:\")\n", 253 | "print(corr_matrix)" 254 | ], 255 | "metadata": { 256 | "colab": { 257 | "base_uri": "https://localhost:8080/" 258 | }, 259 | "id": "7UuFzRB6M979", 260 | "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7" 261 | }, 262 | "execution_count": null, 263 | "outputs": [ 264 | { 265 | "output_type": "stream", 266 | "name": "stdout", 267 | "text": [ 268 | "\n", 269 | "Correlation matrix:\n", 270 | "[[0. 0. 0. 0.]\n", 271 | " [0. 0. 0. 0.]\n", 272 | " [0. 0. 0. 0.]\n", 273 | " [0. 0. 0. 0.]]\n" 274 | ] 275 | } 276 | ] 277 | } 278 | ] 279 | } --------------------------------------------------------------------------------