├── data ├── .gitkeep ├── data.csv ├── test2.csv ├── test3.csv ├── test4.csv └── test5.csv ├── docs ├── .gitkeep └── README.md ├── requirements ├── .gitkeep └── requirements.txt ├── src ├── .gitkeep ├── data_analysis.py ├── detect_outliers.py ├── normalize_columns └── util_outlier_detection ├── tests ├── .gitkeep └── test_detect_outliers.py └── util └── util_outlier_detection.py /data/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/data.csv: -------------------------------------------------------------------------------- 1 | 1,2 2 | 2,4 3 | 3,6 4 | 4,8 5 | 5,10 6 | 100,200 7 | -------------------------------------------------------------------------------- /data/test2.csv: -------------------------------------------------------------------------------- 1 | 1,2 2 | 2,4 3 | 3,6 4 | 4,8 5 | 5,10 6 | 6,12 7 | 7,14 8 | 8,16 9 | 9,18 10 | 10,20 11 | 11,22 12 | 12,24 13 | 13,26 14 | 14,28 15 | 15,30 16 | 16,32 17 | 17,34 18 | 18,36 19 | 19,38 20 | 20,40 21 | -------------------------------------------------------------------------------- /data/test3.csv: -------------------------------------------------------------------------------- 1 | 1 2.9934283060224653 2 | 2 3.723471397657631 3 | 3 7.295377076201385 4 | 4 11.046059712816051 5 | 5 9.531693250553328 6 | 6 11.531726086101639 7 | 7 17.158425631014783 8 | 8 17.534869458305817 9 | 9 17.061051228130097 10 | 10 21.08512008717193 11 | 11 21.073164614375074 12 | 12 23.068540492859487 13 | 13 26.48392454313207 14 | 14 24.173439510684403 15 | 15 26.550164334973935 16 | 16 30.875424941518055 17 | 17 31.974337759331153 18 | 18 36.628494665190544 19 | 19 36.18395184895758 20 | 20 37.17539259732942 21 | 21 44.93129753784311 22 | 22 43.54844739902693 23 | 23 46.13505640937585 24 | 24 45.15050362757309 25 | 25 48.91123455094964 26 | 26 52.22184517941973 27 | 27 51.69801285 28 | 28 56.75139603669135 29 | 29 56.79872262016239 30 | 30 59.41661250041345 31 | 31 60.79658677554121 32 | 32 67.70455636901788 33 | 33 65.97300555052414 34 | 34 65.88457814 35 | 35 71.64508982420638 36 | 36 69.55831270005795 37 | 37 74.41772719000951 38 | 38 72.08065975224045 39 | 39 75.34362790220314 40 | 40 80.39372247173824 41 | 41 83.47693315999082 42 | 42 84.34273656237994 43 | 43 85.76870343522351 44 | 44 87.39779260882142 45 | 45 87.04295601926515 46 | 46 90.56031158321058 47 | 47 93.07872245808042 48 | 48 98.11424445243783 49 | 49 98.68723657913692 50 | 50 96.47391968927454 51 | -------------------------------------------------------------------------------- /data/test4.csv: -------------------------------------------------------------------------------- 1 | x y 2 | 38.07947176588889 79.85127643175483 3 | 95.12071633 191.09827407511327 4 | 73.46740023932911 146.3565591 5 | 60.26718993550662 119.0288613930668 6 | 16.445845403801215 25.499080855765293 7 | 16.443457513284063 29.287693984594583 8 | 6.750277604651747 11.197361354504556 9 | 86.75143843171858 178.78848799453175 10 | 60.510386162577674 122.73886377299766 11 | 71.09918520180851 133.38316962680335 12 | 3.0378649352844422 7.696149717542859 13 | 97.02107536403744 192.1167393259933 14 | 83.41182143924175 163.43903287695372 15 | 22.02157195714934 47.10152535850302 16 | 19.00067175350296 43.156341119485674 17 | 19.15704647548995 42.97049354656089 18 | 31.119982052994235 58.043876489875274 19 | 52.950886731591545 104.35571158392702 20 | 43.76255684556946 89.18143084815675 21 | 29.83168487960615 64.54109539 22 | 61.573436577515565 120.75100196580468 23 | 14.809892204552142 28.691489525785197 24 | 29.922320204986598 54.31296553994305 25 | 37.269822486075476 68.55861185 26 | 46.150928437486556 96.36448599 27 | 78.73242017790835 164.24604049867082 28 | 20.767704433677615 41.17535825945356 29 | 51.909209402947546 108.83608329535521 30 | 59.64904231734221 121.10626475992258 31 | 5.598590859279775 7.971582945533928 32 | 61.146940338242395 124.10085870402686 33 | 17.88188824504186 43.45395932241357 34 | 7.4401077055426725 14.701085215535587 35 | 94.93966818807999 197.70255465523002 36 | 96.59757127438138 180.09641702831405 37 | 81.03133746352965 166.17218744893543 38 | 31.156763148163698 62.74876163751825 39 | 10.669539286632004 19.84404182093467 40 | 68.73906962470353 137.93694313208456 41 | 44.57509688022053 79.21234918743659 42 | 13.081785249633104 25.065211060078646 43 | 50.02251410101575 101.83059105959023 44 | 4.404463590406621 16.198397404520822 45 | 91.02271980579943 179.4540885202306 46 | 26.619218178401674 49.19596834233741 47 | 66.58970615104421 130.67062708416574 48 | 31.859396532851683 68.29580365421374 49 | 52.48673409660327 106.61722374150497 50 | 55.12431765498469 107.59983429113417 51 | 19.300591097027176 41.16751935962113 52 | -------------------------------------------------------------------------------- /data/test5.csv: -------------------------------------------------------------------------------- 1 | x y 2 | 67.55557730444839 160.33012809598438 3 | 49.319761524817785 126.7446944242207 4 | 82.72402222955374 213.61119316443478 5 | 4.113192375003516 8.167841725111824 6 | 80.99694637311993 209.54627760259422 7 | 56.996124541442526 138.00388004358507 8 | 30.464627371690455 71.39383176190998 9 | 5.622876330188372 16.268118496473182 10 | 99.07211254760882 244.4163575735596 11 | 1.675747571321864 2.2027815313916266 12 | 77.20950979080403 197.36867673023718 13 | 74.92994299843566 183.42615638937653 14 | 38.36645469716155 94.78759347126154 15 | 49.92059777185661 124.33263782989066 16 | 92.96589081746234 233.9289161090991 17 | 40.149950390236214 103.30365456430363 18 | 97.42167338606704 243.13989932670017 19 | 52.917056822148766 127.49781693474209 20 | 10.267696236477505 29.70938289631082 21 | 81.51753286543077 215.29527348307383 22 | 21.956991773346985 58.17373001 23 | 55.88023267447785 141.94302770993264 24 | 29.93464248 73.3340374 25 | 81.79809363873163 203.48681690226584 26 | 82.97621405447433 201.88208134749868 27 | 22.936159801228193 59.80972861048586 28 | 64.83863545872785 161.55344045575197 29 | 10.422980571419512 31.20907126 30 | 41.754660645450905 100.23867402412957 31 | 10.589660865004047 31.931480713091013 32 | 15.25708889186078 36.90289568319627 33 | 22.007421356240144 52.57408491489617 34 | 48.18895929113014 118.53997511720033 35 | 8.683769823401471 19.27348980822208 36 | 24.26933437092893 52.319994736733996 37 | 1.648747265435246 6.679156156184053 38 | 89.96577452016759 228.01165211108943 39 | 55.67120826088866 144.2922982557267 40 | 17.58711614371605 46.78889443666723 41 | 92.95894016607785 235.0212599385139 42 | 54.666682182974284 129.95269731442994 43 | 5.134094598129749 13.56767596534359 44 | 52.96280715623928 127.87412153769112 45 | 64.37411151091817 159.8189359333329 46 | 80.26515820099465 206.24937509920807 47 | 83.65696631737042 209.27129356570805 48 | 26.010590785085874 54.571833876367755 49 | 96.85907776475311 238.14138389110536 50 | 47.15757533627438 115.6238943151569 51 | 27.14502581563233 66.95993242433185 52 | -------------------------------------------------------------------------------- /docs/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Outlier Detection Tool 2 | 3 | Author: Yisong Chen 4 | 5 | ## Overview 6 | The **Outlier Detection Tool** is a powerful and efficient Python package designed for data scientists, statisticians, and analysts who require a precise method to detect anomalies in two-dimensional datasets. By leveraging the statistical rigor of **Cook's Distance**, this tool performs regression diagnostics on (x, y) data, identifies influential data points, and flags significant deviations from the general trend. 7 | 8 | ---- 9 | 10 | ## Purpose and Motivation 11 | 12 | Outliers can significantly impact statistical models, distort trends, and bias predictions. Addressing these anomalies is essential for ensuring the accuracy and reliability of data-driven insights. Traditional methods like standard deviation thresholds or IQR (Interquartile Range) often fall short in complex datasets where more nuanced techniques are required. 13 | 14 | The **Outlier Detection Tool**, developed by **Yisong Chen**, provides a more sophisticated approach by utilizing **Cook's Distance**, a metric designed to quantify the influence of each observation in a regression model. This tool enables professionals to: 15 | 16 | - Detect and quantify data points that disproportionately affect regression outcomes. 17 | - Evaluate the statistical influence of each observation and determine its impact on overall model performance. 18 | - Make informed decisions about whether to retain or exclude anomalous data. 19 | 20 | By automating the use of Cook's Distance, the **Outlier Detection Tool** bridges the gap between theoretical rigor and practical application, making it indispensable for any professional working in data preprocessing, anomaly detection, or predictive modeling. 21 | 22 | ---- 23 | 24 | ## Key Features 25 | 26 | - **Cook's Distance Calculation**: Precisely measures the influence of each data point in regression models. 27 | - **Automated Outlier Flagging**: Flags influential data points based on a user-defined threshold. 28 | - **Customizable Sensitivity**: Allows fine-tuning of the outlier detection threshold to fit different datasets. 29 | - **Comprehensive Summary Statistics**: Provides detailed insights, including the number of outliers detected, their proportion, and their influence on the dataset. 30 | - **CSV File Support**: Ingests CSV files with (x, y) data and outputs enriched datasets with diagnostic metrics and anomaly flags. 31 | - **Robust Error Handling**: Includes validation for missing files, malformed data, or incomplete input structures to prevent processing failures. 32 | - **Flexible Data Cleaning**: Offers built-in utilities for removing duplicates, handling missing values, and filtering datasets based on custom criteria. 33 | - **Advanced Data Processing**: Supports additional functionalities such as feature scaling, sorting, renaming columns, and computing correlation matrices. 34 | 35 | ---- 36 | 37 | ## Installation 38 | 39 | To install the **Outlier Detection Tool**, ensure that Python and `pip` are installed on your system. You can install the required dependencies using: 40 | 41 | ```bash 42 | pip install -r requirements.txt 43 | ``` 44 | 45 | or manually install the key libraries: 46 | 47 | ```bash 48 | pip install pandas numpy statsmodels 49 | ``` 50 | 51 | ---- 52 | 53 | ## Usage 54 | 55 | ### 1. Detecting Outliers 56 | The primary function of the tool is to detect outliers in (x, y) datasets using Cook’s Distance. To run outlier detection on a CSV file, use: 57 | 58 | ```python 59 | from detect_outliers import detect_outliers 60 | 61 | file_path = "data.csv" 62 | outlier_data = detect_outliers(file_path, threshold=0.5, output_file="output_with_outliers.csv") 63 | ``` 64 | 65 | ### 2. Summarizing Outlier Results 66 | 67 | ```python 68 | from detect_outliers import summarize_outliers 69 | 70 | summary = summarize_outliers(outlier_data) 71 | print(summary) 72 | ``` 73 | 74 | ### 3. Cleaning Data 75 | 76 | ```python 77 | from data_analysis import clean_data 78 | 79 | cleaned_data = clean_data("data.csv", output_file="cleaned_data.csv") 80 | ``` 81 | 82 | ### 4. Feature Scaling 83 | 84 | ```python 85 | from data_analysis import scale_features 86 | 87 | scaled_data = scale_features(cleaned_data, ["x", "y"]) 88 | ``` 89 | 90 | ### 5. Sorting Data 91 | 92 | ```python 93 | from data_analysis import sort_data 94 | 95 | sorted_data = sort_data(cleaned_data, column="x", ascending=True) 96 | ``` 97 | 98 | ---- 99 | 100 | ## Example Dataset 101 | To test the tool, use a simple CSV file (`data.csv`) structured as follows: 102 | 103 | ``` 104 | x,y 105 | 1,2 106 | 2,4 107 | 3,6 108 | 4,8 109 | 5,10 110 | 100,200 111 | ``` 112 | 113 | This dataset contains five standard points and one extreme outlier (`100,200`). Running the tool will flag `100,200` as an influential data point. 114 | 115 | ---- 116 | 117 | ## Contributing 118 | We welcome contributions to enhance the functionality of the **Outlier Detection Tool**. If you would like to contribute: 119 | 120 | 1. Fork the repository. 121 | 2. Create a new feature branch. 122 | 3. Implement and test your changes. 123 | 4. Submit a pull request for review. 124 | 125 | For major changes, please open an issue first to discuss your proposed modifications. 126 | 127 | ---- 128 | 129 | ## License 130 | This project is licensed under the **MIT License**. 131 | 132 | ---- 133 | 134 | ## Author 135 | **Yisong Chen** 136 | For inquiries or collaborations, please reach out via GitHub or email. 137 | 138 | -------------------------------------------------------------------------------- /requirements/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=1.0.0 2 | numpy>=1.18.0 3 | statsmodels>=0.13.0 4 | -------------------------------------------------------------------------------- /src/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/data_analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import logging 3 | 4 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") 5 | 6 | def calculate_statistics(data, column): 7 | """ 8 | Calculate basic statistics (mean, median, standard deviation) for a specified column. 9 | 10 | Args: 11 | data (pd.DataFrame): DataFrame containing the data. 12 | column (str): Name of the column to calculate statistics for. 13 | 14 | Returns: 15 | dict: Dictionary containing mean, median, and standard deviation. 16 | """ 17 | if column not in data.columns: 18 | logging.error(f"Column '{column}' not found in the data.") 19 | raise ValueError(f"Column '{column}' does not exist in the DataFrame.") 20 | 21 | logging.info(f"Calculating statistics for column: {column}") 22 | 23 | try: 24 | mean = data[column].mean() 25 | median = data[column].median() 26 | std_dev = data[column].std() 27 | 28 | stats = { 29 | "mean": mean, 30 | "median": median, 31 | "std_dev": std_dev 32 | } 33 | 34 | logging.info(f"Statistics for column '{column}': {stats}") 35 | return stats 36 | 37 | except Exception as e: 38 | logging.error(f"Error while calculating statistics: {e}") 39 | raise 40 | 41 | def detect_missing_values(data): 42 | """ 43 | Detect missing values in each column of the DataFrame. 44 | 45 | Args: 46 | data (pd.DataFrame): DataFrame containing the data. 47 | 48 | Returns: 49 | dict: Dictionary containing the count of missing values per column. 50 | """ 51 | logging.info("Detecting missing values in the dataset...") 52 | try: 53 | missing_values = data.isnull().sum().to_dict() 54 | logging.info(f"Missing values per column: {missing_values}") 55 | return missing_values 56 | except Exception as e: 57 | logging.error(f"Error while detecting missing values: {e}") 58 | raise 59 | 60 | def normalize_column(data, column): 61 | """ 62 | Normalize a specified column using min-max normalization. 63 | 64 | Args: 65 | data (pd.DataFrame): DataFrame containing the data. 66 | column (str): Name of the column to normalize. 67 | 68 | Returns: 69 | pd.DataFrame: DataFrame with the normalized column. 70 | """ 71 | if column not in data.columns: 72 | logging.error(f"Column '{column}' not found in the data.") 73 | raise ValueError(f"Column '{column}' does not exist in the DataFrame.") 74 | 75 | logging.info(f"Normalizing column: {column}") 76 | try: 77 | min_val = data[column].min() 78 | max_val = data[column].max() 79 | data[column] = (data[column] - min_val) / (max_val - min_val) 80 | logging.info(f"Column '{column}' normalized successfully.") 81 | return data 82 | except Exception as e: 83 | logging.error(f"Error while normalizing column '{column}': {e}") 84 | raise 85 | def replace_missing_values(data, column, method="mean"): 86 | """ 87 | Replace missing values in a specified column using mean, median, or mode. 88 | 89 | Args: 90 | data (pd.DataFrame): DataFrame containing the data. 91 | column (str): Name of the column to process. 92 | method (str): Strategy to replace missing values (options: "mean", "median", "mode"). 93 | 94 | Returns: 95 | pd.DataFrame: DataFrame with missing values replaced. 96 | """ 97 | if column not in data.columns: 98 | logging.error(f"Column '{column}' not found in the data.") 99 | raise ValueError(f"Column '{column}' does not exist in the DataFrame.") 100 | 101 | logging.info(f"Replacing missing values in column: {column} using {method} method.") 102 | 103 | try: 104 | if method == "mean": 105 | data[column].fillna(data[column].mean(), inplace=True) 106 | elif method == "median": 107 | data[column].fillna(data[column].median(), inplace=True) 108 | elif method == "mode": 109 | data[column].fillna(data[column].mode()[0], inplace=True) 110 | else: 111 | raise ValueError("Method should be 'mean', 'median', or 'mode'.") 112 | 113 | logging.info(f"Missing values in column '{column}' replaced successfully.") 114 | return data 115 | except Exception as e: 116 | logging.error(f"Error while replacing missing values in column '{column}': {e}") 117 | raise 118 | 119 | def compute_correlation_matrix(data): 120 | """ 121 | Compute the correlation matrix for numerical columns in the DataFrame. 122 | 123 | Args: 124 | data (pd.DataFrame): DataFrame containing the data. 125 | 126 | Returns: 127 | pd.DataFrame: Correlation matrix. 128 | """ 129 | logging.info("Computing correlation matrix for numerical columns...") 130 | try: 131 | correlation_matrix = data.corr() 132 | logging.info("Correlation matrix computed successfully.") 133 | return correlation_matrix 134 | except Exception as e: 135 | logging.error(f"Error while computing correlation matrix: {e}") 136 | raise 137 | -------------------------------------------------------------------------------- /src/detect_outliers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import statsmodels.api as sm 4 | import os 5 | import logging 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") 9 | 10 | def detect_outliers(file_path, threshold=0.5, output_file="output_with_outliers.csv"): 11 | """ 12 | Detect outliers in (x, y) data using Cook's distance. 13 | 14 | Args: 15 | file_path (str): Path to the CSV file containing 'x' and 'y' data. 16 | threshold (float): Threshold for Cook's distance to flag outliers. 17 | output_file (str): Path to save the processed file with outliers flagged. 18 | 19 | Returns: 20 | pd.DataFrame: DataFrame with calculated Cook's distance and outlier flags. 21 | """ 22 | logging.info(f"Starting outlier detection for file: {file_path}") 23 | 24 | if not os.path.exists(file_path): 25 | logging.error(f"File not found: {file_path}") 26 | raise FileNotFoundError(f"The file {file_path} does not exist.") 27 | 28 | data = pd.read_csv(file_path) 29 | logging.info("Data successfully loaded.") 30 | 31 | if 'x' not in data.columns or 'y' not in data.columns: 32 | logging.error("Missing required columns 'x' and 'y' in the input data.") 33 | raise ValueError("The input CSV file must contain 'x' and 'y' columns.") 34 | 35 | # Prepare the data for regression 36 | X = sm.add_constant(data['x']) 37 | y = data['y'] 38 | 39 | logging.info("Fitting regression model...") 40 | model = sm.OLS(y, X).fit() 41 | 42 | # Calculate Cook's distance and influence metrics 43 | logging.info("Calculating Cook's distance...") 44 | influence = model.get_influence() 45 | cooks = influence.cooks_distance[0] 46 | 47 | # Add results to the DataFrame 48 | data['cooks_distance'] = cooks 49 | data['outlier'] = data['cooks_distance'] > threshold 50 | 51 | # Add 100 new rows of random data for testing purposes 52 | new_data = pd.DataFrame({ 53 | 'x': np.random.rand(100), 54 | 'y': np.random.rand(100) 55 | }) 56 | new_data['cooks_distance'] = np.nan 57 | new_data['outlier'] = False 58 | data = pd.concat([data, new_data], ignore_index=True) 59 | 60 | # Save output to a CSV file 61 | if output_file: 62 | data.to_csv(output_file, index=False) 63 | logging.info(f"Results saved to: {output_file}") 64 | 65 | return data 66 | 67 | 68 | def summarize_outliers(data): 69 | """ 70 | Summarize the outlier detection results. 71 | 72 | Args: 73 | data (pd.DataFrame): DataFrame containing the outlier flags. 74 | 75 | Returns: 76 | dict: Summary statistics including count of outliers and non-outliers. 77 | """ 78 | logging.info("Summarizing outlier detection results...") 79 | total_points = len(data) 80 | outlier_count = data['outlier'].sum() 81 | non_outlier_count = total_points - outlier_count 82 | 83 | summary = { 84 | "total_points": total_points, 85 | "outliers": outlier_count, 86 | "non_outliers": non_outlier_count, 87 | "outlier_percentage": (outlier_count / total_points) * 100, 88 | } 89 | 90 | logging.info("Summary generated successfully.") 91 | return summary 92 | 93 | 94 | def scale_features(data, columns): 95 | """ 96 | Scale specified numerical columns to a 0-1 range. 97 | 98 | Args: 99 | data (pd.DataFrame): DataFrame containing the data. 100 | columns (list): List of column names to scale. 101 | 102 | Returns: 103 | pd.DataFrame: DataFrame with scaled columns. 104 | """ 105 | logging.info("Scaling specified features to a 0-1 range...") 106 | try: 107 | data = data.copy() 108 | for col in columns: 109 | if col in data.columns: 110 | min_val = data[col].min() 111 | max_val = data[col].max() 112 | data[col] = (data[col] - min_val) / (max_val - min_val) 113 | logging.info(f"Column '{col}' scaled successfully.") 114 | else: 115 | logging.warning(f"Column '{col}' not found in the data.") 116 | return data 117 | except Exception as e: 118 | logging.error(f"Error during feature scaling: {e}") 119 | raise 120 | 121 | 122 | def main(input_file, threshold=0.5, output_file="output_with_outliers.csv"): 123 | """ 124 | Main function to detect and summarize outliers. 125 | 126 | Args: 127 | input_file (str): Path to the input CSV file. 128 | threshold (float): Threshold for Cook's distance to flag outliers. 129 | output_file (str): Path to save the output CSV file. 130 | """ 131 | try: 132 | # Detect outliers 133 | processed_data = detect_outliers(input_file, threshold, output_file) 134 | 135 | # Summarize results 136 | summary = summarize_outliers(processed_data) 137 | 138 | # Display summary 139 | logging.info("Outlier Detection Completed") 140 | logging.info("Summary Statistics:") 141 | for key, value in summary.items(): 142 | logging.info(f"{key.capitalize()}: {value}") 143 | 144 | # Scale features for further analysis 145 | scaled_data = scale_features(processed_data, ['x', 'y']) 146 | logging.info("Feature scaling completed. Preview of scaled data:") 147 | logging.info(scaled_data.head()) 148 | 149 | except Exception as e: 150 | logging.error(f"Error occurred: {e}") 151 | 152 | def remove_missing_values(data): 153 | """ 154 | Remove rows with missing values from the dataset. 155 | 156 | Args: 157 | data (pd.DataFrame): DataFrame containing the data. 158 | 159 | Returns: 160 | pd.DataFrame: Cleaned DataFrame with missing values removed. 161 | """ 162 | logging.info("Removing rows with missing values...") 163 | cleaned_data = data.dropna() 164 | logging.info(f"Removed {len(data) - len(cleaned_data)} rows with missing values.") 165 | return cleaned_data 166 | 167 | 168 | def detect_high_variance_features(data, threshold=1.0): 169 | """ 170 | Identify columns with high variance exceeding a given threshold. 171 | 172 | Args: 173 | data (pd.DataFrame): DataFrame containing the data. 174 | threshold (float): Variance threshold to identify high variance features. 175 | 176 | Returns: 177 | list: List of column names with variance above the threshold. 178 | """ 179 | logging.info("Detecting high variance features...") 180 | high_variance_features = [col for col in data.select_dtypes(include=[np.number]).columns 181 | if data[col].var() > threshold] 182 | logging.info(f"High variance features detected: {high_variance_features}") 183 | return high_variance_features 184 | 185 | def flag_extreme_values(data, column, z_threshold=3): 186 | """ 187 | Flag extreme values in a column using the Z-score method. 188 | 189 | Args: 190 | data (pd.DataFrame): DataFrame containing the data. 191 | column (str): Column name to evaluate. 192 | z_threshold (float): Z-score threshold for identifying extreme values. 193 | 194 | Returns: 195 | pd.DataFrame: DataFrame with an additional boolean column indicating extreme values. 196 | """ 197 | logging.info(f"Flagging extreme values in column '{column}' using Z-score threshold: {z_threshold}") 198 | 199 | if column not in data.columns: 200 | logging.error(f"Column '{column}' not found in the dataset.") 201 | raise ValueError(f"Column '{column}' does not exist in the DataFrame.") 202 | 203 | try: 204 | mean = data[column].mean() 205 | std = data[column].std() 206 | z_scores = (data[column] - mean) / std 207 | data[f"{column}_extreme"] = np.abs(z_scores) > z_threshold 208 | logging.info(f"Extreme values flagged in column '{column}'.") 209 | return data 210 | except Exception as e: 211 | logging.error(f"Error while flagging extreme values in column '{column}': {e}") 212 | raise 213 | 214 | 215 | 216 | if __name__ == "__main__": 217 | # Example usage 218 | input_file = "data.csv" 219 | output_file = "output_with_outliers.csv" 220 | threshold = 0.5 # Adjust threshold as needed 221 | 222 | main(input_file, threshold, output_file) 223 | -------------------------------------------------------------------------------- /src/normalize_columns: -------------------------------------------------------------------------------- 1 | -- updated 06/03/2025 2 | 3 | import pandas as pd 4 | import logging 5 | 6 | def normalize_columns(data, columns): 7 | """ 8 | Normalize specified numerical columns to have a mean of 0 and a standard deviation of 1. 9 | 10 | Args: 11 | data (pd.DataFrame): DataFrame containing the data. 12 | columns (list): List of column names to normalize. 13 | 14 | Returns: 15 | pd.DataFrame: DataFrame with normalized columns. 16 | """ 17 | logging.info("Normalizing specified columns to mean=0 and std=1...") 18 | try: 19 | data = data.copy() 20 | for col in columns: 21 | if col in data.columns: 22 | mean = data[col].mean() 23 | std = data[col].std() 24 | data[col] = (data[col] - mean) / std 25 | logging.info(f"Column '{col}' normalized successfully.") 26 | else: 27 | logging.warning(f"Column '{col}' not found in the data.") 28 | return data 29 | except Exception as e: 30 | logging.error(f"Error during column normalization: {e}") 31 | raise 32 | 33 | def standardize_columns(data, columns): 34 | """ 35 | Standardize specified numerical columns to have a min-max range of 0 to 1. 36 | 37 | Args: 38 | data (pd.DataFrame): DataFrame containing the data. 39 | columns (list): List of column names to standardize. 40 | 41 | Returns: 42 | pd.DataFrame: DataFrame with standardized columns. 43 | """ 44 | logging.info("Standardizing specified columns to range 0-1...") 45 | try: 46 | data = data.copy() 47 | for col in columns: 48 | if col in data.columns: 49 | min_val = data[col].min() 50 | max_val = data[col].max() 51 | data[col] = (data[col] - min_val) / (max_val - min_val) 52 | logging.info(f"Column '{col}' standardized successfully.") 53 | else: 54 | logging.warning(f"Column '{col}' not found in the data.") 55 | return data 56 | except Exception as e: 57 | logging.error(f"Error during column standardization: {e}") 58 | raise 59 | 60 | def remove_outliers(data, columns, threshold=3): 61 | """ 62 | Remove outliers in specified numerical columns using a Z-score threshold. 63 | 64 | Args: 65 | data (pd.DataFrame): DataFrame containing the data. 66 | columns (list): List of column names to check for outliers. 67 | threshold (float): Z-score threshold to identify outliers (default=3). 68 | 69 | Returns: 70 | pd.DataFrame: DataFrame with outliers removed. 71 | """ 72 | logging.info(f"Removing outliers using Z-score threshold of {threshold}...") 73 | try: 74 | data = data.copy() 75 | for col in columns: 76 | if col in data.columns: 77 | mean = data[col].mean() 78 | std = data[col].std() 79 | z_scores = (data[col] - mean) / std 80 | data = data[abs(z_scores) <= threshold] 81 | logging.info(f"Outliers removed from column '{col}'.") 82 | else: 83 | logging.warning(f"Column '{col}' not found in the data.") 84 | return data 85 | except Exception as e: 86 | logging.error(f"Error during outlier removal: {e}") 87 | raise 88 | 89 | def compute_summary_statistics(data, columns): 90 | """ 91 | Compute summary statistics (mean, median, std, min, max) for specified numerical columns. 92 | 93 | Args: 94 | data (pd.DataFrame): DataFrame containing the data. 95 | columns (list): List of column names to compute statistics for. 96 | 97 | Returns: 98 | pd.DataFrame: DataFrame with computed summary statistics. 99 | """ 100 | logging.info("Computing summary statistics...") 101 | try: 102 | stats = data[columns].describe().transpose() 103 | logging.info("Summary statistics computed successfully.") 104 | return stats 105 | except Exception as e: 106 | logging.error(f"Error computing summary statistics: {e}") 107 | raise 108 | 109 | def fill_missing_values(data, columns, strategy="mean"): 110 | """ 111 | Fill missing values in specified columns using a chosen strategy (mean, median, or mode). 112 | 113 | Args: 114 | data (pd.DataFrame): DataFrame containing the data. 115 | columns (list): List of column names to fill missing values. 116 | strategy (str): Strategy to fill missing values ("mean", "median", "mode"). 117 | 118 | Returns: 119 | pd.DataFrame: DataFrame with missing values filled. 120 | """ 121 | logging.info(f"Filling missing values using strategy: {strategy}") 122 | try: 123 | data = data.copy() 124 | for col in columns: 125 | if col in data.columns: 126 | if strategy == "mean": 127 | data[col].fillna(data[col].mean(), inplace=True) 128 | elif strategy == "median": 129 | data[col].fillna(data[col].median(), inplace=True) 130 | elif strategy == "mode": 131 | data[col].fillna(data[col].mode()[0], inplace=True) 132 | logging.info(f"Missing values filled for column '{col}' using {strategy} strategy.") 133 | else: 134 | logging.warning(f"Column '{col}' not found in the data.") 135 | return data 136 | except Exception as e: 137 | logging.error(f"Error filling missing values: {e}") 138 | raise 139 | 140 | def detect_constant_columns(data): 141 | """ 142 | Detect columns with a constant value across all rows. 143 | Args: 144 | 145 | data (pd.DataFrame): DataFrame containing the data. 146 | Returns: 147 | 148 | list: List of column names that have a constant value. 149 | """ 150 | logging.info("Detecting constant columns...") 151 | try: 152 | constant_cols = [col for col in data.columns if data[col].nunique() == 1] 153 | logging.info(f"Constant columns found: {constant_cols}") 154 | return constant_cols 155 | except Exception as e: 156 | logging.error(f"Error detecting constant columns: {e}") 157 | raise 158 | 159 | def convert_columns_to_numeric(data, columns): 160 | """ 161 | Convert specified columns to numeric type, coercing errors to NaN. 162 | 163 | Args: 164 | data (pd.DataFrame): DataFrame containing the data. 165 | columns (list): List of column names to convert. 166 | 167 | Returns: 168 | pd.DataFrame: DataFrame with specified columns converted to numeric. 169 | """ 170 | logging.info("Converting specified columns to numeric type...") 171 | try: 172 | data = data.copy() 173 | for col in columns: 174 | if col in data.columns: 175 | data[col] = pd.to_numeric(data[col], errors='coerce') 176 | logging.info(f"Column '{col}' converted to numeric.") 177 | else: 178 | logging.warning(f"Column '{col}' not found in the data.") 179 | return data 180 | except Exception as e: 181 | logging.error(f"Error converting columns to numeric: {e}") 182 | raise 183 | 184 | def detect_highly_correlated_columns(data, threshold=0.9): 185 | """ 186 | Detect pairs of numerical columns with correlation above a specified threshold. 187 | 188 | Args: 189 | data (pd.DataFrame): DataFrame containing the data. 190 | threshold (float): Correlation coefficient threshold (default is 0.9). 191 | 192 | Returns: 193 | list of tuple: List of column name pairs that are highly correlated. 194 | """ 195 | logging.info(f"Detecting highly correlated column pairs with threshold > {threshold}") 196 | try: 197 | corr_matrix = data.corr().abs() 198 | correlated_pairs = [] 199 | 200 | for i in range(len(corr_matrix.columns)): 201 | for j in range(i + 1, len(corr_matrix.columns)): 202 | if corr_matrix.iloc[i, j] > threshold: 203 | col1 = corr_matrix.columns[i] 204 | col2 = corr_matrix.columns[j] 205 | correlated_pairs.append((col1, col2)) 206 | logging.info(f"High correlation detected: {col1} and {col2} -> {corr_matrix.iloc[i, j]}") 207 | 208 | return correlated_pairs 209 | 210 | def rename_columns(data, column_mapping): 211 | """ 212 | Rename columns in the DataFrame using a provided mapping. 213 | 214 | Args: 215 | data (pd.DataFrame): DataFrame containing the data. 216 | column_mapping (dict): Dictionary mapping old column names to new names. 217 | 218 | Returns: 219 | pd.DataFrame: DataFrame with renamed columns. 220 | """ 221 | logging.info(f"Renaming columns: {column_mapping}") 222 | try: 223 | data = data.copy() 224 | data.rename(columns=column_mapping, inplace=True) 225 | logging.info("Columns renamed successfully.") 226 | return data 227 | except Exception as e: 228 | logging.error(f"Error renaming columns: {e}") 229 | raise 230 | 231 | def detect_duplicate_rows(data, drop=False): 232 | """ 233 | 234 | Detect duplicate rows in the DataFrame and optionally remove them. 235 | Args: 236 | data (pd.DataFrame): DataFrame containing the data. 237 | drop (bool): If True, return the DataFrame without duplicates; if False, just report them. 238 | Returns: 239 | pd.DataFrame or pd.Index: If drop is True, returns DataFrame without duplicates; 240 | if False, returns index of duplicate rows. 241 | 242 | """ 243 | logging.info("Checking for duplicate rows...") 244 | try: 245 | duplicates = data.duplicated() 246 | duplicate_count = duplicates.sum() 247 | logging.info(f"Found {duplicate_count} duplicate rows.") 248 | 249 | if drop: 250 | data_no_duplicates = data.drop_duplicates() 251 | logging.info("Duplicate rows removed.") 252 | return data_no_duplicates 253 | else: 254 | return data[duplicates] 255 | except Exception as e: 256 | logging.error(f"Error detecting duplicate rows: {e}") 257 | raise 258 | 259 | 260 | 261 | -------------------------------------------------------------------------------- /src/util_outlier_detection: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import statsmodels.api as sm 4 | import os 5 | import logging 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") 9 | 10 | def detect_outliers(file_path, threshold=0.5, output_file="output_with_outliers.csv"): 11 | """ 12 | Detect outliers in (x, y) data using Cook's distance. 13 | 14 | Args: 15 | file_path (str): Path to the CSV file containing 'x' and 'y' data. 16 | threshold (float): Threshold for Cook's distance to flag outliers. 17 | output_file (str): Path to save the processed file with outliers flagged. 18 | 19 | Returns: 20 | pd.DataFrame: DataFrame with calculated Cook's distance and outlier flags. 21 | """ 22 | logging.info(f"Starting outlier detection for file: {file_path}") 23 | 24 | if not os.path.exists(file_path): 25 | logging.error(f"File not found: {file_path}") 26 | raise FileNotFoundError(f"The file {file_path} does not exist.") 27 | 28 | data = pd.read_csv(file_path) 29 | logging.info("Data successfully loaded.") 30 | 31 | if 'x' not in data.columns or 'y' not in data.columns: 32 | logging.error("Missing required columns 'x' and 'y' in the input data.") 33 | raise ValueError("The input CSV file must contain 'x' and 'y' columns.") 34 | 35 | # Prepare the data for regression 36 | X = sm.add_constant(data['x']) 37 | y = data['y'] 38 | 39 | logging.info("Fitting regression model...") 40 | model = sm.OLS(y, X).fit() 41 | 42 | # Calculate Cook's distance and influence metrics 43 | logging.info("Calculating Cook's distance...") 44 | influence = model.get_influence() 45 | cooks = influence.cooks_distance[0] 46 | 47 | # Add results to the DataFrame 48 | data['cooks_distance'] = cooks 49 | data['outlier'] = data['cooks_distance'] > threshold 50 | 51 | # Save output to a CSV file 52 | if output_file: 53 | data.to_csv(output_file, index=False) 54 | logging.info(f"Results saved to: {output_file}") 55 | 56 | return data 57 | 58 | 59 | def summarize_outliers(data): 60 | """ 61 | Summarize the outlier detection results. 62 | 63 | Args: 64 | data (pd.DataFrame): DataFrame containing the outlier flags. 65 | 66 | Returns: 67 | dict: Summary statistics including count of outliers and non-outliers. 68 | """ 69 | logging.info("Summarizing outlier detection results...") 70 | total_points = len(data) 71 | outlier_count = data['outlier'].sum() 72 | non_outlier_count = total_points - outlier_count 73 | 74 | summary = { 75 | "total_points": total_points, 76 | "outliers": outlier_count, 77 | "non_outliers": non_outlier_count, 78 | "outlier_percentage": (outlier_count / total_points) * 100, 79 | } 80 | 81 | logging.info("Summary generated successfully.") 82 | return summary 83 | 84 | 85 | def add_random_data(data, num_rows=100): 86 | """ 87 | Add random data to an existing DataFrame. 88 | 89 | Args: 90 | data (pd.DataFrame): Original DataFrame to which random data will be added. 91 | num_rows (int): Number of random rows to add. 92 | 93 | Returns: 94 | pd.DataFrame: DataFrame with added random data. 95 | """ 96 | logging.info(f"Adding {num_rows} random rows to the data.") 97 | random_data = pd.DataFrame({ 98 | 'x': np.random.rand(num_rows), 99 | 'y': np.random.rand(num_rows), 100 | 'cooks_distance': np.nan, 101 | 'outlier': False 102 | }) 103 | return pd.concat([data, random_data], ignore_index=True) 104 | 105 | 106 | def remove_duplicates(data): 107 | """ 108 | Remove duplicate rows from the dataset. 109 | 110 | Args: 111 | data (pd.DataFrame): DataFrame containing the data. 112 | 113 | Returns: 114 | pd.DataFrame: DataFrame without duplicate rows. 115 | """ 116 | logging.info("Removing duplicate rows...") 117 | cleaned_data = data.drop_duplicates() 118 | logging.info(f"Removed {len(data) - len(cleaned_data)} duplicate rows.") 119 | return cleaned_data 120 | 121 | 122 | def calculate_correlation(data): 123 | """ 124 | Calculate the correlation matrix for numerical columns in the dataset. 125 | 126 | Args: 127 | data (pd.DataFrame): DataFrame containing the data. 128 | 129 | Returns: 130 | pd.DataFrame: Correlation matrix of numerical columns. 131 | """ 132 | logging.info("Calculating correlation matrix...") 133 | correlation_matrix = data.corr() 134 | logging.info("Correlation matrix calculated successfully.") 135 | return correlation_matrix 136 | 137 | 138 | def main(input_file, threshold=0.5, output_file="output_with_outliers.csv"): 139 | """ 140 | Main function to detect and summarize outliers. 141 | 142 | Args: 143 | input_file (str): Path to the input CSV file. 144 | threshold (float): Threshold for Cook's distance to flag outliers. 145 | output_file (str): Path to save the output CSV file. 146 | """ 147 | try: 148 | # Detect outliers 149 | processed_data = detect_outliers(input_file, threshold, output_file) 150 | 151 | # Summarize results 152 | summary = summarize_outliers(processed_data) 153 | 154 | # Remove duplicate rows 155 | processed_data = remove_duplicates(processed_data) 156 | 157 | # Calculate correlation matrix 158 | correlation_matrix = calculate_correlation(processed_data) 159 | logging.info("Correlation matrix:") 160 | logging.info(f"{correlation_matrix}") 161 | 162 | # Save the updated data with random rows 163 | processed_data.to_csv(output_file, index=False) 164 | 165 | # Display summary 166 | logging.info("Outlier Detection Completed") 167 | logging.info("Summary Statistics:") 168 | for key, value in summary.items(): 169 | logging.info(f"{key.capitalize()}: {value}") 170 | 171 | except Exception as e: 172 | logging.error(f"Error occurred: {e}") 173 | 174 | 175 | if __name__ == "__main__": 176 | # Example usage 177 | input_file = "data.csv" 178 | output_file = "output_with_outliers.csv" 179 | threshold = 0.5 # Adjust threshold as needed 180 | 181 | main(input_file, threshold, output_file) 182 | -------------------------------------------------------------------------------- /tests/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/test_detect_outliers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from outlier_detection_tool.detect_outliers import detect_outliers, summarize_outliers 3 | 4 | def test_detect_outliers_basic(): 5 | """ 6 | Test the basic functionality of detecting outliers in a small dataset. 7 | """ 8 | # Create a sample dataset 9 | data = pd.DataFrame({"x": [1, 2, 3, 4, 5, 100], "y": [2, 4, 6, 8, 10, 200]}) 10 | test_file = "test_data.csv" 11 | data.to_csv(test_file, index=False) 12 | 13 | # Run outlier detection 14 | result = detect_outliers(test_file, threshold=0.5) 15 | 16 | # Assertions 17 | assert 'cooks_distance' in result.columns, "Cook's distance column is missing." 18 | assert 'outlier' in result.columns, "Outlier flag column is missing." 19 | assert result['outlier'].iloc[-1], "The last row should be flagged as an outlier." 20 | assert not result['outlier'].iloc[:-1].any(), "Non-outlier rows incorrectly flagged." 21 | 22 | # Clean up 23 | os.remove(test_file) 24 | print("test_detect_outliers_basic passed.") 25 | 26 | def test_summarize_outliers(): 27 | """ 28 | Test the summarize_outliers function for correct statistical output. 29 | """ 30 | data = pd.DataFrame({ 31 | "x": [1, 2, 3, 4, 5, 100], 32 | "y": [2, 4, 6, 8, 10, 200], 33 | "cooks_distance": [0.001, 0.002, 0.003, 0.004, 0.005, 0.8], 34 | "outlier": [False, False, False, False, False, True], 35 | }) 36 | 37 | summary = summarize_outliers(data) 38 | 39 | assert summary["total_points"] == 6, "Total points calculation is incorrect." 40 | assert summary["outliers"] == 1, "Outlier count calculation is incorrect." 41 | assert summary["non_outliers"] == 5, "Non-outlier count calculation is incorrect." 42 | assert summary["outlier_percentage"] == (1 / 6) * 100, "Outlier percentage is incorrect." 43 | 44 | print("test_summarize_outliers passed.") 45 | 46 | def test_missing_file(): 47 | """ 48 | Test behavior when a non-existent file is provided. 49 | """ 50 | try: 51 | detect_outliers("non_existent_file.csv") 52 | except FileNotFoundError as e: 53 | assert str(e).startswith("The file"), "FileNotFoundError not raised correctly." 54 | print("test_missing_file passed.") 55 | else: 56 | raise AssertionError("FileNotFoundError was not raised as expected.") 57 | 58 | def test_invalid_columns(): 59 | """ 60 | Test behavior when input data does not contain required columns. 61 | """ 62 | data = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) 63 | test_file = "test_invalid_columns.csv" 64 | data.to_csv(test_file, index=False) 65 | 66 | try: 67 | detect_outliers(test_file) 68 | except ValueError as e: 69 | assert str(e).startswith("The input CSV file must contain"), "ValueError not raised correctly for missing columns." 70 | print("test_invalid_columns passed.") 71 | else: 72 | raise AssertionError("ValueError was not raised as expected.") 73 | 74 | os.remove(test_file) 75 | 76 | def test_no_outliers_detected(): 77 | """ 78 | Test behavior when no outliers are present in the dataset. 79 | """ 80 | # Create a dataset with no significant outliers 81 | data = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]}) 82 | test_file = "test_no_outliers.csv" 83 | data.to_csv(test_file, index=False) 84 | 85 | # Run outlier detection 86 | result = detect_outliers(test_file, threshold=1.0) # Set a high threshold to avoid outliers 87 | 88 | # Assertions 89 | assert 'cooks_distance' in result.columns, "Cook's distance column is missing." 90 | assert 'outlier' in result.columns, "Outlier flag column is missing." 91 | assert not result['outlier'].any(), "No rows should be flagged as outliers." 92 | 93 | # Clean up 94 | os.remove(test_file) 95 | print("test_no_outliers_detected passed.") 96 | 97 | def test_outlier_removal_effect(): 98 | """ 99 | Test whether removing outliers changes the dataset size correctly. 100 | """ 101 | data = pd.DataFrame({ 102 | "x": [1, 2, 3, 4, 5, 100], 103 | "y": [2, 4, 6, 8, 10, 200], 104 | "outlier": [False, False, False, False, False, True] 105 | }) 106 | 107 | data_filtered = data[~data['outlier']] 108 | 109 | assert len(data_filtered) == 5, "Outlier removal did not adjust dataset size correctly." 110 | print("test_outlier_removal_effect passed.") 111 | 112 | def test_outlier_summary_consistency(): 113 | """ 114 | Test if summary statistics remain consistent before and after removing outliers. 115 | """ 116 | data = pd.DataFrame({ 117 | "x": [1, 2, 3, 4, 5, 100], 118 | "y": [2, 4, 6, 8, 10, 200], 119 | "outlier": [False, False, False, False, False, True] 120 | }) 121 | 122 | summary_before = summarize_outliers(data) 123 | data_filtered = data[~data['outlier']] 124 | summary_after = summarize_outliers(data_filtered) 125 | 126 | assert summary_before["outliers"] == 1, "Initial summary miscounts outliers." 127 | assert summary_after["outliers"] == 0, "Filtered summary should not contain outliers." 128 | print("test_outlier_summary_consistency passed.") 129 | 130 | def test_summarize_outliers(): 131 | """ 132 | Test the summarize_outliers function for correct statistical output. 133 | """ 134 | data = pd.DataFrame({ 135 | "x": [1, 2, 3, 4, 5, 100], 136 | "y": [2, 4, 6, 8, 10, 200], 137 | "cooks_distance": [0.001, 0.002, 0.003, 0.004, 0.005, 0.8], 138 | "outlier": [False, False, False, False, False, True], 139 | }) 140 | 141 | summary = summarize_outliers(data) 142 | 143 | assert summary["total_points"] == 6, "Total points calculation is incorrect." 144 | assert summary["outliers"] == 1, "Outlier count calculation is incorrect." 145 | assert summary["non_outliers"] == 5, "Non-outlier count calculation is incorrect." 146 | assert summary["outlier_percentage"] == (1 / 6) * 100, "Outlier percentage is incorrect." 147 | 148 | print("test_summarize_outliers passed.") 149 | 150 | def test_outlier_percentage_consistency(): 151 | """ 152 | Test if outlier percentage calculation remains consistent after filtering. 153 | """ 154 | data = pd.DataFrame({ 155 | "x": range(1, 21), 156 | "y": range(2, 42, 2), 157 | "outlier": [False] * 18 + [True, True] 158 | }) 159 | 160 | summary_before = summarize_outliers(data) 161 | filtered_data = data[~data["outlier"]] 162 | summary_after = summarize_outliers(filtered_data) 163 | 164 | assert summary_before["outlier_percentage"] > summary_after["outlier_percentage"], "Outlier percentage should decrease after filtering." 165 | print("test_outlier_percentage_consistency passed.") 166 | 167 | def test_large_dataset_performance(): 168 | """ 169 | Test the performance of the outlier detection function with a large dataset. 170 | """ 171 | data = pd.DataFrame({ 172 | "x": range(1, 10001), 173 | "y": range(2, 20002, 2) 174 | }) 175 | 176 | test_file = "large_test_data.csv" 177 | data.to_csv(test_file, index=False) 178 | 179 | try: 180 | result = detect_outliers(test_file, threshold=0.5) 181 | assert len(result) == 10000, "The result should contain the same number of rows as the input." 182 | print("test_large_dataset_performance passed.") 183 | finally: 184 | os.remove(test_file) 185 | 186 | 187 | if __name__ == "__main__": 188 | test_detect_outliers_basic() 189 | test_summarize_outliers() 190 | test_missing_file() 191 | test_invalid_columns() 192 | test_no_outliers_detected() 193 | test_outlier_removal_effect() 194 | test_outlier_summary_consistency() 195 | print("All tests passed successfully.") 196 | -------------------------------------------------------------------------------- /util/util_outlier_detection.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import logging 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") 7 | 8 | def clean_data(file_path, output_file="cleaned_data.csv"): 9 | """ 10 | Cleans the input CSV data by removing duplicates and handling missing values. 11 | 12 | Args: 13 | file_path (str): Path to the input CSV file. 14 | output_file (str): Path to save the cleaned data. 15 | 16 | Returns: 17 | pd.DataFrame: Cleaned DataFrame. 18 | """ 19 | logging.info(f"Starting data cleaning for file: {file_path}") 20 | 21 | try: 22 | data = pd.read_csv(file_path) 23 | logging.info("Data successfully loaded.") 24 | 25 | # Remove duplicates 26 | data = data.drop_duplicates() 27 | logging.info("Duplicates removed.") 28 | 29 | # Handle missing values 30 | data = data.dropna() 31 | logging.info("Missing values removed.") 32 | 33 | # Save cleaned data 34 | if output_file: 35 | data.to_csv(output_file, index=False) 36 | logging.info(f"Cleaned data saved to: {output_file}") 37 | 38 | return data 39 | 40 | except Exception as e: 41 | logging.error(f"Error occurred during data cleaning: {e}") 42 | raise 43 | 44 | def filter_data(data, column, threshold): 45 | """ 46 | Filters data based on a threshold value for a specific column. 47 | 48 | Args: 49 | data (pd.DataFrame): DataFrame containing the data. 50 | column (str): Column name to apply the threshold. 51 | threshold (float): Threshold value to filter the data. 52 | 53 | Returns: 54 | pd.DataFrame: Filtered DataFrame. 55 | """ 56 | logging.info(f"Filtering data where {column} >= {threshold}") 57 | try: 58 | filtered_data = data[data[column] >= threshold] 59 | logging.info(f"Filtered {len(data) - len(filtered_data)} rows below the threshold.") 60 | return filtered_data 61 | except Exception as e: 62 | logging.error(f"Error occurred during data filtering: {e}") 63 | raise 64 | 65 | def convert_column_to_numeric(data, column): 66 | """ 67 | Converts a specified column to numeric format, handling errors gracefully. 68 | 69 | Args: 70 | data (pd.DataFrame): DataFrame containing the data. 71 | column (str): Column name to convert to numeric. 72 | 73 | Returns: 74 | pd.DataFrame: DataFrame with the column converted to numeric. 75 | """ 76 | logging.info(f"Converting column {column} to numeric format...") 77 | try: 78 | data[column] = pd.to_numeric(data[column], errors='coerce') 79 | logging.info(f"Column {column} converted to numeric successfully.") 80 | return data 81 | except Exception as e: 82 | logging.error(f"Error occurred during column conversion: {e}") 83 | raise 84 | 85 | def rename_columns(data, column_mappings): 86 | """ 87 | Rename columns in the DataFrame based on a given mapping. 88 | 89 | Args: 90 | data (pd.DataFrame): DataFrame containing the data. 91 | column_mappings (dict): Dictionary mapping old column names to new names. 92 | 93 | Returns: 94 | pd.DataFrame: DataFrame with renamed columns. 95 | """ 96 | logging.info(f"Renaming columns: {column_mappings}") 97 | try: 98 | data = data.rename(columns=column_mappings) 99 | logging.info("Columns renamed successfully.") 100 | return data 101 | except Exception as e: 102 | logging.error(f"Error occurred during column renaming: {e}") 103 | raise 104 | 105 | def sort_data(data, column, ascending=True): 106 | """ 107 | Sort the DataFrame based on a specified column. 108 | 109 | Args: 110 | data (pd.DataFrame): DataFrame containing the data. 111 | column (str): Column name to sort by. 112 | ascending (bool): Whether to sort in ascending order (default=True). 113 | 114 | Returns: 115 | pd.DataFrame: Sorted DataFrame. 116 | """ 117 | logging.info(f"Sorting data by column '{column}', ascending={ascending}") 118 | try: 119 | sorted_data = data.sort_values(by=column, ascending=ascending) 120 | logging.info("Data sorted successfully.") 121 | return sorted_data 122 | except Exception as e: 123 | logging.error(f"Error occurred during sorting: {e}") 124 | raise 125 | 126 | def compute_unique_values(data, column): 127 | """ 128 | Compute the number of unique values in a specified column. 129 | 130 | Args: 131 | data (pd.DataFrame): DataFrame containing the data. 132 | column (str): Column name to count unique values. 133 | 134 | Returns: 135 | int: Number of unique values in the column. 136 | """ 137 | logging.info(f"Computing unique values in column '{column}'") 138 | try: 139 | unique_count = data[column].nunique() 140 | logging.info(f"Column '{column}' has {unique_count} unique values.") 141 | return unique_count 142 | except Exception as e: 143 | logging.error(f"Error occurred while computing unique values: {e}") 144 | raise 145 | 146 | def sort_data(data, column, ascending=True): 147 | """ 148 | Sort the DataFrame based on a specified column. 149 | 150 | Args: 151 | data (pd.DataFrame): DataFrame containing the data. 152 | column (str): Column name to sort by. 153 | ascending (bool): Whether to sort in ascending order (default=True). 154 | 155 | Returns: 156 | pd.DataFrame: Sorted DataFrame. 157 | """ 158 | logging.info(f"Sorting data by column '{column}', ascending={ascending}") 159 | try: 160 | sorted_data = data.sort_values(by=column, ascending=ascending) 161 | logging.info("Data sorted successfully.") 162 | return sorted_data 163 | except Exception as e: 164 | logging.error(f"Error occurred during sorting: {e}") 165 | raise 166 | 167 | def detect_outliers_iqr(data, column, threshold=1.5): 168 | """ 169 | Detects outliers in a numerical column using the Interquartile Range (IQR) method. 170 | 171 | Args: 172 | data (pd.DataFrame): DataFrame containing the data. 173 | column (str): Column name to check for outliers. 174 | threshold (float): Threshold multiplier for defining outliers (default is 1.5). 175 | 176 | Returns: 177 | pd.DataFrame: Subset of the original DataFrame containing only outlier rows. 178 | """ 179 | logging.info(f"Detecting outliers in column '{column}' using IQR method.") 180 | try: 181 | Q1 = data[column].quantile(0.25) 182 | Q3 = data[column].quantile(0.75) 183 | IQR = Q3 - Q1 184 | 185 | lower_bound = Q1 - (threshold * IQR) 186 | upper_bound = Q3 + (threshold * IQR) 187 | 188 | outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)] 189 | logging.info(f"Detected {len(outliers)} outliers in column '{column}'.") 190 | 191 | return outliers 192 | except Exception as e: 193 | logging.error(f"Error occurred while detecting outliers: {e}") 194 | raise 195 | 196 | def replace_missing_values(data, strategy="mean"): 197 | """ 198 | Replaces missing values in numerical columns based on the specified strategy. 199 | 200 | Args: 201 | data (pd.DataFrame): DataFrame containing the data. 202 | strategy (str): Strategy for replacing missing values ('mean', 'median', 'mode'). 203 | 204 | Returns: 205 | pd.DataFrame: DataFrame with missing values replaced. 206 | """ 207 | logging.info(f"Replacing missing values using strategy: {strategy}") 208 | try: 209 | if strategy not in ["mean", "median", "mode"]: 210 | raise ValueError("Invalid strategy. Choose 'mean', 'median', or 'mode'.") 211 | 212 | if strategy == "mean": 213 | data = data.fillna(data.mean()) 214 | elif strategy == "median": 215 | data = data.fillna(data.median()) 216 | elif strategy == "mode": 217 | data = data.fillna(data.mode().iloc[0]) 218 | 219 | logging.info("Missing values replaced successfully.") 220 | return data 221 | except Exception as e: 222 | logging.error(f"Error occurred while replacing missing values: {e}") 223 | raise 224 | --------------------------------------------------------------------------------