├── data
    ├── .gitkeep
    ├── data.csv
    ├── test2.csv
    ├── test3.csv
    ├── test4.csv
    └── test5.csv
├── docs
    ├── .gitkeep
    └── README.md
├── requirements
    ├── .gitkeep
    └── requirements.txt
├── src
    ├── .gitkeep
    ├── data_analysis.py
    ├── detect_outliers.py
    ├── normalize_columns
    └── util_outlier_detection
├── tests
    ├── .gitkeep
    └── test_detect_outliers.py
└── util
    └── util_outlier_detection.py


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/data.csv:
--------------------------------------------------------------------------------
1 | 1,2
2 | 2,4
3 | 3,6
4 | 4,8
5 | 5,10
6 | 100,200
7 | 


--------------------------------------------------------------------------------
/data/test2.csv:
--------------------------------------------------------------------------------
 1 | 1,2
 2 | 2,4
 3 | 3,6
 4 | 4,8
 5 | 5,10
 6 | 6,12
 7 | 7,14
 8 | 8,16
 9 | 9,18
10 | 10,20
11 | 11,22
12 | 12,24
13 | 13,26
14 | 14,28
15 | 15,30
16 | 16,32
17 | 17,34
18 | 18,36
19 | 19,38
20 | 20,40
21 | 


--------------------------------------------------------------------------------
/data/test3.csv:
--------------------------------------------------------------------------------
 1 | 1	2.9934283060224653
 2 | 2	3.723471397657631
 3 | 3	7.295377076201385
 4 | 4	11.046059712816051
 5 | 5	9.531693250553328
 6 | 6	11.531726086101639
 7 | 7	17.158425631014783
 8 | 8	17.534869458305817
 9 | 9	17.061051228130097
10 | 10	21.08512008717193
11 | 11	21.073164614375074
12 | 12	23.068540492859487
13 | 13	26.48392454313207
14 | 14	24.173439510684403
15 | 15	26.550164334973935
16 | 16	30.875424941518055
17 | 17	31.974337759331153
18 | 18	36.628494665190544
19 | 19	36.18395184895758
20 | 20	37.17539259732942
21 | 21	44.93129753784311
22 | 22	43.54844739902693
23 | 23	46.13505640937585
24 | 24	45.15050362757309
25 | 25	48.91123455094964
26 | 26	52.22184517941973
27 | 27	51.69801285
28 | 28	56.75139603669135
29 | 29	56.79872262016239
30 | 30	59.41661250041345
31 | 31	60.79658677554121
32 | 32	67.70455636901788
33 | 33	65.97300555052414
34 | 34	65.88457814
35 | 35	71.64508982420638
36 | 36	69.55831270005795
37 | 37	74.41772719000951
38 | 38	72.08065975224045
39 | 39	75.34362790220314
40 | 40	80.39372247173824
41 | 41	83.47693315999082
42 | 42	84.34273656237994
43 | 43	85.76870343522351
44 | 44	87.39779260882142
45 | 45	87.04295601926515
46 | 46	90.56031158321058
47 | 47	93.07872245808042
48 | 48	98.11424445243783
49 | 49	98.68723657913692
50 | 50	96.47391968927454
51 | 


--------------------------------------------------------------------------------
/data/test4.csv:
--------------------------------------------------------------------------------
 1 | x	y
 2 | 38.07947176588889	79.85127643175483
 3 | 95.12071633	191.09827407511327
 4 | 73.46740023932911	146.3565591
 5 | 60.26718993550662	119.0288613930668
 6 | 16.445845403801215	25.499080855765293
 7 | 16.443457513284063	29.287693984594583
 8 | 6.750277604651747	11.197361354504556
 9 | 86.75143843171858	178.78848799453175
10 | 60.510386162577674	122.73886377299766
11 | 71.09918520180851	133.38316962680335
12 | 3.0378649352844422	7.696149717542859
13 | 97.02107536403744	192.1167393259933
14 | 83.41182143924175	163.43903287695372
15 | 22.02157195714934	47.10152535850302
16 | 19.00067175350296	43.156341119485674
17 | 19.15704647548995	42.97049354656089
18 | 31.119982052994235	58.043876489875274
19 | 52.950886731591545	104.35571158392702
20 | 43.76255684556946	89.18143084815675
21 | 29.83168487960615	64.54109539
22 | 61.573436577515565	120.75100196580468
23 | 14.809892204552142	28.691489525785197
24 | 29.922320204986598	54.31296553994305
25 | 37.269822486075476	68.55861185
26 | 46.150928437486556	96.36448599
27 | 78.73242017790835	164.24604049867082
28 | 20.767704433677615	41.17535825945356
29 | 51.909209402947546	108.83608329535521
30 | 59.64904231734221	121.10626475992258
31 | 5.598590859279775	7.971582945533928
32 | 61.146940338242395	124.10085870402686
33 | 17.88188824504186	43.45395932241357
34 | 7.4401077055426725	14.701085215535587
35 | 94.93966818807999	197.70255465523002
36 | 96.59757127438138	180.09641702831405
37 | 81.03133746352965	166.17218744893543
38 | 31.156763148163698	62.74876163751825
39 | 10.669539286632004	19.84404182093467
40 | 68.73906962470353	137.93694313208456
41 | 44.57509688022053	79.21234918743659
42 | 13.081785249633104	25.065211060078646
43 | 50.02251410101575	101.83059105959023
44 | 4.404463590406621	16.198397404520822
45 | 91.02271980579943	179.4540885202306
46 | 26.619218178401674	49.19596834233741
47 | 66.58970615104421	130.67062708416574
48 | 31.859396532851683	68.29580365421374
49 | 52.48673409660327	106.61722374150497
50 | 55.12431765498469	107.59983429113417
51 | 19.300591097027176	41.16751935962113
52 | 


--------------------------------------------------------------------------------
/data/test5.csv:
--------------------------------------------------------------------------------
 1 | x	y
 2 | 67.55557730444839	160.33012809598438
 3 | 49.319761524817785	126.7446944242207
 4 | 82.72402222955374	213.61119316443478
 5 | 4.113192375003516	8.167841725111824
 6 | 80.99694637311993	209.54627760259422
 7 | 56.996124541442526	138.00388004358507
 8 | 30.464627371690455	71.39383176190998
 9 | 5.622876330188372	16.268118496473182
10 | 99.07211254760882	244.4163575735596
11 | 1.675747571321864	2.2027815313916266
12 | 77.20950979080403	197.36867673023718
13 | 74.92994299843566	183.42615638937653
14 | 38.36645469716155	94.78759347126154
15 | 49.92059777185661	124.33263782989066
16 | 92.96589081746234	233.9289161090991
17 | 40.149950390236214	103.30365456430363
18 | 97.42167338606704	243.13989932670017
19 | 52.917056822148766	127.49781693474209
20 | 10.267696236477505	29.70938289631082
21 | 81.51753286543077	215.29527348307383
22 | 21.956991773346985	58.17373001
23 | 55.88023267447785	141.94302770993264
24 | 29.93464248	73.3340374
25 | 81.79809363873163	203.48681690226584
26 | 82.97621405447433	201.88208134749868
27 | 22.936159801228193	59.80972861048586
28 | 64.83863545872785	161.55344045575197
29 | 10.422980571419512	31.20907126
30 | 41.754660645450905	100.23867402412957
31 | 10.589660865004047	31.931480713091013
32 | 15.25708889186078	36.90289568319627
33 | 22.007421356240144	52.57408491489617
34 | 48.18895929113014	118.53997511720033
35 | 8.683769823401471	19.27348980822208
36 | 24.26933437092893	52.319994736733996
37 | 1.648747265435246	6.679156156184053
38 | 89.96577452016759	228.01165211108943
39 | 55.67120826088866	144.2922982557267
40 | 17.58711614371605	46.78889443666723
41 | 92.95894016607785	235.0212599385139
42 | 54.666682182974284	129.95269731442994
43 | 5.134094598129749	13.56767596534359
44 | 52.96280715623928	127.87412153769112
45 | 64.37411151091817	159.8189359333329
46 | 80.26515820099465	206.24937509920807
47 | 83.65696631737042	209.27129356570805
48 | 26.010590785085874	54.571833876367755
49 | 96.85907776475311	238.14138389110536
50 | 47.15757533627438	115.6238943151569
51 | 27.14502581563233	66.95993242433185
52 | 


--------------------------------------------------------------------------------
/docs/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | # Outlier Detection Tool
  2 | 
  3 | Author: Yisong Chen
  4 | 
  5 | ## Overview
  6 | The **Outlier Detection Tool** is a powerful and efficient Python package designed for data scientists, statisticians, and analysts who require a precise method to detect anomalies in two-dimensional datasets. By leveraging the statistical rigor of **Cook's Distance**, this tool performs regression diagnostics on (x, y) data, identifies influential data points, and flags significant deviations from the general trend.
  7 | 
  8 | ----
  9 | 
 10 | ## Purpose and Motivation
 11 | 
 12 | Outliers can significantly impact statistical models, distort trends, and bias predictions. Addressing these anomalies is essential for ensuring the accuracy and reliability of data-driven insights. Traditional methods like standard deviation thresholds or IQR (Interquartile Range) often fall short in complex datasets where more nuanced techniques are required.
 13 | 
 14 | The **Outlier Detection Tool**, developed by **Yisong Chen**, provides a more sophisticated approach by utilizing **Cook's Distance**, a metric designed to quantify the influence of each observation in a regression model. This tool enables professionals to:
 15 | 
 16 | - Detect and quantify data points that disproportionately affect regression outcomes.
 17 | - Evaluate the statistical influence of each observation and determine its impact on overall model performance.
 18 | - Make informed decisions about whether to retain or exclude anomalous data.
 19 | 
 20 | By automating the use of Cook's Distance, the **Outlier Detection Tool** bridges the gap between theoretical rigor and practical application, making it indispensable for any professional working in data preprocessing, anomaly detection, or predictive modeling.
 21 | 
 22 | ----
 23 | 
 24 | ## Key Features
 25 | 
 26 | - **Cook's Distance Calculation**: Precisely measures the influence of each data point in regression models.
 27 | - **Automated Outlier Flagging**: Flags influential data points based on a user-defined threshold.
 28 | - **Customizable Sensitivity**: Allows fine-tuning of the outlier detection threshold to fit different datasets.
 29 | - **Comprehensive Summary Statistics**: Provides detailed insights, including the number of outliers detected, their proportion, and their influence on the dataset.
 30 | - **CSV File Support**: Ingests CSV files with (x, y) data and outputs enriched datasets with diagnostic metrics and anomaly flags.
 31 | - **Robust Error Handling**: Includes validation for missing files, malformed data, or incomplete input structures to prevent processing failures.
 32 | - **Flexible Data Cleaning**: Offers built-in utilities for removing duplicates, handling missing values, and filtering datasets based on custom criteria.
 33 | - **Advanced Data Processing**: Supports additional functionalities such as feature scaling, sorting, renaming columns, and computing correlation matrices.
 34 | 
 35 | ----
 36 | 
 37 | ## Installation
 38 | 
 39 | To install the **Outlier Detection Tool**, ensure that Python and `pip` are installed on your system. You can install the required dependencies using:
 40 | 
 41 | ```bash
 42 | pip install -r requirements.txt
 43 | ```
 44 | 
 45 | or manually install the key libraries:
 46 | 
 47 | ```bash
 48 | pip install pandas numpy statsmodels
 49 | ```
 50 | 
 51 | ----
 52 | 
 53 | ## Usage
 54 | 
 55 | ### 1. Detecting Outliers
 56 | The primary function of the tool is to detect outliers in (x, y) datasets using Cook’s Distance. To run outlier detection on a CSV file, use:
 57 | 
 58 | ```python
 59 | from detect_outliers import detect_outliers
 60 | 
 61 | file_path = "data.csv"
 62 | outlier_data = detect_outliers(file_path, threshold=0.5, output_file="output_with_outliers.csv")
 63 | ```
 64 | 
 65 | ### 2. Summarizing Outlier Results
 66 | 
 67 | ```python
 68 | from detect_outliers import summarize_outliers
 69 | 
 70 | summary = summarize_outliers(outlier_data)
 71 | print(summary)
 72 | ```
 73 | 
 74 | ### 3. Cleaning Data
 75 | 
 76 | ```python
 77 | from data_analysis import clean_data
 78 | 
 79 | cleaned_data = clean_data("data.csv", output_file="cleaned_data.csv")
 80 | ```
 81 | 
 82 | ### 4. Feature Scaling
 83 | 
 84 | ```python
 85 | from data_analysis import scale_features
 86 | 
 87 | scaled_data = scale_features(cleaned_data, ["x", "y"])
 88 | ```
 89 | 
 90 | ### 5. Sorting Data
 91 | 
 92 | ```python
 93 | from data_analysis import sort_data
 94 | 
 95 | sorted_data = sort_data(cleaned_data, column="x", ascending=True)
 96 | ```
 97 | 
 98 | ----
 99 | 
100 | ## Example Dataset
101 | To test the tool, use a simple CSV file (`data.csv`) structured as follows:
102 | 
103 | ```
104 | x,y
105 | 1,2
106 | 2,4
107 | 3,6
108 | 4,8
109 | 5,10
110 | 100,200
111 | ```
112 | 
113 | This dataset contains five standard points and one extreme outlier (`100,200`). Running the tool will flag `100,200` as an influential data point.
114 | 
115 | ----
116 | 
117 | ## Contributing
118 | We welcome contributions to enhance the functionality of the **Outlier Detection Tool**. If you would like to contribute:
119 | 
120 | 1. Fork the repository.
121 | 2. Create a new feature branch.
122 | 3. Implement and test your changes.
123 | 4. Submit a pull request for review.
124 | 
125 | For major changes, please open an issue first to discuss your proposed modifications.
126 | 
127 | ----
128 | 
129 | ## License
130 | This project is licensed under the **MIT License**.
131 | 
132 | ----
133 | 
134 | ## Author
135 | **Yisong Chen**  
136 | For inquiries or collaborations, please reach out via GitHub or email.
137 | 
138 | 


--------------------------------------------------------------------------------
/requirements/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=1.0.0
2 | numpy>=1.18.0
3 | statsmodels>=0.13.0
4 | 


--------------------------------------------------------------------------------
/src/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/data_analysis.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import logging
  3 | 
  4 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
  5 | 
  6 | def calculate_statistics(data, column):
  7 |     """
  8 |     Calculate basic statistics (mean, median, standard deviation) for a specified column.
  9 | 
 10 |     Args:
 11 |         data (pd.DataFrame): DataFrame containing the data.
 12 |         column (str): Name of the column to calculate statistics for.
 13 | 
 14 |     Returns:
 15 |         dict: Dictionary containing mean, median, and standard deviation.
 16 |     """
 17 |     if column not in data.columns:
 18 |         logging.error(f"Column '{column}' not found in the data.")
 19 |         raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
 20 | 
 21 |     logging.info(f"Calculating statistics for column: {column}")
 22 | 
 23 |     try:
 24 |         mean = data[column].mean()
 25 |         median = data[column].median()
 26 |         std_dev = data[column].std()
 27 | 
 28 |         stats = {
 29 |             "mean": mean,
 30 |             "median": median,
 31 |             "std_dev": std_dev
 32 |         }
 33 | 
 34 |         logging.info(f"Statistics for column '{column}': {stats}")
 35 |         return stats
 36 | 
 37 |     except Exception as e:
 38 |         logging.error(f"Error while calculating statistics: {e}")
 39 |         raise
 40 | 
 41 | def detect_missing_values(data):
 42 |     """
 43 |     Detect missing values in each column of the DataFrame.
 44 | 
 45 |     Args:
 46 |         data (pd.DataFrame): DataFrame containing the data.
 47 | 
 48 |     Returns:
 49 |         dict: Dictionary containing the count of missing values per column.
 50 |     """
 51 |     logging.info("Detecting missing values in the dataset...")
 52 |     try:
 53 |         missing_values = data.isnull().sum().to_dict()
 54 |         logging.info(f"Missing values per column: {missing_values}")
 55 |         return missing_values
 56 |     except Exception as e:
 57 |         logging.error(f"Error while detecting missing values: {e}")
 58 |         raise
 59 | 
 60 | def normalize_column(data, column):
 61 |     """
 62 |     Normalize a specified column using min-max normalization.
 63 | 
 64 |     Args:
 65 |         data (pd.DataFrame): DataFrame containing the data.
 66 |         column (str): Name of the column to normalize.
 67 | 
 68 |     Returns:
 69 |         pd.DataFrame: DataFrame with the normalized column.
 70 |     """
 71 |     if column not in data.columns:
 72 |         logging.error(f"Column '{column}' not found in the data.")
 73 |         raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
 74 | 
 75 |     logging.info(f"Normalizing column: {column}")
 76 |     try:
 77 |         min_val = data[column].min()
 78 |         max_val = data[column].max()
 79 |         data[column] = (data[column] - min_val) / (max_val - min_val)
 80 |         logging.info(f"Column '{column}' normalized successfully.")
 81 |         return data
 82 |     except Exception as e:
 83 |         logging.error(f"Error while normalizing column '{column}': {e}")
 84 |         raise
 85 | def replace_missing_values(data, column, method="mean"):
 86 |     """
 87 |     Replace missing values in a specified column using mean, median, or mode.
 88 | 
 89 |     Args:
 90 |         data (pd.DataFrame): DataFrame containing the data.
 91 |         column (str): Name of the column to process.
 92 |         method (str): Strategy to replace missing values (options: "mean", "median", "mode").
 93 | 
 94 |     Returns:
 95 |         pd.DataFrame: DataFrame with missing values replaced.
 96 |     """
 97 |     if column not in data.columns:
 98 |         logging.error(f"Column '{column}' not found in the data.")
 99 |         raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
100 | 
101 |     logging.info(f"Replacing missing values in column: {column} using {method} method.")
102 | 
103 |     try:
104 |         if method == "mean":
105 |             data[column].fillna(data[column].mean(), inplace=True)
106 |         elif method == "median":
107 |             data[column].fillna(data[column].median(), inplace=True)
108 |         elif method == "mode":
109 |             data[column].fillna(data[column].mode()[0], inplace=True)
110 |         else:
111 |             raise ValueError("Method should be 'mean', 'median', or 'mode'.")
112 | 
113 |         logging.info(f"Missing values in column '{column}' replaced successfully.")
114 |         return data
115 |     except Exception as e:
116 |         logging.error(f"Error while replacing missing values in column '{column}': {e}")
117 |         raise
118 | 
119 | def compute_correlation_matrix(data):
120 |     """
121 |     Compute the correlation matrix for numerical columns in the DataFrame.
122 | 
123 |     Args:
124 |         data (pd.DataFrame): DataFrame containing the data.
125 | 
126 |     Returns:
127 |         pd.DataFrame: Correlation matrix.
128 |     """
129 |     logging.info("Computing correlation matrix for numerical columns...")
130 |     try:
131 |         correlation_matrix = data.corr()
132 |         logging.info("Correlation matrix computed successfully.")
133 |         return correlation_matrix
134 |     except Exception as e:
135 |         logging.error(f"Error while computing correlation matrix: {e}")
136 |         raise
137 | 


--------------------------------------------------------------------------------
/src/detect_outliers.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import statsmodels.api as sm
  4 | import os
  5 | import logging
  6 | 
  7 | # Configure logging 
  8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
  9 | 
 10 | def detect_outliers(file_path, threshold=0.5, output_file="output_with_outliers.csv"):
 11 |     """
 12 |     Detect outliers in (x, y) data using Cook's distance.
 13 | 
 14 |     Args:
 15 |         file_path (str): Path to the CSV file containing 'x' and 'y' data.
 16 |         threshold (float): Threshold for Cook's distance to flag outliers.
 17 |         output_file (str): Path to save the processed file with outliers flagged.
 18 | 
 19 |     Returns:
 20 |         pd.DataFrame: DataFrame with calculated Cook's distance and outlier flags.
 21 |     """
 22 |     logging.info(f"Starting outlier detection for file: {file_path}")
 23 |     
 24 |     if not os.path.exists(file_path):
 25 |         logging.error(f"File not found: {file_path}")
 26 |         raise FileNotFoundError(f"The file {file_path} does not exist.")
 27 | 
 28 |     data = pd.read_csv(file_path)
 29 |     logging.info("Data successfully loaded.")
 30 | 
 31 |     if 'x' not in data.columns or 'y' not in data.columns:
 32 |         logging.error("Missing required columns 'x' and 'y' in the input data.")
 33 |         raise ValueError("The input CSV file must contain 'x' and 'y' columns.")
 34 | 
 35 |     # Prepare the data for regression
 36 |     X = sm.add_constant(data['x'])
 37 |     y = data['y']
 38 | 
 39 |     logging.info("Fitting regression model...")
 40 |     model = sm.OLS(y, X).fit()
 41 | 
 42 |     # Calculate Cook's distance and influence metrics
 43 |     logging.info("Calculating Cook's distance...")
 44 |     influence = model.get_influence()
 45 |     cooks = influence.cooks_distance[0]
 46 | 
 47 |     # Add results to the DataFrame
 48 |     data['cooks_distance'] = cooks
 49 |     data['outlier'] = data['cooks_distance'] > threshold
 50 | 
 51 |     # Add 100 new rows of random data for testing purposes
 52 |     new_data = pd.DataFrame({
 53 |         'x': np.random.rand(100),
 54 |         'y': np.random.rand(100)
 55 |     })
 56 |     new_data['cooks_distance'] = np.nan
 57 |     new_data['outlier'] = False
 58 |     data = pd.concat([data, new_data], ignore_index=True)
 59 | 
 60 |     # Save output to a CSV file
 61 |     if output_file:
 62 |         data.to_csv(output_file, index=False)
 63 |         logging.info(f"Results saved to: {output_file}")
 64 | 
 65 |     return data
 66 | 
 67 | 
 68 | def summarize_outliers(data):
 69 |     """
 70 |     Summarize the outlier detection results.
 71 | 
 72 |     Args:
 73 |         data (pd.DataFrame): DataFrame containing the outlier flags.
 74 | 
 75 |     Returns:
 76 |         dict: Summary statistics including count of outliers and non-outliers.
 77 |     """
 78 |     logging.info("Summarizing outlier detection results...")
 79 |     total_points = len(data)
 80 |     outlier_count = data['outlier'].sum()
 81 |     non_outlier_count = total_points - outlier_count
 82 | 
 83 |     summary = {
 84 |         "total_points": total_points,
 85 |         "outliers": outlier_count,
 86 |         "non_outliers": non_outlier_count,
 87 |         "outlier_percentage": (outlier_count / total_points) * 100,
 88 |     }
 89 | 
 90 |     logging.info("Summary generated successfully.")
 91 |     return summary
 92 | 
 93 | 
 94 | def scale_features(data, columns):
 95 |     """
 96 |     Scale specified numerical columns to a 0-1 range.
 97 | 
 98 |     Args:
 99 |         data (pd.DataFrame): DataFrame containing the data.
100 |         columns (list): List of column names to scale.
101 | 
102 |     Returns:
103 |         pd.DataFrame: DataFrame with scaled columns.
104 |     """
105 |     logging.info("Scaling specified features to a 0-1 range...")
106 |     try:
107 |         data = data.copy()
108 |         for col in columns:
109 |             if col in data.columns:
110 |                 min_val = data[col].min()
111 |                 max_val = data[col].max()
112 |                 data[col] = (data[col] - min_val) / (max_val - min_val)
113 |                 logging.info(f"Column '{col}' scaled successfully.")
114 |             else:
115 |                 logging.warning(f"Column '{col}' not found in the data.")
116 |         return data
117 |     except Exception as e:
118 |         logging.error(f"Error during feature scaling: {e}")
119 |         raise
120 | 
121 | 
122 | def main(input_file, threshold=0.5, output_file="output_with_outliers.csv"):
123 |     """
124 |     Main function to detect and summarize outliers.
125 | 
126 |     Args:
127 |         input_file (str): Path to the input CSV file.
128 |         threshold (float): Threshold for Cook's distance to flag outliers.
129 |         output_file (str): Path to save the output CSV file.
130 |     """
131 |     try:
132 |         # Detect outliers
133 |         processed_data = detect_outliers(input_file, threshold, output_file)
134 | 
135 |         # Summarize results
136 |         summary = summarize_outliers(processed_data)
137 | 
138 |         # Display summary
139 |         logging.info("Outlier Detection Completed")
140 |         logging.info("Summary Statistics:")
141 |         for key, value in summary.items():
142 |             logging.info(f"{key.capitalize()}: {value}")
143 | 
144 |         # Scale features for further analysis
145 |         scaled_data = scale_features(processed_data, ['x', 'y'])
146 |         logging.info("Feature scaling completed. Preview of scaled data:")
147 |         logging.info(scaled_data.head())
148 | 
149 |     except Exception as e:
150 |         logging.error(f"Error occurred: {e}")
151 | 
152 | def remove_missing_values(data):
153 |     """
154 |     Remove rows with missing values from the dataset.
155 | 
156 |     Args:
157 |         data (pd.DataFrame): DataFrame containing the data.
158 | 
159 |     Returns:
160 |         pd.DataFrame: Cleaned DataFrame with missing values removed.
161 |     """
162 |     logging.info("Removing rows with missing values...")
163 |     cleaned_data = data.dropna()
164 |     logging.info(f"Removed {len(data) - len(cleaned_data)} rows with missing values.")
165 |     return cleaned_data
166 | 
167 | 
168 | def detect_high_variance_features(data, threshold=1.0):
169 |     """
170 |     Identify columns with high variance exceeding a given threshold.
171 | 
172 |     Args:
173 |         data (pd.DataFrame): DataFrame containing the data.
174 |         threshold (float): Variance threshold to identify high variance features.
175 | 
176 |     Returns:
177 |         list: List of column names with variance above the threshold.
178 |     """
179 |     logging.info("Detecting high variance features...")
180 |     high_variance_features = [col for col in data.select_dtypes(include=[np.number]).columns 
181 |                               if data[col].var() > threshold]
182 |     logging.info(f"High variance features detected: {high_variance_features}")
183 |     return high_variance_features
184 | 
185 | def flag_extreme_values(data, column, z_threshold=3):
186 |     """
187 |     Flag extreme values in a column using the Z-score method.
188 | 
189 |     Args:
190 |         data (pd.DataFrame): DataFrame containing the data.
191 |         column (str): Column name to evaluate.
192 |         z_threshold (float): Z-score threshold for identifying extreme values.
193 | 
194 |     Returns:
195 |         pd.DataFrame: DataFrame with an additional boolean column indicating extreme values.
196 |     """
197 |     logging.info(f"Flagging extreme values in column '{column}' using Z-score threshold: {z_threshold}")
198 |     
199 |     if column not in data.columns:
200 |         logging.error(f"Column '{column}' not found in the dataset.")
201 |         raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
202 | 
203 |     try:
204 |         mean = data[column].mean()
205 |         std = data[column].std()
206 |         z_scores = (data[column] - mean) / std
207 |         data[f"{column}_extreme"] = np.abs(z_scores) > z_threshold
208 |         logging.info(f"Extreme values flagged in column '{column}'.")
209 |         return data
210 |     except Exception as e:
211 |         logging.error(f"Error while flagging extreme values in column '{column}': {e}")
212 |         raise
213 | 
214 | 
215 | 
216 | if __name__ == "__main__":
217 |     # Example usage
218 |     input_file = "data.csv"
219 |     output_file = "output_with_outliers.csv"
220 |     threshold = 0.5  # Adjust threshold as needed
221 | 
222 |     main(input_file, threshold, output_file)
223 | 


--------------------------------------------------------------------------------
/src/normalize_columns:
--------------------------------------------------------------------------------
  1 | -- updated 06/03/2025
  2 | 
  3 | import pandas as pd
  4 | import logging
  5 | 
  6 | def normalize_columns(data, columns):
  7 |     """
  8 |     Normalize specified numerical columns to have a mean of 0 and a standard deviation of 1.
  9 | 
 10 |     Args:
 11 |         data (pd.DataFrame): DataFrame containing the data.
 12 |         columns (list): List of column names to normalize.
 13 | 
 14 |     Returns:
 15 |         pd.DataFrame: DataFrame with normalized columns.
 16 |     """
 17 |     logging.info("Normalizing specified columns to mean=0 and std=1...")
 18 |     try:
 19 |         data = data.copy()
 20 |         for col in columns:
 21 |             if col in data.columns:
 22 |                 mean = data[col].mean()
 23 |                 std = data[col].std()
 24 |                 data[col] = (data[col] - mean) / std
 25 |                 logging.info(f"Column '{col}' normalized successfully.")
 26 |             else:
 27 |                 logging.warning(f"Column '{col}' not found in the data.")
 28 |         return data
 29 |     except Exception as e:
 30 |         logging.error(f"Error during column normalization: {e}")
 31 |         raise
 32 | 
 33 | def standardize_columns(data, columns):
 34 |     """
 35 |     Standardize specified numerical columns to have a min-max range of 0 to 1.
 36 | 
 37 |     Args:
 38 |         data (pd.DataFrame): DataFrame containing the data.
 39 |         columns (list): List of column names to standardize.
 40 | 
 41 |     Returns:
 42 |         pd.DataFrame: DataFrame with standardized columns.
 43 |     """
 44 |     logging.info("Standardizing specified columns to range 0-1...")
 45 |     try:
 46 |         data = data.copy()
 47 |         for col in columns:
 48 |             if col in data.columns:
 49 |                 min_val = data[col].min()
 50 |                 max_val = data[col].max()
 51 |                 data[col] = (data[col] - min_val) / (max_val - min_val)
 52 |                 logging.info(f"Column '{col}' standardized successfully.")
 53 |             else:
 54 |                 logging.warning(f"Column '{col}' not found in the data.")
 55 |         return data
 56 |     except Exception as e:
 57 |         logging.error(f"Error during column standardization: {e}")
 58 |         raise
 59 | 
 60 | def remove_outliers(data, columns, threshold=3):
 61 |     """
 62 |     Remove outliers in specified numerical columns using a Z-score threshold.
 63 | 
 64 |     Args:
 65 |         data (pd.DataFrame): DataFrame containing the data.
 66 |         columns (list): List of column names to check for outliers.
 67 |         threshold (float): Z-score threshold to identify outliers (default=3).
 68 | 
 69 |     Returns:
 70 |         pd.DataFrame: DataFrame with outliers removed.
 71 |     """
 72 |     logging.info(f"Removing outliers using Z-score threshold of {threshold}...")
 73 |     try:
 74 |         data = data.copy()
 75 |         for col in columns:
 76 |             if col in data.columns:
 77 |                 mean = data[col].mean()
 78 |                 std = data[col].std()
 79 |                 z_scores = (data[col] - mean) / std
 80 |                 data = data[abs(z_scores) <= threshold]
 81 |                 logging.info(f"Outliers removed from column '{col}'.")
 82 |             else:
 83 |                 logging.warning(f"Column '{col}' not found in the data.")
 84 |         return data
 85 |     except Exception as e:
 86 |         logging.error(f"Error during outlier removal: {e}")
 87 |         raise
 88 | 
 89 | def compute_summary_statistics(data, columns):
 90 |     """
 91 |     Compute summary statistics (mean, median, std, min, max) for specified numerical columns.
 92 | 
 93 |     Args:
 94 |         data (pd.DataFrame): DataFrame containing the data.
 95 |         columns (list): List of column names to compute statistics for.
 96 | 
 97 |     Returns:
 98 |         pd.DataFrame: DataFrame with computed summary statistics.
 99 |     """
100 |     logging.info("Computing summary statistics...")
101 |     try:
102 |         stats = data[columns].describe().transpose()
103 |         logging.info("Summary statistics computed successfully.")
104 |         return stats
105 |     except Exception as e:
106 |         logging.error(f"Error computing summary statistics: {e}")
107 |         raise
108 | 
109 | def fill_missing_values(data, columns, strategy="mean"):
110 |     """
111 |     Fill missing values in specified columns using a chosen strategy (mean, median, or mode).
112 | 
113 |     Args:
114 |         data (pd.DataFrame): DataFrame containing the data.
115 |         columns (list): List of column names to fill missing values.
116 |         strategy (str): Strategy to fill missing values ("mean", "median", "mode").
117 | 
118 |     Returns:
119 |         pd.DataFrame: DataFrame with missing values filled.
120 |     """
121 |     logging.info(f"Filling missing values using strategy: {strategy}")
122 |     try:
123 |         data = data.copy()
124 |         for col in columns:
125 |             if col in data.columns:
126 |                 if strategy == "mean":
127 |                     data[col].fillna(data[col].mean(), inplace=True)
128 |                 elif strategy == "median":
129 |                     data[col].fillna(data[col].median(), inplace=True)
130 |                 elif strategy == "mode":
131 |                     data[col].fillna(data[col].mode()[0], inplace=True)
132 |                 logging.info(f"Missing values filled for column '{col}' using {strategy} strategy.")
133 |             else:
134 |                 logging.warning(f"Column '{col}' not found in the data.")
135 |         return data
136 |     except Exception as e:
137 |         logging.error(f"Error filling missing values: {e}")
138 |         raise
139 | 
140 | def detect_constant_columns(data):
141 |     """
142 |     Detect columns with a constant value across all rows.
143 |     Args:
144 | 
145 |         data (pd.DataFrame): DataFrame containing the data.
146 |     Returns:
147 | 
148 |         list: List of column names that have a constant value.
149 |     """
150 |     logging.info("Detecting constant columns...")
151 |     try:
152 |         constant_cols = [col for col in data.columns if data[col].nunique() == 1]
153 |         logging.info(f"Constant columns found: {constant_cols}")
154 |         return constant_cols
155 |     except Exception as e:
156 |         logging.error(f"Error detecting constant columns: {e}")
157 |         raise
158 | 
159 | def convert_columns_to_numeric(data, columns):
160 |     """
161 |     Convert specified columns to numeric type, coercing errors to NaN.
162 | 
163 |     Args:
164 |         data (pd.DataFrame): DataFrame containing the data.
165 |         columns (list): List of column names to convert.
166 | 
167 |     Returns:
168 |         pd.DataFrame: DataFrame with specified columns converted to numeric.
169 |     """
170 |     logging.info("Converting specified columns to numeric type...")
171 |     try:
172 |         data = data.copy()
173 |         for col in columns:
174 |             if col in data.columns:
175 |                 data[col] = pd.to_numeric(data[col], errors='coerce')
176 |                 logging.info(f"Column '{col}' converted to numeric.")
177 |             else:
178 |                 logging.warning(f"Column '{col}' not found in the data.")
179 |         return data
180 |     except Exception as e:
181 |         logging.error(f"Error converting columns to numeric: {e}")
182 |         raise
183 | 
184 | def detect_highly_correlated_columns(data, threshold=0.9):
185 |     """
186 |     Detect pairs of numerical columns with correlation above a specified threshold.
187 | 
188 |     Args:
189 |         data (pd.DataFrame): DataFrame containing the data.
190 |         threshold (float): Correlation coefficient threshold (default is 0.9).
191 | 
192 |     Returns:
193 |         list of tuple: List of column name pairs that are highly correlated.
194 |     """
195 |     logging.info(f"Detecting highly correlated column pairs with threshold > {threshold}")
196 |     try:
197 |         corr_matrix = data.corr().abs()
198 |         correlated_pairs = []
199 | 
200 |         for i in range(len(corr_matrix.columns)):
201 |             for j in range(i + 1, len(corr_matrix.columns)):
202 |                 if corr_matrix.iloc[i, j] > threshold:
203 |                     col1 = corr_matrix.columns[i]
204 |                     col2 = corr_matrix.columns[j]
205 |                     correlated_pairs.append((col1, col2))
206 |                     logging.info(f"High correlation detected: {col1} and {col2} -> {corr_matrix.iloc[i, j]}")
207 | 
208 |         return correlated_pairs
209 | 
210 | def rename_columns(data, column_mapping):
211 |     """
212 |     Rename columns in the DataFrame using a provided mapping.
213 | 
214 |     Args:
215 |         data (pd.DataFrame): DataFrame containing the data.
216 |         column_mapping (dict): Dictionary mapping old column names to new names.
217 | 
218 |     Returns:
219 |         pd.DataFrame: DataFrame with renamed columns.
220 |     """
221 |     logging.info(f"Renaming columns: {column_mapping}")
222 |     try:
223 |         data = data.copy()
224 |         data.rename(columns=column_mapping, inplace=True)
225 |         logging.info("Columns renamed successfully.")
226 |         return data
227 |     except Exception as e:
228 |         logging.error(f"Error renaming columns: {e}")
229 |         raise
230 | 
231 | def detect_duplicate_rows(data, drop=False):
232 |     """
233 | 
234 |     Detect duplicate rows in the DataFrame and optionally remove them.
235 |     Args:
236 |         data (pd.DataFrame): DataFrame containing the data.
237 |         drop (bool): If True, return the DataFrame without duplicates; if False, just report them.
238 |     Returns:
239 |         pd.DataFrame or pd.Index: If drop is True, returns DataFrame without duplicates;
240 |                                   if False, returns index of duplicate rows.
241 | 
242 |     """
243 |     logging.info("Checking for duplicate rows...")
244 |     try:
245 |         duplicates = data.duplicated()
246 |         duplicate_count = duplicates.sum()
247 |         logging.info(f"Found {duplicate_count} duplicate rows.")
248 | 
249 |         if drop:
250 |             data_no_duplicates = data.drop_duplicates()
251 |             logging.info("Duplicate rows removed.")
252 |             return data_no_duplicates
253 |         else:
254 |             return data[duplicates]
255 |     except Exception as e:
256 |         logging.error(f"Error detecting duplicate rows: {e}")
257 |         raise
258 | 
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/src/util_outlier_detection:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import statsmodels.api as sm
  4 | import os
  5 | import logging
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
  9 | 
 10 | def detect_outliers(file_path, threshold=0.5, output_file="output_with_outliers.csv"):
 11 |     """
 12 |     Detect outliers in (x, y) data using Cook's distance.
 13 | 
 14 |     Args:
 15 |         file_path (str): Path to the CSV file containing 'x' and 'y' data.
 16 |         threshold (float): Threshold for Cook's distance to flag outliers.
 17 |         output_file (str): Path to save the processed file with outliers flagged.
 18 | 
 19 |     Returns:
 20 |         pd.DataFrame: DataFrame with calculated Cook's distance and outlier flags.
 21 |     """
 22 |     logging.info(f"Starting outlier detection for file: {file_path}")
 23 |     
 24 |     if not os.path.exists(file_path):
 25 |         logging.error(f"File not found: {file_path}")
 26 |         raise FileNotFoundError(f"The file {file_path} does not exist.")
 27 | 
 28 |     data = pd.read_csv(file_path)
 29 |     logging.info("Data successfully loaded.")
 30 | 
 31 |     if 'x' not in data.columns or 'y' not in data.columns:
 32 |         logging.error("Missing required columns 'x' and 'y' in the input data.")
 33 |         raise ValueError("The input CSV file must contain 'x' and 'y' columns.")
 34 | 
 35 |     # Prepare the data for regression
 36 |     X = sm.add_constant(data['x'])
 37 |     y = data['y']
 38 | 
 39 |     logging.info("Fitting regression model...")
 40 |     model = sm.OLS(y, X).fit()
 41 | 
 42 |     # Calculate Cook's distance and influence metrics
 43 |     logging.info("Calculating Cook's distance...")
 44 |     influence = model.get_influence()
 45 |     cooks = influence.cooks_distance[0]
 46 | 
 47 |     # Add results to the DataFrame
 48 |     data['cooks_distance'] = cooks
 49 |     data['outlier'] = data['cooks_distance'] > threshold
 50 | 
 51 |     # Save output to a CSV file
 52 |     if output_file:
 53 |         data.to_csv(output_file, index=False)
 54 |         logging.info(f"Results saved to: {output_file}")
 55 | 
 56 |     return data
 57 | 
 58 | 
 59 | def summarize_outliers(data):
 60 |     """
 61 |     Summarize the outlier detection results.
 62 | 
 63 |     Args:
 64 |         data (pd.DataFrame): DataFrame containing the outlier flags.
 65 | 
 66 |     Returns:
 67 |         dict: Summary statistics including count of outliers and non-outliers.
 68 |     """
 69 |     logging.info("Summarizing outlier detection results...")
 70 |     total_points = len(data)
 71 |     outlier_count = data['outlier'].sum()
 72 |     non_outlier_count = total_points - outlier_count
 73 | 
 74 |     summary = {
 75 |         "total_points": total_points,
 76 |         "outliers": outlier_count,
 77 |         "non_outliers": non_outlier_count,
 78 |         "outlier_percentage": (outlier_count / total_points) * 100,
 79 |     }
 80 | 
 81 |     logging.info("Summary generated successfully.")
 82 |     return summary
 83 | 
 84 | 
 85 | def add_random_data(data, num_rows=100):
 86 |     """
 87 |     Add random data to an existing DataFrame.
 88 | 
 89 |     Args:
 90 |         data (pd.DataFrame): Original DataFrame to which random data will be added.
 91 |         num_rows (int): Number of random rows to add.
 92 | 
 93 |     Returns:
 94 |         pd.DataFrame: DataFrame with added random data.
 95 |     """
 96 |     logging.info(f"Adding {num_rows} random rows to the data.")
 97 |     random_data = pd.DataFrame({
 98 |         'x': np.random.rand(num_rows),
 99 |         'y': np.random.rand(num_rows),
100 |         'cooks_distance': np.nan,
101 |         'outlier': False
102 |     })
103 |     return pd.concat([data, random_data], ignore_index=True)
104 | 
105 | 
106 | def remove_duplicates(data):
107 |     """
108 |     Remove duplicate rows from the dataset.
109 | 
110 |     Args:
111 |         data (pd.DataFrame): DataFrame containing the data.
112 | 
113 |     Returns:
114 |         pd.DataFrame: DataFrame without duplicate rows.
115 |     """
116 |     logging.info("Removing duplicate rows...")
117 |     cleaned_data = data.drop_duplicates()
118 |     logging.info(f"Removed {len(data) - len(cleaned_data)} duplicate rows.")
119 |     return cleaned_data
120 | 
121 | 
122 | def calculate_correlation(data):
123 |     """
124 |     Calculate the correlation matrix for numerical columns in the dataset.
125 | 
126 |     Args:
127 |         data (pd.DataFrame): DataFrame containing the data.
128 | 
129 |     Returns:
130 |         pd.DataFrame: Correlation matrix of numerical columns.
131 |     """
132 |     logging.info("Calculating correlation matrix...")
133 |     correlation_matrix = data.corr()
134 |     logging.info("Correlation matrix calculated successfully.")
135 |     return correlation_matrix
136 | 
137 | 
138 | def main(input_file, threshold=0.5, output_file="output_with_outliers.csv"):
139 |     """
140 |     Main function to detect and summarize outliers.
141 | 
142 |     Args:
143 |         input_file (str): Path to the input CSV file.
144 |         threshold (float): Threshold for Cook's distance to flag outliers.
145 |         output_file (str): Path to save the output CSV file.
146 |     """
147 |     try:
148 |         # Detect outliers
149 |         processed_data = detect_outliers(input_file, threshold, output_file)
150 | 
151 |         # Summarize results
152 |         summary = summarize_outliers(processed_data)
153 | 
154 |         # Remove duplicate rows
155 |         processed_data = remove_duplicates(processed_data)
156 | 
157 |         # Calculate correlation matrix
158 |         correlation_matrix = calculate_correlation(processed_data)
159 |         logging.info("Correlation matrix:")
160 |         logging.info(f"{correlation_matrix}")
161 | 
162 |         # Save the updated data with random rows
163 |         processed_data.to_csv(output_file, index=False)
164 | 
165 |         # Display summary
166 |         logging.info("Outlier Detection Completed")
167 |         logging.info("Summary Statistics:")
168 |         for key, value in summary.items():
169 |             logging.info(f"{key.capitalize()}: {value}")
170 | 
171 |     except Exception as e:
172 |         logging.error(f"Error occurred: {e}")
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     # Example usage
177 |     input_file = "data.csv"
178 |     output_file = "output_with_outliers.csv"
179 |     threshold = 0.5  # Adjust threshold as needed
180 | 
181 |     main(input_file, threshold, output_file)
182 | 


--------------------------------------------------------------------------------
/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/test_detect_outliers.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from outlier_detection_tool.detect_outliers import detect_outliers, summarize_outliers
  3 | 
  4 | def test_detect_outliers_basic():
  5 |     """
  6 |     Test the basic functionality of detecting outliers in a small dataset.
  7 |     """
  8 |     # Create a sample dataset
  9 |     data = pd.DataFrame({"x": [1, 2, 3, 4, 5, 100], "y": [2, 4, 6, 8, 10, 200]})
 10 |     test_file = "test_data.csv"
 11 |     data.to_csv(test_file, index=False)
 12 | 
 13 |     # Run outlier detection
 14 |     result = detect_outliers(test_file, threshold=0.5)
 15 | 
 16 |     # Assertions
 17 |     assert 'cooks_distance' in result.columns, "Cook's distance column is missing."
 18 |     assert 'outlier' in result.columns, "Outlier flag column is missing."
 19 |     assert result['outlier'].iloc[-1], "The last row should be flagged as an outlier."
 20 |     assert not result['outlier'].iloc[:-1].any(), "Non-outlier rows incorrectly flagged."
 21 | 
 22 |     # Clean up
 23 |     os.remove(test_file)
 24 |     print("test_detect_outliers_basic passed.")
 25 | 
 26 | def test_summarize_outliers():
 27 |     """
 28 |     Test the summarize_outliers function for correct statistical output.
 29 |     """
 30 |     data = pd.DataFrame({
 31 |         "x": [1, 2, 3, 4, 5, 100],
 32 |         "y": [2, 4, 6, 8, 10, 200],
 33 |         "cooks_distance": [0.001, 0.002, 0.003, 0.004, 0.005, 0.8],
 34 |         "outlier": [False, False, False, False, False, True],
 35 |     })
 36 | 
 37 |     summary = summarize_outliers(data)
 38 | 
 39 |     assert summary["total_points"] == 6, "Total points calculation is incorrect."
 40 |     assert summary["outliers"] == 1, "Outlier count calculation is incorrect."
 41 |     assert summary["non_outliers"] == 5, "Non-outlier count calculation is incorrect."
 42 |     assert summary["outlier_percentage"] == (1 / 6) * 100, "Outlier percentage is incorrect."
 43 | 
 44 |     print("test_summarize_outliers passed.")
 45 | 
 46 | def test_missing_file():
 47 |     """
 48 |     Test behavior when a non-existent file is provided.
 49 |     """
 50 |     try:
 51 |         detect_outliers("non_existent_file.csv")
 52 |     except FileNotFoundError as e:
 53 |         assert str(e).startswith("The file"), "FileNotFoundError not raised correctly."
 54 |         print("test_missing_file passed.")
 55 |     else:
 56 |         raise AssertionError("FileNotFoundError was not raised as expected.")
 57 | 
 58 | def test_invalid_columns():
 59 |     """
 60 |     Test behavior when input data does not contain required columns.
 61 |     """
 62 |     data = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
 63 |     test_file = "test_invalid_columns.csv"
 64 |     data.to_csv(test_file, index=False)
 65 | 
 66 |     try:
 67 |         detect_outliers(test_file)
 68 |     except ValueError as e:
 69 |         assert str(e).startswith("The input CSV file must contain"), "ValueError not raised correctly for missing columns."
 70 |         print("test_invalid_columns passed.")
 71 |     else:
 72 |         raise AssertionError("ValueError was not raised as expected.")
 73 | 
 74 |     os.remove(test_file)
 75 | 
 76 | def test_no_outliers_detected():
 77 |     """
 78 |     Test behavior when no outliers are present in the dataset.
 79 |     """
 80 |     # Create a dataset with no significant outliers
 81 |     data = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
 82 |     test_file = "test_no_outliers.csv"
 83 |     data.to_csv(test_file, index=False)
 84 | 
 85 |     # Run outlier detection
 86 |     result = detect_outliers(test_file, threshold=1.0)  # Set a high threshold to avoid outliers
 87 | 
 88 |     # Assertions
 89 |     assert 'cooks_distance' in result.columns, "Cook's distance column is missing."
 90 |     assert 'outlier' in result.columns, "Outlier flag column is missing."
 91 |     assert not result['outlier'].any(), "No rows should be flagged as outliers."
 92 | 
 93 |     # Clean up
 94 |     os.remove(test_file)
 95 |     print("test_no_outliers_detected passed.")
 96 | 
 97 | def test_outlier_removal_effect():
 98 |     """
 99 |     Test whether removing outliers changes the dataset size correctly.
100 |     """
101 |     data = pd.DataFrame({
102 |         "x": [1, 2, 3, 4, 5, 100],
103 |         "y": [2, 4, 6, 8, 10, 200],
104 |         "outlier": [False, False, False, False, False, True]
105 |     })
106 |     
107 |     data_filtered = data[~data['outlier']]
108 |     
109 |     assert len(data_filtered) == 5, "Outlier removal did not adjust dataset size correctly."
110 |     print("test_outlier_removal_effect passed.")
111 | 
112 | def test_outlier_summary_consistency():
113 |     """
114 |     Test if summary statistics remain consistent before and after removing outliers.
115 |     """
116 |     data = pd.DataFrame({
117 |         "x": [1, 2, 3, 4, 5, 100],
118 |         "y": [2, 4, 6, 8, 10, 200],
119 |         "outlier": [False, False, False, False, False, True]
120 |     })
121 |     
122 |     summary_before = summarize_outliers(data)
123 |     data_filtered = data[~data['outlier']]
124 |     summary_after = summarize_outliers(data_filtered)
125 |     
126 |     assert summary_before["outliers"] == 1, "Initial summary miscounts outliers."
127 |     assert summary_after["outliers"] == 0, "Filtered summary should not contain outliers."
128 |     print("test_outlier_summary_consistency passed.")
129 | 
130 | def test_summarize_outliers():
131 |     """
132 |     Test the summarize_outliers function for correct statistical output.
133 |     """
134 |     data = pd.DataFrame({
135 |         "x": [1, 2, 3, 4, 5, 100],
136 |         "y": [2, 4, 6, 8, 10, 200],
137 |         "cooks_distance": [0.001, 0.002, 0.003, 0.004, 0.005, 0.8],
138 |         "outlier": [False, False, False, False, False, True],
139 |     })
140 | 
141 |     summary = summarize_outliers(data)
142 | 
143 |     assert summary["total_points"] == 6, "Total points calculation is incorrect."
144 |     assert summary["outliers"] == 1, "Outlier count calculation is incorrect."
145 |     assert summary["non_outliers"] == 5, "Non-outlier count calculation is incorrect."
146 |     assert summary["outlier_percentage"] == (1 / 6) * 100, "Outlier percentage is incorrect."
147 | 
148 |     print("test_summarize_outliers passed.")
149 | 
150 | def test_outlier_percentage_consistency():
151 |     """
152 |     Test if outlier percentage calculation remains consistent after filtering.
153 |     """
154 |     data = pd.DataFrame({
155 |         "x": range(1, 21),
156 |         "y": range(2, 42, 2),
157 |         "outlier": [False] * 18 + [True, True]
158 |     })
159 | 
160 |     summary_before = summarize_outliers(data)
161 |     filtered_data = data[~data["outlier"]]
162 |     summary_after = summarize_outliers(filtered_data)
163 | 
164 |     assert summary_before["outlier_percentage"] > summary_after["outlier_percentage"], "Outlier percentage should decrease after filtering."
165 |     print("test_outlier_percentage_consistency passed.")
166 | 
167 | def test_large_dataset_performance():
168 |     """
169 |     Test the performance of the outlier detection function with a large dataset.
170 |     """
171 |     data = pd.DataFrame({
172 |         "x": range(1, 10001),
173 |         "y": range(2, 20002, 2)
174 |     })
175 | 
176 |     test_file = "large_test_data.csv"
177 |     data.to_csv(test_file, index=False)
178 | 
179 |     try:
180 |         result = detect_outliers(test_file, threshold=0.5)
181 |         assert len(result) == 10000, "The result should contain the same number of rows as the input."
182 |         print("test_large_dataset_performance passed.")
183 |     finally:
184 |         os.remove(test_file)
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     test_detect_outliers_basic()
189 |     test_summarize_outliers()
190 |     test_missing_file()
191 |     test_invalid_columns()
192 |     test_no_outliers_detected()
193 |     test_outlier_removal_effect()
194 |     test_outlier_summary_consistency()
195 |     print("All tests passed successfully.")
196 | 


--------------------------------------------------------------------------------
/util/util_outlier_detection.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import logging
  4 | 
  5 | # Configure logging
  6 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
  7 | 
  8 | def clean_data(file_path, output_file="cleaned_data.csv"):
  9 |     """
 10 |     Cleans the input CSV data by removing duplicates and handling missing values.
 11 | 
 12 |     Args:
 13 |         file_path (str): Path to the input CSV file.
 14 |         output_file (str): Path to save the cleaned data.
 15 | 
 16 |     Returns:
 17 |         pd.DataFrame: Cleaned DataFrame.
 18 |     """
 19 |     logging.info(f"Starting data cleaning for file: {file_path}")
 20 | 
 21 |     try:
 22 |         data = pd.read_csv(file_path)
 23 |         logging.info("Data successfully loaded.")
 24 |         
 25 |         # Remove duplicates
 26 |         data = data.drop_duplicates()
 27 |         logging.info("Duplicates removed.")
 28 |         
 29 |         # Handle missing values
 30 |         data = data.dropna()
 31 |         logging.info("Missing values removed.")
 32 | 
 33 |         # Save cleaned data
 34 |         if output_file:
 35 |             data.to_csv(output_file, index=False)
 36 |             logging.info(f"Cleaned data saved to: {output_file}")
 37 | 
 38 |         return data
 39 | 
 40 |     except Exception as e:
 41 |         logging.error(f"Error occurred during data cleaning: {e}")
 42 |         raise
 43 | 
 44 | def filter_data(data, column, threshold):
 45 |     """
 46 |     Filters data based on a threshold value for a specific column.
 47 | 
 48 |     Args:
 49 |         data (pd.DataFrame): DataFrame containing the data.
 50 |         column (str): Column name to apply the threshold.
 51 |         threshold (float): Threshold value to filter the data.
 52 | 
 53 |     Returns:
 54 |         pd.DataFrame: Filtered DataFrame.
 55 |     """
 56 |     logging.info(f"Filtering data where {column} >= {threshold}")
 57 |     try:
 58 |         filtered_data = data[data[column] >= threshold]
 59 |         logging.info(f"Filtered {len(data) - len(filtered_data)} rows below the threshold.")
 60 |         return filtered_data
 61 |     except Exception as e:
 62 |         logging.error(f"Error occurred during data filtering: {e}")
 63 |         raise
 64 | 
 65 | def convert_column_to_numeric(data, column):
 66 |     """
 67 |     Converts a specified column to numeric format, handling errors gracefully.
 68 | 
 69 |     Args:
 70 |         data (pd.DataFrame): DataFrame containing the data.
 71 |         column (str): Column name to convert to numeric.
 72 | 
 73 |     Returns:
 74 |         pd.DataFrame: DataFrame with the column converted to numeric.
 75 |     """
 76 |     logging.info(f"Converting column {column} to numeric format...")
 77 |     try:
 78 |         data[column] = pd.to_numeric(data[column], errors='coerce')
 79 |         logging.info(f"Column {column} converted to numeric successfully.")
 80 |         return data
 81 |     except Exception as e:
 82 |         logging.error(f"Error occurred during column conversion: {e}")
 83 |         raise
 84 | 
 85 | def rename_columns(data, column_mappings):
 86 |     """
 87 |     Rename columns in the DataFrame based on a given mapping.
 88 | 
 89 |     Args:
 90 |         data (pd.DataFrame): DataFrame containing the data.
 91 |         column_mappings (dict): Dictionary mapping old column names to new names.
 92 | 
 93 |     Returns:
 94 |         pd.DataFrame: DataFrame with renamed columns.
 95 |     """
 96 |     logging.info(f"Renaming columns: {column_mappings}")
 97 |     try:
 98 |         data = data.rename(columns=column_mappings)
 99 |         logging.info("Columns renamed successfully.")
100 |         return data
101 |     except Exception as e:
102 |         logging.error(f"Error occurred during column renaming: {e}")
103 |         raise
104 | 
105 | def sort_data(data, column, ascending=True):
106 |     """
107 |     Sort the DataFrame based on a specified column.
108 | 
109 |     Args:
110 |         data (pd.DataFrame): DataFrame containing the data.
111 |         column (str): Column name to sort by.
112 |         ascending (bool): Whether to sort in ascending order (default=True).
113 | 
114 |     Returns:
115 |         pd.DataFrame: Sorted DataFrame.
116 |     """
117 |     logging.info(f"Sorting data by column '{column}', ascending={ascending}")
118 |     try:
119 |         sorted_data = data.sort_values(by=column, ascending=ascending)
120 |         logging.info("Data sorted successfully.")
121 |         return sorted_data
122 |     except Exception as e:
123 |         logging.error(f"Error occurred during sorting: {e}")
124 |         raise
125 | 
126 | def compute_unique_values(data, column):
127 |     """
128 |     Compute the number of unique values in a specified column.
129 | 
130 |     Args:
131 |         data (pd.DataFrame): DataFrame containing the data.
132 |         column (str): Column name to count unique values.
133 | 
134 |     Returns:
135 |         int: Number of unique values in the column.
136 |     """
137 |     logging.info(f"Computing unique values in column '{column}'")
138 |     try:
139 |         unique_count = data[column].nunique()
140 |         logging.info(f"Column '{column}' has {unique_count} unique values.")
141 |         return unique_count
142 |     except Exception as e:
143 |         logging.error(f"Error occurred while computing unique values: {e}")
144 |         raise
145 | 
146 | def sort_data(data, column, ascending=True):
147 |     """
148 |     Sort the DataFrame based on a specified column.
149 | 
150 |     Args:
151 |         data (pd.DataFrame): DataFrame containing the data.
152 |         column (str): Column name to sort by.
153 |         ascending (bool): Whether to sort in ascending order (default=True).
154 | 
155 |     Returns:
156 |         pd.DataFrame: Sorted DataFrame.
157 |     """
158 |     logging.info(f"Sorting data by column '{column}', ascending={ascending}")
159 |     try:
160 |         sorted_data = data.sort_values(by=column, ascending=ascending)
161 |         logging.info("Data sorted successfully.")
162 |         return sorted_data
163 |     except Exception as e:
164 |         logging.error(f"Error occurred during sorting: {e}")
165 |         raise
166 | 
167 | def detect_outliers_iqr(data, column, threshold=1.5):
168 |     """
169 |     Detects outliers in a numerical column using the Interquartile Range (IQR) method.
170 | 
171 |     Args:
172 |         data (pd.DataFrame): DataFrame containing the data.
173 |         column (str): Column name to check for outliers.
174 |         threshold (float): Threshold multiplier for defining outliers (default is 1.5).
175 | 
176 |     Returns:
177 |         pd.DataFrame: Subset of the original DataFrame containing only outlier rows.
178 |     """
179 |     logging.info(f"Detecting outliers in column '{column}' using IQR method.")
180 |     try:
181 |         Q1 = data[column].quantile(0.25)
182 |         Q3 = data[column].quantile(0.75)
183 |         IQR = Q3 - Q1
184 | 
185 |         lower_bound = Q1 - (threshold * IQR)
186 |         upper_bound = Q3 + (threshold * IQR)
187 | 
188 |         outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
189 |         logging.info(f"Detected {len(outliers)} outliers in column '{column}'.")
190 | 
191 |         return outliers
192 |     except Exception as e:
193 |         logging.error(f"Error occurred while detecting outliers: {e}")
194 |         raise
195 |         
196 | def replace_missing_values(data, strategy="mean"):
197 |     """
198 |     Replaces missing values in numerical columns based on the specified strategy.
199 | 
200 |     Args:
201 |         data (pd.DataFrame): DataFrame containing the data.
202 |         strategy (str): Strategy for replacing missing values ('mean', 'median', 'mode').
203 | 
204 |     Returns:
205 |         pd.DataFrame: DataFrame with missing values replaced.
206 |     """
207 |     logging.info(f"Replacing missing values using strategy: {strategy}")
208 |     try:
209 |         if strategy not in ["mean", "median", "mode"]:
210 |             raise ValueError("Invalid strategy. Choose 'mean', 'median', or 'mode'.")
211 | 
212 |         if strategy == "mean":
213 |             data = data.fillna(data.mean())
214 |         elif strategy == "median":
215 |             data = data.fillna(data.median())
216 |         elif strategy == "mode":
217 |             data = data.fillna(data.mode().iloc[0])
218 | 
219 |         logging.info("Missing values replaced successfully.")
220 |         return data
221 |     except Exception as e:
222 |         logging.error(f"Error occurred while replacing missing values: {e}")
223 |         raise
224 | 


--------------------------------------------------------------------------------