├── schema ├── __init__.py ├── scenario.py ├── error_type.py ├── model.py ├── dataset.py └── clean_method.py ├── .gitignore ├── result.zip ├── analysis.zip ├── TechReport.pdf ├── DatasetDescriptions.pdf ├── requirements.txt ├── main.py ├── config.py ├── clean.py ├── init.py ├── experiment.py ├── inject.py ├── README.md ├── train.py ├── preprocess.py ├── relation.py └── utils.py /schema/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.npy 3 | *.pyc 4 | .DS_Store 5 | datasets/* 6 | */trash/* -------------------------------------------------------------------------------- /result.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chu-data-lab/CleanML/HEAD/result.zip -------------------------------------------------------------------------------- /analysis.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chu-data-lab/CleanML/HEAD/analysis.zip -------------------------------------------------------------------------------- /TechReport.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chu-data-lab/CleanML/HEAD/TechReport.pdf -------------------------------------------------------------------------------- /DatasetDescriptions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chu-data-lab/CleanML/HEAD/DatasetDescriptions.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | imbalanced-learn==0.11.0 2 | matplotlib==3.3.4 3 | numpy==1.20.1 4 | pandas==1.3.4 5 | scikit-learn==1.3.0 6 | scipy==1.9.1 7 | xgboost==1.7.6 8 | statsmodels==0.12.2 9 | openpyxl==3.0.7 -------------------------------------------------------------------------------- /schema/scenario.py: -------------------------------------------------------------------------------- 1 | """Define the domain of scenarios""" 2 | from .error_type import * 3 | 4 | scenarios = { 5 | "missing_values":["CD"], 6 | "outliers":["BD", "CD"], 7 | "mislabel":["BD", "CD"], 8 | "inconsistency": ["BD", "CD"], 9 | "duplicates": ["BD", "CD"] 10 | } -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Main function""" 2 | 3 | import numpy as np 4 | import utils 5 | import json 6 | import argparse 7 | import datetime 8 | import time 9 | import config 10 | from experiment import experiment 11 | from relation import populate 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--run_experiments', default=False, action='store_true') 15 | parser.add_argument('--run_analysis', default=False, action='store_true') 16 | parser.add_argument('--cpu', default=1, type=int) 17 | parser.add_argument('--error_type', default=None) 18 | parser.add_argument('--seeds', default=None, type=int, nargs='+') 19 | parser.add_argument('--log', default=False, action='store_true') 20 | parser.add_argument('--dataset', default=None) 21 | parser.add_argument('--nosave', default=False, action='store_true') 22 | parser.add_argument('--alpha', default=0.05, type=float) 23 | 24 | args = parser.parse_args() 25 | 26 | # run experiments on datasets 27 | if args.run_experiments: 28 | datasets = [utils.get_dataset(args.dataset)] if args.dataset is not None else config.datasets 29 | experiment(datasets, args.log, args.cpu, args.nosave, args.error_type, args.seeds) 30 | 31 | # run analysis on results 32 | if args.run_analysis: 33 | populate([args.alpha]) -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | """Configuration of experiment and schema""" 2 | from schema import dataset, model, error_type, clean_method, scenario 3 | 4 | # ============================================================================= 5 | # Directory Configuration 6 | # ============================================================================= 7 | data_dir = 'data' # dir storing data 8 | result_dir = 'result' # dir saving experiment results 9 | analysis_dir = 'analysis' # dir saving analysis results 10 | plot_dir = 'plot' # dir saving plots 11 | 12 | # ============================================================================= 13 | # Experiment Configuration 14 | # ============================================================================= 15 | root_seed = 1 # root seed for entire experiments 16 | n_resplit = 20 # num of resplit for handling split randomness 17 | n_retrain = 5 # num of retrain for handling random search randomness 18 | test_ratio = 0.3 # train/test ratio 19 | max_size = 15000 # max data size for training 20 | 21 | # ============================================================================= 22 | # Schema Configuration 23 | # ============================================================================= 24 | datasets = dataset.datasets 25 | models = model.models 26 | error_types = error_type.error_types 27 | scenarios = scenario.scenarios -------------------------------------------------------------------------------- /schema/error_type.py: -------------------------------------------------------------------------------- 1 | # define the domain of error type 2 | from .clean_method import * 3 | 4 | # details of each error type 5 | missing_values = { 6 | "name": "missing_values", 7 | "clean_methods": {"delete": MVCleaner("delete"), 8 | "impute_holoclean": MVHoloCleaner(), 9 | "impute_mean_mode": MVCleaner("impute", num="mean", cat="mode"), 10 | "impute_mean_dummy": MVCleaner("impute", num="mean", cat="dummy"), 11 | "impute_median_mode": MVCleaner("impute", num="median", cat="mode"), 12 | "impute_median_dummy": MVCleaner("impute", num="median", cat="dummy"), 13 | "impute_mode_mode": MVCleaner("impute", num="mode", cat="mode"), 14 | "impute_mode_dummy": MVCleaner("impute", num="mode", cat="dummy"), 15 | } 16 | } 17 | 18 | outliers = { 19 | "name": "outliers", 20 | "clean_methods": {"clean_HC_impute_holoclean": OutlierHoloCleaner(), 21 | "clean_SD_impute_mean_dummy": OutlierCleaner(detect_method="SD", repairer=MVCleaner("impute", num="mean", cat="dummy")), 22 | "clean_SD_impute_mode_dummy": OutlierCleaner(detect_method="SD", repairer=MVCleaner("impute", num="mode", cat="dummy")), 23 | "clean_SD_impute_median_dummy": OutlierCleaner(detect_method="SD", repairer=MVCleaner("impute", num="median", cat="dummy")), 24 | "clean_IQR_impute_mean_dummy": OutlierCleaner(detect_method="IQR", repairer=MVCleaner("impute", num="mean", cat="dummy")), 25 | "clean_IQR_impute_mode_dummy": OutlierCleaner(detect_method="IQR", repairer=MVCleaner("impute", num="mode", cat="dummy")), 26 | "clean_IQR_impute_median_dummy": OutlierCleaner(detect_method="IQR", repairer=MVCleaner("impute", num="median", cat="dummy")), 27 | "clean_IF_impute_mean_dummy": OutlierCleaner(detect_method="IF", repairer=MVCleaner("impute", num="mean", cat="dummy")), 28 | "clean_IF_impute_mode_dummy": OutlierCleaner(detect_method="IF", repairer=MVCleaner("impute", num="mode", cat="dummy")), 29 | "clean_IF_impute_median_dummy": OutlierCleaner(detect_method="IF", repairer=MVCleaner("impute", num="median", cat="dummy")), 30 | } 31 | } 32 | 33 | mislabel = { 34 | "name": "mislabel", 35 | "clean_methods": {"cleanlab": MislabelCleaner()} 36 | } 37 | 38 | duplicates = { 39 | "name": "duplicates", 40 | "clean_methods": {"clean": DuplicatesCleaner(), "AutoER": AutoERCleaner()} 41 | } 42 | 43 | inconsistency = { 44 | "name": "inconsistency", 45 | "clean_methods": {"clean": InconsistencyCleaner()} 46 | } 47 | 48 | # domain of error types 49 | error_types = [missing_values, outliers, mislabel, inconsistency, duplicates] -------------------------------------------------------------------------------- /clean.py: -------------------------------------------------------------------------------- 1 | """Clean datasets""" 2 | import pandas as pd 3 | import config 4 | import os 5 | import argparse 6 | import utils 7 | import sys 8 | import schema.clean_method 9 | 10 | def clean_error(dataset, error): 11 | """ Clean one error in the dataset. 12 | 13 | Args: 14 | dataset (dict): dataset dict in dataset.py 15 | error (string): error type 16 | """ 17 | # create saving folder 18 | save_dir = utils.get_dir(dataset, error, create_folder=True) 19 | 20 | # load dirty data 21 | dirty_path_pfx = utils.get_dir(dataset, 'raw', 'dirty') 22 | dirty_train, dirty_test, version = utils.load_dfs(dataset, dirty_path_pfx, return_version=True) 23 | 24 | # delete missing values if error type is not missing values 25 | if error != 'missing_values': 26 | dirty_train = dirty_train.dropna().reset_index(drop=True) 27 | dirty_test = dirty_test.dropna().reset_index(drop=True) 28 | 29 | # save dirty data 30 | dirty_path_pfx = os.path.join(save_dir, 'dirty') 31 | utils.save_dfs(dirty_train, dirty_test, dirty_path_pfx, version) 32 | 33 | # clean the error in the dataset with various cleaning methods 34 | error_type = utils.get_error(error) 35 | for clean_method, cleaner in error_type['clean_methods'].items(): 36 | print(" - Clean the error with method '{}'".format(clean_method)) 37 | # fit on dirty train and clean both train and test 38 | cleaner.fit(dataset, dirty_train) 39 | clean_train, ind_train, clean_test, ind_test = cleaner.clean(dirty_train, dirty_test) 40 | 41 | # save clean train and test data 42 | clean_path_pfx = os.path.join(save_dir, clean_method) 43 | utils.save_dfs(clean_train, clean_test, clean_path_pfx, version) 44 | 45 | # save indicator 46 | ind_path_pfx = os.path.join(save_dir, 'indicator_{}'.format(clean_method)) 47 | utils.save_dfs(ind_train, ind_test, ind_path_pfx) 48 | 49 | def clean(dataset, error_type=None): 50 | """ Clean each error in the dataset. 51 | 52 | Args: 53 | dataset (dict): dataset dict in dataset.py 54 | """ 55 | print("- Clean dataset '{}'".format(dataset['data_dir'])) 56 | for error in dataset['error_types']: 57 | if error_type is not None and error != error_type: 58 | continue 59 | print(" - Clean error type '{}'".format(error)) 60 | clean_error(dataset, error) 61 | print(" - Finished") 62 | 63 | 64 | if __name__ == '__main__': 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument('--dataset', default=None) 67 | args = parser.parse_args() 68 | 69 | # datasets to be cleaned, clean all datasets if not specified 70 | datasets = [utils.get_dataset(args.dataset)] if args.dataset is not None else config.datasets 71 | 72 | # clean datasets 73 | for dataset in datasets: 74 | clean(dataset) -------------------------------------------------------------------------------- /schema/model.py: -------------------------------------------------------------------------------- 1 | """Define the domain of ML model""" 2 | from sklearn.linear_model import Lasso 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 5 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 6 | from sklearn.svm import LinearSVC, SVC 7 | from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor 8 | from sklearn.neural_network import MLPClassifier 9 | from sklearn.naive_bayes import GaussianNB 10 | from sklearn.naive_bayes import MultinomialNB 11 | from sklearn.linear_model import RANSACRegressor 12 | from xgboost import XGBClassifier 13 | 14 | # details of each model 15 | logistic_reg = { 16 | "name": "logistic_regression", 17 | "fn": LogisticRegression, 18 | "fixed_params": {"solver":"lbfgs", "max_iter":5000, "multi_class":'auto'}, 19 | "parallelable": True, 20 | "type": "classification", 21 | "hyperparams": "C" , 22 | "hyperparams_type": "real", 23 | "hyperparams_range": [-5, 5] 24 | } 25 | 26 | knn_clf = { 27 | "name": "knn_classification", 28 | "fn": KNeighborsClassifier, 29 | "fixed_params":{}, 30 | "parallelable": True, 31 | "type": "classification", 32 | "hyperparams": "n_neighbors", 33 | "hyperparams_type": "int", 34 | "hyperparams_range": [1, 95] 35 | } 36 | 37 | dt_clf = { 38 | "name": "decision_tree_classification", 39 | "fn": DecisionTreeClassifier, 40 | "fixed_params": {}, 41 | "type": "classification", 42 | "hyperparams": "max_depth", 43 | "hyperparams_type": "int", 44 | "hyperparams_range": [1, 200] 45 | } 46 | 47 | linear_svm = { 48 | "name": "linear_svm", 49 | "fn": SVC, 50 | "fixed_params": {"kernel":"linear", "cache_size":7000}, 51 | "type": "classification", 52 | "hyperparams": "C", 53 | "hyperparams_type": "real", 54 | "hyperparams_range": [-5, 5] 55 | } 56 | 57 | adaboost_clf = { 58 | "name": "adaboost_classification", 59 | "fn": AdaBoostClassifier, 60 | "fixed_params": {"n_estimators":200}, 61 | "type": "classification", 62 | "hyperparams": "learning_rate", 63 | "hyperparams_type": "real", 64 | "hyperparams_range": [-9, 1] 65 | } 66 | 67 | random_forest_clf = { 68 | "name": "random_forest_classification", 69 | "fn": RandomForestClassifier, 70 | "fixed_params": {"n_estimators":100}, 71 | "parallelable": True, 72 | "type": "classification", 73 | "hyperparams": "max_depth", 74 | "hyperparams_type": "int", 75 | "hyperparams_range": [1, 200] 76 | } 77 | 78 | gaussian_nb = { 79 | "name": "guassian_naive_bayes", 80 | "fn": GaussianNB, 81 | "fixed_params": {}, 82 | "type": "classification" 83 | } 84 | 85 | xgb_clf = { 86 | "name":"XGBoost", 87 | "fn": XGBClassifier, 88 | "fixed_params": {}, 89 | "type": "classification", 90 | "hyperparams": "max_depth", 91 | "hyperparams_type": "int", 92 | "hyperparams_range": [1, 100] 93 | } 94 | 95 | # model domain 96 | models = [logistic_reg, knn_clf, dt_clf, adaboost_clf, random_forest_clf, gaussian_nb, xgb_clf] 97 | -------------------------------------------------------------------------------- /init.py: -------------------------------------------------------------------------------- 1 | """ Initialize datasets: 2 | Delete missing labels of raw.csv for each dataset to ensure Supervised Learning. 3 | Delete missing features for dataset not having "missing_values" in config 4 | Split dataset into train/test 5 | """ 6 | import config 7 | import utils 8 | import pandas as pd 9 | import numpy as np 10 | import os 11 | import argparse 12 | 13 | def delete_missing_labels(raw, label_name): 14 | """ Delete missing labels""" 15 | label = raw[label_name] 16 | is_missing_label = label.isnull() 17 | dirty = raw[is_missing_label == False] 18 | return dirty 19 | 20 | def delete_missing_values(raw): 21 | """ Delete missing values""" 22 | dirty = raw.dropna() 23 | return dirty 24 | 25 | def split(data, test_ratio, seed, max_size=None): 26 | """ Shuffle and split data to train / test""" 27 | # random shuffle 28 | np.random.seed(seed) 29 | N = data.shape[0] 30 | idx = np.random.permutation(N) 31 | 32 | # only use first max_size data if N > max_size 33 | if max_size is not None: 34 | N = min(N, int(max_size)) 35 | 36 | # split train and test 37 | test_size = int(N * test_ratio) 38 | idx_train = idx[test_size:N] 39 | idx_test = idx[:test_size] 40 | train = data.iloc[idx_train] 41 | test = data.iloc[idx_test] 42 | idx_train = pd.DataFrame(idx_train, columns=["index"]) 43 | idx_test = pd.DataFrame(idx_test, columns=["index"]) 44 | 45 | return train, test, idx_train, idx_test 46 | 47 | def reset(dataset): 48 | """ Reset dataset""" 49 | # delete folders for each error 50 | for error in dataset['error_types']: 51 | utils.remove(utils.get_dir(dataset, error)) 52 | 53 | # delete dirty_train and dirty_test in raw folder 54 | utils.remove(utils.get_dir(dataset, 'raw', 'dirty_train.csv')) 55 | utils.remove(utils.get_dir(dataset, 'raw', 'dirty_test.csv')) 56 | utils.remove(utils.get_dir(dataset, 'raw', 'dirty.csv')) 57 | utils.remove(utils.get_dir(dataset, 'raw', 'idx_train.csv')) 58 | utils.remove(utils.get_dir(dataset, 'raw', 'idx_test.csv')) 59 | utils.remove(utils.get_dir(dataset, 'raw', 'version.json')) 60 | 61 | def init(dataset, test_ratio=0.3, seed=1, max_size=None): 62 | """ Initialize dataset: raw -> dirty -> dirty_train, dirty_test 63 | 64 | Args: 65 | dataset (dict): dataset dict in config.py 66 | max_size (int): maximum limit of dataset size 67 | test_ratio: ratio of test set 68 | seed: seed used to split dataset 69 | """ 70 | print("Initialize dataset {}".format(dataset['data_dir'])) 71 | 72 | # load raw data 73 | raw_path = utils.get_dir(dataset, 'raw', 'raw.csv') 74 | raw = pd.read_csv(raw_path) 75 | 76 | # delete missing labels or all missing values 77 | if 'missing_values' not in dataset['error_types']: 78 | dirty = delete_missing_values(raw) 79 | else: 80 | dirty = delete_missing_labels(raw, dataset['label']) 81 | 82 | # split dataset 83 | train, test, idx_train, idx_test = split(dirty, test_ratio, seed, max_size) 84 | 85 | # save train / test 86 | save_path_pfx = utils.get_dir(dataset, 'raw', 'dirty') 87 | utils.save_dfs(train, test, save_path_pfx) 88 | 89 | # save the version (seed) of dataset 90 | utils.save_version(save_path_pfx, seed) 91 | 92 | # save index 93 | save_path_pfx = utils.get_dir(dataset, 'raw', 'idx') 94 | utils.save_dfs(idx_train, idx_test, save_path_pfx) 95 | 96 | # save dirty 97 | # save_path = utils.get_dir(dataset, 'raw', 'dirty.csv') 98 | # dirty.to_csv(save_path, index=False) 99 | 100 | if __name__ == '__main__': 101 | parser = argparse.ArgumentParser() 102 | parser.add_argument('--dataset', default=None) 103 | parser.add_argument('--max_size', type=int, default=None) 104 | parser.add_argument('--test_ratio', type=int, default=0.3) 105 | parser.add_argument('--seed', type=int, default=1) 106 | parser.add_argument('--reset', default=False, action='store_true' ) 107 | args = parser.parse_args() 108 | 109 | # datasets to be initialized, initialze all datasets if not specified 110 | datasets = [utils.get_dataset(args.dataset)] if args.dataset is not None else config.datasets 111 | 112 | # raw -> dirty 113 | for dataset in datasets: 114 | if args.reset: 115 | reset(dataset) 116 | else: 117 | init(dataset, max_size=args.max_size, test_ratio=args.test_ratio, seed=args.seed) -------------------------------------------------------------------------------- /experiment.py: -------------------------------------------------------------------------------- 1 | """Run experiments""" 2 | from train import train_and_evaluate 3 | from preprocess import preprocess 4 | import numpy as np 5 | import utils 6 | import json 7 | import argparse 8 | import config 9 | import datetime 10 | from init import init 11 | from clean import clean 12 | import time 13 | import logging 14 | 15 | def one_search_experiment(dataset, error_type, train_file, model, seed, n_jobs=1, hyperparams=None): 16 | """One experiment on the datase given an error type, a train file, a model and a random search seed 17 | 18 | Args: 19 | dataset (dict): dataset dict in config.py 20 | error_type (string): error type 21 | train_file (string): filename of training set (dirty or clean) 22 | model (dict): ml model dict in model.py 23 | seed (int): seed for this experiment 24 | """ 25 | np.random.seed(seed) 26 | # generate random seeds for down sample and training 27 | down_sample_seed, train_seed = np.random.randint(1000, size=2) 28 | 29 | # load and preprocess data 30 | X_train, y_train, X_test_list, y_test_list, test_files = \ 31 | preprocess(dataset, error_type, train_file, normalize=True, down_sample_seed=down_sample_seed) 32 | 33 | # train and evaluate 34 | result = train_and_evaluate(X_train, y_train, X_test_list, y_test_list, test_files, model, n_jobs=n_jobs, seed=train_seed, hyperparams=hyperparams) 35 | return result 36 | 37 | def one_split_experiment(dataset, n_retrain=5, seed=1, n_jobs=1, nosave=True, error_type=None): 38 | """Run experiments on one dataset for one split. 39 | 40 | Args: 41 | dataset (dict): dataset dict in config.py 42 | models (list): list of model dict in model.py 43 | nosave (bool): whether not save results 44 | seed: experiment seed 45 | n_retrain: times of repeated experiments 46 | """ 47 | # generate seeds for n experiments 48 | np.random.seed(seed) 49 | seeds = np.random.randint(10000, size=n_retrain) 50 | 51 | # load result dict 52 | result = utils.load_result(dataset['data_dir']) 53 | 54 | # run experiments 55 | for error in dataset["error_types"]: 56 | if error_type is not None and error != error_type: 57 | continue 58 | 59 | for train_file in utils.get_train_files(error): 60 | for model in config.models: 61 | for seed in seeds: 62 | version = utils.get_version(utils.get_dir(dataset, error, train_file)) 63 | key = "/".join((dataset['data_dir'], 'v'+str(version), error, train_file, model['name'], str(seed))) 64 | 65 | if key in result.keys(): 66 | print("Ignore experiment {} that has been completed before.".format(key)) 67 | continue 68 | 69 | print("{} Processing {}".format(datetime.datetime.now(), key)) 70 | res = one_search_experiment(dataset, error, train_file, model, seed, n_jobs=n_jobs) 71 | 72 | if not nosave: 73 | utils.save_result(dataset['data_dir'], key, res) 74 | 75 | def experiment(datasets, log=False, n_jobs=1, nosave=False, error_type=None, arg_seeds=None): 76 | """Run expriments on all datasets for all splits""" 77 | # set logger for experiments 78 | if log: 79 | logging.captureWarnings(False) 80 | logging.basicConfig(filename='logging_{}.log'.format(datetime.datetime.now()), level=logging.DEBUG) 81 | 82 | # set seeds for experiments 83 | np.random.seed(config.root_seed) 84 | split_seeds = np.random.randint(10000, size=config.n_resplit) 85 | experiment_seed = np.random.randint(10000) 86 | 87 | # run experiments 88 | for dataset in datasets: 89 | if log: 90 | logging.debug("{}: Experiment on {}".format(datetime.datetime.now(), dataset['data_dir'])) 91 | 92 | for i, seed in enumerate(split_seeds): 93 | if arg_seeds is not None: 94 | if i not in arg_seeds: 95 | continue 96 | 97 | if utils.check_completed(dataset, seed, experiment_seed): 98 | print("Ignore {}-th experiment on {} that has been completed before.".format(i, dataset['data_dir'])) 99 | continue 100 | 101 | tic = time.time() 102 | init(dataset, seed=seed, max_size=config.max_size) 103 | clean(dataset, error_type) 104 | one_split_experiment(dataset, n_retrain=config.n_retrain, n_jobs=n_jobs, nosave=nosave, seed=experiment_seed, error_type=error_type) 105 | toc = time.time() 106 | t = (toc - tic) / 60 107 | remaining = t*(len(split_seeds)-i-1) 108 | if log: 109 | logging.debug("{}: {}-th experiment takes {} min. Estimated remaining time: {} min".format(datetime.datetime.now(), i, t, remaining)) -------------------------------------------------------------------------------- /inject.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import config 3 | import os 4 | import argparse 5 | import utils 6 | 7 | def uniform_class_noise(df, label, percentage=0.05, random_state=123): 8 | """Uniform class noise in a binary classification dataset. 9 | x% of the examples are corrupted. 10 | The class labels of these examples are randomly replaced by another one from the M classes. 11 | - flip 5% in each class, in total 5% of the labels are changed 12 | 13 | Args: 14 | df: pandas dataframe 15 | percentage: the percentage to corrupt, percentage = X 16 | label: the column of label 17 | """ 18 | ## load in csv 19 | dist = df[label].value_counts(ascending=True) 20 | # print('class distribution before injection:\n', dist) 21 | 22 | classes = list(dist.index) 23 | 24 | ## label == 1 25 | train1 = df[df[label]==classes[1]].copy() 26 | train1.loc[train1.sample(frac=percentage, random_state=random_state).index, label] = classes[0] 27 | 28 | ## label == 0 29 | train0 = df[df[label]==classes[0]].copy() 30 | train0.loc[train0.sample(frac=percentage, random_state=random_state).index, label] = classes[1] 31 | 32 | ## append the noisy sets 33 | uniform_df = train1.append(train0) 34 | # print('\nclass distribution after uniform injection:\n', uniform_df[label].value_counts(ascending=True)) 35 | return uniform_df 36 | 37 | def pairwise_class_noise(df, label, percentage=0.05, random_state=123): 38 | """ Pairwise class noise. 39 | Let X be the majority class and Y the second 40 | majority class, an example with the label X has a probability of x/100 of 41 | being incorrectly labeled as Y . 42 | - flip 5% of the labels in class A and keep the labels for class B 43 | - flip 5% of the labels in class B and keep the labels for class A 44 | 45 | Args: 46 | df: pandas dataframe 47 | percentage: the percentage to corrupt, percentage = X 48 | label, the column of label 49 | class_to_flip, the class label to corrupt 50 | """ 51 | ## load in csv 52 | dist = df[label].value_counts(ascending=True) 53 | # print('class distribution before injection:\n', dist) 54 | 55 | classes = list(dist.index) 56 | 57 | flip_major = df.copy() 58 | flip_major.loc[df[df[label]==classes[1]].sample(frac=percentage, random_state=random_state).index, label] = classes[0] 59 | flip_minor = df.copy() 60 | flip_minor.loc[df[df[label]==classes[0]].sample(frac=percentage, random_state=random_state).index, label] = classes[1] 61 | 62 | # print('\nclass distribution after injection (flip majority class):\n', flip_major[label].value_counts(ascending=True)) 63 | # print('\nclass distribution after injection (flip minority class):\n', flip_minor[label].value_counts(ascending=True)) 64 | return flip_major, flip_minor 65 | 66 | def inject(dataset): 67 | """ Inject mislabels 68 | Args: 69 | dataset (dict): dataset dict in config 70 | """ 71 | # create saving folder 72 | major_save_dir = utils.makedirs([config.data_dir, dataset["data_dir"] + "_major", 'raw']) 73 | minor_save_dir = utils.makedirs([config.data_dir, dataset["data_dir"] + "_minor", 'raw']) 74 | uniform_save_dir = utils.makedirs([config.data_dir, dataset["data_dir"] + "_uniform", 'raw']) 75 | 76 | # load clean data 77 | clean_path = utils.get_dir(dataset, 'raw', 'raw.csv') 78 | clean = utils.load_df(dataset, clean_path) 79 | clean = clean.dropna().reset_index(drop=True) 80 | 81 | major_clean_path = os.path.join(major_save_dir, 'mislabel_clean_raw.csv') 82 | minor_clean_path = os.path.join(minor_save_dir, 'mislabel_clean_raw.csv') 83 | uniform_clean_path = os.path.join(uniform_save_dir, 'mislabel_clean_raw.csv') 84 | clean.to_csv(major_clean_path, index=False) 85 | clean.to_csv(minor_clean_path, index=False) 86 | clean.to_csv(uniform_clean_path, index=False) 87 | 88 | label = dataset['label'] 89 | 90 | # uniform flip 91 | uniform = uniform_class_noise(clean, label) 92 | # pairwise flip 93 | major, minor = pairwise_class_noise(clean, label) 94 | 95 | major_raw_path = os.path.join(major_save_dir, 'raw.csv') 96 | minor_raw_path = os.path.join(minor_save_dir, 'raw.csv') 97 | uniform_raw_path = os.path.join(uniform_save_dir, 'raw.csv') 98 | 99 | major.to_csv(major_raw_path, index=False) 100 | minor.to_csv(minor_raw_path, index=False) 101 | uniform.to_csv(uniform_raw_path, index=False) 102 | 103 | if __name__ == '__main__': 104 | parser = argparse.ArgumentParser() 105 | parser.add_argument('--dataset', default=None) 106 | args = parser.parse_args() 107 | 108 | # datasets to be inject, inject all datasets with error type mislabel if not specified 109 | datasets = [utils.get_dataset(args.dataset)] 110 | 111 | # clean datasets 112 | for dataset in datasets: 113 | inject(dataset) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CleanML 2 | 3 | This is the CleanML Benchmark for Joint Data Cleaning and Machine Learning. 4 | 5 | The details of the benchmark methodology and design are described in the paper: 6 | [CleanML: A Study for Evaluating the Impact of Data Cleaning on ML Classification Tasks](https://arxiv.org/pdf/1904.09483.pdf) 7 | 8 | 9 | ## Basic Usage 10 | ### Run Experiments 11 | To run experiments, download and unzip the [datasets](https://www.dropbox.com/s/nerfrhbrseev928/CleanML-datasets-2020.zip?dl=0). Place it under the project home directory and execute the following command from the project home directory: 12 | 13 | ``` 14 | python3 main.py --run_experiments [--dataset ] [--cpu ] [--log] 15 | ``` 16 | 17 | #### Options: 18 | --dataset: the experiment dataset. If not specified, the program will run experiments on all datasets.
19 | --cpu: the number of cpu used for experiment. Default is 1.
20 | --log: whether to log experiment process 21 | 22 | #### Output: 23 | The experimental results for each dataset will be saved in `/result` directory as a json file named as \\_result.json. Each result is a key-value pair. The key is a string in format "\/\/\/\/\/\". The value is a set of key-value pairs for each evaluation metric and result. Our experimental results are provided in `result.zip`. 24 | 25 | ### Run Analysis 26 | To run analysis for populating relations described in the paper, unzip `result.zip` and execute the following command from the project home directory: 27 | 28 | ``` 29 | python3 main.py --run_analysis [--alpha ] 30 | ``` 31 | 32 | #### Options: 33 | --alpha: the significance level for multiple hypothesis test. Default is 0.05. 34 | 35 | #### Output: 36 | The relations R1, R2 and R3 will be saved in `/analysis` directory. Our analysis results are provided in `analysis.zip`. 37 | 38 | ## Extend Domain of Attributes 39 | ### Add new datasets: 40 | To add a new dataset, first, create a new folder with dataset name under `/data` and create a `raw` folder under the new folder. The `raw` folder must contain raw data named `raw.csv`. For dataset with inconsistencies, it must also contain the inconsistency-cleaned version data named `inconsistency_clean_raw.csv`. For dataset with mislabels, it must also contain the mislabel-cleaned version data named `mislabel_clean_raw.csv`. The structure of the directory looks like: 41 |
42 | .
43 | └── data
44 |     └── new_dataset
45 |         └── raw
46 |             ├── raw.csv
47 |             ├── inconsistency_clean_raw.csv (for dataset with inconsistencies)
48 |             └── mislabel_clean_raw.csv (for dataset with mislabels)
49 | 
50 | 51 | Then add a dictionary to `/schema/dataset.py` and append it to `datasets` array at the end of the file.
52 | 53 | The new dictionary must contain the following keys:
54 | ```yaml 55 | data_dir: the name of the dataset. 56 | error_types: a list of error types that the dataset contains. 57 | label: the label of ML task. 58 | ``` 59 | 60 | The following keys are optional:
61 | ```yaml 62 | class_imbalance: whether the dataset is class imbalanced. 63 | categorical_variables: a list of categorical attributes. 64 | text_variables: a list of text attributes. 65 | key_columns: a list of key columns used for deduplication. 66 | drop_variables: a list of irrelevant attributes. 67 | ``` 68 | ### Add new error types: 69 | To add a new error type, add a dictionary to `/schema/error_type.py` and append it to `error_types` array at the end of the file.
70 | 71 | The new dictionary must contain the following keys:
72 | ```yaml 73 | name: the name of the error type. 74 | cleaning_methods: a dictionary, {cleaning method name: cleaning methods object}. 75 | ``` 76 | ### Add new models: 77 | To add a new ML model, add a dictionary to `/schema/model.py` and append it to `models` array at the end of the file.
78 | 79 | The new dictionary must contain the following keys:
80 | ```yaml 81 | name: the name of the model. 82 | fn: the function of the model. 83 | fixed_params: parameters not to be tuned. 84 | hyperparams: the hyperparameter to be tuned. 85 | hyperparams_type: the type of hyperparameter "real" or "int". 86 | hyperparams_range: range of search. Use log base for real type hyperparameters. 87 | ``` 88 | ### Add new cleaning methods: 89 | To add a new cleaning methods, add a class to `/schema/cleaning_method.py`.
90 | 91 | The class must contain two methods:
92 | 93 | `fit(dataset, dirty_train)`: take in the dataset dictionary and dirty training set. Compute statistics or train models on training set for data cleaning.
94 | `clean(dirty_train, dirty_test)`: take in the dirty training set and dirty test set. Clean the error in the training set and test set. Return `(clean_train, indicator_train, clean_test, indicator_test)`, which are the clean version datasets and indicators that indicate the location of error. 95 | 96 | ### Add new scenarios: 97 | We consider "BD" and "CD" scenarios in our paper. To investigate other scenarios, add scenarios to `/schema/scenario.py`. 98 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """Train the model""" 2 | import numpy as np 3 | import pandas as pd 4 | import argparse 5 | import config 6 | import utils 7 | import sys 8 | from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 9 | import pickle 10 | from preprocess import preprocess 11 | from sklearn.metrics import f1_score 12 | from sklearn.feature_selection import SelectKBest, chi2 13 | from sklearn.model_selection import cross_val_score 14 | 15 | def parse_searcher(searcher): 16 | """Get results from gridsearch 17 | 18 | Args: 19 | searcher: GridSearchCV object 20 | """ 21 | train_accs = searcher.cv_results_['mean_train_score'] 22 | val_accs = searcher.cv_results_['mean_test_score'] 23 | best_idx = searcher.best_index_ 24 | best_params = searcher.best_params_ 25 | train_acc, val_acc = train_accs[best_idx], val_accs[best_idx] 26 | best_model = searcher.best_estimator_ 27 | return best_model, best_params, train_acc, val_acc 28 | 29 | def train(X_train, y_train, estimator, param_grid, seed=1, n_jobs=1, skip=False): 30 | """Train the model 31 | 32 | Args: 33 | X_train (pd.DataFrame): features (train) 34 | y_train (pd.DataFrame): label (train) 35 | estimator (sklearn.model): model 36 | param_grid (dict): hyper-parameters to tune 37 | seed (int): seed for training 38 | n_jobs (int): num of threads 39 | """ 40 | np.random.seed(seed) 41 | 42 | # cleamml 2020 43 | if skip: 44 | best_model = estimator 45 | best_model.fit(X_train, y_train) 46 | result = {} 47 | return best_model, result 48 | 49 | # train and tune hyper parameter with 5-fold cross validation 50 | if param_grid is not None: 51 | searcher = GridSearchCV(estimator, param_grid, cv=5, n_jobs=n_jobs, return_train_score=True) 52 | searcher.fit(X_train, y_train) 53 | best_model, best_params, train_acc, val_acc = parse_searcher(searcher) 54 | else: 55 | # if no hyper parameter is given, train directly 56 | best_model = estimator 57 | val_acc = cross_val_score(best_model, X_train, y_train, cv=5).mean() 58 | best_model.fit(X_train, y_train) 59 | train_acc = best_model.score(X_train, y_train) 60 | best_params = {} 61 | 62 | result = {"best_params": best_params, "train_acc":train_acc, "val_acc": val_acc} 63 | return best_model, result 64 | 65 | def evaluate(best_model, X_test_list, y_test_list, test_files): 66 | # evaluate on test sets 67 | result = {} 68 | for X_test, y_test, file in zip(X_test_list, y_test_list, test_files): 69 | y_pred = best_model.predict(X_test) 70 | test_acc = np.mean(y_pred == y_test) 71 | result[file + "_test_acc"] = test_acc 72 | 73 | if len(set(y_test)) > 2: 74 | test_f1 = f1_score(y_test, y_pred, average='macro') 75 | else: 76 | test_f1 = f1_score(y_test, y_pred) 77 | result[file + "_test_f1"] = test_f1 78 | return result 79 | 80 | def get_coarse_grid(model, seed, n_jobs, N): 81 | """Get hyper parameters (coarse random search) """ 82 | np.random.seed(seed) 83 | low, high = model["hyperparams_range"] 84 | if model["hyperparams_type"] == "real": 85 | param_grid = {model['hyperparams']: 10 ** np.random.uniform(low, high, 20)} 86 | if model["hyperparams_type"] == "int": 87 | if model["name"] == "knn_classification": 88 | high = min(high, int(N/5*4)) 89 | param_grid = {model['hyperparams']: np.random.randint(low, high, 20)} 90 | return param_grid 91 | 92 | def get_fine_grid(model, best_param_coarse, n_jobs, N): 93 | """Get hyper parameters (fine grid search, around the best parameter in coarse search) """ 94 | if model["hyperparams_type"] == "real": 95 | base = np.log10(best_param_coarse) 96 | param_grid = {model['hyperparams']: np.linspace(10**(base-0.5), 10**(base+0.5), 20)} 97 | if model["hyperparams_type"] == "int": 98 | low = max(best_param_coarse - 10, 1) 99 | high = low + 20 100 | if model["name"] == "knn_classification": 101 | high = min(high, int(N/5*4)) 102 | param_grid = {model['hyperparams']: np.arange(low, high)} 103 | return param_grid 104 | 105 | def hyperparam_search(X_train, y_train, model, n_jobs=1, seed=1, hyperparams=None): 106 | np.random.seed(seed) 107 | coarse_param_seed, coarse_train_seed, fine_train_seed = np.random.randint(1000, size=3) 108 | fixed_params = model["fixed_params"] 109 | if "parallelable" in model.keys() and model['parallelable']: 110 | fixed_params["n_jobs"] = n_jobs 111 | 112 | if hyperparams is not None: 113 | if "hyperparams_type" in model and model["hyperparams_type"] == "int": 114 | hyperparams[model["hyperparams"]] = int(hyperparams[model["hyperparams"]]) 115 | fixed_params.update(hyperparams) 116 | 117 | estimator = model["fn"](**fixed_params) 118 | 119 | # hyperparameter search 120 | if "hyperparams" not in model.keys() or hyperparams is not None: 121 | # if no hyper parmeter, train directly 122 | best_model, result = train(X_train, y_train, estimator, None, n_jobs=n_jobs, seed=coarse_train_seed, skip=(hyperparams is not None)) 123 | else: 124 | # coarse random search 125 | param_grid = get_coarse_grid(model, coarse_param_seed, n_jobs, len(y_train)) 126 | best_model_coarse, result_coarse = train(X_train, y_train, estimator, param_grid, n_jobs=n_jobs, seed=coarse_train_seed) 127 | val_acc_coarse = result_coarse['val_acc'] 128 | 129 | # fine grid search 130 | best_param_coarse = result_coarse['best_params'][model['hyperparams']] 131 | param_grid = get_fine_grid(model, best_param_coarse, n_jobs, len(y_train)) 132 | best_model_fine, result_fine = train(X_train, y_train, estimator, param_grid, n_jobs=n_jobs, seed=fine_train_seed) 133 | val_acc_fine = result_fine['val_acc'] 134 | 135 | if val_acc_fine > val_acc_coarse: 136 | result = result_fine 137 | best_model = best_model_fine 138 | else: 139 | result = result_coarse 140 | best_model = best_model_coarse 141 | 142 | # convert int to float to avoid json error 143 | if model["hyperparams_type"] == "int": 144 | result['best_params'][model["hyperparams"]] *= 1.0 145 | 146 | return best_model, result 147 | 148 | def train_and_evaluate(X_train, y_train, X_test_list, y_test_list, test_files, model, n_jobs=1, seed=1, hyperparams=None): 149 | """Search hyperparameters and evaluate 150 | 151 | Args: 152 | X_train (pd.DataFrame): features (train) 153 | y_train (pd.DataFrame): label (train) 154 | X_test_list (list): list of features (test) 155 | y_test_list (list): list of label (test) 156 | test_files (list): list of filenames of test set 157 | model (dict): ml model dict in model.py 158 | seed (int): seed for training 159 | n_jobs (int): num of threads 160 | """ 161 | best_model, result_train = hyperparam_search(X_train, y_train, model, n_jobs, seed, hyperparams) 162 | result_test = evaluate(best_model, X_test_list, y_test_list, test_files) 163 | result = {**result_train, **result_test} 164 | return result -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | """ Load and preprocess data""" 2 | from sklearn.preprocessing import LabelEncoder 3 | from imblearn.under_sampling import RandomUnderSampler 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | from sklearn.feature_selection import SelectKBest, chi2 7 | import numpy as np 8 | import pandas as pd 9 | import utils 10 | 11 | def check_version(dataset, error_type, train_file): 12 | """Check whether train and test are of the same version""" 13 | train_path_pfx = utils.get_dir(dataset, error_type, train_file) 14 | train_version = utils.get_version(train_path_pfx) 15 | test_files = utils.get_test_files(error_type, train_file) 16 | for test_file in test_files: 17 | test_path_pfx = utils.get_dir(dataset, error_type, test_file) 18 | test_version = utils.get_version(test_path_pfx) 19 | assert(train_version == test_version) 20 | 21 | def load_data(dataset, train_path, test_path_list): 22 | """Load and split data into features and label. 23 | 24 | Args: 25 | dataset (dict): dataset dict in config 26 | train_path (string): path for training set 27 | test_path_list (list): a list of paths for test set (missing values and outlier have multiple test sets) 28 | """ 29 | # load data 30 | train = utils.load_df(dataset, train_path) 31 | test_list = [utils.load_df(dataset, test_dir) for test_dir in test_path_list] 32 | 33 | # split X, y 34 | label = dataset['label'] 35 | features = [v for v in train.columns if not v == label] 36 | X_train, y_train = train.loc[:, features], train.loc[:, label] 37 | X_test_list = [test.loc[:, features] for test in test_list] 38 | y_test_list = [test.loc[:, label] for test in test_list] 39 | 40 | return X_train, y_train, X_test_list, y_test_list 41 | 42 | def drop_variables(X_train, X_test_list, drop_columns): 43 | """Drop irrelavant features""" 44 | n_test_files = len(X_test_list) 45 | X_train.drop(columns=drop_columns, inplace=True) 46 | for i in range(n_test_files): 47 | X_test_list[i].drop(columns=drop_columns, inplace=True) 48 | 49 | def down_sample(X, y, random_state): 50 | rus = RandomUnderSampler(random_state=random_state) 51 | X_rus, y_rus = rus.fit_resample(X, y) 52 | indices = rus.sample_indices_ 53 | X_train = X.iloc[indices, :].reset_index(drop=True) 54 | y_train = y.iloc[indices].reset_index(drop=True) 55 | return X_train, y_train 56 | 57 | def encode_cat_label(y_train, y_test_list): 58 | n_tr = y_train.shape[0] 59 | n_te_list = [y_test.shape[0] for y_test in y_test_list] 60 | test_split = np.cumsum(n_te_list)[:-1] 61 | 62 | y = pd.concat([y_train, *y_test_list], axis=0) 63 | le = LabelEncoder() 64 | y = le.fit_transform(y.values.ravel()) 65 | 66 | y_train = y[:n_tr] 67 | y_test_list = np.split(y[n_tr:], test_split) 68 | return y_train, y_test_list 69 | 70 | def text_embedding(corpus_train, corpus_test_list, y_train): 71 | vectorizer = TfidfVectorizer(stop_words='english') 72 | x_train_raw = vectorizer.fit_transform(corpus_train) 73 | x_test_list_raw = [vectorizer.transform(corpus_test) for corpus_test in corpus_test_list] 74 | feature_names = vectorizer.get_feature_names_out() 75 | 76 | k = min(200, x_train_raw.shape[1]) 77 | ch2 = SelectKBest(chi2, k=k) 78 | x_train = ch2.fit_transform(x_train_raw, y_train) 79 | x_test_list = [ch2.transform(x_test) for x_test in x_test_list_raw] 80 | feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] 81 | x_train = pd.DataFrame(x_train.toarray(), columns=feature_names) 82 | x_test_list = [pd.DataFrame(x_test.toarray(), columns=feature_names) for x_test in x_test_list] 83 | return x_train, x_test_list 84 | 85 | def encode_text_features(X_train, X_test_list, y_train, text_columns): 86 | n_test_files = len(X_test_list) 87 | text_train = pd.DataFrame(X_train, columns=text_columns) 88 | text_test_list = [pd.DataFrame(X_test, columns=text_columns) for X_test in X_test_list] 89 | X_train.drop(columns=text_columns, inplace=True) 90 | for i in range(n_test_files): 91 | X_test_list[i].drop(columns=text_columns, inplace=True) 92 | 93 | for tc in text_columns: 94 | corpus_train = text_train.loc[:, tc] 95 | corpus_test_list = [text_test.loc[:, tc] for text_test in text_test_list] 96 | x_train, x_test_list = text_embedding(corpus_train, corpus_test_list, y_train) 97 | X_train = pd.concat([X_train, x_train], axis=1) 98 | for i in range(n_test_files): 99 | X_test_list[i] = pd.concat([X_test_list[i], x_test_list[i]], axis=1) 100 | return X_train, X_test_list 101 | 102 | def encode_cat_features(X_train, X_test_list): 103 | n_tr = X_train.shape[0] 104 | n_te_list = [X_test.shape[0] for X_test in X_test_list] 105 | test_split = np.cumsum(n_te_list)[:-1] 106 | 107 | X = pd.concat([X_train, *X_test_list], axis=0) 108 | X = pd.get_dummies(X, drop_first=True).values.astype(float) 109 | 110 | X_train = X[:n_tr, :] 111 | X_test_list = np.split(X[n_tr:], test_split) 112 | return X_train, X_test_list 113 | 114 | def preprocess(dataset, error_type, train_file, normalize=True, down_sample_seed=1): 115 | """Load and preprocess data 116 | 117 | Args: 118 | dataset (dict): dataset dict in config 119 | error_type (string): error type 120 | train_file (string): prefix of file of training set 121 | normalize (bool): whehter to standarize the data 122 | down_sample_seed: seed for down sampling 123 | """ 124 | # check train and test version are consistent 125 | check_version(dataset, error_type, train_file) 126 | 127 | # get path of train file and test files 128 | train_path = utils.get_dir(dataset, error_type, train_file + "_train.csv") 129 | test_files = utils.get_test_files(error_type, train_file) 130 | test_path_list = [utils.get_dir(dataset, error_type, test_file + "_test.csv") for test_file in test_files] 131 | 132 | # load data 133 | X_train, y_train, X_test_list, y_test_list = load_data(dataset, train_path, test_path_list) 134 | 135 | ## preprocess data 136 | # drop irrelavant features 137 | if "drop_variables" in dataset.keys(): 138 | drop_columns = dataset['drop_variables'] 139 | drop_variables(X_train, X_test_list, drop_columns) 140 | 141 | # down sample if imbalanced 142 | if "class_imbalance" in dataset.keys() and dataset["class_imbalance"]: 143 | X_train, y_train = down_sample(X_train, y_train, down_sample_seed) 144 | 145 | # encode label 146 | if dataset['ml_task'] == 'classification': 147 | y_train, y_test_list = encode_cat_label(y_train, y_test_list) 148 | 149 | # text embedding 150 | if "text_variables" in dataset.keys(): 151 | text_columns = dataset["text_variables"] 152 | X_train, X_test_list = encode_text_features(X_train, X_test_list, y_train, text_columns) 153 | 154 | # encode categorical features 155 | X_train, X_test_list = encode_cat_features(X_train, X_test_list) 156 | 157 | # normalize data 158 | if normalize: 159 | scaler = StandardScaler() 160 | X_train = scaler.fit_transform(X_train) 161 | X_test_list = [scaler.transform(X_test) for X_test in X_test_list] 162 | 163 | return X_train, y_train, X_test_list, y_test_list, test_files 164 | 165 | -------------------------------------------------------------------------------- /schema/dataset.py: -------------------------------------------------------------------------------- 1 | """Define the domain of dataset""" 2 | from .error_type import * 3 | 4 | # details of each dataset 5 | Citation = { 6 | "data_dir": "Citation", 7 | "error_types": ["duplicates"], 8 | 'key_columns': ['title'], 9 | "label":"CS", 10 | "ml_task": "classification", 11 | "text_variables":["title"], 12 | } 13 | 14 | Marketing = { 15 | "data_dir": "Marketing", 16 | "error_types": ["missing_values"], 17 | "label": 'Income', 18 | "ml_task": "classification" 19 | } 20 | 21 | Airbnb = { 22 | "data_dir": "Airbnb", 23 | "error_types": ["duplicates", "outliers", "missing_values"], 24 | "label": 'Rating', 25 | "categorical_variables": ['Rating'], 26 | "ml_task": "classification", 27 | 'key_columns': ['latitude', 'longitude'], 28 | } 29 | 30 | Titanic = { 31 | "data_dir": "Titanic", 32 | "error_types": ["missing_values"], 33 | "drop_variables": ['PassengerId', 'Name'], 34 | "label": "Survived", 35 | "categorical_variables":["Survived"], 36 | "ml_task": "classification" 37 | } 38 | 39 | EEG = { 40 | "data_dir": "EEG", 41 | "error_types": ["outliers"], 42 | 'label':'Eye', 43 | "categorical_variables":['Eye'], 44 | "ml_task": "classification" 45 | } 46 | 47 | USCensus = { 48 | "data_dir": "USCensus", 49 | "error_types": ["missing_values"], 50 | "label": 'Income', 51 | "ml_task": "classification" 52 | } 53 | 54 | Restaurant = { 55 | "data_dir": "Restaurant", 56 | "error_types": ["duplicates", "inconsistency"], 57 | "label": "priceRange", 58 | "ml_task": "classification", 59 | "drop_variables": ["streetAddress", "telephone", "website"], 60 | "text_variables": ["name", "categories", "neighborhood"], 61 | "key_columns": ["telephone"] 62 | } 63 | 64 | Credit = { 65 | "data_dir": "Credit", 66 | "error_types": ["outliers", "missing_values"], 67 | "label": "SeriousDlqin2yrs", 68 | "categorical_variables":["SeriousDlqin2yrs"], 69 | "ml_task": "classification", 70 | "class_imbalance":True 71 | } 72 | 73 | Sensor = { 74 | "data_dir": "Sensor", 75 | "error_types": ["outliers"], 76 | "categorical_variables": ['moteid'], 77 | "label": 'moteid', 78 | "ml_task": "classification" 79 | } 80 | 81 | Movie = { 82 | "data_dir": "Movie", 83 | "error_types": ["duplicates", "inconsistency"], 84 | "key_columns": ["title", "year"], 85 | "categorical_variables": ["genres"], 86 | "text_variables": ["title"], 87 | "label": "genres", 88 | "ml_task": "classification" 89 | } 90 | 91 | Company = { 92 | "data_dir": "Company", 93 | "error_types": ["inconsistency"], 94 | "label": "Sentiment", 95 | "ml_task": "classification", 96 | "drop_variables": ["Date", "Unnamed: 0", "City"] 97 | } 98 | 99 | University = { 100 | "data_dir": "University", 101 | "error_types": ["inconsistency"], 102 | "label": "expenses thous$", 103 | "ml_task": "classification", 104 | "drop_variables": ["university name", "academic-emphasis"] 105 | } 106 | 107 | KDD_major = { 108 | "data_dir": "KDD_major", 109 | "error_types": ["mislabel"], 110 | "label": 'is_exciting_20', 111 | "ml_task": "classification", 112 | "class_imbalance": True, 113 | "categorical_variables":['is_exciting_20'], 114 | } 115 | 116 | KDD_minor = { 117 | "data_dir": "KDD_minor", 118 | "error_types": ["mislabel"], 119 | "label": 'is_exciting_20', 120 | "ml_task": "classification", 121 | "class_imbalance": True, 122 | "categorical_variables":['is_exciting_20'], 123 | } 124 | 125 | KDD_uniform = { 126 | "data_dir": "KDD_uniform", 127 | "error_types": ["mislabel"], 128 | "label": 'is_exciting_20', 129 | "ml_task": "classification", 130 | "class_imbalance": True, 131 | "categorical_variables":['is_exciting_20'], 132 | } 133 | 134 | USCensus_major = { 135 | "data_dir": "USCensus_major", 136 | "error_types": ["mislabel"], 137 | "label": 'Income', 138 | "ml_task": "classification" 139 | } 140 | 141 | USCensus_minor = { 142 | "data_dir": "USCensus_minor", 143 | "error_types": ["mislabel"], 144 | "label": 'Income', 145 | "ml_task": "classification" 146 | } 147 | 148 | USCensus_uniform = { 149 | "data_dir": "USCensus_uniform", 150 | "error_types": ["mislabel"], 151 | "label": 'Income', 152 | "ml_task": "classification" 153 | } 154 | 155 | EEG_major = { 156 | "data_dir": "EEG_major", 157 | "error_types": ["mislabel"], 158 | 'label':'Eye', 159 | "categorical_variables":['Eye'], 160 | "ml_task": "classification" 161 | } 162 | 163 | EEG_minor = { 164 | "data_dir": "EEG_minor", 165 | "error_types": ["mislabel"], 166 | 'label':'Eye', 167 | "categorical_variables":['Eye'], 168 | "ml_task": "classification" 169 | } 170 | 171 | EEG_uniform = { 172 | "data_dir": "EEG_uniform", 173 | "error_types": ["mislabel"], 174 | 'label':'Eye', 175 | "categorical_variables":['Eye'], 176 | "ml_task": "classification" 177 | } 178 | 179 | Titanic_uniform = { 180 | "data_dir": "Titanic_uniform", 181 | "error_types": ["mislabel"], 182 | "drop_variables": ['PassengerId', 'Name'], 183 | "label": "Survived", 184 | "categorical_variables":["Survived"], 185 | "ml_task": "classification" 186 | } 187 | 188 | Titanic_major = { 189 | "data_dir": "Titanic_major", 190 | "error_types": ["mislabel"], 191 | "drop_variables": ['PassengerId', 'Name'], 192 | "label": "Survived", 193 | "categorical_variables":["Survived"], 194 | "ml_task": "classification" 195 | } 196 | 197 | Titanic_minor = { 198 | "data_dir": "Titanic_minor", 199 | "error_types": ["mislabel"], 200 | "drop_variables": ['PassengerId', 'Name'], 201 | "label": "Survived", 202 | "categorical_variables":["Survived"], 203 | "ml_task": "classification" 204 | } 205 | 206 | Marketing_uniform = { 207 | "data_dir": "Marketing_uniform", 208 | "error_types": ["mislabel"], 209 | "label": 'Income', 210 | "ml_task": "classification" 211 | } 212 | 213 | Marketing_minor = { 214 | "data_dir": "Marketing_minor", 215 | "error_types": ["mislabel"], 216 | "label": 'Income', 217 | "ml_task": "classification" 218 | } 219 | 220 | Marketing_major = { 221 | "data_dir": "Marketing_major", 222 | "error_types": ["mislabel"], 223 | "label": 'Income', 224 | "ml_task": "classification" 225 | } 226 | Credit_uniform = { 227 | "data_dir": "Credit_uniform", 228 | "error_types": ["mislabel"], 229 | "label": "SeriousDlqin2yrs", 230 | "categorical_variables":["SeriousDlqin2yrs"], 231 | "ml_task": "classification", 232 | "class_imbalance":True 233 | } 234 | Credit_major = { 235 | "data_dir": "Credit_major", 236 | "error_types": ["mislabel"], 237 | "label": "SeriousDlqin2yrs", 238 | "categorical_variables":["SeriousDlqin2yrs"], 239 | "ml_task": "classification", 240 | "class_imbalance":True 241 | } 242 | Credit_minor = { 243 | "data_dir": "Credit_minor", 244 | "error_types": ["mislabel"], 245 | "label": "SeriousDlqin2yrs", 246 | "categorical_variables":["SeriousDlqin2yrs"], 247 | "ml_task": "classification", 248 | "class_imbalance":True 249 | } 250 | 251 | BabyProduct = { 252 | "data_dir": "BabyProduct", 253 | "error_types": ["missing_values"], 254 | "label": "class", 255 | "ml_task": "classification" 256 | } 257 | 258 | Clothing = { 259 | "data_dir": "Clothing", 260 | "error_types": ["mislabel"], 261 | "label": "label", 262 | "ml_task": "classification" 263 | } 264 | 265 | # domain of dataset 266 | datasets = [Credit, Airbnb, USCensus, EEG, Titanic, 267 | Marketing, Sensor, Movie, Restaurant, Citation, 268 | Company, University, USCensus_uniform, USCensus_major, 269 | USCensus_minor, EEG_uniform, EEG_minor, EEG_major, 270 | Titanic_uniform, Titanic_minor, Titanic_major, 271 | Marketing_uniform, Marketing_major, Marketing_minor, 272 | Credit_uniform, Credit_major, Credit_minor, 273 | BabyProduct, Clothing] -------------------------------------------------------------------------------- /relation.py: -------------------------------------------------------------------------------- 1 | """Populate relations using training results""" 2 | import json 3 | import pandas as pd 4 | import numpy as np 5 | import utils 6 | from scipy.stats import ttest_rel 7 | import config 8 | import os 9 | from matplotlib import pyplot as plt 10 | from statsmodels.stats.multitest import multipletests, fdrcorrection_twostage 11 | import json 12 | import sys 13 | 14 | """Compare class""" 15 | class Compare(object): 16 | def __init__(self, result, compare_method, compare_metric): 17 | super(Compare, self).__init__() 18 | """ Compare 19 | Args: 20 | result (dict): result dict 21 | compare_method (fn): function to compare two metrics 22 | compare_metric (fn): function to specify metrics to be compared 23 | """ 24 | self.result = result 25 | self.compare_metric = compare_metric 26 | self.compare_method = compare_method 27 | 28 | self.four_metrics = {} 29 | self.compare_result = {} 30 | for error_type in config.error_types: 31 | self.compare_result[error_type['name']], self.four_metrics[error_type['name']] = self.compare_error(error_type['name']) 32 | 33 | # key order: error/clean_method/dataset/models/scenario/ [compare_keys...] 34 | self.compare_result = utils.flatten_dict(self.compare_result) 35 | 36 | # rearrange key order: error/dataset/clean_method/models/scenario/ [compare_keys...] 37 | self.compare_result = utils.rearrange_dict(self.compare_result, [0, 2, 1, 3, 4]) 38 | 39 | def get_four_metrics(self, error_type, file_types): 40 | """Get four metrics (A, B, C, D) for all datasets in a table (pd.DataFrame) 41 | 42 | Args: 43 | error_type (string): error type 44 | file_types (list): names of two types of train or test files 45 | """ 46 | four_metrics = {} 47 | for (dataset, split_seed, error, train_file, model), value in self.result.items(): 48 | if error == error_type and train_file in file_types: 49 | for test_file in file_types: 50 | metric_name = self.compare_metric(dataset, error_type, test_file) 51 | metric = value[metric_name] 52 | four_metrics[(dataset, split_seed, train_file, model, test_file)] = metric 53 | 54 | four_metrics = utils.dict_to_df(four_metrics, [0, 2, 1], [3, 4]).sort_index() 55 | return four_metrics 56 | 57 | def compare_four_metrics(self, error_type, four_metrics, file_types): 58 | """Compute the relative difference between four metrics 59 | 60 | Args: 61 | four_metrics (pandas.DataFrame): four metrics 62 | file_types (list): names of two types of train or test files 63 | compare_method (fn): function to compare two metrics 64 | """ 65 | A = lambda m: m.loc[file_types[0], file_types[0]] 66 | B = lambda m: m.loc[file_types[0], file_types[1]] 67 | C = lambda m: m.loc[file_types[1], file_types[0]] 68 | D = lambda m: m.loc[file_types[1], file_types[1]] 69 | 70 | scenarios = { 71 | "CD":lambda m: self.compare_method(C(m), D(m)), 72 | "BD":lambda m: self.compare_method(B(m), D(m)), 73 | "AB":lambda m: self.compare_method(A(m), B(m)), 74 | "AC":lambda m: self.compare_method(A(m), C(m)) 75 | } 76 | 77 | comparison = {} 78 | datasets = list(set(four_metrics.index.get_level_values(0))) 79 | models = list(set(four_metrics.columns.get_level_values(0))) 80 | for dataset in datasets: 81 | for model in models: 82 | m = four_metrics.loc[dataset, model] 83 | for s in config.scenarios[error_type]: 84 | comparison[(dataset, model, s)] = scenarios[s](m) 85 | # comparison = utils.dict_to_df(comparison, [0, 1], [2]) 86 | return comparison 87 | 88 | def compare_error(self, error_type): 89 | """Compare four metrics based on compared method given error_type 90 | 91 | Args: 92 | error_type (string): error type 93 | 94 | Return: 95 | clean_method/dataset/model/scenario/compare_method:result 96 | 97 | """ 98 | ## each error has two types of files 99 | # file type 1 100 | file1 = "delete" if error_type == "missing_values" else "dirty" 101 | file2 = list(set([k[3] for k in self.result.keys() if k[2] == error_type and k[3] != file1])) 102 | comparisons = {} 103 | metrics = {} 104 | 105 | for f2 in file2: 106 | file_types = [file1, f2] 107 | four_metrics = self.get_four_metrics(error_type, file_types) 108 | comparison = self.compare_four_metrics(error_type, four_metrics, file_types) 109 | metrics[f2] = four_metrics 110 | comparisons[f2] = comparison 111 | return comparisons, metrics 112 | 113 | def save_four_metrics(self, save_dir): 114 | for error_type in config.error_types: 115 | save_path = os.path.join(save_dir, "{}_four_metrics.xlsx".format(error_type['name'])) 116 | utils.dfs_to_xls(self.four_metrics[error_type['name']], save_path) 117 | flat_metrics = utils.flatten_dict(self.four_metrics) 118 | 119 | """Comparing method""" 120 | def t_test(dirty, clean): 121 | def two_tailed_t_test(dirty, clean): 122 | n_d = len(dirty) 123 | n_c = len(clean) 124 | n = min(n_d, n_c) 125 | t, p = ttest_rel(clean[:n], dirty[:n]) 126 | if np.isnan(t): 127 | t, p = 0, 1 128 | return {"t-stats":t, "p-value":p} 129 | 130 | def one_tailed_t_test(dirty, clean, direction): 131 | two_tail = two_tailed_t_test(dirty, clean) 132 | t, p_two = two_tail['t-stats'], two_tail['p-value'] 133 | if direction == 'positive': 134 | if t > 0 : 135 | p = p_two * 0.5 136 | else: 137 | p = 1 - p_two * 0.5 138 | else: 139 | if t < 0: 140 | p = p_two * 0.5 141 | else: 142 | p = 1 - p_two * 0.5 143 | return {"t-stats":t, "p-value":p} 144 | 145 | result = {} 146 | result['two_tail'] = two_tailed_t_test(dirty, clean) 147 | result['one_tail_pos'] = one_tailed_t_test(dirty, clean, 'positive') 148 | result['one_tail_neg'] = one_tailed_t_test(dirty, clean, 'negative') 149 | return result 150 | 151 | def mean_f1(dirty, clean): 152 | result = {"dirty_f1": np.mean(dirty), "clean_f1":np.mean(clean)} 153 | return result 154 | 155 | def mean_acc(dirty, clean): 156 | result = {"dirty_acc": np.mean(dirty), "clean_acc":np.mean(clean)} 157 | return result 158 | 159 | def diff_f1(dirty, clean): 160 | result = {"diff_f1": np.mean((clean - dirty) / dirty)} 161 | return result 162 | 163 | def diff_acc(dirty, clean): 164 | result = {"diff_acc": np.mean((clean - dirty) / dirty)} 165 | return result 166 | 167 | def direct_count(dirty, clean): 168 | result = {"pos_count": np.sum(dirty - clean < -1e-8), "neg_count": np.sum(dirty - clean > 1e-8), "same_count": np.sum(np.abs(dirty - clean) < 1e-8)} 169 | return result 170 | 171 | """Comparing metrics""" 172 | def test_f1(dataset_name, error_type, test_file): 173 | metric = test_file + "_test_f1" 174 | return metric 175 | 176 | def test_acc(dataset_name, error_type, test_file): 177 | metric = test_file + "_test_acc" 178 | return metric 179 | 180 | def mixed_f1_acc(dataset_name, error_type, test_file): 181 | if error_type == 'mislabel': 182 | dataset_name = dataset_name.split('_')[0] 183 | dataset = utils.get_dataset(dataset_name) 184 | if ('class_imbalance' in dataset.keys() and dataset['class_imbalance']): 185 | metric = test_file + "_test_f1" 186 | else: 187 | metric = test_file + "_test_acc" 188 | return metric 189 | 190 | """Multiple hypothesis test """ 191 | def hypothesis_test(t_test_results, alpha=0.05, multiple_test_method='fdr_by'): 192 | # convert to pd.DataFrame 193 | t_test_results_df = utils.dict_to_df(t_test_results, [0, 1, 2, 3, 4], [5, 6]) 194 | 195 | # run BY procedure 196 | rejects = {} 197 | correct_p_vals = {} 198 | test_types = ['two_tail', 'one_tail_pos','one_tail_neg'] 199 | pvals = [t_test_results_df.loc[:, (test_type, 'p-value')].values for test_type in test_types] 200 | pvals = np.concatenate(pvals, axis=0) 201 | print("# hypothesis:", len(pvals)) 202 | rej, cor_p, m0, alpha_stages = multipletests(pvals, method=multiple_test_method, alpha=alpha) 203 | # print(np.max(pvals[rej]), np.max(cor_p[rej])) 204 | rej = np.split(rej, 3) 205 | cor_p = np.split(cor_p, 3) 206 | for test_type, r, p in zip(test_types, rej, cor_p): 207 | rejects[test_type] = pd.DataFrame(r, index=t_test_results_df.index, columns=['reject']) 208 | correct_p_vals[test_type] = pd.DataFrame(p, index=t_test_results_df.index, columns=['p-value']) 209 | 210 | hypothesis_result = {} 211 | for e, d, c, m, s, _, _ in t_test_results.keys(): 212 | hypothesis_result[(e, d, c, m, s, 'two_tail_pvalue')] = correct_p_vals['two_tail'].loc[(e, d, c, m, s),'p-value'] 213 | hypothesis_result[(e, d, c, m, s, 'pos_pvalue')] = correct_p_vals['one_tail_pos'].loc[(e, d, c, m, s),'p-value'] 214 | hypothesis_result[(e, d, c, m, s, 'neg_pvalue')] = correct_p_vals['one_tail_neg'].loc[(e, d, c, m, s),'p-value'] 215 | pos = rejects['one_tail_pos'].loc[(e, d, c, m, s), 'reject'] 216 | neg = rejects['one_tail_neg'].loc[(e, d, c, m, s), 'reject'] 217 | sig = rejects['two_tail'].loc[(e, d, c, m, s), 'reject'] 218 | 219 | if sig and pos: 220 | hypothesis_result[(e, d, c, m, s, 'flag')] = 'P' 221 | elif sig and neg: 222 | hypothesis_result[(e, d, c, m, s, 'flag')] = 'N' 223 | else: 224 | hypothesis_result[(e, d, c, m, s, 'flag')] = 'S' 225 | return hypothesis_result 226 | 227 | """Group and split the result """ 228 | def split_clean_method(result): 229 | new_result = {} 230 | for (error, dataset, clean_method, model, scenario, comp_key), value in result.items(): 231 | if error == 'outliers': 232 | detect = clean_method.split('_')[1] 233 | repair = clean_method.replace('_{}'.format(detect), '') 234 | else: 235 | detect = 'detect' 236 | repair = clean_method 237 | new_result[(error, dataset, detect, repair, model, scenario, comp_key)] = value 238 | return new_result 239 | 240 | def group_by_mean(result): 241 | # group by training seed and reduce by mean 242 | result = utils.group(result, 5) 243 | result = utils.reduce_by_mean(result) 244 | return result 245 | 246 | def group_by_max(result): 247 | result = utils.group(result, 5) 248 | result = utils.reduce_by_max_val(result) 249 | return result 250 | 251 | def group_by_best_model(result): 252 | # select best model by max val acc 253 | result = utils.group(result, 5) 254 | result = utils.reduce_by_max_val(result) 255 | result = utils.group(result, 4, keepdim=True) 256 | result = utils.reduce_by_max_val(result, dim=4, dim_name="model") 257 | return result 258 | 259 | def group_by_best_model_clean(result_best_model): 260 | # select best model by max val acc 261 | result = utils.group_reduce_by_best_clean(result_best_model) 262 | return result 263 | 264 | def elim_redundant_dim(relation, dims): 265 | new_rel = {} 266 | for k, v in relation.items(): 267 | new_key = tuple([k[i] for i in range(len(k)) if i not in dims]) 268 | new_rel[new_key] = v 269 | return new_rel 270 | 271 | """Populate relations""" 272 | def populate_relation(result, name, alphas=[0.05], split_detect=True, multiple_test_method='fdr_by'): 273 | print("Populate relation", name) 274 | # create save folder 275 | save_dir = utils.makedirs([config.analysis_dir, name]) 276 | relation_dir = utils.makedirs([save_dir, 'relations']) 277 | metric_dir = utils.makedirs([save_dir, 'four_metrics']) 278 | 279 | # get other attributes 280 | attr_mean_acc = Compare(result, mean_acc, test_acc).compare_result # attr: dirty_acc, clean_acc 281 | attr_diff_acc = Compare(result, diff_acc, test_acc).compare_result # attr: diff_acc 282 | attr_mean_f1 = Compare(result, mean_f1, test_f1).compare_result # attr: dirty_f1, clean_f1 283 | attr_diff_f1 = Compare(result, diff_f1, test_f1).compare_result # attr: diff_f1 284 | attr_count = Compare(result, direct_count, mixed_f1_acc).compare_result # attr: pos count, neg count, same count 285 | 286 | # run t-test 287 | t_test_comp = Compare(result, t_test, mixed_f1_acc) 288 | t_test_comp.save_four_metrics(metric_dir) 289 | 290 | # hypothesis test 291 | for alpha in alphas: 292 | # print(alpha) 293 | # get attribute flag by multiple hypothesis test 294 | attr_flag = hypothesis_test(t_test_comp.compare_result, alpha, multiple_test_method) 295 | 296 | # populate relation with all of attributes 297 | relation = {**attr_flag, **attr_mean_acc, **attr_mean_f1, **attr_diff_acc, **attr_diff_f1, **attr_count} 298 | 299 | # split detect 300 | if split_detect and name != "R3": 301 | relation = split_clean_method(relation) 302 | 303 | # eliminate redundant attribute for R2 and R3 304 | 305 | if name == "R2": 306 | redundant_dims = [4] if split_detect else [3] 307 | relation = elim_redundant_dim(relation, redundant_dims) 308 | if name == "R3": 309 | redundant_dims = [2, 3] 310 | relation = elim_redundant_dim(relation, redundant_dims) 311 | 312 | # convert dict to df 313 | n_key = len(list(relation.keys())[0]) 314 | relation_df = utils.dict_to_df(relation, list(range(n_key-1)), [n_key-1]) 315 | 316 | # save relation to csv and pkl 317 | relation_csv_dir = utils.makedirs([relation_dir, 'csv']) 318 | save_path = os.path.join(relation_csv_dir, '{}_{}.csv'.format(name, "{:.6f}".format(alpha).rstrip('0'))) 319 | 320 | relation_df = relation_df.reset_index() 321 | 322 | if name == "R1": 323 | relation_df.rename(columns={"level_0": "error_type", "level_1":"dataset", "level_2": "detect_method", 324 | "level_3": "repair_method", "level_4": "model", "level_5": "scenario"}, inplace=True) 325 | elif name == "R2": 326 | relation_df.rename(columns={"level_0": "error_type", "level_1":"dataset", "level_2": "detect_method", 327 | "level_3": "repair_method", "level_4": "scenario"}, inplace=True) 328 | else: 329 | relation_df.rename(columns={"level_0": "error_type", "level_1":"dataset", "level_2": "scenario"}, inplace=True) 330 | 331 | relation_df.to_csv(save_path, index=False) 332 | 333 | relation_pkl_dir = utils.makedirs([relation_dir, 'pkl']) 334 | save_path = os.path.join(relation_pkl_dir, '{}_{}.pkl'.format(name, "{:.6f}".format(alpha).rstrip('0'))) 335 | utils.df_to_pickle(relation_df, save_path) 336 | 337 | def populate(alphas, save_training=False): 338 | """Populate R1, R2 and R3""" 339 | result = utils.load_result(parse_key=True) 340 | 341 | if save_training: 342 | save_dir = os.path.join(config.analysis_dir, "training_result") 343 | utils.result_to_table(result, save_dir) 344 | 345 | # populate R1 346 | result_vanilla = group_by_max(result) 347 | populate_relation(result_vanilla, "R1", alphas=alphas) 348 | 349 | # populate R2 350 | result_best_model = group_by_best_model(result) 351 | populate_relation(result_best_model, "R2", alphas=alphas) 352 | 353 | # # populate R3 354 | result_best_model_clean = group_by_best_model_clean(result_best_model) 355 | populate_relation(result_best_model_clean, "R3", alphas=alphas) -------------------------------------------------------------------------------- /schema/clean_method.py: -------------------------------------------------------------------------------- 1 | # define the domain of cleaning method 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.neighbors import LocalOutlierFactor 5 | from sklearn.ensemble import IsolationForest 6 | from sklearn import preprocessing 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.cluster import DBSCAN 9 | import sys 10 | import utils 11 | import os 12 | 13 | class MVCleaner(object): 14 | def __init__(self, method='delete', **kwargs): 15 | self.method = method 16 | self.kwargs = kwargs 17 | self.is_fit = False 18 | if method == 'impute': 19 | if 'num' not in kwargs or 'cat' not in kwargs: 20 | print("Must give imputation method for numerical and categorical data") 21 | sys.exit(1) 22 | self.tag = "impute_{}_{}".format(kwargs['num'], kwargs['cat']) 23 | else: 24 | self.tag = "delete" 25 | 26 | def detect(self, df): 27 | return df.isnull() 28 | 29 | def fit(self, dataset, df): 30 | if self.method == 'impute': 31 | num_method = self.kwargs['num'] 32 | cat_method = self.kwargs['cat'] 33 | num_df = df.select_dtypes(include='number') 34 | cat_df = df.select_dtypes(exclude='number') 35 | if num_method == "mean": 36 | num_imp = num_df.mean() 37 | if num_method == "median": 38 | num_imp = num_df.median() 39 | if num_method == "mode": 40 | num_imp = num_df.mode().iloc[0] 41 | 42 | if cat_method == "mode": 43 | cat_imp = cat_df.mode().iloc[0] 44 | if cat_method == "dummy": 45 | cat_imp = ['missing'] * len(cat_df.columns) 46 | cat_imp = pd.Series(cat_imp, index=cat_df.columns) 47 | self.impute = pd.concat([num_imp, cat_imp], axis=0) 48 | self.is_fit = True 49 | 50 | def repair(self, df): 51 | if self.method == 'delete': 52 | df_clean = df.dropna() 53 | 54 | if self.method == 'impute': 55 | df_clean = df.fillna(value=self.impute) 56 | return df_clean 57 | 58 | def clean_df(self, df): 59 | if not self.is_fit: 60 | print('Must fit before clean.') 61 | sys.exit() 62 | mv_mat = self.detect(df) 63 | df_clean = self.repair(df) 64 | return df_clean, mv_mat 65 | 66 | def clean(self, dirty_train, dirty_test): 67 | clean_train, indicator_train = self.clean_df(dirty_train) 68 | clean_test, indicator_test = self.clean_df(dirty_test) 69 | return clean_train, indicator_train, clean_test, indicator_test 70 | 71 | class DuplicatesCleaner(object): 72 | def __init__(self): 73 | super(DuplicatesCleaner, self).__init__() 74 | 75 | def fit(self, dataset, df): 76 | self.keys = dataset['key_columns'] 77 | 78 | def detect(self, df, keys): 79 | key_col = pd.DataFrame(df, columns=keys) 80 | is_dup = key_col.duplicated(keep='first') 81 | is_dup = pd.DataFrame(is_dup, columns=['is_dup']) 82 | return is_dup 83 | 84 | def repair(self, df, is_dup): 85 | not_dup = (is_dup.values == False) 86 | df_clean = df[not_dup] 87 | return df_clean 88 | 89 | def clean_df(self, df): 90 | is_dup = self.detect(df, self.keys) 91 | df_clean = self.repair(df, is_dup) 92 | return df_clean, is_dup 93 | 94 | def clean(self, dirty_train, dirty_test): 95 | clean_train, indicator_train = self.clean_df(dirty_train) 96 | clean_test, indicator_test = self.clean_df(dirty_test) 97 | return clean_train, indicator_train, clean_test, indicator_test 98 | 99 | class InconsistencyCleaner(object): 100 | def __init__(self): 101 | super(InconsistencyCleaner, self).__init__() 102 | 103 | def fit(self, dataset, dirty_train): 104 | dirty_raw_path = utils.get_dir(dataset, 'raw', 'raw.csv') 105 | clean_raw_path = utils.get_dir(dataset, 'raw', 'inconsistency_clean_raw.csv') 106 | if not os.path.exists(clean_raw_path): 107 | print("Must provide clean version of raw data for cleaning inconsistency") 108 | sys.exit(1) 109 | dirty_raw = utils.load_df(dataset, dirty_raw_path) 110 | clean_raw = utils.load_df(dataset, clean_raw_path) 111 | N, m = dirty_raw.shape 112 | dirty_raw = dirty_raw.values 113 | clean_raw = clean_raw.values 114 | mask = (dirty_raw != clean_raw) 115 | dirty = dirty_raw[mask] 116 | clean = clean_raw[mask] 117 | self.incon_dict = dict(zip(dirty, clean)) 118 | 119 | def clean_df(self, df): 120 | df_clean = df.copy() 121 | N, m = df_clean.shape 122 | indicator = np.zeros_like(df_clean).astype(bool) 123 | 124 | for i in range(N): 125 | for j in range(m): 126 | if df_clean.iloc[i, j] in self.incon_dict.keys(): 127 | df_clean.iloc[i, j] = self.incon_dict[df_clean.iloc[i, j]] 128 | indicator[i, j] = True 129 | indicator = pd.DataFrame(indicator, columns=df.columns) 130 | return df_clean, indicator 131 | 132 | def clean(self, dirty_train, dirty_test): 133 | clean_train, indicator_train = self.clean_df(dirty_train) 134 | clean_test, indicator_test = self.clean_df(dirty_test) 135 | return clean_train, indicator_train, clean_test, indicator_test 136 | 137 | class InconsistencyHumanCleaner(object): 138 | def __init__(self): 139 | super(InconsistencyHumanCleaner, self).__init__() 140 | 141 | def fit(self, dataset, dirty_train): 142 | dirty_raw_path = utils.get_dir(dataset, 'raw', 'raw.csv') 143 | clean_raw_path = utils.get_dir(dataset, 'raw', 'inconsistency_human-clean_raw.csv') 144 | if not os.path.exists(clean_raw_path): 145 | print("Must provide clean version of raw data for cleaning inconsistency") 146 | sys.exit(1) 147 | dirty_raw = utils.load_df(dataset, dirty_raw_path) 148 | clean_raw = utils.load_df(dataset, clean_raw_path) 149 | N, m = dirty_raw.shape 150 | dirty_raw = dirty_raw.values 151 | clean_raw = clean_raw.values 152 | mask = (dirty_raw != clean_raw) 153 | dirty = dirty_raw[mask] 154 | clean = clean_raw[mask] 155 | self.incon_dict = dict(zip(dirty, clean)) 156 | 157 | def clean_df(self, df): 158 | df_clean = df.copy() 159 | N, m = df_clean.shape 160 | indicator = np.zeros_like(df_clean).astype(bool) 161 | 162 | for i in range(N): 163 | for j in range(m): 164 | if df_clean.iloc[i, j] in self.incon_dict.keys(): 165 | df_clean.iloc[i, j] = self.incon_dict[df_clean.iloc[i, j]] 166 | indicator[i, j] = True 167 | indicator = pd.DataFrame(indicator, columns=df.columns) 168 | return df_clean, indicator 169 | 170 | def clean(self, dirty_train, dirty_test): 171 | clean_train, indicator_train = self.clean_df(dirty_train) 172 | clean_test, indicator_test = self.clean_df(dirty_test) 173 | return clean_train, indicator_train, clean_test, indicator_test 174 | 175 | def SD(x, nstd=3.0): 176 | # Standard Deviaiton Method (Univariate) 177 | mean, std = np.mean(x), np.std(x) 178 | cut_off = std * nstd 179 | lower, upper = mean - cut_off, mean + cut_off 180 | return lambda y: (y > upper) | (y < lower) 181 | 182 | def IQR(x, k=1.5): 183 | # Interquartile Range (Univariate) 184 | q25, q75 = np.percentile(x, 25), np.percentile(x, 75) 185 | iqr = q75 - q25 186 | cut_off = iqr * k 187 | lower, upper = q25 - cut_off, q75 + cut_off 188 | return lambda y: (y > upper) | (y < lower) 189 | 190 | def IF(x, contamination=0.01): 191 | # Isolation Forest (Univariate) 192 | IF = IsolationForest(contamination=contamination) 193 | IF.fit(x.reshape(-1, 1)) 194 | return lambda y: (IF.predict(y.reshape(-1, 1)) == -1) 195 | 196 | class OutlierCleaner(object): 197 | def __init__(self, detect_method, repairer=MVCleaner('delete'), **kwargs): 198 | super(OutlierCleaner, self).__init__() 199 | detect_fn_dict = {'SD':SD, 'IQR':IQR, "IF":IF} 200 | self.detect_method = detect_method 201 | self.detect_fn = detect_fn_dict[detect_method] 202 | self.repairer = repairer 203 | self.kwargs = kwargs 204 | self.tag = "{}_{}".format(detect_method, repairer.tag) 205 | self.is_fit = False 206 | 207 | def fit(self, dataset, df): 208 | num_df = df.select_dtypes(include='number') 209 | cat_df = df.select_dtypes(exclude='number') 210 | X = num_df.values 211 | m = X.shape[1] 212 | 213 | self.detectors = [] 214 | for i in range(m): 215 | x = X[:, i] 216 | detector = self.detect_fn(x, **self.kwargs) 217 | self.detectors.append(detector) 218 | 219 | ind = self.detect(df) 220 | df_copy = df.copy() 221 | df_copy[ind] = np.nan 222 | self.repairer.fit(dataset, df_copy) 223 | self.is_fit = True 224 | 225 | def detect(self, df): 226 | num_df = df.select_dtypes(include='number') 227 | cat_df = df.select_dtypes(exclude='number') 228 | X = num_df.values 229 | m = X.shape[1] 230 | 231 | ind_num = np.zeros_like(num_df).astype('bool') 232 | ind_cat = np.zeros_like(cat_df).astype('bool') 233 | for i in range(m): 234 | x = X[:, i] 235 | detector = self.detectors[i] 236 | is_outlier = detector(x) 237 | ind_num[:, i] = is_outlier 238 | 239 | ind_num = pd.DataFrame(ind_num, columns=num_df.columns) 240 | ind_cat = pd.DataFrame(ind_cat, columns=cat_df.columns) 241 | ind = pd.concat([ind_num, ind_cat], axis=1).reindex(columns=df.columns) 242 | return ind 243 | 244 | def repair(self, df, ind): 245 | df_copy = df.copy() 246 | df_copy[ind] = np.nan 247 | df_clean, _ = self.repairer.clean_df(df_copy) 248 | return df_clean 249 | 250 | def clean_df(self, df, ignore=None): 251 | if not self.is_fit: 252 | print("Must fit before clean") 253 | sys.exit() 254 | ind = self.detect(df) 255 | if ignore is not None: 256 | ind.loc[:, ignore] = False 257 | df_clean = self.repair(df, ind) 258 | return df_clean, ind 259 | 260 | def clean(self, dirty_train, dirty_test): 261 | clean_train, indicator_train = self.clean_df(dirty_train) 262 | clean_test, indicator_test = self.clean_df(dirty_test) 263 | return clean_train, indicator_train, clean_test, indicator_test 264 | 265 | class MislabelCleaner(object): 266 | def __init__(self): 267 | super(MislabelCleaner, self).__init__() 268 | 269 | def fit(self, dataset, dirty_train): 270 | index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv') 271 | index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv') 272 | index_train = pd.read_csv(index_train_path).values.reshape(-1) 273 | index_test = pd.read_csv(index_test_path).values.reshape(-1) 274 | clean_path = utils.get_dir(dataset, 'raw', 'mislabel_clean_raw.csv') 275 | clean = utils.load_df(dataset, clean_path) 276 | self.clean_train = clean.loc[index_train, :].reset_index(drop=True) 277 | self.clean_test = clean.loc[index_test, :].reset_index(drop=True) 278 | 279 | def clean(self, dirty_train, dirty_test): 280 | indicator_train = pd.DataFrame(dirty_train.values != self.clean_train.values, columns=dirty_train.columns) 281 | indicator_test = pd.DataFrame(dirty_test.values != self.clean_test.values, columns=dirty_train.columns) 282 | return self.clean_train, indicator_train, self.clean_test, indicator_test 283 | 284 | class MislabelHumanCleaner(object): 285 | def __init__(self): 286 | super(MislabelHumanCleaner, self).__init__() 287 | 288 | def fit(self, dataset, dirty_train): 289 | index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv') 290 | index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv') 291 | index_train = pd.read_csv(index_train_path).values.reshape(-1) 292 | index_test = pd.read_csv(index_test_path).values.reshape(-1) 293 | clean_path = utils.get_dir(dataset, 'raw', 'Humanclean_mislabel_clean.csv') 294 | clean = utils.load_df(dataset, clean_path) 295 | self.clean_train = clean.loc[index_train, :].reset_index(drop=True) 296 | self.clean_test = clean.loc[index_test, :].reset_index(drop=True) 297 | 298 | def clean(self, dirty_train, dirty_test): 299 | indicator_train = pd.DataFrame(dirty_train.values != self.clean_train.values, columns=dirty_train.columns) 300 | indicator_test = pd.DataFrame(dirty_test.values != self.clean_test.values, columns=dirty_train.columns) 301 | return self.clean_train, indicator_train, self.clean_test, indicator_test 302 | 303 | class AutoERCleaner(object): 304 | """docstring for AutoERCleaner""" 305 | def __init__(self, remove_mv=True): 306 | super(AutoERCleaner, self).__init__() 307 | self.remove_mv = remove_mv 308 | 309 | def fit(self, dataset, dirty_train): 310 | index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv') 311 | index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv') 312 | dirty_train_path = utils.get_dir(dataset, 'raw', 'dirty_train.csv') 313 | dirty_test_path = utils.get_dir(dataset, 'raw', 'dirty_test.csv') 314 | 315 | index_train = pd.read_csv(index_train_path).values.reshape(-1) 316 | index_test = pd.read_csv(index_test_path).values.reshape(-1) 317 | ind_path = utils.get_dir(dataset, 'raw', 'AutoER.csv') 318 | 319 | autoer_result = pd.read_csv(ind_path).values.reshape(-1) 320 | 321 | ind_train = autoer_result[index_train] 322 | ind_test = autoer_result[index_test] 323 | 324 | dirty_train = pd.read_csv(dirty_train_path) 325 | dirty_test = pd.read_csv(dirty_test_path) 326 | 327 | if self.remove_mv: 328 | train_mv = dirty_train.isnull().values.any(axis=1) 329 | test_mv = dirty_test.isnull().values.any(axis=1) 330 | ind_train = ind_train[train_mv == False] 331 | ind_test = ind_test[test_mv == False] 332 | 333 | ind_train = pd.DataFrame(ind_train.reshape(-1, 1), columns=["label"]) 334 | ind_test = pd.DataFrame(ind_test.reshape(-1, 1), columns=["label"]) 335 | 336 | self.ind_train = ind_train.duplicated(keep="first").values 337 | self.ind_test = ind_test.duplicated(keep="first").values 338 | self.ind_train[ind_train["label"] == -1] = False 339 | self.ind_test[ind_test["label"] == -1] = False 340 | 341 | def repair(self, df, is_dup): 342 | assert len(df) == len(is_dup) 343 | df_clean = df[is_dup == False] 344 | return df_clean 345 | 346 | def clean(self, dirty_train, dirty_test): 347 | clean_train = self.repair(dirty_train, self.ind_train) 348 | clean_test = self.repair(dirty_test, self.ind_test) 349 | ind_train = pd.DataFrame(self.ind_train, columns=["is_dup"]) 350 | ind_test = pd.DataFrame(self.ind_test, columns=["is_dup"]) 351 | 352 | return clean_train, ind_train, clean_test, ind_test 353 | 354 | class FDCleaner(object): 355 | def __init__(self): 356 | super(FDCleaner, self).__init__() 357 | 358 | def fit(self, dataset, dirty_train): 359 | dirty_raw_path = utils.get_dir(dataset, 'raw', 'raw.csv') 360 | clean_raw_path = utils.get_dir(dataset, 'raw', 'FD.csv') 361 | if not os.path.exists(clean_raw_path): 362 | print("Must provide clean version of raw data for cleaning inconsistency") 363 | sys.exit(1) 364 | dirty_raw = utils.load_df(dataset, dirty_raw_path) 365 | clean_raw = utils.load_df(dataset, clean_raw_path) 366 | 367 | N, m = dirty_raw.shape 368 | dirty_raw = dirty_raw.values 369 | clean_raw = clean_raw.values 370 | mask = (dirty_raw != clean_raw) 371 | dirty = dirty_raw[mask] 372 | clean = clean_raw[mask] 373 | self.incon_dict = dict(zip(dirty, clean)) 374 | 375 | def clean_df(self, df): 376 | df_clean = df.copy() 377 | N, m = df_clean.shape 378 | indicator = np.zeros_like(df_clean).astype(bool) 379 | 380 | for i in range(N): 381 | for j in range(m): 382 | if df_clean.iloc[i, j] in self.incon_dict.keys(): 383 | df_clean.iloc[i, j] = self.incon_dict[df_clean.iloc[i, j]] 384 | indicator[i, j] = True 385 | indicator = pd.DataFrame(indicator, columns=df.columns) 386 | return df_clean, indicator 387 | 388 | def clean(self, dirty_train, dirty_test): 389 | clean_train, indicator_train = self.clean_df(dirty_train) 390 | clean_test, indicator_test = self.clean_df(dirty_test) 391 | return clean_train, indicator_train, clean_test, indicator_test 392 | 393 | class MVHoloCleaner(object): 394 | def __init__(self): 395 | self.tag = "impute_holoclean" 396 | 397 | def detect(self, df): 398 | return df.isnull() 399 | 400 | def fit(self, dataset, df): 401 | clean_raw_path = utils.get_dir(dataset, 'raw', 'Holoclean_mv_clean.csv') 402 | clean_raw = pd.read_csv(clean_raw_path) 403 | 404 | index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv') 405 | index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv') 406 | index_train = pd.read_csv(index_train_path).values.reshape(-1) 407 | index_test = pd.read_csv(index_test_path).values.reshape(-1) 408 | 409 | self.clean_train = clean_raw.iloc[index_train, :] 410 | self.clean_test = clean_raw.iloc[index_test, :] 411 | 412 | def clean(self, dirty_train, dirty_test): 413 | indicator_train = self.detect(dirty_train) 414 | indicator_test = self.detect(dirty_test) 415 | 416 | clean_train = self.clean_train 417 | clean_test =self.clean_test 418 | return clean_train, indicator_train, clean_test, indicator_test 419 | 420 | class MVHumanCleaner(object): 421 | def __init__(self): 422 | self.tag = "impute_human" 423 | 424 | def detect(self, df): 425 | return df.isnull() 426 | 427 | def fit(self, dataset, df): 428 | clean_raw_path = utils.get_dir(dataset, 'raw', 'Humanclean_mv_clean.csv') 429 | clean_raw = pd.read_csv(clean_raw_path) 430 | 431 | index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv') 432 | index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv') 433 | index_train = pd.read_csv(index_train_path).values.reshape(-1) 434 | index_test = pd.read_csv(index_test_path).values.reshape(-1) 435 | 436 | self.clean_train = clean_raw.iloc[index_train, :] 437 | self.clean_test = clean_raw.iloc[index_test, :] 438 | 439 | def clean(self, dirty_train, dirty_test): 440 | indicator_train = self.detect(dirty_train) 441 | indicator_test = self.detect(dirty_test) 442 | 443 | clean_train = self.clean_train 444 | clean_test =self.clean_test 445 | return clean_train, indicator_train, clean_test, indicator_test 446 | 447 | class OutlierHoloCleaner(object): 448 | def __init__(self): 449 | self.tag = "impute_holoclean" 450 | 451 | def fit(self, dataset, df): 452 | clean_raw_path = utils.get_dir(dataset, 'raw', 'Holoclean_outlier_clean.csv') 453 | index_train_path = utils.get_dir(dataset, 'raw', 'idx_train.csv') 454 | index_test_path = utils.get_dir(dataset, 'raw', 'idx_test.csv') 455 | 456 | index_train = pd.read_csv(index_train_path).values.reshape(-1) 457 | index_test = pd.read_csv(index_test_path).values.reshape(-1) 458 | clean_raw = pd.read_csv(clean_raw_path) 459 | 460 | if 'missing_values' in dataset['error_types']: 461 | dirty_train = pd.read_csv(utils.get_dir(dataset, 'raw', 'dirty_train.csv')) 462 | dirty_test = pd.read_csv(utils.get_dir(dataset, 'raw', 'dirty_test.csv')) 463 | raw = pd.read_csv(utils.get_dir(dataset, 'raw', 'raw.csv')) 464 | raw_mv_rows = raw.isnull().values.any(axis=1) 465 | train_mv_rows = dirty_train.isnull().values.any(axis=1) 466 | test_mv_rows = dirty_test.isnull().values.any(axis=1) 467 | 468 | old_index = np.arange(len(raw))[raw_mv_rows == False] 469 | new_index = np.arange(len(raw) - sum(raw_mv_rows)) 470 | index_map = {} 471 | 472 | for o, n in zip(old_index, new_index): 473 | index_map[o] = n 474 | 475 | index_train_no_mv = index_train[train_mv_rows == False] 476 | index_test_no_mv = index_test[test_mv_rows == False] 477 | 478 | index_train = [index_map[i] for i in index_train_no_mv] 479 | index_test = [index_map[i] for i in index_test_no_mv] 480 | 481 | self.clean_train = clean_raw.iloc[index_train, :] 482 | self.clean_test = clean_raw.iloc[index_test, :] 483 | 484 | def clean(self, dirty_train, dirty_test): 485 | indicator_train = pd.DataFrame(dirty_train.values != self.clean_train.values, columns=dirty_train.columns) 486 | indicator_test = pd.DataFrame(dirty_test.values != self.clean_test.values, columns=dirty_train.columns) 487 | 488 | clean_train = self.clean_train 489 | clean_test =self.clean_test 490 | return clean_train, indicator_train, clean_test, indicator_test -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import config 4 | import sys 5 | import json 6 | import numpy as np 7 | from matplotlib import pyplot as plt 8 | import shutil 9 | from collections import defaultdict 10 | 11 | # ============================================================================= 12 | # Data related utils 13 | # ============================================================================= 14 | 15 | def get_dataset(name): 16 | """Get dataset dict in config.py given name 17 | 18 | Args: 19 | name (string): dataset name 20 | """ 21 | dataset = [d for d in config.datasets if d['data_dir'] == name] 22 | if len(dataset) == 0: 23 | print('Dataset {} does not exist.'.format(name)) 24 | sys.exit() 25 | return dataset[0] 26 | 27 | def get_error(name): 28 | """Get error dict in config.py given name 29 | 30 | Args: 31 | name (string): dataset name 32 | """ 33 | error_type = [e for e in config.error_types if e['name'] == name] 34 | if len(error_type) == 0: 35 | print('Error type {} does not exist.'.format(name)) 36 | sys.exit() 37 | return error_type[0] 38 | 39 | def get_model(name): 40 | """Get model dict in config.py given name 41 | 42 | Args: 43 | name (string): model name 44 | """ 45 | model = [m for m in config.models if m['name'] == name ] 46 | if len(model) == 0: 47 | print("Model {} does not exist.".format(name)) 48 | sys.exit() 49 | return model[0] 50 | 51 | def get_dir(dataset, folder=None, file=None, create_folder=False): 52 | """Get directory or path given dataset, folder name (optional) and filename (optional) 53 | 54 | Args: 55 | dataset(dict): dataset dict in config.py 56 | folder (string): raw/missing_values/outliers/duplicates/inconsistency/mislabel 57 | file (string): file name 58 | create_folder (bool): whether create folder if not exist 59 | """ 60 | data_dir = os.path.join(config.data_dir, dataset['data_dir']) 61 | if folder is None: 62 | return data_dir 63 | 64 | folder_dir = os.path.join(data_dir, folder) 65 | if create_folder and not os.path.exists(folder_dir): 66 | os.makedirs(folder_dir) 67 | 68 | if file is None: 69 | return folder_dir 70 | 71 | file_dir = os.path.join(folder_dir, file) 72 | return file_dir 73 | 74 | def load_df(dataset, file_path): 75 | """load data file into pandas dataframe and convert categorical variables to string 76 | 77 | Args: 78 | dataset (dict): dataset in config.py 79 | file_path (string): path of data file 80 | """ 81 | df = pd.read_csv(file_path) 82 | if 'categorical_variables' in dataset.keys(): 83 | categories = dataset['categorical_variables'] 84 | for cat in categories: 85 | df[cat] = df[cat].astype(str).replace('nan', np.nan) 86 | return df 87 | 88 | def load_dfs(dataset, file_path_pfx, return_version=False): 89 | """load train and test files into pandas dataframes 90 | 91 | Args: 92 | dataset (dict): dataset in config.py 93 | file_path_pfx (string): prefix of data file 94 | return_version (bool): whether to return the version (split seed) of data 95 | """ 96 | train_dir = file_path_pfx + '_train.csv' 97 | test_dir = file_path_pfx + '_test.csv' 98 | train = load_df(dataset, train_dir) 99 | test = load_df(dataset, test_dir) 100 | if return_version: 101 | version = get_version(file_path_pfx) 102 | return train, test, version 103 | else: 104 | return train, test 105 | 106 | def save_dfs(train, test, save_path_pfx, version=None): 107 | """Save train and test pandas dataframes in csv file 108 | 109 | Args: 110 | train (pd.DataFrame): training set 111 | test (pd.DataFrame): test set 112 | save_path_pfx (string): prefix of save path 113 | version (int): version of data (optional) 114 | """ 115 | train_save_path = save_path_pfx + '_train.csv' 116 | test_save_path = save_path_pfx + '_test.csv' 117 | train.to_csv(train_save_path, index=False) 118 | test.to_csv(test_save_path, index=False) 119 | if version is not None: 120 | save_version(save_path_pfx, version) 121 | 122 | def save_version(file_path_pfx, seed): 123 | """Save version of data in json file 124 | 125 | Args: 126 | file_path_pfx (string): prefix of path of data file 127 | seed (int): split seed of data 128 | """ 129 | directory, file = os.path.split(file_path_pfx) 130 | version_path = os.path.join(directory, "version.json") 131 | if os.path.exists(version_path): 132 | version = json.load(open(version_path, 'r')) 133 | else: 134 | version = {} 135 | version[file] = str(seed) 136 | json.dump(version, open(version_path, 'w')) 137 | 138 | def get_version(file_path_pfx): 139 | """Get version of data 140 | 141 | Args: 142 | file_path_pfx (string): prefix of path of data file 143 | """ 144 | directory, file = os.path.split(file_path_pfx) 145 | version_path = os.path.join(directory, "version.json") 146 | if os.path.exists(version_path): 147 | version = json.load(open(version_path, 'r')) 148 | return int(version[file]) 149 | else: 150 | return None 151 | 152 | def remove(path): 153 | """Remove file or directory 154 | 155 | Args: 156 | path (string): path of file or directory 157 | """ 158 | if os.path.isfile(path): 159 | os.remove(path) 160 | elif os.path.isdir(path): 161 | shutil.rmtree(path) 162 | 163 | # ============================================================================= 164 | # Training related utils 165 | # ============================================================================= 166 | 167 | def get_train_files(error_type): 168 | """Get training files given error type 169 | 170 | Args: 171 | error_type (string): missing_values/outliers/mislabel/duplicates/inconsistency 172 | """ 173 | 174 | error_dict = [e for e in config.error_types if e["name"] == error_type][0] 175 | if error_type == 'missing_values': 176 | filenames = list(error_dict["clean_methods"].keys()) 177 | else: 178 | filenames = ["dirty"] + list(error_dict["clean_methods"].keys()) 179 | 180 | # if error_type == 'missing_values': 181 | # filenames = [ 182 | # "delete", 183 | # "impute_holoclean", 184 | # "impute_mean_mode", 185 | # "impute_mean_dummy", 186 | # "impute_median_mode", 187 | # "impute_median_dummy", 188 | # "impute_mode_mode", 189 | # "impute_mode_dummy"] 190 | # elif error_type == 'outliers': 191 | # filenames = ["dirty", 192 | # "clean_HC_impute_holoclean", 193 | # "clean_SD_delete", 194 | # "clean_IF_delete", 195 | # "clean_IQR_delete", 196 | # "clean_SD_impute_mean_dummy", 197 | # "clean_IQR_impute_mean_dummy", 198 | # "clean_IF_impute_mean_dummy", 199 | # "clean_SD_impute_median_dummy", 200 | # "clean_IQR_impute_median_dummy", 201 | # "clean_IF_impute_median_dummy", 202 | # "clean_SD_impute_mode_dummy", 203 | # "clean_IQR_impute_mode_dummy", 204 | # "clean_IF_impute_mode_dummy"] 205 | # elif error_type == 'mislabel': 206 | # filenames = ["dirty", 207 | # "clean"] 208 | # elif error_type == 'duplicates': 209 | # filenames = ["dirty", "clean", "AutoER"] 210 | # elif error_type == "inconsistency": 211 | # filenames = ["dirty", "clean", "FD"] 212 | # else: 213 | # filenames = ["dirty", "clean"] 214 | return filenames 215 | 216 | def get_test_files(error_type, train_file): 217 | """Get test files given error type and training file 218 | Each error has two types of files: dirty and clean (delete and impute for missing values) 219 | Test files for one training file include the test file corresponding to itself and all of test 220 | files in another type (e.g. For outliers, test files for dirty_train are dirty_test and all of 221 | clean_***_test. Test files for outliers clean_SD_delete_train are clean_SD_delete_test and 222 | dirty_test.) 223 | 224 | Args: 225 | error_type (string): missing_values/outliers/mislabel/duplicates/inconsistency 226 | train_file (string): training file specified in get_train_files() 227 | """ 228 | if error_type == "missing_values": 229 | if train_file == "delete": 230 | return get_train_files(error_type) 231 | else: 232 | return ["delete", train_file] 233 | else: 234 | if train_file == "dirty": 235 | return get_train_files(error_type) 236 | else: 237 | return ["dirty", train_file] 238 | 239 | def check_completed(dataset, split_seed, experiment_seed): 240 | """Check whether all experiments for the dataset with split_seed have been completed 241 | 242 | Args: 243 | dataset (dict): dataset dict in config.py 244 | split_seed (int): split seed 245 | experiment_seed (int): experiment seed 246 | """ 247 | 248 | result = load_result(dataset['data_dir']) 249 | np.random.seed(experiment_seed) 250 | seeds = np.random.randint(10000, size=config.n_retrain) 251 | 252 | for error in dataset['error_types']: 253 | for model in config.models: 254 | for train_file in get_train_files(error): 255 | for s in seeds: 256 | key = "{}/v{}/{}/{}/{}/{}".format(dataset['data_dir'], split_seed, error, train_file, model['name'], s) 257 | if key not in result.keys(): 258 | return False 259 | return True 260 | 261 | # ============================================================================= 262 | # Result related utils 263 | # ============================================================================= 264 | 265 | def load_result(dataset_name=None, parse_key=False): 266 | """Load result of one dataset or all datasets (if no argument) from json to dict 267 | 268 | Args: 269 | dataset_name (string): dataset name. If not specified, load results of all datasets. 270 | parse_key (bool): whether convert key from string to tuple 271 | """ 272 | if dataset_name is None: 273 | files = [file for file in os.listdir(config.result_dir) if file.endswith('_result.json')] 274 | result_path = [os.path.join(config.result_dir, file) for file in files] 275 | else: 276 | result_path = [os.path.join(config.result_dir, '{}_result.json'.format(dataset_name))] 277 | 278 | result = {} 279 | for path in result_path: 280 | if os.path.exists(path): 281 | result.update(json.load(open(path, 'r'))) 282 | 283 | if parse_key: 284 | new_result = {} 285 | for key, value in result.items(): 286 | new_key = tuple(key.split('/')) 287 | new_result[new_key] = value 288 | result = new_result 289 | 290 | return result 291 | 292 | def load_result2019(dataset_name=None, parse_key=False): 293 | """Load result of one dataset or all datasets (if no argument) from json to dict 294 | 295 | Args: 296 | dataset_name (string): dataset name. If not specified, load results of all datasets. 297 | parse_key (bool): whether convert key from string to tuple 298 | """ 299 | if dataset_name is None: 300 | files = [file for file in os.listdir(config.result_dir) if file.endswith('_result.json')] 301 | result_path = [os.path.join("result2019", file) for file in files] 302 | else: 303 | result_path = [os.path.join("result2019", '{}_result.json'.format(dataset_name))] 304 | 305 | result = {} 306 | for path in result_path: 307 | if os.path.exists(path): 308 | result.update(json.load(open(path, 'r'))) 309 | 310 | if parse_key: 311 | new_result = {} 312 | for key, value in result.items(): 313 | new_key = tuple(key.split('/')) 314 | new_result[new_key] = value 315 | result = new_result 316 | 317 | return result 318 | 319 | def save_result(dataset_name, key, res): 320 | """Save result to json 321 | 322 | Args: 323 | dataset_name (string): dataset name. 324 | key (string): key of result in form: dataset_name/split_seed/error_type/clean_method/model_name/seed 325 | res (dict): result dict {metric_name: metric result} 326 | """ 327 | result = load_result(dataset_name) 328 | result[key] = res 329 | result_path = os.path.join(config.result_dir, '{}_result.json'.format(dataset_name)) 330 | if not os.path.exists(config.result_dir): 331 | os.makedirs(config.result_dir) 332 | json.dump(result, open(result_path, 'w'), indent=4) 333 | 334 | def dict_to_df(dic, row_keys_idx, col_keys_idx): 335 | """Convert dict to data frame 336 | 337 | Args: 338 | dic: result dictionary. Keys are tuples. 339 | row_keys_idx: index of keys for rows, ordered hierarchicallly 340 | col_keys_idx: index of keys for columns, ordered hierarchicallly 341 | """ 342 | col_keys = sorted(set([tuple([k[i] for i in col_keys_idx]) for k in dic.keys()]))[::-1] 343 | row_keys = sorted(set([tuple([k[i] for i in row_keys_idx]) for k in dic.keys()]))[::-1] 344 | sheet_idx = [i for i in np.arange(len(list(dic.keys())[0])) if i not in row_keys_idx and i not in col_keys_idx] 345 | sheet_keys = sorted(set([tuple([k[i] for i in sheet_idx]) for k in dic.keys()])) 346 | 347 | if len(sheet_keys) > 1: 348 | print(sheet_keys) 349 | print("sheet key must be unique in the same sheet.") 350 | sys.exit() 351 | else: 352 | sheet_key = sheet_keys[0] 353 | 354 | order = col_keys_idx + row_keys_idx + sheet_idx 355 | 356 | index = pd.MultiIndex.from_tuples(row_keys) 357 | columns = pd.MultiIndex.from_tuples(col_keys) 358 | data = [] 359 | 360 | for r in row_keys: 361 | row = [] 362 | for c in col_keys: 363 | disorder_key = c + r + sheet_key 364 | key = tuple([d for o, d in sorted(zip(order, disorder_key))]) 365 | 366 | if key in dic.keys(): 367 | row.append(dic[key]) 368 | else: 369 | row.append(np.nan) 370 | data.append(row) 371 | df = pd.DataFrame(data, index=index, columns=columns) 372 | return df 373 | 374 | def dict_to_dfs(dic, row_keys_idx, col_keys_idx, df_idx): 375 | """Convert dict to multiple dataframes saved in one dict 376 | 377 | Args: 378 | dic (dict): result dictionary. Keys are tuples. 379 | row_keys_idx (int): index of keys for rows, ordered hierarchicallly 380 | col_keys_idx (int): index of keys for columns, ordered hierarchicallly 381 | df_idx (int): index of keys for spliting dict to multiple dfs. 382 | """ 383 | dfs = {} 384 | df_keys = sorted(set([k[df_idx] for k in dic.keys()])) 385 | for k in df_keys: 386 | filtered_dic = {key:value for key, value in dic.items() if key[df_idx] == k} 387 | df = dict_to_df(filtered_dic, row_keys_idx, col_keys_idx) 388 | dfs[k] = df 389 | return dfs 390 | 391 | def df_to_xls(df, save_path): 392 | """Save single pd.DataFrame to a excel file""" 393 | directory = os.path.dirname(save_path) 394 | if not os.path.exists(directory): 395 | os.makedirs(directory) 396 | writer = pd.ExcelWriter(save_path) 397 | df.to_excel(writer) 398 | writer.save() 399 | 400 | def df_to_pickle(df, save_path): 401 | """Save single pd.DataFrame to a pickle file""" 402 | directory = os.path.dirname(save_path) 403 | if not os.path.exists(directory): 404 | os.makedirs(directory) 405 | df.to_pickle(save_path) 406 | 407 | def dfs_to_xls(dfs, save_path): 408 | """Save multiple pd.DataFrame in a dict to a excel file 409 | 410 | Args: 411 | dfs (dict): {sheet_name: pd.DataFrame} 412 | """ 413 | directory = os.path.dirname(save_path) 414 | if not os.path.exists(directory): 415 | os.makedirs(directory) 416 | writer = pd.ExcelWriter(save_path) 417 | for k, df in dfs.items(): 418 | df.to_excel(writer, '%s'%k) 419 | writer.save() 420 | 421 | def dict_to_xls(dic, row_keys_idx, col_keys_idx, save_path, sheet_idx=None): 422 | """Convert dict to excel 423 | 424 | Args: 425 | dic: result dictionary. Keys are tuples. 426 | row_keys_idx: index of keys for rows, ordered hierarchicallly 427 | col_keys_idx: index of keys for columns, ordered hierarchicallly 428 | sheet_idx: index of keys for sheet 429 | """ 430 | if sheet_idx is None: 431 | df = dict_to_df(dic, row_keys_idx, col_keys_idx) 432 | df_to_xls(df, save_path) 433 | else: 434 | dfs = dict_to_dfs(dic, row_keys_idx, col_keys_idx, sheet_idx) 435 | dfs_to_xls(dfs, save_path) 436 | 437 | def flatten_dict(dictionary): 438 | """Convert hierarchic dictionary into a flat dict by extending dimension of keys. 439 | (e.g. {"a": {"b":"c"}} -> {("a", "b"): "c"}) 440 | """ 441 | values = list(dictionary.values()) 442 | if any([type(v) != dict for v in values]): 443 | return dictionary 444 | 445 | flat_dict = {} 446 | for k, v in dictionary.items(): 447 | if type(k) != tuple: 448 | k = (k,) 449 | for vk, vv in v.items(): 450 | if type(vk) != tuple: 451 | vk = (vk,) 452 | new_key = k + vk 453 | flat_dict[new_key] = vv 454 | return flatten_dict(flat_dict) 455 | 456 | def rearrange_dict(dictionary, order): 457 | """Rearrange the order of key of dictionary""" 458 | new_dict = {} 459 | for key, value in dictionary.items(): 460 | if len(key) < len(order): 461 | print("Number of new order must be smaller than the length of key") 462 | sys.exit() 463 | 464 | new_order = np.arange(len(key)) 465 | for i, o in enumerate(order): 466 | new_order[i] = o 467 | 468 | new_key = tuple([key[i] for i in new_order]) 469 | new_dict[new_key] = value 470 | return new_dict 471 | 472 | def makedirs(dir_list): 473 | save_dir = os.path.join(*dir_list) 474 | if not os.path.exists(save_dir): 475 | os.makedirs(save_dir) 476 | return save_dir 477 | 478 | def result_to_table(result, save_dir, csv=True, xls=True): 479 | """Convert result to tables. One table for each dataset. 480 | 481 | Args: 482 | result (dict): key: (dataset_name, split_seed, error_type, train_file, model_name, seed) 483 | csv (bool): save csv table 484 | xls (bool): save xls table 485 | 486 | """ 487 | 488 | # save csv table 489 | if csv: 490 | csv_dir = makedirs([save_dir, 'csv']) 491 | flat_result = flatten_dict({k + ('result',):v for k, v in result.items()}) 492 | result_df = dict_to_df(flat_result, [0, 1, 2, 3, 4, 5, 7], [6]) 493 | save_path = os.path.join(csv_dir, "training_result.csv") 494 | result_df.to_csv(save_path, index_label=['dataset', 'split_seed', 'error_type', 'train_file', 'model_name', 'seed', 'metric']) 495 | 496 | if xls: 497 | xls_dir = makedirs([save_dir, 'xls']) 498 | datasets = list({k[0] for k in result.keys()}) 499 | 500 | for dataset in datasets: 501 | dataset_result = flatten_dict({k:v for k, v in result.items() if k[0] == dataset}) 502 | save_path = os.path.join(xls_dir, '{}_result.xls'.format(dataset)) 503 | dict_to_xls(dataset_result, [0, 1, 3, 4, 5], [6], save_path, sheet_idx=2) 504 | 505 | def group(result, idx, keepdim=False): 506 | """Group results on one dimension (key component) into a list 507 | 508 | Args: 509 | result (dict): result dict 510 | key (tuple): e.g. (dataset_name, split_seed, error_type, train_file, model_name, seed) 511 | value (dict): {metric_name: metric} 512 | idx: the index of dimension (key component) by which the result is grouped 513 | keepdim (bool): keep or delete dimension by which the result is grouped 514 | """ 515 | 516 | # get domain in given dimension (key component) 517 | domain = list({k[idx] for k in result.keys()}) 518 | 519 | # loop through each value in domain, append corresponding results into a list 520 | new_result = {} 521 | for x in domain: 522 | for old_key, v in result.items(): 523 | 524 | if x != old_key[idx]: 525 | continue 526 | 527 | # new key (eliminate the given dimension) 528 | new_key = tuple([old_key[i] for i in range(len(old_key)) if i != idx]) 529 | 530 | # new value 531 | if new_key not in new_result.keys(): 532 | new_result[new_key] = defaultdict(list) 533 | 534 | # apppend results into list 535 | for vk, vv in v.items(): 536 | # don't include best param saved in result 537 | if vk != "best_params": 538 | new_result[new_key][vk].append(vv) 539 | 540 | if keepdim: 541 | new_result[new_key]["group_key"].append(old_key[idx]) 542 | 543 | if keepdim: 544 | final_result = {} 545 | for k, v in new_result.items(): 546 | group_key = "/".join(v["group_key"]) 547 | new_k = k[0:idx] + (group_key,) + k[idx:] 548 | del v["group_key"] 549 | final_result[new_k] = v 550 | new_result = final_result 551 | return new_result 552 | 553 | def reduce_by_mean(result): 554 | """Reduce a list of results into a single result by mean 555 | 556 | Args: 557 | result (dict): result dict 558 | key (tuple): (dataset_name, split_seed, error_type, train_file, model_name) 559 | value (dict): {metric_name: [metric lists]} 560 | """ 561 | new_result = {} 562 | for k, v in result.items(): 563 | new_value = {} 564 | for vk, vv in v.items(): 565 | new_value[vk] = np.mean(vv) 566 | new_result[k] = new_value 567 | return new_result 568 | 569 | def reduce_by_max_val(result, dim=None, dim_name=None): 570 | """Reduce a list of results into a single result by the result corresponding to the best val_acc 571 | 572 | Args: 573 | result (dict): result dict 574 | key (tuple): (dataset_name, split_seed, error_type, train_file, model_name) 575 | value (dict): {metric_name: [metric lists]} 576 | """ 577 | new_result = {} 578 | for k, v in result.items(): 579 | new_value = {} 580 | 581 | if np.isnan(v['val_acc']).all(): 582 | best_val_idx = 0 583 | else: 584 | best_val_idx = np.nanargmax(v['val_acc']) 585 | 586 | if dim is not None: 587 | best = k[dim].split('/')[best_val_idx] 588 | new_key = k[0:dim] + (dim_name,) + k[dim+1:] 589 | else: 590 | new_key = k 591 | 592 | for vk, vv in v.items(): 593 | new_value[vk] = vv[best_val_idx] 594 | 595 | if dim is not None: 596 | new_value[dim_name] = best 597 | 598 | new_result[new_key] = new_value 599 | 600 | return new_result 601 | 602 | def get_dirty_clean_train_files(error_type): 603 | files = get_train_files(error_type) 604 | dirty_file = files[0] 605 | clean_file = files[1:] 606 | return dirty_file, clean_file 607 | 608 | def group_reduce_by_best_clean(result): 609 | """Group by clean method and then reduce a list of results into a single result by the result corresponding to the best val_acc 610 | 611 | Args: 612 | result (dict): result dict 613 | key (tuple): (dataset_name, split_seed, error_type, train_file, model_name) 614 | value (dict): {metric_name: [metric lists]} 615 | """ 616 | dirty = {} 617 | clean = {} 618 | 619 | for k, v in result.items(): 620 | error_type = k[2] 621 | dirty_file, clean_file = get_dirty_clean_train_files(error_type) 622 | 623 | train_file = k[3] 624 | if train_file in dirty_file: 625 | dirty[k] = v 626 | else: 627 | new_v = {} 628 | 629 | for vk, vv in v.items(): 630 | new_vk = vk 631 | 632 | for c in clean_file: 633 | if c in vk: 634 | if error_type == "missing_values": 635 | new_vk = new_vk.replace(c, "impute") 636 | else: 637 | new_vk = new_vk.replace(c, "clean") 638 | break 639 | 640 | new_v[new_vk] = vv 641 | clean[k] = new_v 642 | 643 | clean = group(clean, 3, keepdim=True) 644 | 645 | clean = reduce_by_max_val(clean, dim=3, dim_name="clean") 646 | 647 | new_clean = {} 648 | for k, v in clean.items(): 649 | if k[2] == 'missing_values': 650 | new_k = k[0:3] + ('impute',) + k[4:] 651 | else: 652 | new_k = k 653 | new_clean[new_k] = v 654 | 655 | new_dirty = {} 656 | for k, v in dirty.items(): 657 | new_v = {} 658 | clean_key = k[0:3] + ("clean",) + k[4:] 659 | clean_method = clean[clean_key]["clean"] 660 | 661 | new_v = {} 662 | for vk, vv in v.items(): 663 | vk_list = vk.split('_') 664 | if vk_list[0] not in ['clean', 'impute']: 665 | new_v[vk] = vv 666 | 667 | if k[2] == 'missing_values': 668 | new_v["impute_test_acc"] = v["{}_test_acc".format(clean_method)] 669 | new_v["impute_test_f1"] = v["{}_test_f1".format(clean_method)] 670 | else: 671 | new_v["clean_test_acc"] = v["{}_test_acc".format(clean_method)] 672 | new_v["clean_test_f1"] = v["{}_test_f1".format(clean_method)] 673 | 674 | new_dirty[k] = new_v 675 | 676 | new_result = {**new_dirty, **new_clean} 677 | return new_result --------------------------------------------------------------------------------