├── .idea ├── .gitignore ├── vcs.xml ├── misc.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── framework.iml ├── .gitattributes ├── src ├── config.py ├── dispatcher.py ├── create_folds.py ├── predict.py ├── feature_gen.py ├── feature_eval.py ├── train.py ├── feature_impute.py ├── metrics.py ├── numerical.py ├── categorical.py ├── cross_validation.py └── engine.py ├── LICENSE ├── .gitignore └── README.md /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | DATA_PATH = r"/input/" 2 | TUNE_PATH = r"./tune/" 3 | MODEL_PATH = r"./models/" 4 | FOLDS = 5 5 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/framework.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/dispatcher.py: -------------------------------------------------------------------------------- 1 | from sklearn import ensemble 2 | from sklearn import linear_model 3 | from sklearn import svm 4 | 5 | # TODO Implement models with good default arguments 6 | 7 | REGRESSION_MODELS = dict( 8 | lr=linear_model.LinearRegression(), 9 | svm=svm.SVR(C=0.001, epsilon=0.001, gamma="scale"), 10 | randomforest=ensemble.RandomForestRegressor( 11 | n_estimators=100, n_jobs=100, verbose=1 12 | ), 13 | extratrees=ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=100, verbose=1), 14 | ) 15 | 16 | 17 | CLASSIFICATION_MODELS = dict( 18 | lr=linear_model.LogisticRegression(), 19 | svm=svm.SVC(), 20 | randomforest=ensemble.RandomForestClassifier( 21 | n_estimators=200, n_jobs=100, verbose=1 22 | ), 23 | extratrees=ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=100, verbose=1), 24 | ) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Abhishek Kumar Mishra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/create_folds.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from src import config 3 | from src.cross_validation import CrossValidation 4 | 5 | RAW_DATA = config.RAW_DATA 6 | FOLDS_DATA = config.DATA_PATH + r"\diamonds_folds.csv" 7 | # REG_DATA = config.REG_DATA 8 | # FOLDS_DATA_REG = config.DATA_PATH+r'\train_folds_reg.csv' 9 | 10 | 11 | if __name__ == "__main__": 12 | # df = pd.read_csv(REG_DATA) 13 | # df['kfold'] = -1 14 | df = pd.read_csv(r"C:\Users\abhis\Documents\01_proj\input_data\diamonds.csv") 15 | df = df.sample(frac=1).reset_index(drop=True) 16 | 17 | # kf = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=42) 18 | 19 | # for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.target.values)): 20 | # print(len(train_idx), len(val_idx)) 21 | # df.loc[val_idx, 'kfold'] = fold 22 | 23 | # cross_val = CrossValidation(df = df, target_cols=['price'], problem_type='single_col_regression', 24 | # stratified_regression = True) 25 | cross_val = CrossValidation( 26 | df=df, 27 | target_cols=["target"], 28 | problem_type="single_col_regression", 29 | stratified_regression=True, 30 | ) 31 | df_folds = cross_val.split() 32 | # df.to_csv(FOLDS_DATA_REG, index=False) 33 | df_folds.to_csv(FOLDS_DATA, index=False) 34 | -------------------------------------------------------------------------------- /src/predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import joblib 6 | 7 | from src import config 8 | 9 | TRAINING_DATA = config.TRAINING_DATA 10 | TEST_DATA = config.TEST_DATA 11 | FOLDS = config.FOLDS 12 | 13 | 14 | def predict(MODEL, FOLDS): 15 | MODEL = MODEL 16 | df = pd.read_csv(TEST_DATA) 17 | text_idx = df["id"].values 18 | predictions = None 19 | 20 | for FOLD in range(FOLDS): 21 | print(FOLD) 22 | df = pd.read_csv(TEST_DATA) 23 | encoders = joblib.load(f"models/{MODEL}_{FOLD}_label_encoder.pkl") 24 | cols = joblib.load(f"models/{MODEL}_{FOLD}_columns.pkl") 25 | for c in encoders: 26 | print(c) 27 | lbl = encoders[c] 28 | df.loc[:, c] = lbl.transform(df[c].values.tolist()) 29 | 30 | clf = joblib.load(f"models/{MODEL}_{FOLD}_.pkl") 31 | df = df[cols] 32 | preds = clf.predict_proba(df)[:, 1] 33 | 34 | if FOLD == 0: 35 | predictions = preds 36 | else: 37 | predictions += preds 38 | 39 | predictions /= 5 40 | 41 | sub = pd.DataFrame( 42 | np.column_stack((text_idx, predictions)), columns=["id", "target"] 43 | ) 44 | return sub 45 | 46 | 47 | if __name__ == "__main__": 48 | 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("model", help="Type in the model you want to run", type=str) 51 | args = parser.parse_args() 52 | 53 | MODEL = args.model 54 | 55 | submission = predict(MODEL, FOLDS) 56 | submission.id = submission.id.astype(int) 57 | submission.to_csv(f"models/{MODEL}.csv", index=False) 58 | -------------------------------------------------------------------------------- /src/feature_gen.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import preprocessing 3 | 4 | 5 | class FeatureGen: 6 | def __init__( 7 | self, 8 | df, 9 | target_cols: list = None, 10 | degree: int = 2, 11 | interaction_only: bool = False, 12 | include_bias: bool = True, 13 | feature_gen: str = "poly", 14 | ): 15 | """ 16 | :param df: Dataframe that needs to be used for generation of feature 17 | :param target_cols: List of columns that the method needs to be applied on 18 | :param feature_gen: Method to be used to generate features, ploy=Polynomical Feature generator from sklearn 19 | """ 20 | self.df = df 21 | self.feature_gen = feature_gen 22 | self.target_cols = target_cols 23 | self.degree = degree 24 | self.interaction_only = interaction_only 25 | self.include_bias = include_bias 26 | 27 | if self.target_cols is None: 28 | self.target_cols = self.df.columns 29 | 30 | def fit_transform(self): 31 | if self.feature_gen == "poly": 32 | polynomial = preprocessing.PolynomialFeatures( 33 | self.degree, self.interaction_only, self.include_bias 34 | ) 35 | new_features = polynomial.fit_transform(self.df[self.target_cols].values) 36 | new_features = pd.DataFrame(new_features) 37 | output_df = pd.concat([self.df, new_features], axis=1) 38 | return output_df 39 | 40 | 41 | if __name__ == "__main__": 42 | import pandas as pd 43 | 44 | df = pd.read_csv( 45 | r"C:\Users\abhis\Documents\01_proj\kaggle_comp\sample_ihsm\input\train_sample.csv" 46 | ) 47 | poly = FeatureGen(df, degree=2) 48 | new_df = poly.fit_transform() 49 | print(new_df.head()) 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | # Other files 128 | *.csv 129 | *.pkl 130 | .vscode 131 | .idea -------------------------------------------------------------------------------- /src/feature_eval.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | 4 | 5 | class FeatEvaluation: 6 | def __init__(self, df, target_col: str = None): 7 | """ 8 | :param df: Dataframe which will be analysed 9 | :param target_col: String of the colummn name that is the target for this analysis in the dataframe 10 | """ 11 | self.df = df 12 | self.target = target_col 13 | 14 | def stat_desc(self, col): 15 | if self.df[col].dtype == "O": 16 | return "Categorical Data" 17 | else: 18 | return self.df[col].describe().loc[["min", "max"]] 19 | 20 | def feature_report(self): 21 | print("Feature Report Generated for all the columns in the Dataframe") 22 | for col in self.df.columns: 23 | print("\n") 24 | print(f"Feature Report for Column: {col}") 25 | print("~~~~~~==================~~~~~~") 26 | print(str(self.stat_desc(col))) 27 | print(f"No of Unique Values: {self.df[col].nunique()}") 28 | print(f"No of Values in the column: {self.df[col].value_counts()}") 29 | return 30 | 31 | def feature_plot(self): 32 | for col in self.df.columns: 33 | print("Plotting the Distribution for: {0}".format(col)) 34 | if self.df[col].dtype == "O": 35 | plt.figure(figsize=(16, 9)) 36 | sns.boxplot(x=col, y=self.target, data=self.df) 37 | plt.show() 38 | else: 39 | plt.figure(figsize=(16, 9)) 40 | sns.distplot(self.df[col].values) 41 | plt.show() 42 | return 43 | 44 | def corelation_plot(self): 45 | corr = self.df.corr() 46 | plt.figure(figsize=(16, 9)) 47 | sns.heatmap( 48 | corr, 49 | annot=True, 50 | vmin=-1, 51 | vmax=1, 52 | center=0, 53 | cmap="coolwarm", 54 | linewidths=1.5, 55 | linecolor="black", 56 | ) 57 | plt.show() 58 | return 59 | 60 | 61 | # if __name__ == "__main__": 62 | # import config 63 | # import pandas as pd 64 | # RAW_TRAIN_DATA = config.RAW_DATA 65 | # TEST_DATA = config.TEST_DATA 66 | # 67 | # train_df = pd.read_csv(RAW_TRAIN_DATA) 68 | # test_df = pd.read_csv(TEST_DATA) 69 | # test_df['target'] = -99999 70 | # eval = FeatEvaluation(train_df, 'price') 71 | # print(eval.feature_report()) 72 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | 4 | from sklearn import preprocessing 5 | from sklearn import metrics 6 | 7 | import joblib 8 | 9 | from src import dispatcher, config 10 | 11 | TRAINING_DATA = config.TRAINING_DATA 12 | TEST_DATA = config.TEST_DATA 13 | FOLDS = config.FOLDS 14 | 15 | FOLD_MAPPING = { 16 | 0: [1, 2, 3, 4], 17 | 1: [0, 2, 3, 4], 18 | 2: [0, 1, 3, 4], 19 | 3: [0, 1, 2, 4], 20 | 4: [0, 1, 2, 3], 21 | } 22 | 23 | if __name__ == "__main__": 24 | # TODO Update the train.py to use the ML Framework for all the work 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("model", help="Type in the model you want to run", type=str) 27 | args = parser.parse_args() 28 | 29 | MODEL = args.model 30 | 31 | df = pd.read_csv(TRAINING_DATA) 32 | df_test = pd.read_csv(TEST_DATA) 33 | idx = [] 34 | predictions = [] 35 | 36 | for FOLD in range(FOLDS): 37 | train_df = df[df.kfold.isin(FOLD_MAPPING.get(FOLD))] 38 | valid_df = df[df.kfold == FOLD] 39 | 40 | ytrain = train_df.target.values 41 | yvalid = valid_df.target.values 42 | 43 | idx.extend(valid_df["id"].values.tolist()) 44 | 45 | train_df = train_df.drop(["id", "kfold", "target"], axis=1) 46 | valid_df = valid_df.drop(["id", "kfold", "target"], axis=1) 47 | valid_df = valid_df[train_df.columns] 48 | 49 | label_encoder = {} 50 | for c in train_df.columns: 51 | lbl = preprocessing.LabelEncoder() 52 | lbl.fit( 53 | train_df[c].values.tolist() 54 | + valid_df[c].values.tolist() 55 | + df_test[c].values.tolist() 56 | ) 57 | train_df.loc[:, c] = lbl.transform(train_df[c].values.tolist()) 58 | valid_df.loc[:, c] = lbl.transform(valid_df[c].values.tolist()) 59 | label_encoder[c] = lbl 60 | 61 | clf = dispatcher.MODELS[MODEL] 62 | clf.fit(train_df, ytrain) 63 | preds = clf.predict_proba(valid_df)[:, 1] 64 | predictions.extend(preds.tolist()) 65 | print(metrics.roc_auc_score(yvalid, preds)) 66 | 67 | joblib.dump(label_encoder, f"models/{MODEL}_{FOLD}_label_encoder.pkl") 68 | joblib.dump(clf, f"models/{MODEL}_{FOLD}_.pkl") 69 | joblib.dump(train_df.columns, f"models/{MODEL}_{FOLD}_columns.pkl") 70 | oof_dict = {"id": idx, "Predictions": predictions} 71 | oof_pred = pd.DataFrame(oof_dict) 72 | oof_pred.to_csv(f"models/{MODEL}_oof_predictions.csv") 73 | -------------------------------------------------------------------------------- /src/feature_impute.py: -------------------------------------------------------------------------------- 1 | from sklearn import impute 2 | from sklearn.experimental import enable_iterative_imputer 3 | from sklearn import linear_model 4 | from sklearn import ensemble 5 | from sklearn import neighbors 6 | 7 | 8 | class FeatureImpute: 9 | def __init__( 10 | self, 11 | dataframe, 12 | target_col: list, 13 | impute_method: str = "simple", 14 | impute_model: str = "lr", 15 | impute_stratergy: str = "mean", 16 | ): 17 | """ 18 | 19 | :param dataframe: Dataframe that is to be imputed 20 | :param target_col: List of columns on which imputation is to be performed 21 | :param impute_method: String to define the possible imputation stratergy. 'simple', 'model', 'knn' 22 | :param impute_model: String to define if any model is to be used for imputation. Values: 'lr', 'et', 'knn' 23 | :param impute_stratergy: String to define what strategy to be used for imputing 24 | """ 25 | 26 | self.df = dataframe 27 | self.target = target_col 28 | self.impute_method = impute_method 29 | self.model = impute_model 30 | self.stratergy = impute_stratergy 31 | 32 | if self.model == "et": 33 | self.estimator = ensemble.ExtraTreesRegressor( 34 | n_estimators=50, random_state=42 35 | ) 36 | elif self.model == "knn": 37 | self.estimator = neighbors.KNeighborsRegressor(n_neighbors=15) 38 | else: 39 | self.estimator = linear_model.LinearRegression() 40 | 41 | self.output_df = self.df.copy(deep=True) 42 | 43 | def _simple_impute(self): 44 | for col in self.target: 45 | s_impute = impute.SimpleImputer() 46 | s_impute.fit(self.df[col].values) 47 | self.output_df.loc[:, col] = s_impute.fit_transform(self.df[col].values) 48 | return self.output_df 49 | 50 | def _model_impute(self): 51 | for col in self.target: 52 | m_impute = impute.IterativeImputer( 53 | estimator=self.estimator, random_state=42 54 | ) 55 | m_impute.fit(self.df[col].values) 56 | self.output_df.loc[:, col] = m_impute.fit_transform(self.df[col].values) 57 | return self.output_df 58 | 59 | def _knn_impute(self): 60 | for col in self.target: 61 | k_impute = impute.KNNImputer() 62 | k_impute.fit(self.df[col].values) 63 | self.output_df.loc[:, col] = k_impute.fit_transform(self.df[col].values) 64 | return self.output_df 65 | 66 | def fit_transfom(self): 67 | if self.impute_method == "simple": 68 | return self._simple_impute() 69 | elif self.impute_method == "model": 70 | return self._model_impute() 71 | elif self.impute_method == "knn": 72 | return self._knn_impute() 73 | else: 74 | raise Exception("Imputation Type not defined") 75 | -------------------------------------------------------------------------------- /src/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics as skmetrics 2 | import numpy as np 3 | 4 | 5 | class RegressionMetric: 6 | def __init__(self): 7 | self.metrics = { 8 | "mae": self._mae, 9 | "mse": self._mse, 10 | "rmse": self._rmse, 11 | "msle": self._msle, 12 | "rmsle": self._rmsle, 13 | "r2": self._r2, 14 | "mape": self._mape, 15 | } 16 | 17 | def __call__(self, metric, y_true, y_pred): 18 | if metric not in self.metrics: 19 | raise Exception("Metric not implemented") 20 | else: 21 | return self.metrics[metric](y_true=y_true, y_pred=y_pred) 22 | 23 | @staticmethod 24 | def _mae(y_true, y_pred): 25 | return skmetrics.mean_absolute_error(y_true=y_true, y_pred=y_pred) 26 | 27 | @staticmethod 28 | def _mse(y_true, y_pred): 29 | return skmetrics.mean_squared_error(y_true=y_true, y_pred=y_pred) 30 | 31 | def _rmse(self, y_true, y_pred): 32 | return np.sqrt(self._mse(y_true=y_true, y_pred=y_pred)) 33 | 34 | @staticmethod 35 | def _msle(y_true, y_pred): 36 | return skmetrics.mean_squared_log_error(y_true=y_true, y_pred=y_pred) 37 | 38 | def _rmsle(self, y_true, y_pred): 39 | return np.sqrt(self._msle(y_true=y_true, y_pred=y_pred)) 40 | 41 | @staticmethod 42 | def _r2(y_true, y_pred): 43 | return skmetrics.r2_score(y_true=y_true, y_pred=y_pred) 44 | 45 | @staticmethod 46 | def _mape(y_true, y_pred): 47 | y_true, y_pred = np.array(y_true), np.array(y_pred) 48 | epsilon = np.finfo(np.float64).eps 49 | mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon) 50 | output_errors = np.average(mape) 51 | return output_errors 52 | 53 | 54 | class ClassificationMetric: 55 | def __init__(self): 56 | self.metrics = { 57 | "accuracy": self._accuracy, 58 | "f1": self._f1, 59 | "recall": self._recall, 60 | "precision": self._precision, 61 | "auc": self._auc, 62 | "logloss": self._logloss, 63 | } 64 | 65 | def __call__(self, metric, y_true, y_pred, y_proba=None): 66 | if metric not in self.metrics: 67 | raise Exception("Metric not implemented") 68 | if metric == "auc": 69 | if y_proba is not None: 70 | return self._auc(y_true=y_true, y_pred=y_proba) 71 | else: 72 | raise Exception("y_proba cannot be None for AUC") 73 | elif metric == "logloss": 74 | if y_proba is not None: 75 | return self._auc(y_true=y_true, y_pred=y_proba) 76 | else: 77 | raise Exception("y_proba cannot be None for LogLoss") 78 | else: 79 | return self.metrics[metric](y_true=y_true, y_pred=y_pred) 80 | 81 | @staticmethod 82 | def _auc(y_true, y_pred): 83 | return skmetrics.roc_auc_score(y_true=y_true, y_score=y_pred) 84 | 85 | @staticmethod 86 | def _accuracy(y_true, y_pred): 87 | return skmetrics.accuracy_score(y_true=y_true, y_pred=y_pred) 88 | 89 | @staticmethod 90 | def _f1(y_true, y_pred): 91 | return skmetrics.f1_score(y_true=y_true, y_pred=y_pred) 92 | 93 | @staticmethod 94 | def _recall(y_true, y_pred): 95 | return skmetrics.recall_score(y_true=y_true, y_pred=y_pred) 96 | 97 | @staticmethod 98 | def _precision(y_true, y_pred): 99 | return skmetrics.precision_score(y_true=y_true, y_pred=y_pred) 100 | 101 | @staticmethod 102 | def _logloss(y_true, y_pred): 103 | return skmetrics.log_loss(y_true=y_true, y_pred=y_pred) 104 | -------------------------------------------------------------------------------- /src/numerical.py: -------------------------------------------------------------------------------- 1 | from sklearn import preprocessing 2 | import numpy as np 3 | 4 | 5 | class NumericalFeatures: 6 | def __init__(self, df, numerical_features, encoding_type, handle_na=False): 7 | """ 8 | df: pandas dataframe 9 | categorical_features: List of Columns that we want to encode 10 | encoding_type: standard, min-max, power. log 11 | """ 12 | self.df = df 13 | self.num_feats = numerical_features 14 | self.enc_type = encoding_type 15 | self.handle_na = handle_na 16 | self.stan_scaler = dict() 17 | self.min_max_encoder = dict() 18 | self.power_transform_encoder = dict() 19 | self.log_transform = dict() 20 | 21 | if self.handle_na: 22 | for c in self.num_feats: 23 | self.df.loc[:, c] = self.df.loc[:, c].astype(str).fillna("-9999999999") 24 | 25 | self.output_df = self.df.copy(deep=True) 26 | 27 | def _standard_scaler(self): 28 | for c in self.num_feats: 29 | ss = preprocessing.StandardScaler() 30 | ss.fit(self.df[c].values.reshape(-1, 1)) 31 | self.output_df.loc[:, c] = ss.transform(self.df[c].values.reshape(-1, 1)) 32 | self.stan_scaler[c] = ss 33 | return self.output_df, self.stan_scaler 34 | 35 | def _min_max_scaler(self): 36 | for c in self.num_feats: 37 | mms = preprocessing.MinMaxScaler() 38 | mms.fit(self.df[c].values.reshape(-1, 1)) 39 | self.output_df.loc[:, c] = mms.transform(self.df[c].values.reshape(-1, 1)) 40 | self.min_max_encoder[c] = mms 41 | return self.output_df, self.min_max_encoder 42 | 43 | def _power_transform(self): 44 | for c in self.num_feats: 45 | powt = preprocessing.PowerTransformer() 46 | powt.fit(self.df[c].values.reshape(-1, 1)) 47 | self.output_df.loc[:, c] = powt.transform(self.df[c].values.reshape(-1, 1)) 48 | self.power_transform_encoder[c] = powt 49 | return self.output_df, self.power_transform_encoder 50 | 51 | def _log_transform(self): 52 | for c in self.num_feats: 53 | logt = preprocessing.FunctionTransformer( 54 | np.log1p, inverse_func=np.expm1, validate=True 55 | ) 56 | logt.fit(self.df[c].values.reshape(-1, 1)) 57 | self.output_df.loc[:, c] = logt.transform(self.df[c].values.reshape(-1, 1)) 58 | self.log_transform[c] = logt 59 | return self.output_df, self.log_transform 60 | 61 | def fit_transform(self): 62 | if self.enc_type == "min-max": 63 | return self._min_max_scaler() 64 | elif self.enc_type == "standard": 65 | return self._standard_scaler() 66 | elif self.enc_type == "power": 67 | return self._power_transform() 68 | elif self.enc_type == "log": 69 | return self._log_transform() 70 | else: 71 | raise Exception("Transformation Type not understood") 72 | 73 | def transform(self, dataframe): 74 | if self.handle_na: 75 | for c in self.num_feats: 76 | dataframe.loc[:, c] = dataframe.loc[:, c].astype(str).fillna("-9999999") 77 | 78 | if self.enc_type == "min-max": 79 | for c, mms in self.min_max_encoder.items(): 80 | dataframe.loc[:, c] = mms.transform(dataframe[c].values.reshape(-1, 1)) 81 | return dataframe 82 | elif self.enc_type == "standard": 83 | for c, ss in self.stan_scaler.items(): 84 | dataframe.loc[:, c] = ss.transform(dataframe[c].values.reshape(-1, 1)) 85 | return dataframe 86 | elif self.enc_type == "power": 87 | for c, powt in self.power_transform_encoder.items(): 88 | dataframe.loc[:, c] = powt.transform(dataframe[c].values.reshape(-1, 1)) 89 | return dataframe 90 | elif self.enc_type == "log": 91 | for c, logt in self.log_transform.items(): 92 | dataframe.loc[:, c] = logt.transform(dataframe[c].values.reshape(-1, 1)) 93 | return dataframe 94 | else: 95 | raise Exception("Transformation not understood") 96 | 97 | 98 | # if __name__ == "__main__": 99 | # import pandas as pd 100 | # 101 | # df = pd.read_csv(r'C:\Users\abhis\OneDrive - IHS Markit\Python\00_practice\00_practice\diamonds.csv', 102 | # encoding='latin-1') 103 | # num_cols = ['price'] 104 | # num_feat_transform = NumericalFeatures(df, num_cols, encoding_type='log', handle_na=False) 105 | # transformed_df = num_feat_transform.fit_transform() 106 | # print(transformed_df.head()) 107 | -------------------------------------------------------------------------------- /src/categorical.py: -------------------------------------------------------------------------------- 1 | from sklearn import preprocessing 2 | 3 | 4 | class CategoricalFeatures: 5 | def __init__(self, df, categorical_features, encoding_type, handle_na=False): 6 | """ 7 | df: pandas dataframe 8 | categorical_features: List of Columns that we want to encode 9 | encoding_type: label, binary, ohe-hot-encoding (ohe) 10 | """ 11 | self.df = df 12 | self.cat_feats = categorical_features 13 | self.enc_type = encoding_type 14 | self.label_encoders = dict() 15 | self.binary_encoders = dict() 16 | self.handle_na = handle_na 17 | 18 | if self.handle_na: 19 | for c in self.cat_feats: 20 | self.df.loc[:, c] = self.df.loc[:, c].astype(str).fillna("-9999999999") 21 | 22 | self.output_df = self.df.copy(deep=True) 23 | 24 | def _label_encoding(self): 25 | for c in self.cat_feats: 26 | lbl = preprocessing.LabelEncoder() 27 | lbl.fit(self.df[c].values) 28 | self.output_df.loc[:, c] = lbl.transform(self.df[c].values) 29 | self.label_encoders[c] = lbl 30 | return self.output_df 31 | 32 | def _label_binarizer(self): 33 | for c in self.cat_feats: 34 | lbl = preprocessing.LabelBinarizer() 35 | lbl.fit(self.df[c].values) 36 | val = lbl.transform(self.df[c].values) 37 | self.output_df = self.output_df.drop(c, axis=1) 38 | for j in range(val.shape[1]): 39 | new_col_name = c + f"__bin_{j}" 40 | self.output_df[new_col_name] = val[:, j] 41 | self.binary_encoders[c] = lbl 42 | return self.output_df 43 | 44 | def _one_hot(self): 45 | ohe = preprocessing.OneHotEncoder() 46 | ohe.fit(self.df[self.cat_feats].values) 47 | self.ohe = ohe 48 | return ohe.transform(self.df[self.cat_feats].values) 49 | 50 | def fit_transform(self): 51 | if self.enc_type == "label": 52 | return self._label_encoding() 53 | elif self.enc_type == "binary": 54 | return self._label_binarizer() 55 | elif self.enc_type == "ohe": 56 | return self._one_hot() 57 | else: 58 | raise Exception("Encoding Type not understood") 59 | 60 | def transform(self, dataframe): 61 | if self.handle_na: 62 | for c in self.cat_feats: 63 | dataframe.loc[:, c] = ( 64 | dataframe.loc[:, c].astype(str).fillna("-9999999999") 65 | ) 66 | 67 | if self.enc_type == "label": 68 | for c, lbl in self.label_encoders.items(): 69 | dataframe.loc[:, c] = lbl.transform(dataframe[c].values) 70 | return dataframe 71 | elif self.enc_type == "binary": 72 | for c, lbl in self.binary_encoders.items(): 73 | val = lbl.transform(dataframe[c].values) 74 | dataframe = dataframe.drop(c, axis=1) 75 | for j in range(val.shape[1]): 76 | new_col_name = c + f"__bin_{j}" 77 | dataframe[new_col_name] = val[:, j] 78 | return dataframe 79 | elif self.enc_type == "ohe": 80 | return self.ohe.transform(dataframe[self.cat_feats].values) 81 | else: 82 | raise Exception("Encoding Type not understood") 83 | 84 | 85 | # if __name__ == "__main__": 86 | # import pandas as pd 87 | # import config 88 | # from sklearn import linear_model 89 | # DATA_PATH = config.DATA_PATH 90 | # TRAINING_DATA = DATA_PATH + r'\train_cat.csv' 91 | # TEST_DATA = DATA_PATH + r'\test_cat.csv' 92 | # df = pd.read_csv(TRAINING_DATA)#.head(50) 93 | # df_test = pd.read_csv(TEST_DATA)#.head(50) 94 | # submission = pd.read_csv(r'C:\Users\abhis\Documents\01_proj\input_data\submission.csv') 95 | # train_len = len(df) 96 | 97 | # df_test['target'] = -1 98 | 99 | # full_data = pd.concat([df, df_test]) 100 | 101 | # cols = [c for c in df.columns if c not in ['id', 'target']] 102 | # print(cols) 103 | # cat_feats = CategoricalFeatures(full_data, categorical_features=cols, encoding_type='ohe', handle_na=True) 104 | 105 | # full_data_transformed = cat_feats.fit_transform() 106 | # # test_transformed = cat_feats.transform(df_test) 107 | 108 | # train_transformed = full_data_transformed[:train_len, :] 109 | # test_transformed = full_data_transformed[train_len:, :] 110 | 111 | # print(train_transformed.shape) 112 | # print(test_transformed.shape) 113 | 114 | # clf = linear_model.LogisticRegression() 115 | # clf.fit(train_transformed, df.target.values) 116 | # pred = clf.predict_proba(test_transformed)[:,1] 117 | 118 | # submission.loc[:, 'target'] = pred 119 | # submission.to_csv(r'C:\Users\abhis\Documents\01_proj\input_data\submission.csv', index = False) 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hackathon Framework 2 | 3 | Objective of this project to enable quick experimentation in Data Analytics projects with minimal cookie cutter programming. 4 | Getting rid of all the fit_transforms.! 5 | 6 | --- 7 | ***NOTE*** 8 | 9 | - This is a work in progress. Underlying modules are in process of development. 10 | - As this project matures there will be changes in the scripts such as `train.py` and `predict.py` 11 | - TODO 12 | * Create modules for `tuning`, `stacking` 13 | * Removal of some of the modules that are redundant 14 | 15 | 16 | --- 17 | 18 | The framework is designed to make the Data Science flow easier to perform, by encapsulating different techniques for each step within 1 method. 19 | There are classes for each of the below listed steps: 20 | 21 | - Feature Evaluation 22 | * Report to give an intution of the dataset 23 | 24 | - Feature Engineering 25 | * Modules to perform feature transformations on Categorical and Numerical Dataset. 26 | * Various applicable techniques are encoded within these modules and are accesed with an argument. 27 | 28 | - Fefature Generation 29 | * Module to create new features based on different techniques 30 | 31 | - Cross Validation 32 | * Stratified Folding both for Regression and Classification 33 | 34 | - Training 35 | * Run multiple models using 1 class. 36 | * Evaluating and Saving the results in an organized manner 37 | 38 | - Tuning 39 | * Hyper-parameter tuning of multiple models, based on json arguments for parameter values. 40 | 41 | - Prediction 42 | 43 | - Evaluating the model 44 | 45 | 46 | 47 | 48 | ## Steps to use the framework 49 | 50 | 51 | 52 | 1. Clone the repo. 53 | 2. Create 3 folders `input` and `model` and `tuneq`. 54 | 3. Save the training, testing and sample submission file in `input` folder. 55 | 4. The outputs generated from training such as trained model, encoders and oof_preds will be saved in `model` folder. 56 | 5. The parameters for fine tuning the models should be saved in the `tune` folder. 57 | 6. Update the `config.py` to point it to the correct path for data, model and tuning. 58 | 7. Update the `dispatcher.py` with model/models you want to run your dataset on. 59 | 8. Use the sample notebook to understand how to use this framework after this intial configuration is completed. 60 | 61 | 62 | 63 | 64 | ## Description of Files and their Purpose 65 | 66 | - `config.py`: Config file to give path of all the datasets and other standard configuration items. Such as csv files path, random seed etc. 67 | 68 | - `feature_eval.py`: This script and the class inside is used to analyze the dataframe and its columns to get the following output: 69 | - min, max and unique values of each column 70 | - histogram/ distribution of each column 71 | - corelation of columns using a heat map 72 | 73 | - `feature_gen.py`: Encapsulates method to generate new features. Currently implemented the `Polynomial features` method from sklearn. 74 | Returns Dataframe with new features. 75 | 76 | - `feature_impute.py`: Encapsulates the method to impute blank values in a dataframe. 77 | Currently, it supports 3 imputation methods: 78 | - Simple Imputer 79 | - Model Based Imputer: Extra Trees or knn 80 | - Knn based imputer 81 | - Returns updated Dataframe 82 | 83 | - `cross_validation.py`: This class is used to perform cross validation on any dataframe based on the type of problem statement. It is used to create cross validated dataset. 84 | 85 | - `categorical.py`: This class can be used for encoding of categorical features in a given dataframe. 86 | - Inputs : Dataframe, Categorical Columns List, Type of Encoding 87 | - Output: Encoded Dataframe 88 | - Supported Encoding Techniques: 89 | - Lable Encoding 90 | - Binary Encoding 91 | - One Hot Encoding 92 | 93 | - `numerical.py`: This class can be used for encoding of numerical features in a given dataframe. 94 | - Inputs : Dataframe, Categorical Columns List, Type of Encoding 95 | - Output: Encoded Dataframe, Transformer Object for later use. 96 | - Support Techniques: 97 | - Standard Scaler 98 | - Min-Max Scaler 99 | - Power Tranformer 100 | - Log Transformer 101 | 102 | - `metrics.py`: This class can be used to evaluate the results of given predictions and actual value. 103 | 104 | - `dispatcher.py`: Python File with Models and parameters. They have been designed to supply the models to `engine.py` for training on a given dataset 105 | 106 | - `engine.py`: This script encapsulates the method to train and evaluate the multiple models simultaneously 107 | - Leverages on `dispatcher.py` and `metrics.py` for model and metrics 108 | - The results for each fold are also saved in the `models` folder as `oof_predictions.csv` for each model. 109 | - **To Do** Stacking module to suporrt stacking of multiple models 110 | 111 | - **Scripts to be ignored for now**: 112 | - `train.py`: For training 113 | - `predict.py`: For prediction 114 | - `tune.py`: For tuning h-parameter 115 | - `create_folds.py`: To create folded datframe -------------------------------------------------------------------------------- /src/cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import model_selection 3 | 4 | 5 | """ 6 | 7 | Categories to tackle 8 | - binary classification 9 | - multi class classification 10 | - multi label classification 11 | - single column regression 12 | - multi column regression 13 | - hold out 14 | 15 | """ 16 | 17 | 18 | class CrossValidation: 19 | def __init__( 20 | self, 21 | df, 22 | target_cols, 23 | shuffle=False, 24 | problem_type="binary_classification", 25 | stratified_regression=False, 26 | multilabel_delimiter=",", 27 | n_folds=5, 28 | random_state=42, 29 | ): 30 | self.dataframe = df 31 | self.target_cols = target_cols 32 | self.num_targets = len(target_cols) 33 | self.shuffle = shuffle 34 | self.problem_type = problem_type 35 | self.stratified_regression = stratified_regression 36 | self.num_folds = n_folds 37 | self.random_state = random_state 38 | self.multilabel_delimiter = multilabel_delimiter 39 | 40 | if self.shuffle is True: 41 | self.dataframe = self.dataframe.sample(frac=1).reset_index(drop=True) 42 | 43 | self.dataframe["kfold"] = -1 44 | 45 | @staticmethod 46 | def _sort_partition(y, num_folds): 47 | n = len(y) 48 | cats = np.empty(n, dtype="u4") 49 | div, mod = divmod(n, num_folds) 50 | cats[: n - mod] = np.repeat(range(div), num_folds) 51 | cats[n - mod :] = div + 1 52 | return cats[np.argsort(np.argsort(y))] 53 | 54 | def split(self): 55 | if self.problem_type in ("binary_classification", "multiclass_classification"): 56 | if self.num_targets != 1: 57 | raise Exception( 58 | "Invalid number of target for this type of Problem statement" 59 | ) 60 | target = self.target_cols[0] 61 | unqiue_values = self.dataframe[target].nunique() 62 | if unqiue_values == 1: 63 | raise Exception("Only one class present in the data, No ML is needed") 64 | elif unqiue_values > 1: 65 | kf = model_selection.StratifiedKFold( 66 | n_splits=self.num_folds, shuffle=False 67 | ) 68 | for fold, (train_idx, val_idx) in enumerate( 69 | kf.split(X=self.dataframe, y=self.dataframe[target].values) 70 | ): 71 | self.dataframe.loc[val_idx, "kfold"] = fold 72 | 73 | elif self.problem_type in ("single_col_regression", "multi_col_regression"): 74 | if self.num_targets != 1 and self.problem_type == "single_col_regression": 75 | raise Exception("Invalid number of targets for this type of problem") 76 | if self.num_targets < 2 and self.problem_type == "multi_col_regression": 77 | raise Exception("Invalid number of targets for this type of problem") 78 | kf = model_selection.KFold(n_splits=self.num_folds) 79 | for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.dataframe)): 80 | self.dataframe.loc[val_idx, "kfold"] = fold 81 | 82 | elif ( 83 | self.problem_type in "single_col_regression" and self.stratified_regression 84 | ): 85 | if self.num_targets != 1 and self.problem_type == "single_col_regression": 86 | raise Exception("Invalid number of targets for this problem type") 87 | target = self.target_cols[0] 88 | y = self.dataframe[target].values 89 | y_categorized = self._sort_partition(y, self.num_folds) 90 | kf = model_selection.StratifiedKFold(n_splits=self.num_folds, shuffle=False) 91 | for fold, (train_idx, val_idx) in enumerate( 92 | kf.split(X=self.dataframe, y=y_categorized) 93 | ): 94 | self.dataframe.loc[val_idx, "kfold"] = fold 95 | 96 | elif self.problem_type.startswith("holdout_"): 97 | holdout_percentage = int(self.problem_type.split("_")[1]) 98 | num_holdout_samples = int(len(self.dataframe) * holdout_percentage / 100) 99 | self.dataframe.loc[: len(self.dataframe) - num_holdout_samples, "kfold"] = 0 100 | self.dataframe.loc[len(self.dataframe) - num_holdout_samples :, "kfold"] = 1 101 | 102 | elif self.problem_type == "multilabel_classification": 103 | if self.num_targets != 1: 104 | raise Exception("Invalid number of targets for this problem type") 105 | targets = self.dataframe[self.target_cols[0]].apply( 106 | lambda x: len(str(x).split(self.multilabel_delimiter)) 107 | ) 108 | kf = model_selection.StratifiedKFold(n_splits=self.num_folds) 109 | for fold, (train_idx, val_idx) in enumerate( 110 | kf.split(X=self.dataframe, y=targets) 111 | ): 112 | self.dataframe.loc[val_idx, "kfold"] = fold 113 | 114 | else: 115 | raise Exception("Problem type not understood!") 116 | 117 | return self.dataframe 118 | 119 | 120 | # if __name__ == "__main__": 121 | # import config 122 | # import pandas as pd 123 | # REG_DATA = config.RAW_DATA 124 | # df = pd.read_csv(REG_DATA) 125 | # cross_val = CrossValidation(df=df, target_cols=['price'], problem_type='single_col_regression', stratified_regression = True) 126 | # df_folds = cross_val.split() 127 | # print(df_folds.head()) 128 | -------------------------------------------------------------------------------- /src/engine.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from src import dispatcher 3 | import joblib 4 | from pathlib import Path 5 | from src.metrics import RegressionMetric, ClassificationMetric 6 | 7 | 8 | class Engine: 9 | def __init__( 10 | self, 11 | dataframe, 12 | id_col: str, 13 | target_col: str, 14 | folds: int, 15 | unused_col: list = None, 16 | problem_type: str = "regression", 17 | model_list: list = None, 18 | save_model: bool = False, 19 | ): 20 | """ 21 | 22 | :param dataframe: Dataframe for the folded dataset to be used for training/tuning 23 | :param id_col: String for the column name that is used for identfying the rows in dataframe 24 | :param target_col: String of Column name that is to be used for prediction 25 | :param folds: Number of folds in the dataframe 26 | :param unused_col: List of columns that is not to be considered for training 27 | :param model_list: List of models that are to be run for comparison, tuning or training. 28 | :param problem_type: This will define the type of models to pick.Possible values: regression/classification 29 | :param save_model: True or False to specify if you want the save the model file 30 | """ 31 | self.df = dataframe 32 | self.id = id_col 33 | self.target = target_col 34 | self.folds = folds 35 | self.problem = problem_type 36 | self.models = model_list 37 | self.save_model = save_model 38 | self.model_dict = dict() 39 | self.result_dict = dict() 40 | 41 | if unused_col is None: 42 | self.unused_col = self.target.split() 43 | else: 44 | self.unused_col = unused_col 45 | 46 | if self.problem == "regression": 47 | self.models_used = dispatcher.REGRESSION_MODELS 48 | else: 49 | self.models_used = dispatcher.CLASSIFICATION_MODELS 50 | 51 | self.output_path = Path("./models/") 52 | 53 | @staticmethod 54 | def _generate_mapping(folds): 55 | fold_dict = dict() 56 | for i in range(folds): 57 | fold_dict[i] = [x for x in range(folds) if x != i] 58 | return fold_dict 59 | 60 | @staticmethod 61 | def _save_model(self, model: str, fold: int, clf): 62 | joblib.dump(clf, f"{self.output_path}/{str(model)}__{str(fold)}__.pkl") 63 | return 64 | 65 | @staticmethod 66 | def _save_result(self): 67 | for model_result in self.result_dict.keys(): 68 | result_df = pd.DataFrame(self.result_dict[model_result]) 69 | result_df.to_csv( 70 | f"{self.output_path}/{str(model_result)}__oof_predictions.csv", 71 | index=False, 72 | ) 73 | return 74 | 75 | @staticmethod 76 | def _train_model(self, model, fold_dict: dict): 77 | idx = list() 78 | actuals = list() 79 | predictions = list() 80 | fold_list = list() 81 | for fold in range(self.folds): 82 | train_df = self.df[self.df.kfold.isin(fold_dict.get(fold))] 83 | valid_df = self.df[self.df.kfold == fold] 84 | ytrain = train_df[self.target].values 85 | yvalid = valid_df[self.target].values 86 | idx.extend(valid_df[self.id].values.tolist()) 87 | train_df = train_df.drop(self.unused_col, axis=1) 88 | valid_df = valid_df.drop(self.unused_col, axis=1) 89 | valid_df = valid_df[train_df.columns] 90 | clf = self.models_used[model] 91 | clf.fit(train_df, ytrain) 92 | preds = clf.predict(valid_df) 93 | predictions.extend(preds.tolist()) 94 | actuals.extend(yvalid.tolist()) 95 | fold_list.extend([fold for num in range(len(yvalid))]) 96 | if self.save_model: 97 | self._save_model(self, model, fold, clf) 98 | result = { 99 | self.id: idx, 100 | "Predictions": predictions, 101 | "Actuals": actuals, 102 | "Fold": fold_list, 103 | } 104 | return result 105 | 106 | def train_models(self): 107 | fold_dict = self._generate_mapping(self.folds) 108 | if self.models is None: 109 | self.models = list(self.models_used.keys()) 110 | for model in self.models: 111 | if self.save_model: 112 | print( 113 | f"Training model: {str(model)}, and saving the model and results at: {str(self.output_path)}" 114 | ) 115 | else: 116 | print( 117 | f"Training model: {str(model)}, and saving only the results at: {str(self.output_path)}" 118 | ) 119 | self.result_dict[model] = self._train_model(self, model, fold_dict) 120 | print(f"Saving the results of Trained Models") 121 | self._save_result(self) 122 | return 123 | 124 | def stack(self): 125 | # TODO Implement stacking in Engine. IP: model_preds, meta_model | OP: Pred, result 126 | return 127 | 128 | def evaluate( 129 | self, model_list: list, metric: str = None, target_transformer: dict = None 130 | ): 131 | """ 132 | 133 | :param model_list: List of models which needs to be evaluated 134 | :param metric: Metric that is to be calcuated for the models 135 | :param target_transformer: If any transformer is used on the target, pass that transformer object here. 136 | :return: dataframe with metric and mean 137 | """ 138 | for model in model_list: 139 | print("Evaluating the result for model {0}".format(str(model))) 140 | result_df = pd.read_csv( 141 | f"{self.output_path}/{str(model)}__oof_predictions.csv" 142 | ) 143 | if self.problem == "regression": 144 | metric_type = RegressionMetric() 145 | else: 146 | metric_type = ClassificationMetric() 147 | if target_transformer is None: 148 | result = metric_type( 149 | metric, 150 | result_df["Actuals"].values.reshape(-1, 1), 151 | result_df["Predictions"].values.reshape(-1, 1), 152 | ) 153 | else: 154 | result = metric_type( 155 | metric, 156 | target_transformer[self.target].inverse_transform( 157 | result_df["Actuals"].values.reshape(-1, 1) 158 | ), 159 | target_transformer[self.target].inverse_transform( 160 | result_df["Predictions"].values.reshape(-1, 1) 161 | ), 162 | ) 163 | 164 | return result 165 | 166 | 167 | # if __name__ == "__main__": 168 | # import pandas as pd 169 | # 170 | # df = pd.read_csv(r'C:\Users\abhis\Documents\01_proj\kaggle_comp\sample_ihsm\input\train_folds.csv') 171 | # engine = Engine( 172 | # dataframe=df, 173 | # id_col='vehicle_id', 174 | # target_col='Price_USD', 175 | # folds=5, 176 | # save_model=True, 177 | # model_list=['lr'] 178 | # ) 179 | # engine.train_models() 180 | # my_result = engine.evaluate(model_list=['lr'], metric='mape') 181 | # print(my_result) 182 | --------------------------------------------------------------------------------