├── .idea
├── .gitignore
├── vcs.xml
├── misc.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── framework.iml
├── .gitattributes
├── src
├── config.py
├── dispatcher.py
├── create_folds.py
├── predict.py
├── feature_gen.py
├── feature_eval.py
├── train.py
├── feature_impute.py
├── metrics.py
├── numerical.py
├── categorical.py
├── cross_validation.py
└── engine.py
├── LICENSE
├── .gitignore
└── README.md
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | DATA_PATH = r"/input/"
2 | TUNE_PATH = r"./tune/"
3 | MODEL_PATH = r"./models/"
4 | FOLDS = 5
5 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/framework.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/src/dispatcher.py:
--------------------------------------------------------------------------------
1 | from sklearn import ensemble
2 | from sklearn import linear_model
3 | from sklearn import svm
4 |
5 | # TODO Implement models with good default arguments
6 |
7 | REGRESSION_MODELS = dict(
8 | lr=linear_model.LinearRegression(),
9 | svm=svm.SVR(C=0.001, epsilon=0.001, gamma="scale"),
10 | randomforest=ensemble.RandomForestRegressor(
11 | n_estimators=100, n_jobs=100, verbose=1
12 | ),
13 | extratrees=ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=100, verbose=1),
14 | )
15 |
16 |
17 | CLASSIFICATION_MODELS = dict(
18 | lr=linear_model.LogisticRegression(),
19 | svm=svm.SVC(),
20 | randomforest=ensemble.RandomForestClassifier(
21 | n_estimators=200, n_jobs=100, verbose=1
22 | ),
23 | extratrees=ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=100, verbose=1),
24 | )
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Abhishek Kumar Mishra
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/create_folds.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from src import config
3 | from src.cross_validation import CrossValidation
4 |
5 | RAW_DATA = config.RAW_DATA
6 | FOLDS_DATA = config.DATA_PATH + r"\diamonds_folds.csv"
7 | # REG_DATA = config.REG_DATA
8 | # FOLDS_DATA_REG = config.DATA_PATH+r'\train_folds_reg.csv'
9 |
10 |
11 | if __name__ == "__main__":
12 | # df = pd.read_csv(REG_DATA)
13 | # df['kfold'] = -1
14 | df = pd.read_csv(r"C:\Users\abhis\Documents\01_proj\input_data\diamonds.csv")
15 | df = df.sample(frac=1).reset_index(drop=True)
16 |
17 | # kf = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=42)
18 |
19 | # for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.target.values)):
20 | # print(len(train_idx), len(val_idx))
21 | # df.loc[val_idx, 'kfold'] = fold
22 |
23 | # cross_val = CrossValidation(df = df, target_cols=['price'], problem_type='single_col_regression',
24 | # stratified_regression = True)
25 | cross_val = CrossValidation(
26 | df=df,
27 | target_cols=["target"],
28 | problem_type="single_col_regression",
29 | stratified_regression=True,
30 | )
31 | df_folds = cross_val.split()
32 | # df.to_csv(FOLDS_DATA_REG, index=False)
33 | df_folds.to_csv(FOLDS_DATA, index=False)
34 |
--------------------------------------------------------------------------------
/src/predict.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import numpy as np
3 | import pandas as pd
4 |
5 | import joblib
6 |
7 | from src import config
8 |
9 | TRAINING_DATA = config.TRAINING_DATA
10 | TEST_DATA = config.TEST_DATA
11 | FOLDS = config.FOLDS
12 |
13 |
14 | def predict(MODEL, FOLDS):
15 | MODEL = MODEL
16 | df = pd.read_csv(TEST_DATA)
17 | text_idx = df["id"].values
18 | predictions = None
19 |
20 | for FOLD in range(FOLDS):
21 | print(FOLD)
22 | df = pd.read_csv(TEST_DATA)
23 | encoders = joblib.load(f"models/{MODEL}_{FOLD}_label_encoder.pkl")
24 | cols = joblib.load(f"models/{MODEL}_{FOLD}_columns.pkl")
25 | for c in encoders:
26 | print(c)
27 | lbl = encoders[c]
28 | df.loc[:, c] = lbl.transform(df[c].values.tolist())
29 |
30 | clf = joblib.load(f"models/{MODEL}_{FOLD}_.pkl")
31 | df = df[cols]
32 | preds = clf.predict_proba(df)[:, 1]
33 |
34 | if FOLD == 0:
35 | predictions = preds
36 | else:
37 | predictions += preds
38 |
39 | predictions /= 5
40 |
41 | sub = pd.DataFrame(
42 | np.column_stack((text_idx, predictions)), columns=["id", "target"]
43 | )
44 | return sub
45 |
46 |
47 | if __name__ == "__main__":
48 |
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument("model", help="Type in the model you want to run", type=str)
51 | args = parser.parse_args()
52 |
53 | MODEL = args.model
54 |
55 | submission = predict(MODEL, FOLDS)
56 | submission.id = submission.id.astype(int)
57 | submission.to_csv(f"models/{MODEL}.csv", index=False)
58 |
--------------------------------------------------------------------------------
/src/feature_gen.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn import preprocessing
3 |
4 |
5 | class FeatureGen:
6 | def __init__(
7 | self,
8 | df,
9 | target_cols: list = None,
10 | degree: int = 2,
11 | interaction_only: bool = False,
12 | include_bias: bool = True,
13 | feature_gen: str = "poly",
14 | ):
15 | """
16 | :param df: Dataframe that needs to be used for generation of feature
17 | :param target_cols: List of columns that the method needs to be applied on
18 | :param feature_gen: Method to be used to generate features, ploy=Polynomical Feature generator from sklearn
19 | """
20 | self.df = df
21 | self.feature_gen = feature_gen
22 | self.target_cols = target_cols
23 | self.degree = degree
24 | self.interaction_only = interaction_only
25 | self.include_bias = include_bias
26 |
27 | if self.target_cols is None:
28 | self.target_cols = self.df.columns
29 |
30 | def fit_transform(self):
31 | if self.feature_gen == "poly":
32 | polynomial = preprocessing.PolynomialFeatures(
33 | self.degree, self.interaction_only, self.include_bias
34 | )
35 | new_features = polynomial.fit_transform(self.df[self.target_cols].values)
36 | new_features = pd.DataFrame(new_features)
37 | output_df = pd.concat([self.df, new_features], axis=1)
38 | return output_df
39 |
40 |
41 | if __name__ == "__main__":
42 | import pandas as pd
43 |
44 | df = pd.read_csv(
45 | r"C:\Users\abhis\Documents\01_proj\kaggle_comp\sample_ihsm\input\train_sample.csv"
46 | )
47 | poly = FeatureGen(df, degree=2)
48 | new_df = poly.fit_transform()
49 | print(new_df.head())
50 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 |
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 |
113 | # Rope project settings
114 | .ropeproject
115 |
116 | # mkdocs documentation
117 | /site
118 |
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 |
124 | # Pyre type checker
125 | .pyre/
126 |
127 | # Other files
128 | *.csv
129 | *.pkl
130 | .vscode
131 | .idea
--------------------------------------------------------------------------------
/src/feature_eval.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import seaborn as sns
3 |
4 |
5 | class FeatEvaluation:
6 | def __init__(self, df, target_col: str = None):
7 | """
8 | :param df: Dataframe which will be analysed
9 | :param target_col: String of the colummn name that is the target for this analysis in the dataframe
10 | """
11 | self.df = df
12 | self.target = target_col
13 |
14 | def stat_desc(self, col):
15 | if self.df[col].dtype == "O":
16 | return "Categorical Data"
17 | else:
18 | return self.df[col].describe().loc[["min", "max"]]
19 |
20 | def feature_report(self):
21 | print("Feature Report Generated for all the columns in the Dataframe")
22 | for col in self.df.columns:
23 | print("\n")
24 | print(f"Feature Report for Column: {col}")
25 | print("~~~~~~==================~~~~~~")
26 | print(str(self.stat_desc(col)))
27 | print(f"No of Unique Values: {self.df[col].nunique()}")
28 | print(f"No of Values in the column: {self.df[col].value_counts()}")
29 | return
30 |
31 | def feature_plot(self):
32 | for col in self.df.columns:
33 | print("Plotting the Distribution for: {0}".format(col))
34 | if self.df[col].dtype == "O":
35 | plt.figure(figsize=(16, 9))
36 | sns.boxplot(x=col, y=self.target, data=self.df)
37 | plt.show()
38 | else:
39 | plt.figure(figsize=(16, 9))
40 | sns.distplot(self.df[col].values)
41 | plt.show()
42 | return
43 |
44 | def corelation_plot(self):
45 | corr = self.df.corr()
46 | plt.figure(figsize=(16, 9))
47 | sns.heatmap(
48 | corr,
49 | annot=True,
50 | vmin=-1,
51 | vmax=1,
52 | center=0,
53 | cmap="coolwarm",
54 | linewidths=1.5,
55 | linecolor="black",
56 | )
57 | plt.show()
58 | return
59 |
60 |
61 | # if __name__ == "__main__":
62 | # import config
63 | # import pandas as pd
64 | # RAW_TRAIN_DATA = config.RAW_DATA
65 | # TEST_DATA = config.TEST_DATA
66 | #
67 | # train_df = pd.read_csv(RAW_TRAIN_DATA)
68 | # test_df = pd.read_csv(TEST_DATA)
69 | # test_df['target'] = -99999
70 | # eval = FeatEvaluation(train_df, 'price')
71 | # print(eval.feature_report())
72 |
--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import pandas as pd
3 |
4 | from sklearn import preprocessing
5 | from sklearn import metrics
6 |
7 | import joblib
8 |
9 | from src import dispatcher, config
10 |
11 | TRAINING_DATA = config.TRAINING_DATA
12 | TEST_DATA = config.TEST_DATA
13 | FOLDS = config.FOLDS
14 |
15 | FOLD_MAPPING = {
16 | 0: [1, 2, 3, 4],
17 | 1: [0, 2, 3, 4],
18 | 2: [0, 1, 3, 4],
19 | 3: [0, 1, 2, 4],
20 | 4: [0, 1, 2, 3],
21 | }
22 |
23 | if __name__ == "__main__":
24 | # TODO Update the train.py to use the ML Framework for all the work
25 | parser = argparse.ArgumentParser()
26 | parser.add_argument("model", help="Type in the model you want to run", type=str)
27 | args = parser.parse_args()
28 |
29 | MODEL = args.model
30 |
31 | df = pd.read_csv(TRAINING_DATA)
32 | df_test = pd.read_csv(TEST_DATA)
33 | idx = []
34 | predictions = []
35 |
36 | for FOLD in range(FOLDS):
37 | train_df = df[df.kfold.isin(FOLD_MAPPING.get(FOLD))]
38 | valid_df = df[df.kfold == FOLD]
39 |
40 | ytrain = train_df.target.values
41 | yvalid = valid_df.target.values
42 |
43 | idx.extend(valid_df["id"].values.tolist())
44 |
45 | train_df = train_df.drop(["id", "kfold", "target"], axis=1)
46 | valid_df = valid_df.drop(["id", "kfold", "target"], axis=1)
47 | valid_df = valid_df[train_df.columns]
48 |
49 | label_encoder = {}
50 | for c in train_df.columns:
51 | lbl = preprocessing.LabelEncoder()
52 | lbl.fit(
53 | train_df[c].values.tolist()
54 | + valid_df[c].values.tolist()
55 | + df_test[c].values.tolist()
56 | )
57 | train_df.loc[:, c] = lbl.transform(train_df[c].values.tolist())
58 | valid_df.loc[:, c] = lbl.transform(valid_df[c].values.tolist())
59 | label_encoder[c] = lbl
60 |
61 | clf = dispatcher.MODELS[MODEL]
62 | clf.fit(train_df, ytrain)
63 | preds = clf.predict_proba(valid_df)[:, 1]
64 | predictions.extend(preds.tolist())
65 | print(metrics.roc_auc_score(yvalid, preds))
66 |
67 | joblib.dump(label_encoder, f"models/{MODEL}_{FOLD}_label_encoder.pkl")
68 | joblib.dump(clf, f"models/{MODEL}_{FOLD}_.pkl")
69 | joblib.dump(train_df.columns, f"models/{MODEL}_{FOLD}_columns.pkl")
70 | oof_dict = {"id": idx, "Predictions": predictions}
71 | oof_pred = pd.DataFrame(oof_dict)
72 | oof_pred.to_csv(f"models/{MODEL}_oof_predictions.csv")
73 |
--------------------------------------------------------------------------------
/src/feature_impute.py:
--------------------------------------------------------------------------------
1 | from sklearn import impute
2 | from sklearn.experimental import enable_iterative_imputer
3 | from sklearn import linear_model
4 | from sklearn import ensemble
5 | from sklearn import neighbors
6 |
7 |
8 | class FeatureImpute:
9 | def __init__(
10 | self,
11 | dataframe,
12 | target_col: list,
13 | impute_method: str = "simple",
14 | impute_model: str = "lr",
15 | impute_stratergy: str = "mean",
16 | ):
17 | """
18 |
19 | :param dataframe: Dataframe that is to be imputed
20 | :param target_col: List of columns on which imputation is to be performed
21 | :param impute_method: String to define the possible imputation stratergy. 'simple', 'model', 'knn'
22 | :param impute_model: String to define if any model is to be used for imputation. Values: 'lr', 'et', 'knn'
23 | :param impute_stratergy: String to define what strategy to be used for imputing
24 | """
25 |
26 | self.df = dataframe
27 | self.target = target_col
28 | self.impute_method = impute_method
29 | self.model = impute_model
30 | self.stratergy = impute_stratergy
31 |
32 | if self.model == "et":
33 | self.estimator = ensemble.ExtraTreesRegressor(
34 | n_estimators=50, random_state=42
35 | )
36 | elif self.model == "knn":
37 | self.estimator = neighbors.KNeighborsRegressor(n_neighbors=15)
38 | else:
39 | self.estimator = linear_model.LinearRegression()
40 |
41 | self.output_df = self.df.copy(deep=True)
42 |
43 | def _simple_impute(self):
44 | for col in self.target:
45 | s_impute = impute.SimpleImputer()
46 | s_impute.fit(self.df[col].values)
47 | self.output_df.loc[:, col] = s_impute.fit_transform(self.df[col].values)
48 | return self.output_df
49 |
50 | def _model_impute(self):
51 | for col in self.target:
52 | m_impute = impute.IterativeImputer(
53 | estimator=self.estimator, random_state=42
54 | )
55 | m_impute.fit(self.df[col].values)
56 | self.output_df.loc[:, col] = m_impute.fit_transform(self.df[col].values)
57 | return self.output_df
58 |
59 | def _knn_impute(self):
60 | for col in self.target:
61 | k_impute = impute.KNNImputer()
62 | k_impute.fit(self.df[col].values)
63 | self.output_df.loc[:, col] = k_impute.fit_transform(self.df[col].values)
64 | return self.output_df
65 |
66 | def fit_transfom(self):
67 | if self.impute_method == "simple":
68 | return self._simple_impute()
69 | elif self.impute_method == "model":
70 | return self._model_impute()
71 | elif self.impute_method == "knn":
72 | return self._knn_impute()
73 | else:
74 | raise Exception("Imputation Type not defined")
75 |
--------------------------------------------------------------------------------
/src/metrics.py:
--------------------------------------------------------------------------------
1 | from sklearn import metrics as skmetrics
2 | import numpy as np
3 |
4 |
5 | class RegressionMetric:
6 | def __init__(self):
7 | self.metrics = {
8 | "mae": self._mae,
9 | "mse": self._mse,
10 | "rmse": self._rmse,
11 | "msle": self._msle,
12 | "rmsle": self._rmsle,
13 | "r2": self._r2,
14 | "mape": self._mape,
15 | }
16 |
17 | def __call__(self, metric, y_true, y_pred):
18 | if metric not in self.metrics:
19 | raise Exception("Metric not implemented")
20 | else:
21 | return self.metrics[metric](y_true=y_true, y_pred=y_pred)
22 |
23 | @staticmethod
24 | def _mae(y_true, y_pred):
25 | return skmetrics.mean_absolute_error(y_true=y_true, y_pred=y_pred)
26 |
27 | @staticmethod
28 | def _mse(y_true, y_pred):
29 | return skmetrics.mean_squared_error(y_true=y_true, y_pred=y_pred)
30 |
31 | def _rmse(self, y_true, y_pred):
32 | return np.sqrt(self._mse(y_true=y_true, y_pred=y_pred))
33 |
34 | @staticmethod
35 | def _msle(y_true, y_pred):
36 | return skmetrics.mean_squared_log_error(y_true=y_true, y_pred=y_pred)
37 |
38 | def _rmsle(self, y_true, y_pred):
39 | return np.sqrt(self._msle(y_true=y_true, y_pred=y_pred))
40 |
41 | @staticmethod
42 | def _r2(y_true, y_pred):
43 | return skmetrics.r2_score(y_true=y_true, y_pred=y_pred)
44 |
45 | @staticmethod
46 | def _mape(y_true, y_pred):
47 | y_true, y_pred = np.array(y_true), np.array(y_pred)
48 | epsilon = np.finfo(np.float64).eps
49 | mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon)
50 | output_errors = np.average(mape)
51 | return output_errors
52 |
53 |
54 | class ClassificationMetric:
55 | def __init__(self):
56 | self.metrics = {
57 | "accuracy": self._accuracy,
58 | "f1": self._f1,
59 | "recall": self._recall,
60 | "precision": self._precision,
61 | "auc": self._auc,
62 | "logloss": self._logloss,
63 | }
64 |
65 | def __call__(self, metric, y_true, y_pred, y_proba=None):
66 | if metric not in self.metrics:
67 | raise Exception("Metric not implemented")
68 | if metric == "auc":
69 | if y_proba is not None:
70 | return self._auc(y_true=y_true, y_pred=y_proba)
71 | else:
72 | raise Exception("y_proba cannot be None for AUC")
73 | elif metric == "logloss":
74 | if y_proba is not None:
75 | return self._auc(y_true=y_true, y_pred=y_proba)
76 | else:
77 | raise Exception("y_proba cannot be None for LogLoss")
78 | else:
79 | return self.metrics[metric](y_true=y_true, y_pred=y_pred)
80 |
81 | @staticmethod
82 | def _auc(y_true, y_pred):
83 | return skmetrics.roc_auc_score(y_true=y_true, y_score=y_pred)
84 |
85 | @staticmethod
86 | def _accuracy(y_true, y_pred):
87 | return skmetrics.accuracy_score(y_true=y_true, y_pred=y_pred)
88 |
89 | @staticmethod
90 | def _f1(y_true, y_pred):
91 | return skmetrics.f1_score(y_true=y_true, y_pred=y_pred)
92 |
93 | @staticmethod
94 | def _recall(y_true, y_pred):
95 | return skmetrics.recall_score(y_true=y_true, y_pred=y_pred)
96 |
97 | @staticmethod
98 | def _precision(y_true, y_pred):
99 | return skmetrics.precision_score(y_true=y_true, y_pred=y_pred)
100 |
101 | @staticmethod
102 | def _logloss(y_true, y_pred):
103 | return skmetrics.log_loss(y_true=y_true, y_pred=y_pred)
104 |
--------------------------------------------------------------------------------
/src/numerical.py:
--------------------------------------------------------------------------------
1 | from sklearn import preprocessing
2 | import numpy as np
3 |
4 |
5 | class NumericalFeatures:
6 | def __init__(self, df, numerical_features, encoding_type, handle_na=False):
7 | """
8 | df: pandas dataframe
9 | categorical_features: List of Columns that we want to encode
10 | encoding_type: standard, min-max, power. log
11 | """
12 | self.df = df
13 | self.num_feats = numerical_features
14 | self.enc_type = encoding_type
15 | self.handle_na = handle_na
16 | self.stan_scaler = dict()
17 | self.min_max_encoder = dict()
18 | self.power_transform_encoder = dict()
19 | self.log_transform = dict()
20 |
21 | if self.handle_na:
22 | for c in self.num_feats:
23 | self.df.loc[:, c] = self.df.loc[:, c].astype(str).fillna("-9999999999")
24 |
25 | self.output_df = self.df.copy(deep=True)
26 |
27 | def _standard_scaler(self):
28 | for c in self.num_feats:
29 | ss = preprocessing.StandardScaler()
30 | ss.fit(self.df[c].values.reshape(-1, 1))
31 | self.output_df.loc[:, c] = ss.transform(self.df[c].values.reshape(-1, 1))
32 | self.stan_scaler[c] = ss
33 | return self.output_df, self.stan_scaler
34 |
35 | def _min_max_scaler(self):
36 | for c in self.num_feats:
37 | mms = preprocessing.MinMaxScaler()
38 | mms.fit(self.df[c].values.reshape(-1, 1))
39 | self.output_df.loc[:, c] = mms.transform(self.df[c].values.reshape(-1, 1))
40 | self.min_max_encoder[c] = mms
41 | return self.output_df, self.min_max_encoder
42 |
43 | def _power_transform(self):
44 | for c in self.num_feats:
45 | powt = preprocessing.PowerTransformer()
46 | powt.fit(self.df[c].values.reshape(-1, 1))
47 | self.output_df.loc[:, c] = powt.transform(self.df[c].values.reshape(-1, 1))
48 | self.power_transform_encoder[c] = powt
49 | return self.output_df, self.power_transform_encoder
50 |
51 | def _log_transform(self):
52 | for c in self.num_feats:
53 | logt = preprocessing.FunctionTransformer(
54 | np.log1p, inverse_func=np.expm1, validate=True
55 | )
56 | logt.fit(self.df[c].values.reshape(-1, 1))
57 | self.output_df.loc[:, c] = logt.transform(self.df[c].values.reshape(-1, 1))
58 | self.log_transform[c] = logt
59 | return self.output_df, self.log_transform
60 |
61 | def fit_transform(self):
62 | if self.enc_type == "min-max":
63 | return self._min_max_scaler()
64 | elif self.enc_type == "standard":
65 | return self._standard_scaler()
66 | elif self.enc_type == "power":
67 | return self._power_transform()
68 | elif self.enc_type == "log":
69 | return self._log_transform()
70 | else:
71 | raise Exception("Transformation Type not understood")
72 |
73 | def transform(self, dataframe):
74 | if self.handle_na:
75 | for c in self.num_feats:
76 | dataframe.loc[:, c] = dataframe.loc[:, c].astype(str).fillna("-9999999")
77 |
78 | if self.enc_type == "min-max":
79 | for c, mms in self.min_max_encoder.items():
80 | dataframe.loc[:, c] = mms.transform(dataframe[c].values.reshape(-1, 1))
81 | return dataframe
82 | elif self.enc_type == "standard":
83 | for c, ss in self.stan_scaler.items():
84 | dataframe.loc[:, c] = ss.transform(dataframe[c].values.reshape(-1, 1))
85 | return dataframe
86 | elif self.enc_type == "power":
87 | for c, powt in self.power_transform_encoder.items():
88 | dataframe.loc[:, c] = powt.transform(dataframe[c].values.reshape(-1, 1))
89 | return dataframe
90 | elif self.enc_type == "log":
91 | for c, logt in self.log_transform.items():
92 | dataframe.loc[:, c] = logt.transform(dataframe[c].values.reshape(-1, 1))
93 | return dataframe
94 | else:
95 | raise Exception("Transformation not understood")
96 |
97 |
98 | # if __name__ == "__main__":
99 | # import pandas as pd
100 | #
101 | # df = pd.read_csv(r'C:\Users\abhis\OneDrive - IHS Markit\Python\00_practice\00_practice\diamonds.csv',
102 | # encoding='latin-1')
103 | # num_cols = ['price']
104 | # num_feat_transform = NumericalFeatures(df, num_cols, encoding_type='log', handle_na=False)
105 | # transformed_df = num_feat_transform.fit_transform()
106 | # print(transformed_df.head())
107 |
--------------------------------------------------------------------------------
/src/categorical.py:
--------------------------------------------------------------------------------
1 | from sklearn import preprocessing
2 |
3 |
4 | class CategoricalFeatures:
5 | def __init__(self, df, categorical_features, encoding_type, handle_na=False):
6 | """
7 | df: pandas dataframe
8 | categorical_features: List of Columns that we want to encode
9 | encoding_type: label, binary, ohe-hot-encoding (ohe)
10 | """
11 | self.df = df
12 | self.cat_feats = categorical_features
13 | self.enc_type = encoding_type
14 | self.label_encoders = dict()
15 | self.binary_encoders = dict()
16 | self.handle_na = handle_na
17 |
18 | if self.handle_na:
19 | for c in self.cat_feats:
20 | self.df.loc[:, c] = self.df.loc[:, c].astype(str).fillna("-9999999999")
21 |
22 | self.output_df = self.df.copy(deep=True)
23 |
24 | def _label_encoding(self):
25 | for c in self.cat_feats:
26 | lbl = preprocessing.LabelEncoder()
27 | lbl.fit(self.df[c].values)
28 | self.output_df.loc[:, c] = lbl.transform(self.df[c].values)
29 | self.label_encoders[c] = lbl
30 | return self.output_df
31 |
32 | def _label_binarizer(self):
33 | for c in self.cat_feats:
34 | lbl = preprocessing.LabelBinarizer()
35 | lbl.fit(self.df[c].values)
36 | val = lbl.transform(self.df[c].values)
37 | self.output_df = self.output_df.drop(c, axis=1)
38 | for j in range(val.shape[1]):
39 | new_col_name = c + f"__bin_{j}"
40 | self.output_df[new_col_name] = val[:, j]
41 | self.binary_encoders[c] = lbl
42 | return self.output_df
43 |
44 | def _one_hot(self):
45 | ohe = preprocessing.OneHotEncoder()
46 | ohe.fit(self.df[self.cat_feats].values)
47 | self.ohe = ohe
48 | return ohe.transform(self.df[self.cat_feats].values)
49 |
50 | def fit_transform(self):
51 | if self.enc_type == "label":
52 | return self._label_encoding()
53 | elif self.enc_type == "binary":
54 | return self._label_binarizer()
55 | elif self.enc_type == "ohe":
56 | return self._one_hot()
57 | else:
58 | raise Exception("Encoding Type not understood")
59 |
60 | def transform(self, dataframe):
61 | if self.handle_na:
62 | for c in self.cat_feats:
63 | dataframe.loc[:, c] = (
64 | dataframe.loc[:, c].astype(str).fillna("-9999999999")
65 | )
66 |
67 | if self.enc_type == "label":
68 | for c, lbl in self.label_encoders.items():
69 | dataframe.loc[:, c] = lbl.transform(dataframe[c].values)
70 | return dataframe
71 | elif self.enc_type == "binary":
72 | for c, lbl in self.binary_encoders.items():
73 | val = lbl.transform(dataframe[c].values)
74 | dataframe = dataframe.drop(c, axis=1)
75 | for j in range(val.shape[1]):
76 | new_col_name = c + f"__bin_{j}"
77 | dataframe[new_col_name] = val[:, j]
78 | return dataframe
79 | elif self.enc_type == "ohe":
80 | return self.ohe.transform(dataframe[self.cat_feats].values)
81 | else:
82 | raise Exception("Encoding Type not understood")
83 |
84 |
85 | # if __name__ == "__main__":
86 | # import pandas as pd
87 | # import config
88 | # from sklearn import linear_model
89 | # DATA_PATH = config.DATA_PATH
90 | # TRAINING_DATA = DATA_PATH + r'\train_cat.csv'
91 | # TEST_DATA = DATA_PATH + r'\test_cat.csv'
92 | # df = pd.read_csv(TRAINING_DATA)#.head(50)
93 | # df_test = pd.read_csv(TEST_DATA)#.head(50)
94 | # submission = pd.read_csv(r'C:\Users\abhis\Documents\01_proj\input_data\submission.csv')
95 | # train_len = len(df)
96 |
97 | # df_test['target'] = -1
98 |
99 | # full_data = pd.concat([df, df_test])
100 |
101 | # cols = [c for c in df.columns if c not in ['id', 'target']]
102 | # print(cols)
103 | # cat_feats = CategoricalFeatures(full_data, categorical_features=cols, encoding_type='ohe', handle_na=True)
104 |
105 | # full_data_transformed = cat_feats.fit_transform()
106 | # # test_transformed = cat_feats.transform(df_test)
107 |
108 | # train_transformed = full_data_transformed[:train_len, :]
109 | # test_transformed = full_data_transformed[train_len:, :]
110 |
111 | # print(train_transformed.shape)
112 | # print(test_transformed.shape)
113 |
114 | # clf = linear_model.LogisticRegression()
115 | # clf.fit(train_transformed, df.target.values)
116 | # pred = clf.predict_proba(test_transformed)[:,1]
117 |
118 | # submission.loc[:, 'target'] = pred
119 | # submission.to_csv(r'C:\Users\abhis\Documents\01_proj\input_data\submission.csv', index = False)
120 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hackathon Framework
2 |
3 | Objective of this project to enable quick experimentation in Data Analytics projects with minimal cookie cutter programming.
4 | Getting rid of all the fit_transforms.!
5 |
6 | ---
7 | ***NOTE***
8 |
9 | - This is a work in progress. Underlying modules are in process of development.
10 | - As this project matures there will be changes in the scripts such as `train.py` and `predict.py`
11 | - TODO
12 | * Create modules for `tuning`, `stacking`
13 | * Removal of some of the modules that are redundant
14 |
15 |
16 | ---
17 |
18 | The framework is designed to make the Data Science flow easier to perform, by encapsulating different techniques for each step within 1 method.
19 | There are classes for each of the below listed steps:
20 |
21 | - Feature Evaluation
22 | * Report to give an intution of the dataset
23 |
24 | - Feature Engineering
25 | * Modules to perform feature transformations on Categorical and Numerical Dataset.
26 | * Various applicable techniques are encoded within these modules and are accesed with an argument.
27 |
28 | - Fefature Generation
29 | * Module to create new features based on different techniques
30 |
31 | - Cross Validation
32 | * Stratified Folding both for Regression and Classification
33 |
34 | - Training
35 | * Run multiple models using 1 class.
36 | * Evaluating and Saving the results in an organized manner
37 |
38 | - Tuning
39 | * Hyper-parameter tuning of multiple models, based on json arguments for parameter values.
40 |
41 | - Prediction
42 |
43 | - Evaluating the model
44 |
45 |
46 |
47 |
48 | ## Steps to use the framework
49 |
50 |
51 |
52 | 1. Clone the repo.
53 | 2. Create 3 folders `input` and `model` and `tuneq`.
54 | 3. Save the training, testing and sample submission file in `input` folder.
55 | 4. The outputs generated from training such as trained model, encoders and oof_preds will be saved in `model` folder.
56 | 5. The parameters for fine tuning the models should be saved in the `tune` folder.
57 | 6. Update the `config.py` to point it to the correct path for data, model and tuning.
58 | 7. Update the `dispatcher.py` with model/models you want to run your dataset on.
59 | 8. Use the sample notebook to understand how to use this framework after this intial configuration is completed.
60 |
61 |
62 |
63 |
64 | ## Description of Files and their Purpose
65 |
66 | - `config.py`: Config file to give path of all the datasets and other standard configuration items. Such as csv files path, random seed etc.
67 |
68 | - `feature_eval.py`: This script and the class inside is used to analyze the dataframe and its columns to get the following output:
69 | - min, max and unique values of each column
70 | - histogram/ distribution of each column
71 | - corelation of columns using a heat map
72 |
73 | - `feature_gen.py`: Encapsulates method to generate new features. Currently implemented the `Polynomial features` method from sklearn.
74 | Returns Dataframe with new features.
75 |
76 | - `feature_impute.py`: Encapsulates the method to impute blank values in a dataframe.
77 | Currently, it supports 3 imputation methods:
78 | - Simple Imputer
79 | - Model Based Imputer: Extra Trees or knn
80 | - Knn based imputer
81 | - Returns updated Dataframe
82 |
83 | - `cross_validation.py`: This class is used to perform cross validation on any dataframe based on the type of problem statement. It is used to create cross validated dataset.
84 |
85 | - `categorical.py`: This class can be used for encoding of categorical features in a given dataframe.
86 | - Inputs : Dataframe, Categorical Columns List, Type of Encoding
87 | - Output: Encoded Dataframe
88 | - Supported Encoding Techniques:
89 | - Lable Encoding
90 | - Binary Encoding
91 | - One Hot Encoding
92 |
93 | - `numerical.py`: This class can be used for encoding of numerical features in a given dataframe.
94 | - Inputs : Dataframe, Categorical Columns List, Type of Encoding
95 | - Output: Encoded Dataframe, Transformer Object for later use.
96 | - Support Techniques:
97 | - Standard Scaler
98 | - Min-Max Scaler
99 | - Power Tranformer
100 | - Log Transformer
101 |
102 | - `metrics.py`: This class can be used to evaluate the results of given predictions and actual value.
103 |
104 | - `dispatcher.py`: Python File with Models and parameters. They have been designed to supply the models to `engine.py` for training on a given dataset
105 |
106 | - `engine.py`: This script encapsulates the method to train and evaluate the multiple models simultaneously
107 | - Leverages on `dispatcher.py` and `metrics.py` for model and metrics
108 | - The results for each fold are also saved in the `models` folder as `oof_predictions.csv` for each model.
109 | - **To Do** Stacking module to suporrt stacking of multiple models
110 |
111 | - **Scripts to be ignored for now**:
112 | - `train.py`: For training
113 | - `predict.py`: For prediction
114 | - `tune.py`: For tuning h-parameter
115 | - `create_folds.py`: To create folded datframe
--------------------------------------------------------------------------------
/src/cross_validation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn import model_selection
3 |
4 |
5 | """
6 |
7 | Categories to tackle
8 | - binary classification
9 | - multi class classification
10 | - multi label classification
11 | - single column regression
12 | - multi column regression
13 | - hold out
14 |
15 | """
16 |
17 |
18 | class CrossValidation:
19 | def __init__(
20 | self,
21 | df,
22 | target_cols,
23 | shuffle=False,
24 | problem_type="binary_classification",
25 | stratified_regression=False,
26 | multilabel_delimiter=",",
27 | n_folds=5,
28 | random_state=42,
29 | ):
30 | self.dataframe = df
31 | self.target_cols = target_cols
32 | self.num_targets = len(target_cols)
33 | self.shuffle = shuffle
34 | self.problem_type = problem_type
35 | self.stratified_regression = stratified_regression
36 | self.num_folds = n_folds
37 | self.random_state = random_state
38 | self.multilabel_delimiter = multilabel_delimiter
39 |
40 | if self.shuffle is True:
41 | self.dataframe = self.dataframe.sample(frac=1).reset_index(drop=True)
42 |
43 | self.dataframe["kfold"] = -1
44 |
45 | @staticmethod
46 | def _sort_partition(y, num_folds):
47 | n = len(y)
48 | cats = np.empty(n, dtype="u4")
49 | div, mod = divmod(n, num_folds)
50 | cats[: n - mod] = np.repeat(range(div), num_folds)
51 | cats[n - mod :] = div + 1
52 | return cats[np.argsort(np.argsort(y))]
53 |
54 | def split(self):
55 | if self.problem_type in ("binary_classification", "multiclass_classification"):
56 | if self.num_targets != 1:
57 | raise Exception(
58 | "Invalid number of target for this type of Problem statement"
59 | )
60 | target = self.target_cols[0]
61 | unqiue_values = self.dataframe[target].nunique()
62 | if unqiue_values == 1:
63 | raise Exception("Only one class present in the data, No ML is needed")
64 | elif unqiue_values > 1:
65 | kf = model_selection.StratifiedKFold(
66 | n_splits=self.num_folds, shuffle=False
67 | )
68 | for fold, (train_idx, val_idx) in enumerate(
69 | kf.split(X=self.dataframe, y=self.dataframe[target].values)
70 | ):
71 | self.dataframe.loc[val_idx, "kfold"] = fold
72 |
73 | elif self.problem_type in ("single_col_regression", "multi_col_regression"):
74 | if self.num_targets != 1 and self.problem_type == "single_col_regression":
75 | raise Exception("Invalid number of targets for this type of problem")
76 | if self.num_targets < 2 and self.problem_type == "multi_col_regression":
77 | raise Exception("Invalid number of targets for this type of problem")
78 | kf = model_selection.KFold(n_splits=self.num_folds)
79 | for fold, (train_idx, val_idx) in enumerate(kf.split(X=self.dataframe)):
80 | self.dataframe.loc[val_idx, "kfold"] = fold
81 |
82 | elif (
83 | self.problem_type in "single_col_regression" and self.stratified_regression
84 | ):
85 | if self.num_targets != 1 and self.problem_type == "single_col_regression":
86 | raise Exception("Invalid number of targets for this problem type")
87 | target = self.target_cols[0]
88 | y = self.dataframe[target].values
89 | y_categorized = self._sort_partition(y, self.num_folds)
90 | kf = model_selection.StratifiedKFold(n_splits=self.num_folds, shuffle=False)
91 | for fold, (train_idx, val_idx) in enumerate(
92 | kf.split(X=self.dataframe, y=y_categorized)
93 | ):
94 | self.dataframe.loc[val_idx, "kfold"] = fold
95 |
96 | elif self.problem_type.startswith("holdout_"):
97 | holdout_percentage = int(self.problem_type.split("_")[1])
98 | num_holdout_samples = int(len(self.dataframe) * holdout_percentage / 100)
99 | self.dataframe.loc[: len(self.dataframe) - num_holdout_samples, "kfold"] = 0
100 | self.dataframe.loc[len(self.dataframe) - num_holdout_samples :, "kfold"] = 1
101 |
102 | elif self.problem_type == "multilabel_classification":
103 | if self.num_targets != 1:
104 | raise Exception("Invalid number of targets for this problem type")
105 | targets = self.dataframe[self.target_cols[0]].apply(
106 | lambda x: len(str(x).split(self.multilabel_delimiter))
107 | )
108 | kf = model_selection.StratifiedKFold(n_splits=self.num_folds)
109 | for fold, (train_idx, val_idx) in enumerate(
110 | kf.split(X=self.dataframe, y=targets)
111 | ):
112 | self.dataframe.loc[val_idx, "kfold"] = fold
113 |
114 | else:
115 | raise Exception("Problem type not understood!")
116 |
117 | return self.dataframe
118 |
119 |
120 | # if __name__ == "__main__":
121 | # import config
122 | # import pandas as pd
123 | # REG_DATA = config.RAW_DATA
124 | # df = pd.read_csv(REG_DATA)
125 | # cross_val = CrossValidation(df=df, target_cols=['price'], problem_type='single_col_regression', stratified_regression = True)
126 | # df_folds = cross_val.split()
127 | # print(df_folds.head())
128 |
--------------------------------------------------------------------------------
/src/engine.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from src import dispatcher
3 | import joblib
4 | from pathlib import Path
5 | from src.metrics import RegressionMetric, ClassificationMetric
6 |
7 |
8 | class Engine:
9 | def __init__(
10 | self,
11 | dataframe,
12 | id_col: str,
13 | target_col: str,
14 | folds: int,
15 | unused_col: list = None,
16 | problem_type: str = "regression",
17 | model_list: list = None,
18 | save_model: bool = False,
19 | ):
20 | """
21 |
22 | :param dataframe: Dataframe for the folded dataset to be used for training/tuning
23 | :param id_col: String for the column name that is used for identfying the rows in dataframe
24 | :param target_col: String of Column name that is to be used for prediction
25 | :param folds: Number of folds in the dataframe
26 | :param unused_col: List of columns that is not to be considered for training
27 | :param model_list: List of models that are to be run for comparison, tuning or training.
28 | :param problem_type: This will define the type of models to pick.Possible values: regression/classification
29 | :param save_model: True or False to specify if you want the save the model file
30 | """
31 | self.df = dataframe
32 | self.id = id_col
33 | self.target = target_col
34 | self.folds = folds
35 | self.problem = problem_type
36 | self.models = model_list
37 | self.save_model = save_model
38 | self.model_dict = dict()
39 | self.result_dict = dict()
40 |
41 | if unused_col is None:
42 | self.unused_col = self.target.split()
43 | else:
44 | self.unused_col = unused_col
45 |
46 | if self.problem == "regression":
47 | self.models_used = dispatcher.REGRESSION_MODELS
48 | else:
49 | self.models_used = dispatcher.CLASSIFICATION_MODELS
50 |
51 | self.output_path = Path("./models/")
52 |
53 | @staticmethod
54 | def _generate_mapping(folds):
55 | fold_dict = dict()
56 | for i in range(folds):
57 | fold_dict[i] = [x for x in range(folds) if x != i]
58 | return fold_dict
59 |
60 | @staticmethod
61 | def _save_model(self, model: str, fold: int, clf):
62 | joblib.dump(clf, f"{self.output_path}/{str(model)}__{str(fold)}__.pkl")
63 | return
64 |
65 | @staticmethod
66 | def _save_result(self):
67 | for model_result in self.result_dict.keys():
68 | result_df = pd.DataFrame(self.result_dict[model_result])
69 | result_df.to_csv(
70 | f"{self.output_path}/{str(model_result)}__oof_predictions.csv",
71 | index=False,
72 | )
73 | return
74 |
75 | @staticmethod
76 | def _train_model(self, model, fold_dict: dict):
77 | idx = list()
78 | actuals = list()
79 | predictions = list()
80 | fold_list = list()
81 | for fold in range(self.folds):
82 | train_df = self.df[self.df.kfold.isin(fold_dict.get(fold))]
83 | valid_df = self.df[self.df.kfold == fold]
84 | ytrain = train_df[self.target].values
85 | yvalid = valid_df[self.target].values
86 | idx.extend(valid_df[self.id].values.tolist())
87 | train_df = train_df.drop(self.unused_col, axis=1)
88 | valid_df = valid_df.drop(self.unused_col, axis=1)
89 | valid_df = valid_df[train_df.columns]
90 | clf = self.models_used[model]
91 | clf.fit(train_df, ytrain)
92 | preds = clf.predict(valid_df)
93 | predictions.extend(preds.tolist())
94 | actuals.extend(yvalid.tolist())
95 | fold_list.extend([fold for num in range(len(yvalid))])
96 | if self.save_model:
97 | self._save_model(self, model, fold, clf)
98 | result = {
99 | self.id: idx,
100 | "Predictions": predictions,
101 | "Actuals": actuals,
102 | "Fold": fold_list,
103 | }
104 | return result
105 |
106 | def train_models(self):
107 | fold_dict = self._generate_mapping(self.folds)
108 | if self.models is None:
109 | self.models = list(self.models_used.keys())
110 | for model in self.models:
111 | if self.save_model:
112 | print(
113 | f"Training model: {str(model)}, and saving the model and results at: {str(self.output_path)}"
114 | )
115 | else:
116 | print(
117 | f"Training model: {str(model)}, and saving only the results at: {str(self.output_path)}"
118 | )
119 | self.result_dict[model] = self._train_model(self, model, fold_dict)
120 | print(f"Saving the results of Trained Models")
121 | self._save_result(self)
122 | return
123 |
124 | def stack(self):
125 | # TODO Implement stacking in Engine. IP: model_preds, meta_model | OP: Pred, result
126 | return
127 |
128 | def evaluate(
129 | self, model_list: list, metric: str = None, target_transformer: dict = None
130 | ):
131 | """
132 |
133 | :param model_list: List of models which needs to be evaluated
134 | :param metric: Metric that is to be calcuated for the models
135 | :param target_transformer: If any transformer is used on the target, pass that transformer object here.
136 | :return: dataframe with metric and mean
137 | """
138 | for model in model_list:
139 | print("Evaluating the result for model {0}".format(str(model)))
140 | result_df = pd.read_csv(
141 | f"{self.output_path}/{str(model)}__oof_predictions.csv"
142 | )
143 | if self.problem == "regression":
144 | metric_type = RegressionMetric()
145 | else:
146 | metric_type = ClassificationMetric()
147 | if target_transformer is None:
148 | result = metric_type(
149 | metric,
150 | result_df["Actuals"].values.reshape(-1, 1),
151 | result_df["Predictions"].values.reshape(-1, 1),
152 | )
153 | else:
154 | result = metric_type(
155 | metric,
156 | target_transformer[self.target].inverse_transform(
157 | result_df["Actuals"].values.reshape(-1, 1)
158 | ),
159 | target_transformer[self.target].inverse_transform(
160 | result_df["Predictions"].values.reshape(-1, 1)
161 | ),
162 | )
163 |
164 | return result
165 |
166 |
167 | # if __name__ == "__main__":
168 | # import pandas as pd
169 | #
170 | # df = pd.read_csv(r'C:\Users\abhis\Documents\01_proj\kaggle_comp\sample_ihsm\input\train_folds.csv')
171 | # engine = Engine(
172 | # dataframe=df,
173 | # id_col='vehicle_id',
174 | # target_col='Price_USD',
175 | # folds=5,
176 | # save_model=True,
177 | # model_list=['lr']
178 | # )
179 | # engine.train_models()
180 | # my_result = engine.evaluate(model_list=['lr'], metric='mape')
181 | # print(my_result)
182 |
--------------------------------------------------------------------------------