├── NOTICE ├── CODE_OF_CONDUCT.md ├── model ├── linear_regressor.py ├── searcher.py ├── extra_trees.py ├── random_forest.py ├── light_gbm.py ├── neural_network.py └── neural_aggregator.py ├── README.md ├── CONTRIBUTING.md ├── data ├── process_data.py ├── download_openml.py └── data_loader.py ├── merge_nested_data.py ├── util ├── misc.py └── metric.py ├── LICENSE ├── nested_aggr_quantile_models.py └── nested_base_quantile_models.py /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /model/linear_regressor.py: -------------------------------------------------------------------------------- 1 | import statsmodels.api as sm 2 | from model.searcher import QuantileSearcher 3 | from sklearn.base import BaseEstimator, RegressorMixin 4 | 5 | 6 | class SMQRmodel(BaseEstimator, RegressorMixin): 7 | def __init__(self, 8 | quantile, 9 | fit_intercept=True): 10 | self.quantile = quantile 11 | self.fit_intercept = fit_intercept 12 | 13 | def fit(self, X, y): 14 | if self.fit_intercept: 15 | X = sm.add_constant(X) 16 | self.model_ = sm.QuantReg(y, X) 17 | self.results_ = self.model_.fit(q=self.quantile, max_iter=10000000) 18 | 19 | def predict(self, X): 20 | if self.fit_intercept: 21 | X = sm.add_constant(X, has_constant='add') 22 | return self.results_.predict(X) 23 | 24 | 25 | class QuantileRegressor(QuantileSearcher): 26 | def __init__(self, 27 | quantile=0.5, 28 | **kwargs): 29 | self.searcher = SMQRmodel(quantile=quantile) 30 | self.quantile = quantile 31 | 32 | def fit(self, x_train, y_train): 33 | self.searcher.fit(x_train, y_train.reshape(-1)) 34 | 35 | def predict(self, x_data, quantile=None): 36 | return self.searcher.predict(x_data).reshape(-1, 1) 37 | 38 | def get_init_model(self): 39 | return SMQRmodel(quantile=self.quantile) 40 | 41 | -------------------------------------------------------------------------------- /model/searcher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class QuantileSearcher: 5 | quantile = 0.5 6 | searcher = None 7 | base_model = None 8 | 9 | def fit(self, x_train, y_train): 10 | self.searcher.fit(x_train, y_train.reshape(-1)) 11 | 12 | def predict(self, x_data, quantile=None): 13 | if quantile is None: 14 | quantile = self.quantile 15 | return self.searcher.best_estimator_.predict(x_data, int(quantile * 100)).reshape(-1, 1) 16 | 17 | def eval_loss(self, x_data, y_data, quantile=None): 18 | if quantile is None: 19 | quantile = self.quantile 20 | error_data = y_data - self.predict(x_data, quantile) 21 | loss_data = np.maximum(quantile * error_data, (quantile - 1) * error_data) 22 | return loss_data.mean() 23 | 24 | def get_init_model(self): 25 | return self.base_model(**self.searcher.best_params_) 26 | 27 | 28 | class MeanSearcher: 29 | searcher = None 30 | base_model = None 31 | 32 | def fit(self, x_train, y_train): 33 | self.searcher.fit(x_train, y_train.reshape(-1)) 34 | 35 | def predict(self, x_data): 36 | return self.searcher.best_estimator_.predict(x_data).reshape(-1, 1) 37 | 38 | def eval_loss(self, x_data, y_data): 39 | error_data = y_data - self.predict(x_data) 40 | loss_data = error_data * error_data 41 | loss_data = np.sqrt(loss_data) 42 | return loss_data.mean() 43 | 44 | def get_init_model(self): 45 | return self.base_model(**self.searcher.best_params_) 46 | 47 | -------------------------------------------------------------------------------- /model/extra_trees.py: -------------------------------------------------------------------------------- 1 | from model.searcher import QuantileSearcher, MeanSearcher 2 | from model.forests import ExtraTreesQuantileRegressor 3 | from sklearn.ensemble import ExtraTreesRegressor 4 | from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 5 | 6 | TREE_PARAM_GRID = {'n_estimators': [50], 7 | 'min_samples_split': [8, 16, 64], 8 | 'min_samples_leaf': [8, 16, 64]} 9 | 10 | TREE_NUM_GRIDS = 1 11 | for param in TREE_PARAM_GRID.values(): 12 | TREE_NUM_GRIDS *= len(param) 13 | 14 | 15 | class QuantileExtraTrees(QuantileSearcher): 16 | def __init__(self, 17 | num_iters, 18 | num_folds, 19 | quantile=0.5, 20 | num_jobs=-1, 21 | rand_seed=111): 22 | self.base_model = ExtraTreesQuantileRegressor 23 | if TREE_NUM_GRIDS > num_iters: 24 | self.searcher = RandomizedSearchCV(estimator=self.base_model(n_jobs=-1), 25 | param_distributions=TREE_PARAM_GRID, 26 | n_iter=num_iters, 27 | cv=num_folds, 28 | random_state=rand_seed, 29 | n_jobs=num_jobs) 30 | else: 31 | self.searcher = GridSearchCV(estimator=self.base_model(n_jobs=-1), 32 | param_grid=TREE_PARAM_GRID, 33 | cv=num_folds, 34 | n_jobs=num_jobs) 35 | self.quantile = quantile 36 | 37 | def full_predict(self, x_data, quantile_list): 38 | return self.searcher.best_estimator_.predict(x_data, quantile_list) 39 | 40 | def get_init_model(self): 41 | return self.base_model(**self.searcher.best_params_, n_jobs=-1) 42 | 43 | -------------------------------------------------------------------------------- /model/random_forest.py: -------------------------------------------------------------------------------- 1 | from model.searcher import QuantileSearcher, MeanSearcher 2 | from model.forests import RandomForestQuantileRegressor 3 | from sklearn.ensemble import RandomForestRegressor 4 | from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 5 | 6 | TREE_PARAM_GRID = {'n_estimators': [50], 7 | 'min_samples_split': [8, 16, 64], 8 | 'min_samples_leaf': [8, 16, 64]} 9 | 10 | TREE_NUM_GRIDS = 1 11 | for param in TREE_PARAM_GRID.values(): 12 | TREE_NUM_GRIDS *= len(param) 13 | 14 | 15 | class QuantileRandomForest(QuantileSearcher): 16 | def __init__(self, 17 | num_iters, 18 | num_folds, 19 | quantile=0.5, 20 | num_jobs=-1, 21 | rand_seed=111): 22 | self.base_model = RandomForestQuantileRegressor 23 | if TREE_NUM_GRIDS > num_iters: 24 | self.searcher = RandomizedSearchCV(estimator=self.base_model(n_jobs=-1), 25 | param_distributions=TREE_PARAM_GRID, 26 | n_iter=num_iters, 27 | cv=num_folds, 28 | random_state=rand_seed, 29 | n_jobs=num_jobs) 30 | else: 31 | self.searcher = GridSearchCV(estimator=self.base_model(n_jobs=-1), 32 | param_grid=TREE_PARAM_GRID, 33 | cv=num_folds, 34 | n_jobs=num_jobs) 35 | self.quantile = quantile 36 | 37 | def full_predict(self, x_data, quantile_list): 38 | return self.searcher.best_estimator_.predict(x_data, quantile_list) 39 | 40 | def get_init_model(self): 41 | return self.base_model(**self.searcher.best_params_, n_jobs=-1) 42 | 43 | -------------------------------------------------------------------------------- /model/light_gbm.py: -------------------------------------------------------------------------------- 1 | from model.searcher import QuantileSearcher, MeanSearcher 2 | from lightgbm import LGBMRegressor 3 | from sklearn.model_selection import RandomizedSearchCV, GridSearchCV 4 | 5 | GBM_PARAM_GRID = {'n_estimators': [50], 6 | 'num_leaves': [10, 50, 100], 7 | 'min_child_samples': [3, 9, 15], 8 | 'min_child_weight': [1e-2, 1e-1, 1], 9 | 'subsample': [0.4, 0.6, 0.8], 10 | 'colsample_bytree': [0.4, 0.6], 11 | 'reg_alpha': [1e-1, 1, 5], 12 | 'reg_lambda': [1e-1, 1, 5]} 13 | GBM_NUM_GRIDS = 1 14 | for param in GBM_PARAM_GRID.values(): 15 | GBM_NUM_GRIDS *= len(param) 16 | 17 | 18 | class QuantileLightGBM(QuantileSearcher): 19 | def __init__(self, 20 | num_iters, 21 | num_folds, 22 | quantile=0.5, 23 | num_jobs=-1, 24 | rand_seed=111): 25 | self.base_model = LGBMRegressor 26 | if GBM_NUM_GRIDS > num_iters: 27 | self.searcher = RandomizedSearchCV(estimator=self.base_model(objective='quantile', metric='quantile', 28 | alpha=quantile, n_jobs=-1), 29 | param_distributions=GBM_PARAM_GRID, 30 | n_iter=num_iters, 31 | cv=num_folds, 32 | random_state=rand_seed, 33 | n_jobs=num_jobs) 34 | else: 35 | self.searcher = GridSearchCV(estimator=self.base_model(objective='quantile', metric='quantile', 36 | alpha=quantile, n_jobs=-1), 37 | param_grids=GBM_PARAM_GRID, 38 | cv=num_folds, 39 | n_jobs=num_jobs) 40 | self.quantile = quantile 41 | 42 | def predict(self, x_data, quantile=None): 43 | return self.searcher.predict(x_data).reshape(-1, 1) 44 | 45 | def get_init_model(self): 46 | return self.base_model(max_depth=-1, objective='quantile', metric='quantile', 47 | alpha=self.quantile, n_jobs=-1, **self.searcher.best_params_) 48 | 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Flexible Model Aggregation for Quantile Regression 2 | ============================================= 3 | Quantile regression is a fundamental problem in statistical learning motivated by the need to quantify uncertainty in predictions, or to model a diverse population without being overly reductive. For instance, epidemiological forecasts, cost estimates, and revenue predictions all benefit from being able to quantify the range of possible values accurately. As such, many models have been developed for this problem over many years of research in econometrics, statistics, and machine learning. 4 | 5 | Rather than proposing yet another (new) algorithm for quantile regression we adopt a meta viewpoint: we investigate methods for aggregating any number of conditional quantile models, in order to improve accuracy and robustness. We consider weighted ensembles where weights may vary over not only individual 6 | models, but also over quantile levels, and feature values. All of the models we consider in this paper can be fit using modern deep learning toolkits, and hence are widely accessible (from an implementation point of view) and scalable. 7 | 8 | To improve the accuracy of the predicted quantiles (or equivalently, prediction intervals), we develop tools for ensuring that quantiles remain monotonically ordered, and apply conformal calibration methods. These can be used without any modification of the original library of base models. We also review some basic theory surrounding quantile aggregation and related scoring rules, and contribute a few new results to this literature (for example, the fact that post sorting or post isotonic regression can only improve the weighted interval score). Finally, we provide an extensive suite of empirical comparisons across 34 data sets from two different benchmark repositories. 9 | 10 | This repository provides the implementation of [Flexible Model Aggregation for Quantile Regression](https://arxiv.org/abs/2103.00083). If you use this code please cite the paper using the following bibtex: 11 | 12 | ``` 13 | @article{fakoor2022quantile, 14 | title={Flexible Model Aggregation for Quantile Regression}, 15 | author={Rasool Fakoor, Taesup Kim, Jonas Mueller, Alexander J. Smola, Ryan J. Tibshirani}, 16 | journal={arXiv preprint arXiv:2103.00083}, 17 | year={2021}, 18 | } 19 | 20 | 21 | ``` 22 | ## Getting Started 23 | ``` 24 | Run the following commands in the specified order: 25 | 26 | 1) python -u nested_base_quantile_models.py --DATA_PATH ~/mydata/ --data_loc ~/rawdata/ --task-id yacht -seed 1 27 | 28 | 2) python -u merge_nested_data.py --DATA_PATH ~/mydata/ --task-id yacht --seed 1 29 | 30 | 3) python -u nested_aggr_quantile_models.py --DATA_PATH ~/mydata/ --task-id yacht --seed 1 --RESULT_PATH ~/myresult/ 31 | 32 | ``` 33 | The code works on both GPU and CPU machines. 34 | 35 | In order to run this code, you will need to install pytorch, lightgbm, numpy, openml, scikit_learn,scipy, autogluon, statsmodels, etc. 36 | 37 | ## License 38 | This project is licensed under the Apache-2.0 License. 39 | 40 | # Contact 41 | 42 | Please open an issue on [issues tracker](https://github.com/amazon-research/quantile-aggregation) to report problems or to ask questions or send an email to me, [Rasool Fakoor](https://github.com/rasoolfa). 43 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /data/process_data.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import pandas as pd 3 | import numpy as np 4 | from autogluon import TabularPrediction as task 5 | 6 | 7 | def processData(data, label_column=None, ag_predictor=None, 8 | problem_type=None, eval_metric=None): 9 | """ Converts pandas Dataframe to matrix of entirely numerical values (stored in DataFrame). 10 | Performs same data preprocessing as used for AutoGluon's tabular neural network model, 11 | to deal with issues such as: missing value imputation, one-hot encoding of categoricals, 12 | handling of high-cardinality categoricals, handling unknown categorical feature-levels at test-time, etc. 13 | 14 | If ag_predictor is not None, uses existing autogluon predictor object to process data (must have tabularNN as first model). 15 | To process training data, ag_predictor should = None. For test data, should != None. 16 | Returns: 17 | Tuple (X, y, ag_predictor) 18 | where y may be None if labels are not present in test data. 19 | """ 20 | 21 | # fit dummy neural network model just to preprocess data. Here we ensure no embedding layers are used. 22 | if ag_predictor is None: 23 | if label_column is None: 24 | raise ValueError("when processing training data, label_column cannot be None") 25 | elif not label_column in data.columns: 26 | raise ValueError("label_column cannot be missing from training data") 27 | ag_predictor = task.fit(train_data=task.Dataset(data), tuning_data=task.Dataset(data), label=label_column, 28 | hyperparameter_tune=False, problem_type=problem_type, eval_metric=eval_metric, 29 | hyperparameters={'NN': {'num_epochs': 0, 'proc.embed_min_categories': np.inf}}, 30 | num_bagging_folds=0, stack_ensemble_levels=0, label_count_threshold=1, verbosity=2) 31 | 32 | model = ag_predictor._trainer.load_model(ag_predictor._trainer.get_model_names_all()[ 33 | 0]) # This must be the neural net model which contains data processor 34 | if 'NeuralNet' not in model.name: 35 | raise ValueError("Data preprocessing error. This model should be the NeuralNet, not the: %s" % model.name) 36 | bad_inds = [] # row-indices to remove from dataset 37 | if label_column is not None and label_column in data.columns: 38 | label_cleaner = ag_predictor._learner.label_cleaner 39 | y = data[label_column].values 40 | data = data.drop([label_column], axis=1, inplace=False) 41 | y = label_cleaner.transform(y) 42 | if np.sum(y.isna()) > 0: 43 | bad_inds = y.index[ 44 | y.apply(np.isnan)].tolist() # remove these inds as label is NaN (due to very rare classes) 45 | warnings.warn("Dropped these rows from data in preprocessing, due to missing labels: " + str(bad_inds)) 46 | else: 47 | y = None 48 | data_initial_processed = ag_predictor._learner.transform_features(data) # general autogluon data processing. 49 | tabNN_data = model.process_test_data(data_initial_processed, batch_size=64, num_dataloading_workers=4) 50 | # neural net-specific autogluon data processing required to turn tabular data into numerical matrix. 51 | numeric_data = tabNN_data.dataset._data # list of mxnet.NDArrays 52 | if len(numeric_data) != 1: 53 | raise ValueError("Data Preprocessing failed.") 54 | numpy_data = numeric_data[0].asnumpy() # 2D Numpy array 55 | X = pd.DataFrame(numpy_data) 56 | X.columns = ['feature' + str(i) for i in range(X.shape[1])] 57 | if len(bad_inds) > 0: 58 | y.drop(index=bad_inds, inplace=True) 59 | X.drop(index=bad_inds, axis=0, inplace=True) 60 | return X, y, ag_predictor 61 | -------------------------------------------------------------------------------- /data/download_openml.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle as pkl 3 | import openml 4 | import collections 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import StandardScaler 8 | from argparse import Namespace 9 | from process_data import processData 10 | import os.path 11 | import argparse 12 | 13 | OPENML_DICT = \ 14 | { 15 | '359934': 'tecator', 16 | '359931': 'sensory', 17 | '359947': 'MIP-2016-regression', 18 | '359932': 'socmob', 19 | '167210': 'Moneyball', 20 | '359951': 'house_prices_nominal', 21 | '359945': 'us_crime', 22 | '359930': 'quake', 23 | '359933': 'space_ga', 24 | '359944': 'abalone', 25 | '233215': 'Mercedes_Benz_Greener_Manufacturing', 26 | '359948': 'SAT11-HAND-runtime-regression', 27 | '233214': 'Santander_transaction_value', 28 | '13854': 'QSAR-TID-11', 29 | '14097': 'QSAR-TID-10980', 30 | '359935': 'wine_quality', 31 | '359942': 'colleges', 32 | '359939': 'topo_2_1', 33 | '359940': 'yprop_4_1', 34 | '317612': 'Brazilian_houses', 35 | '359946': 'pol', 36 | '359936': 'elevators', 37 | '359949': 'house_sales', 38 | '359952': 'house_16H', 39 | '359941': 'OnlineNewsPopularity', 40 | '233211': 'diamonds', 41 | } 42 | 43 | def load_openml(task_id, random_seed=1): 44 | data_path = "./dataset/openml_{}_seed{}.pkl".format(task_id, random_seed) 45 | if os.path.isfile(data_path): 46 | with open(data_path, 'rb') as f: 47 | return pkl.load(f) 48 | 49 | # otherwise load 50 | task = openml.tasks.get_task(task_id) 51 | 52 | # get label 53 | label = task.target_name 54 | 55 | # get full pd_frame 56 | full_data = task.get_dataset().get_data()[0] 57 | full_size = full_data.shape[0] 58 | 59 | # split data 60 | train_data, test_data = train_test_split(full_data, test_size=0.1, random_state=random_seed) 61 | if hasattr(train_data, 'sparse'): 62 | train_data = train_data.sparse.to_dense() 63 | if hasattr(test_data, 'sparse'): 64 | test_data = test_data.sparse.to_dense() 65 | 66 | # preprocess x_train 67 | x_train, y_train, x_transformer = processData(data=train_data, label_column=label, problem_type='regression') 68 | x_test, y_test, _ = processData(data=test_data, label_column=label, ag_predictor=x_transformer) 69 | 70 | # convert to numpy 71 | x_train = x_train.values 72 | y_train = y_train.values.reshape(-1, 1) 73 | x_test = x_test.values 74 | y_test = y_test.values.reshape(-1, 1) 75 | 76 | # y normalizer based on train data 77 | y_transformer = StandardScaler().fit(y_train) 78 | 79 | # transform data 80 | y_train = y_transformer.transform(y_train) 81 | y_test = y_transformer.transform(y_test) 82 | 83 | # dataset 84 | dataset = Namespace(size=full_size, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) 85 | 86 | info = {'task_id': task_id, 87 | 'x_train.shape':x_train.shape, 88 | 'y_train.shape': y_train.shape, 89 | 'x_test.shape':x_test.shape, 90 | 'y_test.shape':y_test.shape, 91 | 'seed':random_seed 92 | } 93 | print('-------------------------') 94 | print('task_id', task_id) 95 | print('train-->', x_train.shape, y_train.shape) 96 | print('test-->', x_test.shape, y_test.shape) 97 | print('-------------------------') 98 | 99 | # pickle 100 | with open(data_path, 'wb') as f: 101 | pkl.dump(dataset, f) 102 | 103 | # return dataset 104 | return dataset, info 105 | 106 | parser = argparse.ArgumentParser() 107 | parser.add_argument('--seed', type=int, default=1) 108 | 109 | if __name__ == "__main__": 110 | 111 | args = parser.parse_args() 112 | print('------------') 113 | print(args.__dict__) 114 | print('------------') 115 | 116 | all_info = [] 117 | for i in OPENML_DICT.keys(): 118 | print('task', i) 119 | _, info = load_openml(i, random_seed=args.seed) 120 | all_info.append(info) 121 | print('*******') 122 | print('Done') 123 | print('---------------------------------------') 124 | print('all_info', all_info) 125 | -------------------------------------------------------------------------------- /data/data_loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle as pkl 3 | import openml 4 | import collections 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import StandardScaler 8 | from argparse import Namespace 9 | from data.process_data import processData 10 | import os.path 11 | 12 | NUM_RI_DATASETS = 13 13 | 14 | UCI_LIST = \ 15 | ['yacht', 16 | 'boston', 17 | 'energy', 18 | 'concrete', 19 | 'kin8nm', 20 | 'power', 21 | 'naval', 22 | 'protein'] 23 | 24 | OPENML_DICT = \ 25 | {'359949': 'house_sales', 26 | '359945': 'us_crime', 27 | '359943': 'nyc-taxi-green-dec-2016', 28 | '359942': 'colleges', 29 | '359944': 'abalone', 30 | '359941': 'OnlineNewsPopularity', 31 | '359926': 'Airlines_DepDelay_1M', 32 | '317614': 'Yolanda', 33 | '317612': 'Brazilian_houses', 34 | '233214': 'Santander_transaction_value', 35 | '233212': 'Allstate_Claims_Severity', 36 | '233215': 'Mercedes_Benz_Greener_Manufacturing', 37 | '359951': 'house_prices_nominal', 38 | '233211': 'diamonds', 39 | '359948': 'SAT11-HAND-runtime-regression', 40 | '359947': 'MIP-2016-regression', 41 | '168891': 'black_friday', 42 | '167210': 'Moneyball', 43 | '233213': 'Buzzinsocialmedia_Twitter', 44 | '14097': 'QSAR-TID-10980', 45 | '13854': 'QSAR-TID-11', 46 | '359952': 'house_16H', 47 | '359930': 'quake', 48 | '359931': 'sensory', 49 | '359932': 'socmob', 50 | '4857': 'boston', 51 | '359933': 'space_ga', 52 | '359934': 'tecator', 53 | '359939': 'topo_2_1', 54 | '359940': 'yprop_4_1', 55 | '359935': 'wine_quality', 56 | '359936': 'elevators', 57 | '359946': 'pol'} 58 | 59 | 60 | def load_uci(dataset_name, random_seed=111, data_loc='./data/dataset/'): 61 | if dataset_name not in UCI_LIST: 62 | raise NotImplementedError('not available dataset') 63 | # load data 64 | data = np.loadtxt(os.path.join(data_loc ,"{}.txt".format(dataset_name))) 65 | x_full = data[:, :-1] 66 | y_full = data[:, -1].reshape(-1, 1) 67 | 68 | # split into train / test 69 | x_train, x_test, y_train, y_test = train_test_split(x_full, y_full, test_size=0.1, random_state=random_seed) 70 | 71 | # normalizer based on train data 72 | x_transformer = StandardScaler().fit(x_train) 73 | y_transformer = StandardScaler().fit(y_train) 74 | 75 | # transform data 76 | x_train = x_transformer.transform(x_train) 77 | y_train = y_transformer.transform(y_train) 78 | x_test = x_transformer.transform(x_test) 79 | y_test = y_transformer.transform(y_test) 80 | 81 | return Namespace(size=x_full.shape[0], x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) 82 | 83 | 84 | def load_openml(task_id, random_seed=1, data_loc='./data/dataset/'): 85 | data_path = os.path.join(data_loc ,"openml_{}_seed{}.pkl".format(task_id, random_seed)) 86 | if os.path.isfile(data_path): 87 | with open(data_path, 'rb') as f: 88 | return pkl.load(f) 89 | 90 | # otherwise load 91 | task = openml.tasks.get_task(task_id) 92 | 93 | # get label 94 | label = task.target_name 95 | 96 | # get full pd_frame 97 | full_data = task.get_dataset().get_data()[0] 98 | full_size = full_data.shape[0] 99 | 100 | # split data 101 | train_data, test_data = train_test_split(full_data, test_size=0.1, random_state=random_seed) 102 | if hasattr(train_data, 'sparse'): 103 | train_data = train_data.sparse.to_dense() 104 | if hasattr(test_data, 'sparse'): 105 | test_data = test_data.sparse.to_dense() 106 | 107 | # preprocess x_train 108 | x_train, y_train, x_transformer = processData(data=train_data, label_column=label, problem_type='regression') 109 | x_test, y_test, _ = processData(data=test_data, label_column=label, ag_predictor=x_transformer) 110 | 111 | # convert to numpy 112 | x_train = x_train.values 113 | y_train = y_train.values.reshape(-1, 1) 114 | x_test = x_test.values 115 | y_test = y_test.values.reshape(-1, 1) 116 | 117 | # y normalizer based on train data 118 | y_transformer = StandardScaler().fit(y_train) 119 | 120 | # transform data 121 | y_train = y_transformer.transform(y_train) 122 | y_test = y_transformer.transform(y_test) 123 | 124 | # dataset 125 | dataset = Namespace(size=full_size, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) 126 | 127 | # pickle 128 | with open(data_path, 'wb') as f: 129 | pkl.dump(dataset, f) 130 | 131 | # return dataset 132 | return dataset 133 | 134 | -------------------------------------------------------------------------------- /merge_nested_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import numpy as np 4 | import pickle as pkl 5 | MODEL_LIST = \ 6 | ['QuantileConditionalGaussianNetwork', 7 | 'QuantileSingleNeuralNetwork', 8 | 'QuantileJointNeuralNetwork', 9 | 'QuantileRandomForest', 10 | 'QuantileExtraTrees', 11 | 'QuantileLightGBM', 12 | ] 13 | 14 | def merge_results(dataset, seed, eparams): 15 | merge_z_test = [] 16 | merge_z_val = [] 17 | 18 | merge_oof_x_train = {} 19 | merge_oof_y_train = {} 20 | merge_oof_z_train = {} 21 | for model in MODEL_LIST: 22 | file_path = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_{}_z_test_{}_cv5_iter20_seed{}.npy'.format(model, dataset, seed) 23 | file_path_val = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_{}_z_val_{}_cv5_iter20_seed{}.npy'.format(model, dataset, seed) 24 | 25 | if os.path.exists(file_path): 26 | merge_z_test.append(np.load(file_path)) 27 | 28 | if os.path.exists(file_path_val): 29 | merge_z_val.append(np.load(file_path_val)) 30 | 31 | 32 | file_path = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_{}_oof_x_train_{}_cv5_iter20_seed{}.pkl'.format(model, dataset, seed) 33 | if os.path.exists(file_path) and len(merge_oof_x_train) == 0: 34 | with open(file_path, 'rb') as handle: 35 | merge_oof_x_train = pkl.load(handle) 36 | 37 | file_path = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_{}_oof_y_train_{}_cv5_iter20_seed{}.pkl'.format(model, dataset, seed) 38 | if os.path.exists(file_path) and len(merge_oof_y_train) == 0: 39 | with open(file_path, 'rb') as handle: 40 | merge_oof_y_train = pkl.load(handle) 41 | 42 | file_path = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_{}_oof_z_train_{}_cv5_iter20_seed{}.pkl'.format(model, dataset, seed) 43 | if os.path.exists(file_path): 44 | with open(file_path, 'rb') as handle: 45 | oof_z_train = pkl.load(handle) 46 | 47 | if len(merge_oof_z_train) == 0: 48 | merge_oof_z_train = oof_z_train 49 | for pair_key in merge_oof_z_train.keys(): 50 | if '-' in pair_key: 51 | tmp0, tmp1 = merge_oof_z_train[pair_key] 52 | merge_oof_z_train[pair_key] = [[tmp0], [tmp1]] 53 | else: 54 | tmp0 = merge_oof_z_train[pair_key] 55 | merge_oof_z_train[pair_key] = [tmp0] 56 | else: 57 | for pair_key in merge_oof_z_train.keys(): 58 | if '-' in pair_key: 59 | tmp0, tmp1 = oof_z_train[pair_key] 60 | merge_oof_z_train[pair_key][0].append(tmp0) 61 | merge_oof_z_train[pair_key][1].append(tmp1) 62 | else: 63 | tmp0 = oof_z_train[pair_key] 64 | merge_oof_z_train[pair_key].append(tmp0) 65 | 66 | merge_z_test = np.stack(merge_z_test, 1) 67 | merge_z_val = np.stack(merge_z_val, 1) 68 | 69 | for pair_key in merge_oof_z_train.keys(): 70 | if '-' in pair_key: 71 | tmp0, tmp1 = merge_oof_z_train[pair_key] 72 | merge_oof_z_train[pair_key] = [np.stack(tmp0, 1), np.stack(tmp1, 1)] 73 | else: 74 | tmp0 = merge_oof_z_train[pair_key] 75 | merge_oof_z_train[pair_key] = np.stack(tmp0, 1) 76 | 77 | np.save(eparams.DATA_PATH + eparams.log_id + 'quantile_nested_base_z_test_{}_cv5_iter20_seed{}.npy'.format(dataset, seed), merge_z_test) 78 | np.save(eparams.DATA_PATH + eparams.log_id + 'quantile_nested_base_z_val_{}_cv5_iter20_seed{}.npy'.format(dataset, seed), merge_z_val) 79 | 80 | file_path = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_base_oof_x_train_{}_cv5_iter20_seed{}.pkl'.format(dataset, seed) 81 | with open(file_path, 'wb') as handle: 82 | pkl.dump(merge_oof_x_train, handle, protocol=pkl.HIGHEST_PROTOCOL) 83 | 84 | file_path = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_base_oof_y_train_{}_cv5_iter20_seed{}.pkl'.format(dataset, seed) 85 | with open(file_path, 'wb') as handle: 86 | pkl.dump(merge_oof_y_train, handle, protocol=pkl.HIGHEST_PROTOCOL) 87 | 88 | file_path = eparams.DATA_PATH + eparams.log_id + 'quantile_nested_base_oof_z_train_{}_cv5_iter20_seed{}.pkl'.format(dataset, seed) 89 | with open(file_path, 'wb') as handle: 90 | pkl.dump(merge_oof_z_train, handle, protocol=pkl.HIGHEST_PROTOCOL) 91 | 92 | return 93 | 94 | if __name__ == "__main__": 95 | # arguments 96 | parser = argparse.ArgumentParser() 97 | 98 | # parser 99 | parser.add_argument('--task-id', type=str, help='task id') 100 | parser.add_argument('--seed', type=int, default=1, help='random seed') 101 | parser.add_argument('--DATA_PATH', default='./output/data/') 102 | parser.add_argument('--log_id', default='mylogid') 103 | 104 | args = parser.parse_args() 105 | print('------------') 106 | print(args.__dict__) 107 | print('------------') 108 | 109 | merge_results(args.task_id, args.seed, args) 110 | print('Done') 111 | -------------------------------------------------------------------------------- /util/misc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | from sklearn.isotonic import IsotonicRegression 5 | import multiprocessing as mp 6 | from fast_soft_sort.pytorch_ops import soft_sort 7 | 8 | 9 | def set2mask(set_data_list, input_size): 10 | # init mask 11 | mask_data = np.zeros((input_size, input_size)) 12 | 13 | # for each set 14 | for i, set_data in enumerate(set_data_list): 15 | num_elements = len(set_data) 16 | tmp_mask_data = np.zeros((input_size, 1)) 17 | tmp_mask_data[set_data] = 1.0 18 | tmp_mask_data /= float(num_elements) 19 | mask_data[:, set_data] = tmp_mask_data 20 | return mask_data 21 | 22 | 23 | def single_pava(z): 24 | # init parition info 25 | p_val = [] 26 | p_cnt = [] 27 | p_set = [] 28 | p_idx = -1 29 | 30 | # for each value 31 | for i, val in enumerate(z): 32 | # if first, or current value is larger than others 33 | if i == 0 or val > p_val[p_idx]: 34 | # add value as new partition 35 | p_set.append([i]) 36 | p_val.append(val) 37 | p_cnt.append(1) 38 | p_idx += 1 39 | continue 40 | # if the value is same as the latest one, just insert 41 | elif val == p_val[p_idx]: 42 | # only count up 43 | p_set[p_idx].append(i) 44 | p_cnt[p_idx] += 1 45 | continue 46 | 47 | # if current value is smaller than the current value 48 | assert val < p_val[p_idx] 49 | # update partition info 50 | p_set[p_idx].append(i) 51 | p_val[p_idx] = (p_val[p_idx] * p_cnt[p_idx] + val) / float(p_cnt[p_idx] + 1) 52 | p_cnt[p_idx] += 1 53 | 54 | # clean up 55 | while p_idx > 0: 56 | # if current parition is equal or smaller than the previous partition 57 | if p_val[p_idx] <= p_val[p_idx - 1]: 58 | # merge 59 | p_set[p_idx - 1] += p_set[p_idx] 60 | p_val[p_idx - 1] = (p_val[p_idx] * p_cnt[p_idx] + p_val[p_idx - 1] * p_cnt[p_idx - 1]) / float( 61 | p_cnt[p_idx] + p_cnt[p_idx - 1]) 62 | p_cnt[p_idx - 1] = p_cnt[p_idx] + p_cnt[p_idx - 1] 63 | p_set.pop(p_idx) 64 | p_val.pop(p_idx) 65 | p_cnt.pop(p_idx) 66 | p_idx -= 1 67 | else: 68 | break 69 | return set2mask(p_set, len(z)) 70 | 71 | 72 | def multi_pava(z_array): 73 | pool = mp.Pool(processes=mp.cpu_count()) 74 | output_mask = np.stack(pool.map(single_pava, z_array), 0) 75 | pool.close() 76 | return output_mask 77 | 78 | 79 | # forward pava (differentiable) 80 | def pava_forward(input_data): 81 | # data size 82 | batch_size, num_quantiles = input_data.size() 83 | 84 | # for each data, make mask 85 | input_mask = [single_pava(input_data[i].cpu().data.numpy()) for i in range(batch_size)] 86 | #input_mask = multi_pava(input_data.cpu().data.numpy()) 87 | input_mask = np.stack(input_mask, 0) 88 | input_mask = torch.Tensor(input_mask).to(input_data.device) 89 | 90 | # based on mask, compute pava output 91 | output_data = torch.bmm(input_data.unsqueeze(1), input_mask.detach()).squeeze(1) 92 | return output_data 93 | 94 | 95 | def isotonic(input_data, quantile_list): 96 | quantile_list = np.array(quantile_list).reshape(-1) 97 | batch_size = input_data.shape[0] 98 | new_output_data = [] 99 | for i in range(batch_size): 100 | new_output_data.append(IsotonicRegression().fit_transform(quantile_list, input_data[i])) 101 | return np.stack(new_output_data, 0) 102 | 103 | 104 | def fix_crossing(predict_data, fix_type=0): 105 | is_torch = True 106 | if type(predict_data) is not torch.Tensor: 107 | is_torch = False 108 | predict_data = torch.Tensor(predict_data) 109 | 110 | # number of quantiles 111 | num_quantiles = predict_data.size()[-1] 112 | 113 | # above 50% and below 50% 114 | if fix_type == 0: 115 | # split into below 50% and above 50% 116 | idx_50 = num_quantiles // 2 117 | 118 | # below 50% 119 | below_50 = predict_data[:, :(idx_50 + 1)].contiguous() 120 | below_50 = torch.flip(torch.cummin(torch.flip(below_50, [-1]), -1)[0], [-1]) 121 | 122 | # above 50% 123 | above_50 = predict_data[:, idx_50:].contiguous() 124 | above_50 = torch.cummax(above_50, -1)[0] 125 | 126 | # refined output 127 | ordered_data = torch.cat([below_50[:, :-1], above_50], -1) 128 | # from 0% to 100% 129 | elif fix_type == 1: 130 | ordered_data = torch.cummax(predict_data, -1)[0] 131 | # from 0% to 100% and from 100% to 0% 132 | elif fix_type == 2: 133 | min_ordered_data = torch.flip(torch.cummin(torch.flip(predict_data, [-1]), -1)[0], [-1]) 134 | max_ordered_data = torch.cummax(predict_data, -1)[0] 135 | ordered_data = 0.5 * (min_ordered_data + max_ordered_data) 136 | else: 137 | ordered_data = predict_data 138 | 139 | if is_torch: 140 | return ordered_data 141 | else: 142 | return ordered_data.data.cpu().numpy() 143 | 144 | 145 | # forward sorting (differentiable) 146 | def sort_forward(input_data, regularization_strength): 147 | return soft_sort(input_data, regularization_strength=regularization_strength) 148 | 149 | # make directory 150 | def make_dir(dir_path): 151 | if not os.path.exists(dir_path): 152 | try: 153 | os.makedirs(dir_path) 154 | except OSError as e: 155 | raise ValueError(e) 156 | -------------------------------------------------------------------------------- /util/metric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | # numpy pinball loss 6 | def pinball_loss_np(predict_data, target_data, quantiles, mean=False): 7 | error_data = target_data.reshape(-1, 1) - predict_data 8 | loss_data = np.maximum(quantiles * error_data, (quantiles - 1) * error_data) 9 | if mean: 10 | return loss_data.mean() 11 | else: 12 | return loss_data.mean(0) 13 | 14 | 15 | # pinball loss 16 | def pinball_loss(predict_data, target_data, quantiles): 17 | error_data = target_data.reshape(-1, 1) - predict_data 18 | loss_data = torch.max(quantiles * error_data, (quantiles - 1) * error_data) 19 | return loss_data.mean() 20 | 21 | 22 | # huber-pinball loss 23 | def huber_loss(predict_data, target_data, quantiles, alpha=0.01): 24 | if alpha == 0.0: 25 | return pinball_loss(predict_data, target_data, quantiles) 26 | 27 | error_data = target_data.reshape(-1, 1) - predict_data 28 | loss_data = torch.where(torch.abs(error_data) < alpha, 29 | 0.5 * error_data * error_data, 30 | alpha * (torch.abs(error_data) - 0.5 * alpha)) 31 | loss_data /= alpha 32 | 33 | scale = torch.where(error_data >= 0, 34 | torch.ones_like(error_data) * quantiles, 35 | torch.ones_like(error_data) * (1 - quantiles)) 36 | loss_data *= scale 37 | return loss_data.mean() 38 | 39 | 40 | # margin loss (between neighbored prediction) 41 | def margin_loss(predict_data, margin_data): 42 | # number of samples 43 | batch_size, num_quantiles = predict_data.size() 44 | 45 | # compute margin loss (batch_size x output_size(above) x output_size(below)) 46 | error_data = predict_data.unsqueeze(1) - predict_data.unsqueeze(2) 47 | 48 | # len(np.shape(margin_data)) ==0, means a scalar 49 | if len(np.shape(margin_data)) == 1: 50 | # margin data (num_quantiles) ===> (num_quantiles x num_quantiles) 51 | if type(margin_data) is not torch.Tensor: 52 | margin_data = torch.tensor(margin_data, device=predict_data.device) 53 | 54 | margin_data = margin_data.reshape(1, -1) 55 | margin_data = margin_data.permute(1, 0) - margin_data 56 | margin_data = torch.tril(margin_data, -1).relu() 57 | 58 | 59 | loss_data = torch.tril(error_data + margin_data, diagonal=-1) 60 | loss_data = loss_data.relu() 61 | loss_data = loss_data.sum() / np.float32(batch_size * (num_quantiles * num_quantiles - num_quantiles) * 0.5) 62 | 63 | # compute accumulated margin 64 | #if only_neighbored: 65 | # loss_data = torch.tril(torch.triu(error_data + margin_data, diagonal=-1), diagonal=-1) 66 | # loss_data = loss_data.relu() 67 | # loss_data = loss_data.sum() / np.float32(batch_size * (num_quantiles - 1)) 68 | 69 | return loss_data 70 | 71 | 72 | # PICP, percentage of captured points (ratio of true observations falling inside the estimated prediction) 73 | def prediction_interval_coverage_rate(y_target, y_lower, y_upper): 74 | return np.mean((y_target >= y_lower) & (y_target <= y_upper)) 75 | 76 | 77 | def mean_prediction_interval_coverage_rate(y_target, y_quantile, quantile_list): 78 | picp_list = [] 79 | error_list = [] 80 | 81 | # for each quantile level (from 1% to 99%) 82 | num_samples = y_quantile.shape[0] 83 | num_quantiles = len(quantile_list) 84 | assert num_quantiles == y_quantile.shape[1] 85 | y_target = y_target.reshape([num_samples, 1]) 86 | 87 | # for each interval 88 | for i in range(num_quantiles // 2): 89 | # lower and upper index 90 | lower_idx = i 91 | upper_idx = -(i + 1) 92 | 93 | # lower and upper quantile 94 | lower_quantile = quantile_list[lower_idx] 95 | upper_quantile = quantile_list[upper_idx] 96 | 97 | # get predicted lower and upper values 98 | y_lower = y_quantile[:, lower_idx].reshape([num_samples, 1]) 99 | y_upper = y_quantile[:, upper_idx].reshape([num_samples, 1]) 100 | 101 | # compute picp 102 | picp = prediction_interval_coverage_rate(y_target=y_target, y_lower=y_lower, y_upper=y_upper) 103 | interval_size = upper_quantile - lower_quantile 104 | 105 | picp_list.append(picp) 106 | error_list.append(np.abs(picp - interval_size)) 107 | 108 | # mean over all intervals 109 | return picp_list, np.array(error_list).mean() 110 | 111 | 112 | # MeanPredictionIntervalWidth (MPIW) 113 | def mean_prediction_interval_width(y_full, y_lower, y_higher): 114 | # width of intervals 115 | y_range = y_full.max() - y_full.min() 116 | return np.abs(y_higher - y_lower).mean() / y_range 117 | 118 | 119 | def mean_abs_calibration_error(y_target, y_quantile, quantile_list): 120 | # y_target (batch_size x 1) 121 | y_target = y_target.reshape(-1, 1) 122 | num_samples = y_target.shape[0] 123 | 124 | # y_quantile (batch_size x num_quantiles) 125 | y_quantile = y_quantile.reshape(num_samples, -1) 126 | num_quantiles = y_quantile.shape[1] 127 | assert num_quantiles == len(quantile_list) 128 | 129 | # compute coverage (num_quantiles) 130 | mean_calibration = (y_target <= y_quantile).mean(0) 131 | 132 | # compute error (mean over quantile-levels) 133 | return mean_calibration.tolist(), np.abs(mean_calibration - quantile_list).mean() 134 | 135 | 136 | def root_mean_squared_calibration_error(y_target, y_quantile, quantile_list): 137 | # y_target (batch_size x 1) 138 | y_target = y_target.reshape(-1, 1) 139 | num_samples = y_target.shape[0] 140 | 141 | # y_quantile (batch_size x num_quantiles) 142 | y_quantile = y_quantile.reshape(num_samples, -1) 143 | num_quantiles = y_quantile.shape[1] 144 | assert num_quantiles == len(quantile_list) 145 | 146 | # compute coverage (num_quantiles) 147 | mean_calibration = (y_target <= y_quantile).mean(0) 148 | 149 | # compute error (mean over quantile-levels) 150 | return np.sqrt(np.mean(np.square(mean_calibration - quantile_list))) 151 | 152 | 153 | def mean_interval_score(y_target, y_quantile, quantile_list): 154 | # assume quantile list is symmetry centered in 50% 155 | # for each quantile level (from 1% to 99%) 156 | interval_score_list = [] 157 | num_quantiles = len(quantile_list) 158 | for i in range(num_quantiles // 2): 159 | # lower and upper quantile 160 | lower_quantile = quantile_list[i] 161 | upper_quantile = quantile_list[-(i + 1)] 162 | 163 | # get predicted lower and upper values 164 | y_lower = y_quantile[:, quantile_list == lower_quantile] 165 | y_upper = y_quantile[:, quantile_list == upper_quantile] 166 | 167 | # get mask below lower, above upper 168 | below_lower = (y_lower > y_target).astype('float') 169 | above_upper = (y_upper < y_target).astype('float') 170 | 171 | # compute score 172 | interval_score = (y_upper - y_lower) 173 | interval_score += (1.0 / lower_quantile) * (y_lower - y_target) * below_lower 174 | interval_score += (1.0 / lower_quantile) * (y_target - y_upper) * above_upper 175 | 176 | # mean over samples 177 | interval_score = interval_score.mean() 178 | interval_score_list.append(interval_score) 179 | 180 | # mean over all intervals 181 | return interval_score_list 182 | 183 | 184 | def compute_quantile_results(prediction, target, quantile_list): 185 | check = pinball_loss_np(predict_data=prediction, target_data=target, quantiles=quantile_list) 186 | interval = mean_interval_score(y_target=target, y_quantile=prediction, quantile_list=quantile_list) 187 | results_value = check.tolist() + [np.mean(check)] + interval + [np.mean(interval)] 188 | return results_value 189 | 190 | 191 | def compute_calibration_results(prediction, target, quantile_list): 192 | picp_list, mean_error = mean_prediction_interval_coverage_rate(y_target=target, y_quantile=prediction, quantile_list=quantile_list) 193 | calib_list, mace = mean_abs_calibration_error(y_target=target, y_quantile=prediction, quantile_list=quantile_list) 194 | results_value = picp_list + [mean_error] + calib_list + [mace] 195 | return results_value 196 | 197 | 198 | def compute_mean_results(prediction, target): 199 | error_data = target - prediction 200 | results_value = [np.sqrt(np.mean(error_data * error_data)), 201 | np.mean(error_data * error_data), 202 | np.mean(np.abs(error_data))] 203 | return results_value 204 | 205 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | -------------------------------------------------------------------------------- /nested_aggr_quantile_models.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import argparse 4 | import numpy as np 5 | import pickle as pkl 6 | from model.neural_aggregator import QuantileLocalAggregatorTrainer, QuantileGlobalAggregatorTrainer 7 | from util.misc import make_dir 8 | from data.data_loader import UCI_LIST, OPENML_DICT 9 | import warnings 10 | from util.others import dump_to_json 11 | import os 12 | 13 | warnings.filterwarnings('ignore') 14 | 15 | # quantile list (from 1% to 99%) 16 | QUANTILE_LIST = np.arange(1, 100, 1) / 100.0 17 | NUM_QUANTILES = len(QUANTILE_LIST) 18 | #CV_RATIO = 0.8 19 | 20 | def model_prediction(model, x_data, z_data): 21 | batch_size = 512 22 | data_size = x_data.shape[0] 23 | num_batches = int(np.ceil(float(data_size) / float(batch_size))) 24 | 25 | e_data = [] 26 | for i in range(num_batches): 27 | e_data.append(model.predict(x_data[i * batch_size:(i + 1) * batch_size], z_data[i * batch_size:(i + 1) * batch_size])) 28 | 29 | return np.concatenate(e_data, 0) 30 | 31 | def run_exp(task_id, use_local=True, 32 | share_weight=False, cross_weight=True, 33 | normalize=True, margin_type=None, 34 | trans_type=None, use_grad=False, 35 | num_searches=20, num_folds=5, rand_seed=1, device=-1, 36 | eparams=None): 37 | 38 | print('--------------------') 39 | print('use_local: ', use_local) 40 | print('share_weight: ', share_weight) 41 | print('cross_weight: ', cross_weight) 42 | print('margin_type: ', margin_type) 43 | print('trans_type: ', trans_type) 44 | print('use_grad: ', use_grad) 45 | print('num_searches: ', num_searches) 46 | print('num_folds: ', num_folds) 47 | print('rand_seed: ', rand_seed) 48 | print('eparams.use_mean_pt: ', eparams.use_mean_pt) 49 | print('device: ', device) 50 | print('regularization_strength:', eparams.regularization_strength) 51 | print('--------------------') 52 | 53 | # set seed 54 | random.seed(rand_seed) 55 | np.random.seed(rand_seed) 56 | torch.manual_seed(rand_seed) 57 | torch.cuda.manual_seed_all(rand_seed) 58 | 59 | if torch.cuda.is_available() and device > -1: 60 | torch.cuda.manual_seed(rand_seed) 61 | torch.backends.cudnn.deterministic = True 62 | torch.backends.cudnn.benchmark = False 63 | 64 | # exp setting 65 | task_name = task_id 66 | if task_id not in UCI_LIST: 67 | task_name = OPENML_DICT[task_id] 68 | 69 | exp_name = [task_id, num_folds, num_searches, rand_seed] 70 | 71 | # load dataset 72 | with open(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_oof_x_train_{}_cv{}_iter{}_seed{}.pkl'.format(*exp_name), 'rb') as handle: 73 | oof_x_train = pkl.load(handle) 74 | 75 | with open(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_oof_y_train_{}_cv{}_iter{}_seed{}.pkl'.format(*exp_name), 'rb') as handle: 76 | oof_y_train = pkl.load(handle) 77 | 78 | with open(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_oof_z_train_{}_cv{}_iter{}_seed{}.pkl'.format(*exp_name), 'rb') as handle: 79 | oof_z_train = pkl.load(handle) 80 | 81 | x_train, y_train, z_train = [], [], [] 82 | for k in range(num_folds): 83 | x_train.append(oof_x_train['{}'.format(k)]) 84 | y_train.append(oof_y_train['{}'.format(k)]) 85 | z_train.append(oof_z_train['{}'.format(k)]) 86 | 87 | x_train = np.concatenate(x_train, 0) 88 | y_train = np.concatenate(y_train, 0) 89 | z_train = np.concatenate(z_train, 0) 90 | 91 | x_test = np.load(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_x_test_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 92 | y_test = np.load(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_y_test_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 93 | z_test = np.load(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_z_test_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 94 | 95 | x_val = np.load(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_x_val_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 96 | y_val = np.load(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_y_val_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 97 | z_val = np.load(eparams.DATA_PATH + eparams.log_id_base + 'quantile_nested_base_z_val_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 98 | 99 | # get empirical quantile-margin 100 | if margin_type == 'non': 101 | margin_list = None 102 | 103 | elif margin_type == 'single': 104 | margin_list = None 105 | 106 | elif margin_type == 'adapt': 107 | # compute error (from estimated median) 108 | if eparams.use_mean_pt == True: 109 | print('using mean for adapt..') 110 | e_train = y_train.reshape(-1) - np.mean(z_train[..., NUM_QUANTILES // 2], 1).reshape(-1).astype(np.float32) 111 | 112 | else: 113 | print('using median..') 114 | e_train = y_train.reshape(-1) - np.median(z_train[..., NUM_QUANTILES // 2], 1).reshape(-1).astype(np.float32) 115 | 116 | margin_list = np.quantile(e_train.reshape(-1), QUANTILE_LIST, 0).astype(np.float32) 117 | 118 | elif margin_type == 'vec': 119 | margin_list = (QUANTILE_LIST.reshape(-1) * 1e-2).astype(np.float32) 120 | 121 | else: 122 | raise ValueError('%s margin not supported' % margin_type) 123 | 124 | # data size 125 | feature_size = x_train.shape[1] 126 | print('Data: {} (seed {}, train size {}, feature size {}, val size {}, test size {})'.format( 127 | task_name, rand_seed, x_train.shape[0], feature_size, x_val.shape[0], x_test.shape[0])) 128 | 129 | #train_size = x_train.shape[0] 130 | train_idx_list = np.load(eparams.DATA_PATH + eparams.log_id_base +'quantile_all_train_idx_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 131 | cv_split = np.load(eparams.DATA_PATH + eparams.log_id_base +'quantile_train_val_idx_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), allow_pickle=True) 132 | 133 | # num of base models and quantiles 134 | num_models, num_quantiles = z_train.shape[1], z_train.shape[2] 135 | assert num_quantiles == NUM_QUANTILES 136 | 137 | # model name 138 | if use_local: 139 | model_name = 'Local' 140 | 141 | else: 142 | model_name = 'Global' 143 | 144 | if share_weight: 145 | model_name += '-Coarse' 146 | 147 | elif cross_weight: 148 | model_name += '-Fine' 149 | 150 | else: 151 | model_name += '-Medium' 152 | 153 | print('Train start: ', model_name) 154 | 155 | # full experiment name 156 | if trans_type is not None: 157 | output_name = 'DQA_{}_norm{}_grad{}_{}_{}_margin_results_{}_cv{}_iter{}_seed{}'.format(model_name, 158 | int(normalize), 159 | int(use_grad), 160 | trans_type, 161 | margin_type, 162 | *exp_name) 163 | else: 164 | output_name = 'DQA_{}_norm{}_{}_margin_results_{}_cv{}_iter{}_seed{}'.format(model_name, 165 | int(normalize), 166 | margin_type, 167 | *exp_name) 168 | 169 | print(eparams.run_id + output_name) 170 | 171 | # set model learner 172 | batch_size = int(2 ** (3 + np.floor(np.log10(y_train.shape[0] + y_test.shape[0])))) 173 | model_trainer = QuantileLocalAggregatorTrainer if use_local else QuantileGlobalAggregatorTrainer 174 | model = model_trainer(quantile_list=QUANTILE_LIST, num_searches=num_searches, cv_split=cv_split, 175 | share_weight=share_weight, cross_weight=cross_weight, 176 | normalize=normalize, margin_list=margin_list, 177 | trans_type=trans_type, use_grad=use_grad, 178 | batch_size=batch_size, rand_seed=rand_seed, device=device, 179 | margin_type=margin_type, 180 | regularization_strength=eparams.regularization_strength) 181 | 182 | # fit model 183 | if use_local: 184 | model.fit(c_train=x_train, x_train=z_train, y_train=y_train, 185 | c_val=x_val, x_val=z_val, y_val=y_val, 186 | ac_train=None, ax_train=None) 187 | 188 | else: 189 | #model.fit(z_train, y_train, None) 190 | model.fit(x_train=z_train, y_train=y_train, 191 | x_val=z_val, y_val=y_val, 192 | ax_train=None) 193 | 194 | 195 | # compute test prediction 196 | if use_local: 197 | org_e_test = model_prediction(model, x_test, z_test) 198 | 199 | else: 200 | org_e_test = model.predict(z_test) 201 | 202 | # save results 203 | np.save(eparams.RESULT_PATH + eparams.run_id + eparams.log_id_out + output_name + '_org_e_test.npy', org_e_test) 204 | np.save(eparams.RESULT_PATH + eparams.run_id + output_name + '_org_e_test.npy', org_e_test) 205 | 206 | print('Done.') 207 | 208 | if __name__ == "__main__": 209 | # arguments 210 | parser = argparse.ArgumentParser() 211 | 212 | # parser 213 | parser.add_argument('--gpu', type=int, default=0, help='gpu id') 214 | parser.add_argument('--fold', type=int, default=5, help='folds for out of fold predictions') 215 | parser.add_argument('--iter', type=int, default=20, help='number of iterations for grid search') 216 | parser.add_argument('--task-id', type=str, default='boston', help='task id') 217 | parser.add_argument('--seed', type=int, default=1, help='random seed') 218 | 219 | parser.add_argument('--local', type=int, default=1) 220 | parser.add_argument('--share', type=int, default=0, help='share combination') 221 | parser.add_argument('--cross', type=int, default=1, help='full combination') 222 | parser.add_argument('--grad', type=int, default=0, help='use grad') 223 | parser.add_argument('--trans', type=str, default='sort', help='non-crossing type') 224 | parser.add_argument('--margin', type=str, default='single', help='margin type') 225 | 226 | parser.add_argument('--norm', type=int, default=1, help='normalize weight') 227 | 228 | parser.add_argument('--DATA_PATH', default='./output/data/') 229 | parser.add_argument('--RESULT_PATH', default='./output/result/') 230 | parser.add_argument('--log_id_base', default='mylogid') 231 | parser.add_argument('--log_id_out', default='exNone') 232 | parser.add_argument('--run_id', default='p1_') 233 | parser.add_argument('--use_mean_pt', default=False, action='store_true') 234 | parser.add_argument('--regularization_strength', type=float, default=0.1) 235 | 236 | args = parser.parse_args() 237 | print('------------') 238 | print(args.__dict__) 239 | print('------------') 240 | 241 | make_dir(args.DATA_PATH) 242 | make_dir(args.RESULT_PATH) 243 | 244 | fname_json = os.path.join(args.RESULT_PATH, args.log_id_out + '_' + 245 | args.log_id_base +'_' + args.run_id + args.task_id +'_args_aggr.json' ) 246 | print(fname_json) 247 | dump_to_json(fname_json, {'args': args.__dict__}) 248 | 249 | # run 250 | run_exp(task_id=args.task_id,use_local=bool(args.local), 251 | share_weight=args.share, cross_weight=args.cross, 252 | normalize=bool(args.norm), margin_type=args.margin, 253 | trans_type=args.trans, use_grad=bool(args.grad), 254 | num_searches=args.iter, num_folds=args.fold, 255 | rand_seed=args.seed, device=args.gpu, 256 | eparams=args, 257 | ) 258 | -------------------------------------------------------------------------------- /nested_base_quantile_models.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import argparse 4 | import numpy as np 5 | import pickle as pkl 6 | from model.linear_regressor import QuantileRegressor 7 | from model.random_forest import QuantileRandomForest 8 | from model.extra_trees import QuantileExtraTrees 9 | from model.light_gbm import QuantileLightGBM 10 | from model.neural_network import QuantileJointNeuralNetwork 11 | from model.neural_network import QuantileSingleNeuralNetwork 12 | from model.neural_network import QuantileConditionalGaussianNetwork 13 | from util.misc import make_dir 14 | from sklearn.model_selection import KFold 15 | from data.data_loader import load_uci, load_openml, UCI_LIST, OPENML_DICT 16 | from util.others import dump_to_json 17 | import os 18 | import copy 19 | 20 | # quantile list (from 1% to 99%) 21 | QUANTILE_LIST = np.arange(1, 100, 1) / 100.0 22 | CV_RATIO = 0.8 23 | 24 | # run neural networks 25 | def run_neural(model, 26 | x_train, y_train, 27 | x_test, y_test, 28 | cv_split, kfolder, 29 | num_iters, rand_seed=1, device=0, 30 | eparams=None): 31 | # model name 32 | model_name = model.__name__ 33 | print('Train start: ', model_name) 34 | 35 | # set model learner 36 | batch_size = int(2 ** (3 + np.floor(np.log10(x_train.shape[0] + x_test.shape[0])))) 37 | model_learner = model(quantile_list=QUANTILE_LIST, 38 | num_iters=num_iters, 39 | cv_split=cv_split, 40 | batch_size=batch_size, 41 | rand_seed=rand_seed, 42 | device=device, 43 | use_grad=eparams.use_grad, 44 | trans_type=eparams.trans_type, 45 | use_margin=eparams.use_margin, 46 | margin_type=eparams.margin_type 47 | ) 48 | ######### 49 | # train/val split happens inside model_learner based on cv_split 50 | ######### 51 | # fit model where x_train and y_train are split into val and train according to cv_plsit 52 | model_learner.fit(x_train, y_train, None) 53 | 54 | # compute test prediction 55 | z_test = model_learner.predict(x_test) 56 | print('z_test', z_test.shape) 57 | 58 | # model prediction on validation 59 | z_val = model_learner.predict(x_train[cv_split[0][1]]) 60 | print('z_val', z_val.shape) 61 | 62 | ######################################### 63 | #### remove validation from train set ### 64 | ######################################### 65 | print('Full training size', x_train.shape, y_train.shape) 66 | x_train = x_train[cv_split[0][0]].copy() 67 | y_train = y_train[cv_split[0][0]].copy() 68 | print('Training size after removing validation', x_train.shape, y_train.shape) 69 | 70 | # get nested out-of-fold predictions 71 | full_index = np.arange(x_train.shape[0]) 72 | # fold_list[5,2] where [:,0] is training and [:,1] validation 73 | fold_list = list(kfolder.split(x_train)) 74 | num_folds = len(fold_list) 75 | 76 | # for oof 77 | oof_x_train = {} 78 | oof_y_train = {} 79 | oof_z_train = {} 80 | for k0 in range(num_folds): 81 | 82 | # compute oof for k0 and train 83 | oof_index0 = fold_list[k0][1] 84 | #train_index = np.setdiff1d(full_index, oof_index0) 85 | train_index = np.setdiff1d(fold_list[k0][0], oof_index0) 86 | 87 | # split train / valid 88 | cv_x_train, cv_y_train = x_train[train_index], y_train[train_index] 89 | cv_x_valid0, cv_y_valid0 = x_train[oof_index0], y_train[oof_index0] 90 | 91 | # fit on cv 92 | model_learner.refit_model(cv_x_train, cv_y_train, None) 93 | 94 | # obtain prediction over quantiles 95 | cv_z_valid0 = model_learner.predict(cv_x_valid0) 96 | 97 | oof_x_train['{}'.format(k0)] = cv_x_valid0 98 | oof_y_train['{}'.format(k0)] = cv_y_valid0 99 | oof_z_train['{}'.format(k0)] = cv_z_valid0 100 | 101 | return oof_x_train, oof_y_train, oof_z_train, z_test, z_val 102 | 103 | # run tree models 104 | def run_tree(model, 105 | x_train, y_train, 106 | x_test, y_test, 107 | cv_split, kfolder, 108 | num_iters, rand_seed=1, 109 | eparams=None, 110 | **kwargs): 111 | # model name 112 | model_name = model.__name__ 113 | print('Train start: ', model_name) 114 | 115 | # set model learner 116 | model_learner = model(num_iters=num_iters, num_folds=cv_split, rand_seed=rand_seed) 117 | model_learner.fit(x_train, y_train) 118 | 119 | # compute test prediction on best hyper-params 120 | z_test = model_learner.full_predict(x_test, list(QUANTILE_LIST)) 121 | print('z_test', z_test.shape) 122 | 123 | # model prediction on validation 124 | z_val = model_learner.full_predict(x_train[cv_split[0][1]], list(QUANTILE_LIST)) 125 | print('z_val', z_val.shape) 126 | 127 | ######################################### 128 | #### remove validation from train set ### 129 | ######################################### 130 | print('Full training sizes:', x_train.shape, y_train.shape) 131 | x_train = x_train[cv_split[0][0]].copy() 132 | y_train = y_train[cv_split[0][0]].copy() 133 | print('Training size after removing validation', x_train.shape, y_train.shape) 134 | 135 | # get nested out-of-fold predictions 136 | full_index = np.arange(x_train.shape[0]) 137 | fold_list = list(kfolder.split(x_train)) 138 | num_folds = len(fold_list) 139 | 140 | # for oof 141 | oof_x_train = {} 142 | oof_y_train = {} 143 | oof_z_train = {} 144 | for k0 in range(num_folds): 145 | 146 | # compute oof for k0 and train 147 | oof_index0 = fold_list[k0][1] 148 | #train_index = np.setdiff1d(full_index, oof_index0) 149 | train_index = np.setdiff1d(fold_list[k0][0], oof_index0) 150 | 151 | # split train / valid 152 | cv_x_train, cv_y_train = x_train[train_index], y_train[train_index] 153 | cv_x_valid0, cv_y_valid0 = x_train[oof_index0], y_train[oof_index0] 154 | 155 | # fit on cv 156 | cv_model = model_learner.get_init_model() 157 | cv_model.fit(cv_x_train, cv_y_train.reshape(-1)) 158 | 159 | # obtain prediction over quantiles 160 | cv_z_valid0 = cv_model.predict(cv_x_valid0, list(QUANTILE_LIST)) 161 | 162 | oof_x_train['{}'.format(k0)] = cv_x_valid0 163 | oof_y_train['{}'.format(k0)] = cv_y_valid0 164 | oof_z_train['{}'.format(k0)] = cv_z_valid0 165 | 166 | return oof_x_train, oof_y_train, oof_z_train, z_test, z_val 167 | 168 | # run other models 169 | def run_others(model, 170 | x_train, y_train, 171 | x_test, y_test, 172 | cv_split, kfolder, 173 | num_iters, rand_seed=1, 174 | eparams=None, 175 | **kwargs): 176 | # model name 177 | model_name = model.__name__ 178 | print('Train start: ', model_name) 179 | 180 | ######################################### 181 | #### remove validation from train set ### 182 | ######################################### 183 | x_train_org = x_train.copy() 184 | y_train_org = y_train.copy() 185 | 186 | x_val = x_train[cv_split[0][1]].copy() 187 | print('x_val', x_val.shape) 188 | 189 | print('Full training sizes:', x_train.shape, y_train.shape) 190 | x_train = x_train[cv_split[0][0]].copy() 191 | y_train = y_train[cv_split[0][0]].copy() 192 | print('Training size after removing validation', x_train.shape, y_train.shape) 193 | 194 | # get nested out-of-fold predictions 195 | full_index = np.arange(x_train.shape[0]) 196 | fold_list = list(kfolder.split(x_train)) 197 | num_folds = len(fold_list) 198 | 199 | # for each quantile 200 | oof_x_train = {} 201 | oof_y_train = {} 202 | oof_z_train = {} 203 | z_test = [] 204 | z_val = [] 205 | 206 | for quantile in QUANTILE_LIST: 207 | # find best model 208 | model_learner = model(quantile=quantile, num_iters=num_iters, num_folds=cv_split, rand_seed=rand_seed) 209 | model_learner.fit(x_train_org, y_train_org) 210 | 211 | # compute test prediction 212 | z_test.append(model_learner.predict(x_test).reshape(-1, 1)) 213 | 214 | # val prediction 215 | z_val.append(model_learner.predict(x_val).reshape(-1, 1)) 216 | 217 | # get out-of-fold predictions 218 | for k0 in range(num_folds): 219 | 220 | # and compute oof for k0 and k1 221 | oof_index0 = fold_list[k0][1] 222 | #train_index = np.setdiff1d(full_index, oof_index0) 223 | train_index = np.setdiff1d(fold_list[k0][0], oof_index0) 224 | 225 | # split train / valid 226 | cv_x_train, cv_y_train = x_train[train_index], y_train[train_index] 227 | cv_x_valid0, cv_y_valid0 = x_train[oof_index0], y_train[oof_index0] 228 | 229 | # fit on cv 230 | cv_model = model_learner.get_init_model() 231 | cv_model.fit(cv_x_train, cv_y_train.reshape(-1)) 232 | 233 | # get oof 234 | cv_z_valid0 = cv_model.predict(cv_x_valid0).reshape(-1, 1) 235 | if '{}'.format(k0) not in oof_x_train: 236 | oof_x_train['{}'.format(k0)] = cv_x_valid0 237 | if '{}'.format(k0) not in oof_y_train: 238 | oof_y_train['{}'.format(k0)] = cv_y_valid0 239 | if '{}'.format(k0) not in oof_z_train: 240 | oof_z_train['{}'.format(k0)] = cv_z_valid0 241 | else: 242 | cv_z_valid0 = np.concatenate([oof_z_train['{}'.format(k0)], cv_z_valid0], -1) 243 | oof_z_train['{}'.format(k0)] = cv_z_valid0 244 | 245 | z_test = np.concatenate(z_test, 1) 246 | z_val = np.concatenate(z_val, 1) 247 | 248 | return oof_x_train, oof_y_train, oof_z_train, z_test, z_val 249 | 250 | def prepare_data(task_id, 251 | num_iters=20, 252 | num_folds=5, 253 | rand_seed=1, 254 | eparams=None): 255 | 256 | print('Data preparation') 257 | # set seed 258 | random.seed(rand_seed) 259 | np.random.seed(rand_seed) 260 | torch.manual_seed(rand_seed) 261 | torch.cuda.manual_seed_all(rand_seed) 262 | 263 | # load dataset 264 | if task_id in UCI_LIST: 265 | task_name = task_id 266 | dataset = load_uci(task_id, random_seed=rand_seed, data_loc=eparams.data_loc) 267 | 268 | else: 269 | assert task_id in OPENML_DICT 270 | task_name = OPENML_DICT[task_id] 271 | dataset = load_openml(task_id, random_seed=rand_seed, data_loc=eparams.data_loc) 272 | 273 | dataset_size = dataset.size 274 | x_train, y_train = dataset.x_train, dataset.y_train 275 | x_test, y_test = dataset.x_test, dataset.y_test 276 | feature_size = x_train.shape[1] 277 | print('Data: {} (seed {}, dataset size {}, feature size {})'.format(task_name, rand_seed, dataset_size, feature_size)) 278 | 279 | # save data split 280 | exp_name = [task_id, num_folds, num_iters, rand_seed] 281 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_base_x_test_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), x_test) 282 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_base_y_test_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), y_test) 283 | 284 | # set cv splits 285 | train_size = x_train.shape[0] 286 | train_idx_list = np.arange(train_size) 287 | np.random.shuffle(train_idx_list) 288 | cv_split = [[train_idx_list[:int(train_size * CV_RATIO)], train_idx_list[int(train_size * CV_RATIO):]]] 289 | ######## 290 | # save the train and val split 291 | ######## 292 | print(eparams.log_id +'quantile_all_train_idx_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 293 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_all_train_idx_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), train_idx_list) 294 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_train_val_idx_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), cv_split) 295 | print(eparams.log_id +'quantile_train_val_idx_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name)) 296 | 297 | ######################################### 298 | #### validation is removed from training set 299 | ######################################### 300 | print('Train data:', x_train[cv_split[0][0]].shape, y_train[cv_split[0][0]].shape) 301 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_base_x_train_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), x_train[cv_split[0][0]]) 302 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_base_y_train_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), y_train[cv_split[0][0]]) 303 | 304 | print('Val data:', x_train[cv_split[0][1]].shape, y_train[cv_split[0][1]].shape) 305 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_base_x_val_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), x_train[cv_split[0][1]]) 306 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_base_y_val_{}_cv{}_iter{}_seed{}.npy'.format(*exp_name), y_train[cv_split[0][1]]) 307 | 308 | print('Test data:', x_test.shape, y_test.shape) 309 | 310 | output = {'x_train':x_train, 311 | 'y_train':y_train, 312 | 'x_test' :x_test, 313 | 'y_test' :y_test, 314 | 'cv_split':cv_split, 315 | } 316 | return output 317 | 318 | def run_exp(task_id, 319 | model_name, 320 | num_iters=20, 321 | num_folds=5, 322 | rand_seed=1, 323 | device=-1, 324 | cleaned_data=None, 325 | eparams=None): 326 | 327 | # set seed 328 | random.seed(rand_seed) 329 | np.random.seed(rand_seed) 330 | torch.manual_seed(rand_seed) 331 | torch.cuda.manual_seed_all(rand_seed) 332 | if torch.cuda.is_available() and device > -1: 333 | torch.backends.cudnn.deterministic = True 334 | torch.backends.cudnn.benchmark = False 335 | 336 | exp_name = [task_id, num_folds, num_iters, rand_seed] 337 | 338 | # set k-Folder for out of fold prediction (no need shuffle) 339 | kfolder = KFold(n_splits=num_folds) 340 | 341 | model, exp_fn = None, None 342 | if model_name == 'cgn': 343 | model = QuantileConditionalGaussianNetwork 344 | exp_fn = run_neural 345 | 346 | elif model_name == 'sqr': 347 | model = QuantileSingleNeuralNetwork 348 | exp_fn = run_neural 349 | 350 | elif model_name == 'mqr': 351 | model = QuantileJointNeuralNetwork 352 | exp_fn = run_neural 353 | 354 | elif model_name == 'rf': 355 | model = QuantileRandomForest 356 | exp_fn = run_tree 357 | 358 | elif model_name == 'xt': 359 | model = QuantileExtraTrees 360 | exp_fn = run_tree 361 | 362 | elif model_name == 'lgbm': 363 | model = QuantileLightGBM 364 | exp_fn = run_others 365 | 366 | elif model_name == 'qr': 367 | model = QuantileRegressor 368 | exp_fn = run_others 369 | 370 | # run exp 371 | oof_x_train, oof_y_train, oof_z_train, z_test, z_val = exp_fn(model=model, 372 | x_train=cleaned_data['x_train'].copy(), 373 | y_train=cleaned_data['y_train'].copy(), 374 | x_test=cleaned_data['x_test'].copy(), 375 | y_test=cleaned_data['y_test'].copy(), 376 | cv_split=copy.deepcopy(cleaned_data['cv_split']), 377 | kfolder=kfolder, 378 | num_iters=num_iters, 379 | rand_seed=rand_seed, 380 | device=device, 381 | eparams=eparams) 382 | 383 | # save 384 | print('Saving') 385 | print('z_test', z_test.shape) 386 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_{}_z_test_{}_cv{}_iter{}_seed{}.npy'.format(model.__name__, *exp_name), z_test) 387 | print(eparams.log_id +'quantile_nested_{}_z_test_{}_cv{}_iter{}_seed{}.npy'.format(model.__name__, *exp_name)) 388 | 389 | print('z_val', z_val.shape) 390 | np.save(eparams.DATA_PATH + eparams.log_id +'quantile_nested_{}_z_val_{}_cv{}_iter{}_seed{}.npy'.format(model.__name__, *exp_name), z_val) 391 | print(eparams.log_id +'quantile_nested_{}_z_val_{}_cv{}_iter{}_seed{}.npy'.format(model.__name__, *exp_name)) 392 | 393 | with open(eparams.DATA_PATH + eparams.log_id +'quantile_nested_{}_oof_x_train_{}_cv{}_iter{}_seed{}.pkl'.format(model.__name__, *exp_name), 'wb') as handle: 394 | pkl.dump(oof_x_train, handle, protocol=pkl.HIGHEST_PROTOCOL) 395 | print(eparams.log_id +'quantile_nested_{}_oof_x_train_{}_cv{}_iter{}_seed{}.pkl'.format(model.__name__, *exp_name)) 396 | 397 | with open(eparams.DATA_PATH + eparams.log_id +'quantile_nested_{}_oof_y_train_{}_cv{}_iter{}_seed{}.pkl'.format(model.__name__, *exp_name), 'wb') as handle: 398 | pkl.dump(oof_y_train, handle, protocol=pkl.HIGHEST_PROTOCOL) 399 | print(eparams.log_id +'quantile_nested_{}_oof_y_train_{}_cv{}_iter{}_seed{}.pkl'.format(model.__name__, *exp_name)) 400 | 401 | with open(eparams.DATA_PATH + eparams.log_id +'quantile_nested_{}_oof_z_train_{}_cv{}_iter{}_seed{}.pkl'.format(model.__name__, *exp_name), 'wb') as handle: 402 | pkl.dump(oof_z_train, handle, protocol=pkl.HIGHEST_PROTOCOL) 403 | print(eparams.log_id +'quantile_nested_{}_oof_z_train_{}_cv{}_iter{}_seed{}.pkl'.format(model.__name__, *exp_name)) 404 | 405 | if __name__ == "__main__": 406 | # arguments 407 | parser = argparse.ArgumentParser() 408 | 409 | # parser 410 | parser.add_argument('--task-id', type=str, help='task id') 411 | parser.add_argument('--seed', type=int, default=1, help='random seed') 412 | parser.add_argument('--gpu', type=int, default=0, help='gpu') 413 | parser.add_argument('--cv', type=int, default=5, help='folds for out of fold predictions') 414 | parser.add_argument('--iter', type=int, default=20, help='number of iterations for grid search') 415 | parser.add_argument('--trans_type', default='pava') 416 | parser.add_argument('--use_grad', default=False, action='store_true') 417 | parser.add_argument('--use_margin', default=False, action='store_true') 418 | parser.add_argument('--DATA_PATH', default='./output/data/') 419 | parser.add_argument('--log_id', default='mylogid') 420 | parser.add_argument('--data_loc', default='./data/dataset/') 421 | parser.add_argument('--margin_type', type=str, default='single', help='margin type') 422 | 423 | 424 | args = parser.parse_args() 425 | print('------------') 426 | print(args.__dict__) 427 | print('------------') 428 | 429 | make_dir(args.DATA_PATH) 430 | fname_json = os.path.join(args.DATA_PATH, args.log_id + '-t' + \ 431 | str(args.task_id) + '_cv' + str(args.cv) + \ 432 | '_i' + str(args.iter) + \ 433 | '_S' + str(args.seed) + '.json' ) 434 | dump_to_json(fname_json, {'args': args.__dict__}) 435 | print(fname_json) 436 | 437 | # first prepare data 438 | cleaned_dt = prepare_data(task_id=args.task_id, 439 | num_iters=args.iter, 440 | num_folds=args.cv, 441 | rand_seed=args.seed, 442 | eparams=args) 443 | print('---------------------------------------------------') 444 | print() 445 | # run 446 | 447 | model_list = ['cgn', 'sqr', 'mqr', 'rf', 'xt', 'lgbm'] 448 | 449 | for model in model_list: 450 | print(model , '...') 451 | run_exp(task_id=args.task_id, 452 | model_name=model, 453 | num_iters=args.iter, 454 | num_folds=args.cv, 455 | rand_seed=args.seed, 456 | device=args.gpu, 457 | cleaned_data=cleaned_dt, 458 | eparams=args, 459 | ) 460 | print('---------------------------------------------------') 461 | 462 | print('Done.') 463 | 464 | -------------------------------------------------------------------------------- /model/neural_network.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import random 3 | import torch 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from scipy.stats import norm 8 | from util.metric import pinball_loss, huber_loss, margin_loss, pinball_loss_np 9 | from util.misc import fix_crossing, pava_forward, sort_forward 10 | from torch.utils.data import DataLoader, TensorDataset 11 | from sklearn.model_selection import ParameterSampler 12 | EVAL_STEPS = 10 13 | STOP_STEPS = EVAL_STEPS * 50 14 | QUANTILE_LOSS_PARAM_GRID = {'alpha': [0.0, 0.01], 15 | 'lr': [1e-3, 3e-4], 16 | 'wd': [1e-5, 1e-7], 17 | 'weight': [0.0, 1.0]} 18 | QUANTILE_MARGIN_PARAM_GRID = {'margin': [0.0, 0.5, 1.0, 5.0], 19 | 'scale': [1e-3, 1e-4]} 20 | 21 | QUANTILE_MARGIN_PARAM_GRID_FIX = {'margin': [0.0, 0.5, 1.0, 5.0], 22 | 'margin_delta': [0.0001, 0.001, 0.005, 0.0075, 0.0099]} 23 | 24 | MEAN_LOSS_PARAM_GRID = {'lr': [1e-3, 1e-4], 25 | 'wd': [1e-5, 1e-7]} 26 | NETWORK_PARAM_GRID = {'hidden_size': [64, 128], 27 | 'num_layers': [2, 3], 28 | 'dropout': [0.0, 0.05, 0.1]} 29 | 30 | 31 | class NeuralJointQuantileRegressor(nn.Module): 32 | def __init__(self, 33 | quantile_list, 34 | input_size, 35 | hidden_size=64, 36 | num_layers=3, 37 | dropout=0.0, 38 | activation='elu', 39 | use_grad=True, 40 | trans_type='mono', 41 | use_margin=True, 42 | margin_type=''): 43 | super(NeuralJointQuantileRegressor, self).__init__() 44 | # quantile list to handle 45 | self.num_quantiles = len(quantile_list) 46 | self.register_buffer('quantile_list', torch.Tensor(quantile_list).float()) 47 | 48 | # activation 49 | act_fn = nn.ELU() 50 | if activation == 'elu': 51 | act_fn = nn.ELU() 52 | elif activation == 'relu': 53 | act_fn = nn.ReLU() 54 | elif activation == 'tanh': 55 | act_fn = nn.Tanh() 56 | 57 | # network with predicting quantiles 58 | layers = [nn.Linear(input_size, hidden_size), act_fn] 59 | for _ in range(num_layers - 1): 60 | layers.append(nn.Dropout(dropout)) 61 | layers.append(nn.Linear(hidden_size, hidden_size)) 62 | layers.append(act_fn) 63 | layers.append(nn.Linear(hidden_size, self.num_quantiles)) 64 | self.network = nn.Sequential(*layers) 65 | 66 | # post-process 67 | self.use_grad = use_grad 68 | if trans_type is None: 69 | self.use_grad = False 70 | self.trans_type = trans_type 71 | self.use_margin = use_margin 72 | self.margin_type = margin_type 73 | 74 | #print('----------------') 75 | #print('NeuralJointQuantileRegressor') 76 | #print('use_grad: ', self.use_grad) 77 | #print('trans_type: ', self.trans_type) 78 | #print('use_margin: ', self.use_margin) 79 | #print('margin_type: ', self.margin_type) 80 | #print('----------------') 81 | 82 | def forward(self, input_data): 83 | # get output values 84 | output_data = self.network(input_data) 85 | return output_data 86 | 87 | def compute_loss(self, 88 | input_data, 89 | target_data, 90 | aux_data=None, 91 | margin=0.0, 92 | scale=0.0, 93 | alpha=0.0, 94 | weight=0.0, 95 | margin_delta=0.0): 96 | # train mode 97 | self.train() 98 | 99 | # margin loss 100 | if margin > 0.0 and self.use_margin: 101 | # include aux-data 102 | if aux_data is not None: 103 | batch_size = input_data.size()[0] 104 | full_input_data = torch.cat([input_data, aux_data], 0) 105 | predict_data = self(full_input_data) 106 | m_loss = margin_loss(predict_data, self.quantile_list * scale) 107 | predict_data = predict_data[:batch_size].contiguous() 108 | else: 109 | predict_data = self(input_data) 110 | if self.margin_type == 'single': 111 | m_loss = margin_loss(predict_data, margin_delta) 112 | 113 | else: 114 | m_loss = margin_loss(predict_data, self.quantile_list * scale) 115 | else: 116 | predict_data = self(input_data) 117 | m_loss = 0.0 118 | 119 | # fix crossing 120 | if self.use_grad: 121 | h_loss = weight * huber_loss(predict_data, target_data, self.quantile_list, alpha=alpha) 122 | if self.trans_type == 'mono': 123 | predict_data = fix_crossing(predict_data) 124 | 125 | elif self.trans_type == 'pava': 126 | predict_data = pava_forward(predict_data) 127 | 128 | elif self.trans_type == 'sort': 129 | predict_data = sort_forward(predict_data) 130 | 131 | else: 132 | NotImplementedError() 133 | h_loss += (1 - weight) * huber_loss(predict_data, target_data, self.quantile_list, alpha=alpha) 134 | 135 | else: 136 | h_loss = huber_loss(predict_data, target_data, self.quantile_list, alpha=alpha) 137 | 138 | # combine 139 | return h_loss + margin * m_loss 140 | 141 | def eval_loss(self, input_data, target_data): 142 | self.eval() 143 | with torch.no_grad(): 144 | predict_data = self(input_data) 145 | 146 | if self.trans_type == 'mono': 147 | predict_data = fix_crossing(predict_data) 148 | 149 | elif self.trans_type == 'pava': 150 | predict_data = pava_forward(predict_data) 151 | 152 | elif self.trans_type == 'sort': 153 | predict_data = sort_forward(predict_data)#torch.sort(predict_data, -1)[0] 154 | 155 | return pinball_loss(predict_data, target_data, self.quantile_list).item() 156 | 157 | def predict(self, input_data): 158 | self.eval() 159 | with torch.no_grad(): 160 | predict_data = self(input_data) 161 | if self.trans_type == 'mono': 162 | predict_data = fix_crossing(predict_data) 163 | 164 | elif self.trans_type == 'pava': 165 | predict_data = pava_forward(predict_data) 166 | 167 | elif self.trans_type == 'sort': 168 | predict_data = sort_forward(predict_data)#torch.sort(predict_data, -1)[0] 169 | 170 | return predict_data.data.cpu().numpy() 171 | 172 | class NeuralSingleQuantileRegressor(nn.Module): 173 | def __init__(self, 174 | input_size, 175 | hidden_size=64, 176 | num_layers=3, 177 | dropout=0.0, 178 | activation='elu'): 179 | super(NeuralSingleQuantileRegressor, self).__init__() 180 | # activation 181 | act_fn = nn.ELU() 182 | if activation == 'elu': 183 | act_fn = nn.ELU() 184 | elif activation == 'relu': 185 | act_fn = nn.ReLU() 186 | elif activation == 'tanh': 187 | act_fn = nn.Tanh() 188 | 189 | # network with predicting quantiles 190 | layers = [nn.Linear(input_size + 1, hidden_size), act_fn] 191 | for _ in range(num_layers - 1): 192 | layers.append(nn.Dropout(dropout)) 193 | layers.append(nn.Linear(hidden_size, hidden_size)) 194 | layers.append(act_fn) 195 | 196 | layers.append(nn.Linear(hidden_size, 1)) 197 | self.network = nn.Sequential(*layers) 198 | 199 | def forward(self, input_data, quantile_data): 200 | quantile_data = quantile_data.reshape(-1, 1) 201 | assert quantile_data.size()[0] == input_data.size()[0] 202 | 203 | # get output values 204 | output_data = self.network(torch.cat([input_data, quantile_data - 0.5], 1)) 205 | return output_data 206 | 207 | def compute_loss(self, input_data, target_data, quantile_data, alpha=0.01): 208 | # train mode 209 | self.train() 210 | 211 | # prediction 212 | predict_data = self(input_data, quantile_data) 213 | 214 | # compute huber loss 215 | return huber_loss(predict_data, target_data, quantile_data, alpha) 216 | 217 | def eval_loss(self, input_data, target_data, quantile_data): 218 | self.eval() 219 | with torch.no_grad(): 220 | predict_data = self(input_data, quantile_data) 221 | return pinball_loss(predict_data, target_data, quantile_data).item() 222 | 223 | def predict(self, input_data, quantile_data): 224 | self.eval() 225 | with torch.no_grad(): 226 | predict_data = self(input_data, quantile_data) 227 | return predict_data.data.cpu().numpy() 228 | 229 | 230 | class NeuralCondtionalGaussian(nn.Module): 231 | def __init__(self, 232 | input_size, 233 | hidden_size=64, 234 | num_layers=3, 235 | dropout=0.0, 236 | activation='elu'): 237 | 238 | super(NeuralCondtionalGaussian, self).__init__() 239 | # activation 240 | act_fn = nn.ELU() 241 | if activation == 'elu': 242 | act_fn = nn.ELU() 243 | elif activation == 'relu': 244 | act_fn = nn.ReLU() 245 | elif activation == 'tanh': 246 | act_fn = nn.Tanh() 247 | 248 | # network with predicting quantiles 249 | layers = [nn.Linear(input_size, hidden_size), act_fn] 250 | for _ in range(num_layers - 1): 251 | layers.append(nn.Dropout(dropout)) 252 | layers.append(nn.Linear(hidden_size, hidden_size)) 253 | layers.append(act_fn) 254 | 255 | layers.append(nn.Linear(hidden_size, 2)) 256 | self.network = nn.Sequential(*layers) 257 | 258 | def forward(self, input_data): 259 | output_data = self.network(input_data) 260 | mean_data = output_data[:, 0].reshape(-1, 1) 261 | var_data = F.softplus(output_data[:, 1].reshape(-1, 1)) + 1e-6 262 | return mean_data, var_data 263 | 264 | def compute_loss(self, input_data, target_data): 265 | # train mode 266 | self.train() 267 | 268 | # prediction 269 | mean_data, var_data = self(input_data) 270 | 271 | # compute negative log-likelihood 272 | nll_loss = torch.pow(target_data - mean_data, 2).div(2 * var_data) 273 | nll_loss += var_data.log().div(2) 274 | return nll_loss.mean() 275 | 276 | def eval_loss(self, input_data, target_data, quantile_data=None): 277 | if quantile_data is None: 278 | self.eval() 279 | with torch.no_grad(): 280 | mean_data, var_data = self(input_data) 281 | error_data = target_data - mean_data 282 | return torch.sqrt(torch.mean(error_data * error_data)).item() 283 | else: 284 | predict_data = self.predict(input_data, quantile_data) 285 | target_data = target_data.data.cpu().numpy() 286 | return pinball_loss_np(predict_data, target_data, quantile_data, True) 287 | 288 | def predict(self, input_data, quantile_data=None): 289 | self.eval() 290 | with torch.no_grad(): 291 | mean_data, var_data = self(input_data) 292 | if quantile_data is None: 293 | return mean_data.data.cpu().numpy() 294 | else: 295 | std_data = torch.sqrt(var_data).data.cpu().numpy() 296 | ppf_data = norm.ppf(quantile_data).reshape(1, -1) 297 | predict_data = mean_data.data.cpu().numpy() + std_data * ppf_data 298 | return predict_data 299 | 300 | 301 | class QuantileJointNeuralNetwork: 302 | def __init__(self, 303 | quantile_list, 304 | num_iters, 305 | cv_split, 306 | batch_size=64, 307 | use_grad=True, 308 | trans_type='mono', 309 | use_margin=True, 310 | rand_seed=1, 311 | device=-1, 312 | margin_type='', 313 | **kwargs): 314 | 315 | self.num_iters = num_iters 316 | self.cv_split = cv_split 317 | self.quantile_list = quantile_list 318 | self.use_grad = use_grad 319 | self.trans_type = trans_type 320 | self.use_margin = use_margin 321 | self.rand_seed = rand_seed 322 | 323 | self.input_size = None 324 | self.batch_size = batch_size 325 | 326 | self.best_model = None 327 | self.best_params = None 328 | 329 | self.margin_type = margin_type 330 | 331 | print('----------------') 332 | print('QuantileJointNeuralNetwork') 333 | print('use_grad: ', self.use_grad) 334 | print('trans_type: ', self.trans_type) 335 | print('use_margin: ', self.use_margin) 336 | print('margin_type: ', self.margin_type) 337 | print('num_iters: ', self.num_iters) 338 | print('----------------') 339 | 340 | if torch.cuda.is_available() and device > -1: 341 | self.device = torch.device("cuda:{}".format(device)) 342 | else: 343 | self.device = torch.device("cpu") 344 | 345 | def fit(self, x_train, y_train, ax_train=None): 346 | # train models 347 | x_train = torch.FloatTensor(x_train).to(self.device) 348 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 349 | 350 | if ax_train is not None: 351 | ax_train = torch.FloatTensor(ax_train).to(self.device) 352 | 353 | # get input size 354 | self.input_size = x_train.size()[1] 355 | 356 | # build params list 357 | full_param_grid = {**NETWORK_PARAM_GRID, 358 | **QUANTILE_LOSS_PARAM_GRID} 359 | 360 | if self.use_margin == True and self.margin_type == 'single': 361 | full_param_grid = {**full_param_grid, 362 | **QUANTILE_MARGIN_PARAM_GRID_FIX} 363 | 364 | elif self.use_margin: 365 | full_param_grid = {**full_param_grid, 366 | **QUANTILE_MARGIN_PARAM_GRID} 367 | 368 | if not self.use_grad or self.trans_type in ['pava', 'sort']: 369 | full_param_grid['weight'] = [0.0] 370 | 371 | params_list = list(ParameterSampler(param_distributions=full_param_grid, 372 | n_iter=self.num_iters, 373 | random_state=self.rand_seed)) 374 | 375 | # set data loader 376 | train_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][0]], 377 | y_train[self.cv_split[0][0]]), 378 | shuffle=True, 379 | batch_size=self.batch_size, 380 | drop_last=False, 381 | worker_init_fn=np.random.seed(self.rand_seed)) 382 | 383 | valid_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][1]], 384 | y_train[self.cv_split[0][1]]), 385 | shuffle=False, 386 | batch_size=1024, 387 | drop_last=False) 388 | 389 | if ax_train is not None: 390 | aux_loader = DataLoader(dataset=TensorDataset(torch.cat([ax_train, x_train[self.cv_split[0][1]]], 0)), 391 | shuffle=True, 392 | batch_size=self.batch_size, 393 | drop_last=False, 394 | worker_init_fn=np.random.seed(self.rand_seed)) 395 | else: 396 | aux_loader = None 397 | 398 | # for each param 399 | best_eval_loss = np.inf 400 | best_eval_step = 0 401 | for p, params in enumerate(params_list): 402 | print('iter:',p, ' ', params) 403 | # fit model with given data and params 404 | eval_loss, eval_step = self.fit_model(train_loader, valid_loader, aux_loader, **params) 405 | if eval_loss < best_eval_loss: 406 | best_eval_loss = eval_loss 407 | best_eval_step = eval_step 408 | self.best_params = params 409 | 410 | print('eval_loss : %.4f, best_eval_sofar: %.4f, eval_step: %d' %(eval_loss, best_eval_loss, eval_step)) 411 | print() 412 | 413 | self.best_params['num_steps'] = best_eval_step 414 | 415 | # retrain model with train split with best hyper-params 416 | train_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][0]], 417 | y_train[self.cv_split[0][0]]), 418 | shuffle=True, 419 | batch_size=self.batch_size, 420 | drop_last=False, 421 | worker_init_fn=np.random.seed(self.rand_seed)) 422 | 423 | if ax_train is not None: 424 | aux_loader = DataLoader(TensorDataset(ax_train), 425 | shuffle=True, 426 | batch_size=self.batch_size, 427 | drop_last=False, 428 | worker_init_fn=np.random.seed(self.rand_seed)) 429 | else: 430 | aux_loader = None 431 | 432 | print('best_params:', self.best_params) 433 | self.best_model = self.fit_model(train_loader, None, aux_loader, **self.best_params) 434 | 435 | def fit_model(self, 436 | train_loader, 437 | valid_loader=None, 438 | aux_loader=None, 439 | hidden_size=64, 440 | num_layers=3, 441 | dropout=0.5, 442 | activation='elu', 443 | lr=1e-3, 444 | wd=1e-5, 445 | num_steps=None, 446 | margin=0.0, 447 | scale=0.0, 448 | alpha=0.0, 449 | weight=0.0, 450 | margin_delta=0.0): 451 | 452 | # init model 453 | random.seed(self.rand_seed) 454 | if self.device == torch.device("cpu"): 455 | torch.manual_seed(self.rand_seed) 456 | 457 | else: 458 | torch.cuda.manual_seed_all(self.rand_seed) 459 | 460 | model = NeuralJointQuantileRegressor(quantile_list=self.quantile_list, 461 | input_size=self.input_size, 462 | hidden_size=hidden_size, 463 | num_layers=num_layers, 464 | dropout=dropout, 465 | activation=activation, 466 | use_grad=self.use_grad, 467 | trans_type=self.trans_type, 468 | use_margin=self.use_margin, 469 | margin_type=self.margin_type) 470 | model = model.to(self.device) 471 | 472 | # init optimizer 473 | optimizer = torch.optim.Adam(params=model.parameters(), 474 | lr=lr, weight_decay=wd, amsgrad=True) 475 | 476 | # init aux_loader 477 | if aux_loader is None: 478 | aux_loader_iterator = None 479 | else: 480 | aux_loader_iterator = iter(aux_loader) 481 | 482 | # for each update 483 | steps = 0 484 | best_valid_loss = np.inf 485 | best_step = 0 486 | while True: 487 | # for each batch (update) 488 | for x_batch, y_batch in train_loader: 489 | # aux data 490 | if aux_loader_iterator is None: 491 | aux_batch = None 492 | else: 493 | try: 494 | aux_batch = next(aux_loader_iterator)[0] 495 | except StopIteration: 496 | aux_loader_iterator = iter(aux_loader) 497 | aux_batch = next(aux_loader_iterator)[0] 498 | 499 | # compute loss 500 | weight = weight * (np.cos(min((steps / float(STOP_STEPS)), 1.0) * np.pi) + 1) * 0.5 501 | batch_loss = model.compute_loss(input_data=x_batch, 502 | target_data=y_batch, 503 | aux_data=aux_batch, 504 | margin=margin, 505 | scale=scale, 506 | alpha=alpha, 507 | weight=weight, 508 | margin_delta=margin_delta) 509 | 510 | # backprop and update 511 | optimizer.zero_grad() 512 | batch_loss.backward() 513 | optimizer.step() 514 | 515 | # step up 516 | steps += 1 517 | 518 | # validate 519 | if steps % 100 == 0 and valid_loader is not None: 520 | valid_loss = 0.0 521 | valid_size = 0.0 522 | 523 | for x_batch, y_batch in valid_loader: 524 | batch_size = x_batch.size()[0] 525 | batch_loss = model.eval_loss(input_data=x_batch, target_data=y_batch) 526 | valid_loss += batch_loss * batch_size 527 | valid_size += batch_size 528 | valid_loss /= valid_size 529 | 530 | if best_valid_loss > valid_loss: 531 | best_valid_loss = valid_loss 532 | best_step = steps 533 | elif steps - best_step >= STOP_STEPS: 534 | return best_valid_loss, best_step 535 | elif num_steps is not None and steps >= num_steps: 536 | assert valid_loader is None 537 | return copy.deepcopy(model) 538 | 539 | def predict(self, x_data): 540 | x_data = torch.FloatTensor(x_data).to(self.device) 541 | y_pred = self.best_model.predict(x_data) 542 | return y_pred 543 | 544 | def refit_model(self, x_train, y_train, ax_train=None): 545 | x_train = torch.FloatTensor(x_train).to(self.device) 546 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 547 | 548 | if ax_train is not None: 549 | ax_train = torch.FloatTensor(ax_train).to(self.device) 550 | 551 | train_loader = DataLoader(TensorDataset(x_train, y_train), 552 | shuffle=True, 553 | batch_size=self.batch_size, 554 | drop_last=False) 555 | if ax_train is not None: 556 | aux_loader = DataLoader(TensorDataset(ax_train), 557 | shuffle=True, 558 | batch_size=self.batch_size, 559 | drop_last=False) 560 | else: 561 | aux_loader = None 562 | self.best_model = self.fit_model(train_loader, None, aux_loader, **self.best_params) 563 | 564 | 565 | class QuantileSingleNeuralNetwork: 566 | def __init__(self, 567 | num_iters, 568 | cv_split, 569 | quantile_list, 570 | batch_size=64, 571 | rand_seed=111, 572 | device=-1, 573 | **kwargs): 574 | 575 | self.num_iters = num_iters 576 | self.cv_split = cv_split 577 | self.quantile_list = quantile_list 578 | self.rand_seed = rand_seed 579 | 580 | self.input_size = None 581 | self.batch_size = batch_size 582 | 583 | self.best_model = None 584 | self.best_params = None 585 | 586 | if torch.cuda.is_available() and device > -1: 587 | self.device = torch.device("cuda:{}".format(device)) 588 | else: 589 | self.device = torch.device("cpu") 590 | 591 | def fit(self, x_train, y_train, ax_train=None): 592 | # train models 593 | x_train = torch.FloatTensor(x_train).float().to(self.device) 594 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).float().to(self.device) 595 | 596 | # get input size 597 | self.input_size = x_train.size()[1] 598 | 599 | # build params list 600 | full_param_grid = {**NETWORK_PARAM_GRID, 601 | **QUANTILE_LOSS_PARAM_GRID} 602 | del full_param_grid['weight'] 603 | params_list = list(ParameterSampler(param_distributions=full_param_grid, 604 | n_iter=self.num_iters, 605 | random_state=self.rand_seed)) 606 | 607 | # set data loader 608 | train_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][0]], 609 | y_train[self.cv_split[0][0]]), 610 | shuffle=True, 611 | batch_size=self.batch_size, 612 | drop_last=False) 613 | 614 | valid_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][1]], 615 | y_train[self.cv_split[0][1]]), 616 | shuffle=False, 617 | batch_size=1024, 618 | drop_last=False) 619 | 620 | # for each param 621 | best_eval_loss = np.inf 622 | best_eval_step = 0 623 | for p, params in enumerate(params_list): 624 | # fit model with given data and params 625 | eval_loss, eval_step = self.fit_model(train_loader, valid_loader, **params) 626 | if eval_loss < best_eval_loss: 627 | best_eval_loss = eval_loss 628 | best_eval_step = eval_step 629 | self.best_params = params 630 | self.best_params['num_steps'] = best_eval_step 631 | 632 | # retrain model train split not full data 633 | #train_loader = DataLoader(TensorDataset(x_train, y_train), 634 | # shuffle=True, batch_size=self.batch_size, drop_last=True) 635 | 636 | train_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][0]], 637 | y_train[self.cv_split[0][0]]), 638 | shuffle=True, 639 | batch_size=self.batch_size, 640 | drop_last=False) 641 | 642 | self.best_model = self.fit_model(train_loader, None, **self.best_params) 643 | 644 | def fit_model(self, train_loader, valid_loader=None, 645 | hidden_size=64, num_layers=3, dropout=0.5, activation='elu', 646 | lr=1e-3, wd=1e-5, num_steps=None, alpha=0.01): 647 | # init model 648 | random.seed(self.rand_seed) 649 | if self.device == torch.device("cpu"): 650 | torch.manual_seed(self.rand_seed) 651 | else: 652 | torch.cuda.manual_seed_all(self.rand_seed) 653 | 654 | model = NeuralSingleQuantileRegressor(input_size=self.input_size, 655 | hidden_size=hidden_size, 656 | num_layers=num_layers, 657 | dropout=dropout, 658 | activation=activation) 659 | model = model.to(self.device) 660 | 661 | # init optimizer 662 | optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, weight_decay=wd, amsgrad=True) 663 | 664 | # for each update 665 | steps = 0 666 | best_valid_loss = np.inf 667 | best_step = 0 668 | while True: 669 | # for each batch (update) 670 | for x_batch, y_batch in train_loader: 671 | # sample quantile 672 | batch_size = x_batch.size()[0] 673 | q_batch = torch.rand(batch_size, 1).to(self.device) 674 | q_batch = torch.clamp(q_batch, 0.001, 0.999) 675 | 676 | # compute loss 677 | batch_loss = model.compute_loss(input_data=x_batch, target_data=y_batch, 678 | quantile_data=q_batch, alpha=alpha) 679 | 680 | # backprop and update 681 | optimizer.zero_grad() 682 | batch_loss.backward() 683 | optimizer.step() 684 | 685 | # step up 686 | steps += 1 687 | 688 | # validate 689 | if steps % 100 == 0 and valid_loader is not None: 690 | valid_loss = 0.0 691 | valid_size = 0.0 692 | 693 | for x_batch, y_batch in valid_loader: 694 | batch_size = x_batch.size()[0] 695 | for q in self.quantile_list: 696 | q_batch = q * torch.ones(batch_size, 1).to(self.device) 697 | batch_loss = model.eval_loss(input_data=x_batch, target_data=y_batch, quantile_data=q_batch) 698 | valid_loss += batch_loss * batch_size 699 | valid_size += batch_size 700 | valid_loss /= valid_size 701 | 702 | if best_valid_loss > valid_loss: 703 | best_valid_loss = valid_loss 704 | best_step = steps 705 | elif steps - best_step >= STOP_STEPS: 706 | return best_valid_loss, best_step 707 | elif num_steps is not None and steps >= num_steps: 708 | assert valid_loader is None 709 | return copy.deepcopy(model) 710 | 711 | def predict(self, x_data): 712 | x_data = torch.FloatTensor(x_data).to(self.device) 713 | batch_size = x_data.size()[0] 714 | y_pred_list = [] 715 | for q in self.quantile_list: 716 | q_data = q * torch.ones(batch_size, 1).to(self.device) 717 | y_pred = self.best_model.predict(x_data, q_data) 718 | y_pred_list.append(y_pred) 719 | return np.concatenate(y_pred_list, 1) 720 | 721 | def refit_model(self, x_train, y_train, ax_train=None): 722 | x_train = torch.FloatTensor(x_train).to(self.device) 723 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 724 | 725 | train_loader = DataLoader(TensorDataset(x_train, y_train), 726 | shuffle=True, 727 | batch_size=self.batch_size, 728 | drop_last=False) 729 | 730 | self.best_model = self.fit_model(train_loader, None, **self.best_params) 731 | 732 | 733 | class QuantileConditionalGaussianNetwork: 734 | def __init__(self, 735 | num_iters, 736 | cv_split, 737 | quantile_list, 738 | batch_size=64, 739 | rand_seed=111, 740 | device=-1, 741 | **kwargs): 742 | self.num_iters = num_iters 743 | self.cv_split = cv_split 744 | self.quantile_list = quantile_list 745 | self.rand_seed = rand_seed 746 | 747 | self.input_size = None 748 | self.batch_size = batch_size 749 | 750 | self.best_model = None 751 | self.best_params = None 752 | 753 | if torch.cuda.is_available() and device > -1: 754 | self.device = torch.device("cuda:{}".format(device)) 755 | else: 756 | self.device = torch.device("cpu") 757 | 758 | def fit(self, x_train, y_train, ax_train=None): 759 | # train models 760 | x_train = torch.FloatTensor(x_train).to(self.device) 761 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 762 | 763 | # get input size 764 | self.input_size = x_train.size()[1] 765 | 766 | # build params list 767 | full_param_grid = NETWORK_PARAM_GRID 768 | params_list = list(ParameterSampler(param_distributions=full_param_grid, 769 | n_iter=self.num_iters, 770 | random_state=self.rand_seed)) 771 | 772 | # set data loader 773 | train_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][0]], 774 | y_train[self.cv_split[0][0]]), 775 | shuffle=True, 776 | batch_size=self.batch_size, 777 | drop_last=False) 778 | 779 | valid_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][1]], 780 | y_train[self.cv_split[0][1]]), 781 | shuffle=False, 782 | batch_size=1024, 783 | drop_last=False) 784 | 785 | # for each param 786 | best_eval_loss = np.inf 787 | best_eval_step = 0 788 | for p, params in enumerate(params_list): 789 | # fit model with given data and params 790 | eval_loss, eval_step = self.fit_model(train_loader, valid_loader, **params) 791 | if eval_loss < best_eval_loss: 792 | best_eval_loss = eval_loss 793 | best_eval_step = eval_step 794 | self.best_params = params 795 | self.best_params['num_steps'] = best_eval_step 796 | 797 | # retrain model with only train split not full data 798 | # train_loader = DataLoader(TensorDataset(x_train, y_train), 799 | # shuffle=True, batch_size=self.batch_size, drop_last=True) 800 | train_loader = DataLoader(TensorDataset(x_train[self.cv_split[0][0]], 801 | y_train[self.cv_split[0][0]]), 802 | shuffle=True, 803 | batch_size=self.batch_size, 804 | drop_last=False) 805 | 806 | print('best_params:', self.best_params) 807 | self.best_model = self.fit_model(train_loader, None, **self.best_params) 808 | 809 | def fit_model(self, 810 | train_loader, 811 | valid_loader=None, 812 | hidden_size=64, 813 | num_layers=3, 814 | dropout=0.5, 815 | activation='elu', 816 | lr=1e-3, 817 | wd=1e-5, 818 | num_steps=None): 819 | 820 | # init model 821 | random.seed(self.rand_seed) 822 | if self.device == torch.device("cpu"): 823 | torch.manual_seed(self.rand_seed) 824 | else: 825 | torch.cuda.manual_seed_all(self.rand_seed) 826 | 827 | model = NeuralCondtionalGaussian(input_size=self.input_size, 828 | hidden_size=hidden_size, 829 | num_layers=num_layers, 830 | dropout=dropout, 831 | activation=activation) 832 | model = model.to(self.device) 833 | 834 | # init optimizer 835 | optimizer = torch.optim.Adam(params=model.parameters(), lr=lr, weight_decay=wd, amsgrad=True) 836 | 837 | # for each update 838 | steps = 0 839 | best_valid_loss = np.inf 840 | best_step = 0 841 | while True: 842 | # for each batch (update) 843 | for x_batch, y_batch in train_loader: 844 | # compute loss 845 | batch_loss = model.compute_loss(input_data=x_batch, target_data=y_batch) 846 | 847 | # backprop and update 848 | optimizer.zero_grad() 849 | batch_loss.backward() 850 | optimizer.step() 851 | 852 | # step up 853 | steps += 1 854 | 855 | # validate 856 | if steps % 100 == 0 and valid_loader is not None: 857 | valid_loss = 0.0 858 | valid_size = 0.0 859 | for x_batch, y_batch in valid_loader: 860 | batch_size = x_batch.size()[0] 861 | batch_loss = model.eval_loss(input_data=x_batch, target_data=y_batch, 862 | quantile_data=self.quantile_list) 863 | valid_loss += batch_loss * batch_size 864 | valid_size += batch_size 865 | valid_loss /= valid_size 866 | 867 | if best_valid_loss > valid_loss: 868 | best_valid_loss = valid_loss 869 | best_step = steps 870 | elif steps - best_step >= STOP_STEPS: 871 | return best_valid_loss, best_step 872 | elif num_steps is not None and steps >= num_steps: 873 | assert valid_loader is None 874 | return copy.deepcopy(model) 875 | 876 | def predict(self, x_data): 877 | x_data = torch.FloatTensor(x_data).to(self.device) 878 | return self.best_model.predict(x_data, self.quantile_list) 879 | 880 | def refit_model(self, x_train, y_train, ax_train=None): 881 | x_train = torch.FloatTensor(x_train).to(self.device) 882 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 883 | train_loader = DataLoader(TensorDataset(x_train, y_train), 884 | shuffle=True, 885 | batch_size=self.batch_size, 886 | drop_last=False) 887 | 888 | self.best_model = self.fit_model(train_loader, None, **self.best_params) 889 | 890 | -------------------------------------------------------------------------------- /model/neural_aggregator.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import numpy as np 4 | import torch.nn as nn 5 | from util.metric import pinball_loss, huber_loss, margin_loss 6 | from util.misc import fix_crossing, pava_forward, sort_forward 7 | from torch.utils.data import DataLoader, TensorDataset 8 | from sklearn.model_selection import ParameterSampler 9 | EVAL_STEPS = 10 10 | STOP_STEPS = EVAL_STEPS * 50 11 | QUANTILE_LOSS_PARAM_GRID = {'alpha': [0.0], 12 | 'lr': [1e-3, 5e-4], 13 | 'wd': [1e-7], 14 | 'margin_weight': [0.5, 1.0, 2.0, 5.0, 10.0], 15 | 'margin_scale': [1e-1, 5e-2, 1e-2, 1e-3, 1e-4]} 16 | 17 | NETWORK_PARAM_GRID = {'hidden_size': [64, 128], 18 | 'num_layers': [2, 3], 19 | 'dropout': [0.0, 0.05, 0.1] 20 | } 21 | 22 | QUANTILE_LOSS_PARAM_GRID_FIX = {'alpha': [0.0], 23 | 'lr': [1e-3], 24 | 'wd': [1e-7], 25 | 'margin_weight': [0.5, 1.0, 2.0, 5.0, 10.0], 26 | 'margin_delta': [0.0001, 0.001, 0.005, 0.0075, 0.0099] 27 | } 28 | 29 | class QuantileGlobalAggregator(nn.Module): 30 | def __init__(self, 31 | num_models, # number of base models 32 | quantile_list, # list of quantile levels 33 | normalize=True, # normalize weights 34 | margin_list=None, # using margin 35 | trans_type=None, # apply non-crossing 36 | use_grad=True, # using non-crossing training 37 | share=False, # share between base models 38 | cross=False, # cross between quantile levels 39 | margin_type='', 40 | regularization_strength=1.0, 41 | ): 42 | super(QuantileGlobalAggregator, self).__init__() 43 | # model size 44 | self.num_models = num_models 45 | 46 | # quantile list to handle 47 | self.num_quantiles = len(quantile_list) 48 | self.register_buffer('quantiles', torch.FloatTensor(quantile_list)) 49 | 50 | # normalize weights 51 | self.normalize = normalize 52 | 53 | # post-process (monotnoizer) 54 | self.trans_type = trans_type 55 | self.use_grad = use_grad 56 | self.margin_list = margin_list 57 | self.margin_type = margin_type 58 | self.regularization_strength = regularization_strength 59 | 60 | # set weight 61 | self.share = share 62 | self.cross = cross 63 | self.model_type = None 64 | if self.share: 65 | self.weights = nn.Parameter(torch.zeros([1, self.num_models, 1])) 66 | self.model_type = 'Coarse' 67 | else: 68 | if self.cross: 69 | self.weights = nn.Parameter(torch.zeros([1, self.num_models * self.num_quantiles, self.num_quantiles])) 70 | self.model_type = 'Fine' 71 | else: 72 | self.weights = nn.Parameter(torch.zeros([1, self.num_models, self.num_quantiles])) 73 | self.model_type = 'Medium' 74 | 75 | # aggregate estimates 76 | def forward(self, input_data): 77 | # get convex weight (normalize over weights) 78 | if self.normalize: 79 | convex_weights = self.weights.softmax(1) 80 | else: 81 | convex_weights = self.weights 82 | 83 | # weight sum 84 | if self.share: 85 | output_data = input_data * convex_weights 86 | else: 87 | if self.cross: 88 | output_data = input_data.reshape(-1, self.num_models * self.num_quantiles, 1) * convex_weights 89 | else: 90 | output_data = input_data * convex_weights 91 | 92 | # aggregate 93 | output_data = torch.sum(output_data, 1) 94 | return output_data 95 | 96 | def compute_loss(self, 97 | input_data, 98 | target_data, 99 | aux_data=None, 100 | margin_weight=0.0, 101 | margin_scale=0.0, 102 | alpha=0.0, 103 | margin_delta=0): 104 | # train mode 105 | self.train() 106 | # get prediction and margin loss 107 | if margin_weight > 0.0 and (self.margin_list is not None or self.margin_type == 'single'): 108 | if aux_data is not None: 109 | batch_size = input_data.size()[0] 110 | full_input_data = torch.cat([input_data, aux_data], 0) 111 | predict_data = self(full_input_data) 112 | m_loss = margin_weight * margin_loss(predict_data, self.margin_list * margin_scale) 113 | predict_data = predict_data[:batch_size].contiguous() 114 | 115 | else: 116 | predict_data = self(input_data) 117 | if self.margin_type == 'single': 118 | m_loss = margin_weight * margin_loss(predict_data, margin_delta) 119 | 120 | else: 121 | m_loss = margin_weight * margin_loss(predict_data, self.margin_list * margin_scale) 122 | else: 123 | predict_data = self(input_data) 124 | m_loss = 0 125 | 126 | # back-prop through non-crossing 127 | if self.use_grad: 128 | if self.trans_type == 'pava': 129 | predict_data = pava_forward(predict_data) 130 | 131 | elif self.trans_type == 'mono': 132 | predict_data = fix_crossing(predict_data) 133 | 134 | elif self.trans_type == 'sort': 135 | predict_data = sort_forward(predict_data, self.regularization_strength) 136 | 137 | # pinball loss 138 | h_loss = huber_loss(predict_data, target_data, self.quantiles, alpha) 139 | return h_loss + m_loss 140 | 141 | def eval_loss(self, input_data, target_data): 142 | # evaluation mode 143 | self.eval() 144 | 145 | with torch.no_grad(): 146 | # get aggregated prediction 147 | predict_data = self(input_data) 148 | 149 | # monotonize 150 | if self.trans_type == 'pava': 151 | predict_data = pava_forward(predict_data) 152 | 153 | elif self.trans_type == 'mono': 154 | predict_data = fix_crossing(predict_data) 155 | 156 | elif self.trans_type == 'sort': 157 | predict_data = sort_forward(predict_data, self.regularization_strength) 158 | 159 | # compute pinball loss 160 | return pinball_loss(predict_data, target_data, self.quantiles).item() 161 | 162 | def predict(self, input_data): 163 | # evaluation mode 164 | self.eval() 165 | 166 | with torch.no_grad(): 167 | # get aggregated prediction 168 | predict_data = self(input_data) 169 | 170 | # monotonize 171 | if self.trans_type == 'pava': 172 | predict_data = pava_forward(predict_data) 173 | 174 | elif self.trans_type == 'mono': 175 | predict_data = fix_crossing(predict_data) 176 | 177 | elif self.trans_type == 'sort': 178 | predict_data = sort_forward(predict_data, self.regularization_strength) 179 | 180 | return predict_data.data.cpu().numpy() 181 | 182 | 183 | class QuantileLocalAggregator(nn.Module): 184 | def __init__(self, 185 | num_models, # number of base models 186 | quantile_list, # list of quantile levels 187 | input_size, # input feature data size 188 | hidden_size=64, # hidden size 189 | num_layers=3, # number of layers 190 | dropout=0.0, # drop out ratio 191 | activation='elu', # activation 192 | normalize=True, # normalize weights 193 | margin_list=None, # using margin 194 | trans_type=None, # apply non-crossing 195 | use_grad=True, # using non-crossing training 196 | share=False, # share between base models 197 | cross=False, # cross between quantile levels 198 | margin_type='', 199 | regularization_strength=1, 200 | ): 201 | 202 | super(QuantileLocalAggregator, self).__init__() 203 | # model size 204 | self.num_models = num_models 205 | 206 | # quantile list to handle 207 | self.num_quantiles = len(quantile_list) 208 | self.register_buffer('quantiles', torch.FloatTensor(quantile_list)) 209 | 210 | # normalize weights 211 | self.normalize = normalize 212 | 213 | # post-process (monotnoizer) 214 | self.trans_type = trans_type 215 | self.use_grad = use_grad 216 | self.margin_list = margin_list 217 | self.margin_type = margin_type 218 | self.regularization_strength = regularization_strength 219 | 220 | # set output size 221 | self.share = share 222 | self.cross = cross 223 | self.model_type = None 224 | if self.share: 225 | num_outputs = self.num_models 226 | self.model_type = 'Coarse' 227 | else: 228 | if self.cross: 229 | num_outputs = self.num_models * self.num_quantiles * self.num_quantiles 230 | self.model_type = 'Fine' 231 | else: 232 | num_outputs = self.num_models * self.num_quantiles 233 | self.model_type = 'Medium' 234 | 235 | # activation 236 | act_fn = nn.ELU() 237 | if activation == 'elu': 238 | act_fn = nn.ELU() 239 | elif activation == 'relu': 240 | act_fn = nn.ReLU() 241 | elif activation == 'tanh': 242 | act_fn = nn.Tanh() 243 | 244 | # network with predicting quantiles 245 | layers = [nn.Linear(input_size, hidden_size), act_fn] 246 | for _ in range(num_layers - 1): 247 | layers.append(nn.Dropout(dropout)) 248 | layers.append(nn.Linear(hidden_size, hidden_size)) 249 | layers.append(act_fn) 250 | 251 | layers.append(nn.Linear(hidden_size, num_outputs)) 252 | self.network = nn.Sequential(*layers) 253 | ''' 254 | if num_layers == 2: 255 | self.network = nn.Sequential( 256 | nn.Linear(input_size, hidden_size), 257 | act_fn, 258 | nn.BatchNorm1d(hidden_size, affine=False), 259 | nn.Linear(hidden_size, hidden_size), 260 | act_fn, 261 | nn.BatchNorm1d(hidden_size, affine=False), 262 | nn.Linear(hidden_size, num_outputs), 263 | ) 264 | 265 | elif num_layers == 3: 266 | self.network = nn.Sequential( 267 | nn.Linear(input_size, hidden_size), 268 | act_fn, 269 | nn.BatchNorm1d(hidden_size, affine=False), 270 | nn.Linear(hidden_size, hidden_size), 271 | act_fn, 272 | nn.BatchNorm1d(hidden_size, affine=False), 273 | nn.Linear(hidden_size, hidden_size), 274 | act_fn, 275 | nn.BatchNorm1d(hidden_size, affine=False), 276 | nn.Linear(hidden_size, num_outputs), 277 | ) 278 | else: 279 | self.network = nn.Sequential( 280 | nn.Linear(input_size, hidden_size), 281 | act_fn, 282 | nn.BatchNorm1d(hidden_size, affine=False), 283 | nn.Linear(hidden_size, hidden_size), 284 | act_fn, 285 | nn.BatchNorm1d(hidden_size, affine=False), 286 | nn.Linear(hidden_size, hidden_size), 287 | act_fn, 288 | nn.BatchNorm1d(hidden_size, affine=False), 289 | nn.Linear(hidden_size, hidden_size), 290 | act_fn, 291 | nn.BatchNorm1d(hidden_size, affine=False), 292 | nn.Linear(hidden_size, num_outputs), 293 | ) 294 | ''' 295 | #print(self.network) 296 | # aggregate estimates 297 | def forward(self, cond_data, input_data): 298 | # combination weight 299 | convex_weights = self.network(cond_data) 300 | 301 | # reshape weights 302 | if self.share: 303 | convex_weights = convex_weights.reshape(-1, self.num_models, 1) 304 | input_data = input_data.reshape(-1, self.num_models, self.num_quantiles) 305 | else: 306 | if self.cross: 307 | convex_weights = convex_weights.reshape(-1, self.num_models * self.num_quantiles, self.num_quantiles) 308 | input_data = input_data.reshape(-1, self.num_models * self.num_quantiles, 1) 309 | else: 310 | convex_weights = convex_weights.reshape(-1, self.num_models, self.num_quantiles) 311 | input_data = input_data.reshape(-1, self.num_models, self.num_quantiles) 312 | 313 | # normalize (sum to 1) 314 | if self.normalize: 315 | convex_weights = convex_weights.softmax(1) 316 | 317 | # aggregate 318 | output_data = torch.sum(input_data * convex_weights, 1) 319 | return output_data 320 | 321 | def compute_loss(self, 322 | cond_data, 323 | input_data, 324 | target_data, 325 | aux_cond_data=None, 326 | aux_input_data=None, 327 | margin_weight=0.0, 328 | margin_scale=0.0, 329 | alpha=0.0, 330 | margin_delta=0.0): 331 | 332 | # train mode 333 | self.train() 334 | 335 | if margin_weight > 0.0 and (self.margin_list is not None or self.margin_type == 'single'): 336 | if aux_cond_data is not None and aux_input_data is not None: 337 | batch_size = input_data.size()[0] 338 | predict_data = self(torch.cat([cond_data, aux_cond_data], 0), 339 | torch.cat([input_data, aux_input_data], 0)) 340 | m_loss = margin_weight * margin_loss(predict_data, self.margin_list * margin_scale) 341 | predict_data = predict_data[:batch_size].contiguous() 342 | 343 | else: 344 | predict_data = self(cond_data, input_data) 345 | if self.margin_type == 'single': 346 | m_loss = margin_weight * margin_loss(predict_data, margin_delta) 347 | 348 | else: 349 | m_loss = margin_weight * margin_loss(predict_data, self.margin_list * margin_scale) 350 | 351 | else: 352 | predict_data = self(cond_data, input_data) 353 | m_loss = 0 354 | 355 | # back-prop through non-crossing 356 | if self.use_grad: 357 | if self.trans_type == 'pava': 358 | predict_data = pava_forward(predict_data) 359 | 360 | elif self.trans_type == 'mono': 361 | predict_data = fix_crossing(predict_data) 362 | 363 | elif self.trans_type == 'sort': 364 | predict_data = sort_forward(predict_data, self.regularization_strength) 365 | 366 | # pinball loss 367 | h_loss = huber_loss(predict_data, target_data, self.quantiles, alpha) 368 | return h_loss + m_loss 369 | 370 | def eval_loss(self, cond_data, input_data, target_data): 371 | # evaluation mode 372 | self.eval() 373 | 374 | with torch.no_grad(): 375 | # get aggregated prediction 376 | predict_data = self(cond_data, input_data) 377 | 378 | # monotonize 379 | if self.trans_type == 'pava': 380 | predict_data = pava_forward(predict_data) 381 | 382 | elif self.trans_type == 'mono': 383 | predict_data = fix_crossing(predict_data) 384 | 385 | elif self.trans_type == 'sort': 386 | predict_data = sort_forward(predict_data, self.regularization_strength) 387 | 388 | # compute pinball loss 389 | return pinball_loss(predict_data, target_data, self.quantiles).item() 390 | 391 | def predict(self, cond_data, input_data): 392 | # evaluation mode 393 | self.eval() 394 | 395 | with torch.no_grad(): 396 | # get aggregated prediction 397 | predict_data = self(cond_data, input_data) 398 | 399 | # monotonize 400 | if self.trans_type == 'pava': 401 | predict_data = pava_forward(predict_data) 402 | 403 | elif self.trans_type == 'mono': 404 | predict_data = fix_crossing(predict_data) 405 | 406 | elif self.trans_type == 'sort': 407 | predict_data = sort_forward(predict_data, self.regularization_strength) 408 | 409 | return predict_data.data.cpu().numpy() 410 | 411 | class QuantileGlobalAggregatorTrainer: 412 | def __init__(self, 413 | num_searches, # number of searching 414 | cv_split, # cross-validation splitting 415 | quantile_list, # list of quantile levels 416 | batch_size=64, # mini batch size 417 | normalize=True, # normalize weights 418 | margin_list=None, # using margin 419 | trans_type=None, # apply non-crossing 420 | use_grad=True, # using non-crossing training 421 | share_weight=False, # share weight over models 422 | cross_weight=False, # cross quantiles 423 | rand_seed=111, # random seed 424 | device=-1, # device id, 425 | margin_type='', 426 | regularization_strength=1, 427 | ): 428 | # training setting 429 | self.num_searches = num_searches 430 | self.cv_split = cv_split 431 | 432 | # model setting 433 | self.quantile_list = quantile_list 434 | self.normalize = normalize 435 | self.margin_list = margin_list 436 | self.trans_type = trans_type 437 | self.use_grad = use_grad 438 | self.share_weight = share_weight 439 | self.cross_weight = cross_weight 440 | self.rand_seed = rand_seed 441 | self.num_models = None 442 | self.num_quantiles = len(quantile_list) 443 | self.batch_size = batch_size 444 | self.margin_type = margin_type 445 | self.regularization_strength = regularization_strength 446 | 447 | # best model after training 448 | self.best_model = None 449 | self.best_params = None 450 | 451 | # set device 452 | if torch.cuda.is_available() and device > -1: 453 | self.device = torch.device("cuda:{}".format(device)) 454 | else: 455 | self.device = torch.device("cpu") 456 | 457 | # fit model by model selection 458 | def fit(self, 459 | x_train, 460 | y_train, 461 | x_val, 462 | y_val, 463 | ax_train=None): 464 | # convert data 465 | x_train = torch.FloatTensor(x_train).to(self.device) 466 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 467 | 468 | x_val = torch.FloatTensor(x_val).to(self.device) 469 | y_val = torch.FloatTensor(y_val.reshape(-1, 1)).to(self.device) 470 | 471 | if ax_train is not None: 472 | ax_train = torch.FloatTensor(ax_train).to(self.device) 473 | 474 | # get number of base models 475 | self.num_models = x_train.size()[1] 476 | assert self.num_quantiles == x_train.size()[2] 477 | 478 | # build params list 479 | if self.margin_type == 'single': 480 | full_param_grid = {**QUANTILE_LOSS_PARAM_GRID_FIX} 481 | 482 | else: 483 | full_param_grid = {**QUANTILE_LOSS_PARAM_GRID} 484 | if self.margin_list is None: 485 | del full_param_grid['margin_weight'] 486 | del full_param_grid['margin_scale'] 487 | 488 | 489 | params_list = list(ParameterSampler(param_distributions=full_param_grid, 490 | n_iter=self.num_searches, 491 | random_state=self.rand_seed)) 492 | 493 | # set data loader 494 | train_loader = DataLoader(dataset=TensorDataset(x_train, 495 | y_train), 496 | shuffle=True, 497 | batch_size=self.batch_size, 498 | drop_last=False, 499 | worker_init_fn=np.random.seed(self.rand_seed)) 500 | 501 | valid_loader = DataLoader(dataset=TensorDataset(x_val, 502 | y_val), 503 | shuffle=False, 504 | batch_size=1024, 505 | drop_last=False) 506 | 507 | if ax_train is not None: 508 | aux_loader = DataLoader(dataset=TensorDataset(torch.cat([ax_train, x_train], 0)), 509 | shuffle=True, 510 | batch_size=self.batch_size, 511 | drop_last=False, 512 | worker_init_fn=np.random.seed(self.rand_seed)) 513 | 514 | else: 515 | aux_loader = None 516 | 517 | # starting model selection 518 | best_eval_loss = np.inf 519 | best_eval_step = 0 520 | 521 | # for each param 522 | for p, params in enumerate(params_list): 523 | # fit model with given data and params 524 | print('iter:',p, ' ', params) 525 | eval_loss, eval_step = self.fit_model(train_loader=train_loader, 526 | valid_loader=valid_loader, 527 | aux_loader=aux_loader, 528 | **params) 529 | 530 | # if best in terms of validation 531 | if eval_loss < best_eval_loss: 532 | best_eval_loss = eval_loss 533 | best_eval_step = eval_step 534 | self.best_params = params 535 | 536 | print('eval_loss : %.4f, best_eval_sofar: %.4f, eval_step: %d' %(eval_loss, best_eval_loss, eval_step)) 537 | print() 538 | self.best_params['num_steps'] = best_eval_step 539 | 540 | # retrain model with full data 541 | train_loader = DataLoader(dataset=TensorDataset(x_train, y_train), 542 | shuffle=True, 543 | batch_size=self.batch_size, 544 | drop_last=False, 545 | worker_init_fn=np.random.seed(self.rand_seed)) 546 | 547 | if ax_train is not None: 548 | aux_loader = DataLoader(dataset=TensorDataset(ax_train), 549 | shuffle=True, 550 | batch_size=self.batch_size, 551 | drop_last=False, 552 | worker_init_fn=np.random.seed(self.rand_seed)) 553 | 554 | else: 555 | aux_loader = None 556 | 557 | # retrain model and set as best model 558 | print('best_params:', self.best_params) 559 | print('best_eval_loss: %.4f' % best_eval_loss) 560 | self.best_model = self.fit_model(train_loader, None, aux_loader, **self.best_params) 561 | 562 | # fit single model based on given params 563 | def fit_model(self, 564 | train_loader, 565 | valid_loader=None, 566 | aux_loader=None, 567 | num_steps=None, 568 | lr=1e-3, 569 | wd=1e-7, 570 | margin_weight=0.0, 571 | margin_scale=0.0, 572 | alpha=0.0, 573 | margin_delta=0.0): 574 | # init model 575 | model = QuantileGlobalAggregator(num_models=self.num_models, 576 | quantile_list=self.quantile_list, 577 | normalize=self.normalize, 578 | margin_list=self.margin_list, 579 | trans_type=self.trans_type, 580 | use_grad=self.use_grad, 581 | share=self.share_weight, 582 | cross=self.cross_weight, 583 | margin_type=self.margin_type, 584 | regularization_strength=self.regularization_strength) 585 | model = model.to(self.device) 586 | 587 | # init optimizer 588 | optimizer = torch.optim.Adam(params=model.parameters(), 589 | lr=lr, weight_decay=wd, amsgrad=True) 590 | 591 | # init results 592 | steps = 0 593 | best_valid_loss = np.inf 594 | best_step = 0 595 | 596 | # init aux_loader 597 | if aux_loader is None: 598 | aux_loader_iterator = None 599 | else: 600 | aux_loader_iterator = iter(aux_loader) 601 | 602 | # for each epoch 603 | while True: 604 | # for each batch (update) 605 | for x_batch, y_batch in train_loader: 606 | # aux data 607 | if aux_loader_iterator is None: 608 | aux_batch = None 609 | else: 610 | try: 611 | aux_batch = next(aux_loader_iterator)[0] 612 | except StopIteration: 613 | aux_loader_iterator = iter(aux_loader) 614 | aux_batch = next(aux_loader_iterator)[0] 615 | 616 | # compute loss 617 | batch_loss = model.compute_loss(input_data=x_batch, 618 | target_data=y_batch, 619 | aux_data=aux_batch, 620 | margin_weight=margin_weight, 621 | margin_scale=margin_scale, 622 | alpha=alpha, 623 | margin_delta=margin_delta) 624 | 625 | # backprop and update 626 | optimizer.zero_grad() 627 | batch_loss.backward() 628 | optimizer.step() 629 | 630 | # step up 631 | steps += 1 632 | 633 | # evaluation over validation set 634 | if steps % 100 == 0 and valid_loader is not None: 635 | valid_loss = 0.0 636 | valid_size = 0.0 637 | 638 | # compute validation loss 639 | for x_batch, y_batch in valid_loader: 640 | batch_size = x_batch.size()[0] 641 | batch_loss = model.eval_loss(input_data=x_batch, target_data=y_batch) 642 | valid_loss += batch_loss * batch_size 643 | valid_size += batch_size 644 | valid_loss /= valid_size 645 | 646 | # update best validation loss 647 | if best_valid_loss > valid_loss: 648 | best_valid_loss = valid_loss 649 | best_step = steps 650 | # if no improvement seen 651 | elif steps - best_step >= STOP_STEPS: 652 | return best_valid_loss, best_step 653 | # if number of steps is reached 654 | elif num_steps is not None and steps >= num_steps: 655 | assert valid_loader is None 656 | return copy.deepcopy(model) 657 | 658 | # prediction 659 | def predict(self, x_data): 660 | x_data = torch.FloatTensor(x_data).to(self.device) 661 | y_pred = self.best_model.predict(x_data) 662 | return y_pred 663 | 664 | def refit_model(self, 665 | x_train, y_train, 666 | ax_train=None): 667 | self.best_model = None 668 | 669 | # convert data 670 | x_train = torch.FloatTensor(x_train).to(self.device) 671 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 672 | if ax_train is not None: 673 | ax_train = torch.FloatTensor(ax_train).to(self.device) 674 | 675 | # retrain model with full data 676 | train_loader = DataLoader(dataset=TensorDataset(x_train, y_train), 677 | shuffle=True, 678 | batch_size=self.batch_size, 679 | drop_last=False, 680 | worker_init_fn=np.random.seed(self.rand_seed)) 681 | 682 | if ax_train is not None: 683 | aux_loader = DataLoader(dataset=TensorDataset(ax_train), 684 | shuffle=True, 685 | batch_size=self.batch_size, 686 | drop_last=False, 687 | worker_init_fn=np.random.seed(self.rand_seed)) 688 | 689 | else: 690 | aux_loader = None 691 | 692 | print('best_params:', self.best_params) 693 | # retrain model and set as best model 694 | self.best_model = self.fit_model(train_loader, None, aux_loader, **self.best_params) 695 | 696 | 697 | class QuantileLocalAggregatorTrainer: 698 | def __init__(self, 699 | num_searches, # number of searching 700 | cv_split, # cross-validation splitting 701 | quantile_list, # list of quantile levels 702 | batch_size=64, # mini batch size 703 | normalize=True, # normalize weights 704 | margin_list=None, # using margin 705 | trans_type=None, # apply non-crossing 706 | use_grad=True, # using non-crossing training 707 | share_weight=False, # share weight over models 708 | cross_weight=False, # cross quantiles 709 | rand_seed=111, # random seed 710 | device=-1, # device id, 711 | margin_type='', 712 | regularization_strength=1.0, 713 | ): 714 | # training setting 715 | self.num_searches = num_searches 716 | self.cv_split = cv_split 717 | 718 | # model setting 719 | self.quantile_list = quantile_list 720 | self.normalize = normalize 721 | self.margin_list = margin_list 722 | self.trans_type = trans_type 723 | self.use_grad = use_grad 724 | self.share_weight = share_weight 725 | self.cross_weight = cross_weight 726 | self.rand_seed = rand_seed 727 | self.input_size = None 728 | self.num_models = None 729 | self.num_quantiles = len(quantile_list) 730 | self.batch_size = batch_size 731 | self.margin_type = margin_type 732 | self.regularization_strength = regularization_strength 733 | 734 | # best model after training 735 | self.best_model = None 736 | self.best_params = None 737 | 738 | # set device 739 | if torch.cuda.is_available() and device > -1: 740 | self.device = torch.device("cuda:{}".format(device)) 741 | else: 742 | self.device = torch.device("cpu") 743 | 744 | # fit model by model selection 745 | def fit(self, 746 | c_train, 747 | x_train, 748 | y_train, 749 | c_val, 750 | x_val, 751 | y_val, 752 | ac_train=None, 753 | ax_train=None): 754 | 755 | # convert data 756 | c_train = torch.FloatTensor(c_train).to(self.device) 757 | x_train = torch.FloatTensor(x_train).to(self.device) 758 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 759 | 760 | c_val = torch.FloatTensor(c_val).to(self.device) 761 | x_val = torch.FloatTensor(x_val).to(self.device) 762 | y_val = torch.FloatTensor(y_val.reshape(-1, 1)).to(self.device) 763 | 764 | if ac_train is not None and ax_train is not None: 765 | ac_train = torch.FloatTensor(ac_train).to(self.device) 766 | ax_train = torch.FloatTensor(ax_train).to(self.device) 767 | 768 | # get input size and number of base models 769 | self.input_size = c_train.size()[1] 770 | self.num_models = x_train.size()[1] 771 | assert self.num_quantiles == x_train.size()[2] 772 | 773 | # build params list 774 | if self.margin_type == 'single': 775 | full_param_grid = {**NETWORK_PARAM_GRID, 776 | **QUANTILE_LOSS_PARAM_GRID_FIX} 777 | 778 | else: 779 | full_param_grid = {**NETWORK_PARAM_GRID, 780 | **QUANTILE_LOSS_PARAM_GRID} 781 | 782 | if self.margin_list is None: 783 | del full_param_grid['margin_weight'] 784 | del full_param_grid['margin_scale'] 785 | 786 | params_list = list(ParameterSampler(param_distributions=full_param_grid, 787 | n_iter=self.num_searches, 788 | random_state=self.rand_seed)) 789 | 790 | # set data loader 791 | train_loader = DataLoader(dataset=TensorDataset(c_train, 792 | x_train, 793 | y_train), 794 | shuffle=True, 795 | batch_size=self.batch_size, 796 | drop_last=False, 797 | worker_init_fn=np.random.seed(self.rand_seed)) 798 | 799 | valid_loader = DataLoader(dataset=TensorDataset(c_val, 800 | x_val, 801 | y_val), 802 | shuffle=False, 803 | batch_size=1024, 804 | drop_last=False) 805 | 806 | if ac_train is not None and ax_train is not None: 807 | aux_loader = DataLoader(dataset=TensorDataset(torch.cat([ac_train, c_val], 0), 808 | torch.cat([ax_train, x_val], 0)), 809 | shuffle=True, 810 | batch_size=self.batch_size, 811 | drop_last=False, 812 | worker_init_fn=np.random.seed(self.rand_seed)) 813 | 814 | else: 815 | aux_loader = None 816 | 817 | 818 | # starting model selection 819 | best_eval_loss = np.inf 820 | best_eval_step = 0 821 | 822 | # for each param 823 | for p, params in enumerate(params_list): 824 | # fit model with given data and params 825 | print('iter:',p, ' ', params) 826 | eval_loss, eval_step = self.fit_model(train_loader=train_loader, 827 | valid_loader=valid_loader, 828 | aux_loader=aux_loader, 829 | **params) 830 | 831 | # if best in terms of validation 832 | if eval_loss < best_eval_loss: 833 | best_eval_loss = eval_loss 834 | best_eval_step = eval_step 835 | self.best_params = params 836 | 837 | print('eval_loss : %.4f, best_eval_sofar: %.4f, eval_step: %d' %(eval_loss, best_eval_loss, eval_step)) 838 | print() 839 | self.best_params['num_steps'] = best_eval_step 840 | 841 | # retrain model with full data 842 | #train_loader = DataLoader(dataset=TensorDataset(c_train, x_train, y_train), 843 | # shuffle=True, batch_size=self.batch_size, drop_last=True, 844 | # worker_init_fn=np.random.seed(self.rand_seed)) 845 | train_loader = DataLoader(dataset=TensorDataset(c_train, 846 | x_train, 847 | y_train), 848 | shuffle=True, 849 | batch_size=self.batch_size, 850 | drop_last=False, 851 | worker_init_fn=np.random.seed(self.rand_seed)) 852 | 853 | 854 | if ac_train is not None and ax_train is not None: 855 | aux_loader = DataLoader(dataset=TensorDataset(ac_train, ax_train), 856 | shuffle=True, 857 | batch_size=self.batch_size, 858 | drop_last=False, 859 | worker_init_fn=np.random.seed(self.rand_seed)) 860 | 861 | else: 862 | aux_loader = None 863 | 864 | print('best_params:', self.best_params) 865 | print('best_eval_loss: %.4f' % best_eval_loss) 866 | # retrain model and set as best model 867 | self.best_model = self.fit_model(train_loader, None, aux_loader, **self.best_params) 868 | 869 | # fit single model based on given params 870 | def fit_model(self, 871 | train_loader, 872 | valid_loader=None, 873 | aux_loader=None, 874 | num_steps=None, 875 | hidden_size=64, 876 | num_layers=3, 877 | dropout=0.1, 878 | activation='elu', 879 | lr=1e-3, 880 | wd=1e-7, 881 | margin_weight=0.0, 882 | margin_scale=0.0, 883 | alpha=0.0, 884 | margin_delta=0.0): 885 | 886 | # init model 887 | model = QuantileLocalAggregator(num_models=self.num_models, 888 | quantile_list=self.quantile_list, 889 | input_size=self.input_size, 890 | hidden_size=hidden_size, 891 | num_layers=num_layers, 892 | dropout=dropout, 893 | activation=activation, 894 | normalize=self.normalize, 895 | margin_list=self.margin_list, 896 | trans_type=self.trans_type, 897 | use_grad=self.use_grad, 898 | share=self.share_weight, 899 | cross=self.cross_weight, 900 | margin_type=self.margin_type, 901 | regularization_strength=self.regularization_strength) 902 | 903 | model = model.to(self.device) 904 | 905 | # init optimizer 906 | optimizer = torch.optim.Adam(params=model.parameters(), 907 | lr=lr, weight_decay=wd, amsgrad=True) 908 | 909 | # init results 910 | steps = 0 911 | best_valid_loss = np.inf 912 | best_step = 0 913 | 914 | # init aux_loader 915 | if aux_loader is None: 916 | aux_loader_iterator = None 917 | else: 918 | aux_loader_iterator = iter(aux_loader) 919 | 920 | # for each epoch 921 | while True: 922 | # for each batch (update) 923 | for c_batch, x_batch, y_batch in train_loader: 924 | # aux data 925 | if aux_loader_iterator is None: 926 | ac_batch, ax_batch = None, None 927 | else: 928 | try: 929 | ac_batch, ax_batch = next(aux_loader_iterator) 930 | except StopIteration: 931 | aux_loader_iterator = iter(aux_loader) 932 | ac_batch, ax_batch = next(aux_loader_iterator) 933 | 934 | # compute loss 935 | batch_loss = model.compute_loss(cond_data=c_batch, 936 | input_data=x_batch, 937 | target_data=y_batch, 938 | aux_cond_data=ac_batch, 939 | aux_input_data=ax_batch, 940 | margin_weight=margin_weight, 941 | margin_scale=margin_scale, 942 | alpha=alpha, 943 | margin_delta=margin_delta) 944 | 945 | # backprop and update 946 | optimizer.zero_grad() 947 | batch_loss.backward() 948 | optimizer.step() 949 | 950 | # step up 951 | steps += 1 952 | 953 | # evaluation over validation set 954 | if steps % 100 == 0 and valid_loader is not None: 955 | valid_loss = 0.0 956 | valid_size = 0.0 957 | 958 | # compute validation loss 959 | for c_batch, x_batch, y_batch in valid_loader: 960 | batch_size = x_batch.size()[0] 961 | batch_loss = model.eval_loss(cond_data=c_batch, input_data=x_batch, target_data=y_batch) 962 | valid_loss += batch_loss * batch_size 963 | valid_size += batch_size 964 | valid_loss /= valid_size 965 | 966 | # update best validation loss 967 | if best_valid_loss > valid_loss: 968 | best_valid_loss = valid_loss 969 | best_step = steps 970 | # if no improvement seen 971 | elif steps - best_step >= STOP_STEPS: 972 | return best_valid_loss, best_step 973 | # if number of steps is reached 974 | elif num_steps is not None and steps >= num_steps: 975 | assert valid_loader is None 976 | return copy.deepcopy(model) 977 | 978 | # prediction 979 | def predict(self, c_data, x_data): 980 | c_data = torch.FloatTensor(c_data).to(self.device) 981 | x_data = torch.FloatTensor(x_data).to(self.device) 982 | y_pred = self.best_model.predict(c_data, x_data) 983 | return y_pred 984 | 985 | def refit_model(self, 986 | c_train, 987 | x_train, 988 | y_train, 989 | ac_train=None, 990 | ax_train=None): 991 | 992 | self.best_model = None 993 | 994 | # convert data 995 | c_train = torch.FloatTensor(c_train).to(self.device) 996 | x_train = torch.FloatTensor(x_train).to(self.device) 997 | y_train = torch.FloatTensor(y_train.reshape(-1, 1)).to(self.device) 998 | 999 | if ac_train is not None and ax_train is not None: 1000 | ac_train = torch.FloatTensor(ac_train).to(self.device) 1001 | ax_train = torch.FloatTensor(ax_train).to(self.device) 1002 | 1003 | # retrain model with full data 1004 | train_loader = DataLoader(dataset=TensorDataset(c_train, x_train, y_train), 1005 | shuffle=True, 1006 | batch_size=self.batch_size, 1007 | drop_last=False, 1008 | worker_init_fn=np.random.seed(self.rand_seed)) 1009 | 1010 | if ac_train is not None and ax_train is not None: 1011 | aux_loader = DataLoader(dataset=TensorDataset(ac_train, ax_train), 1012 | shuffle=True, 1013 | batch_size=self.batch_size, 1014 | drop_last=False, 1015 | worker_init_fn=np.random.seed(self.rand_seed)) 1016 | 1017 | else: 1018 | aux_loader = None 1019 | 1020 | print('best_params:', self.best_params) 1021 | # retrain model and set as best model 1022 | self.best_model = self.fit_model(train_loader, None, aux_loader, **self.best_params) 1023 | --------------------------------------------------------------------------------