├── LICENSE ├── README.md ├── clf_binning_regressor.py ├── clf_sign_regressor.py ├── cv.py ├── early_stopping_estimators.py ├── era_boost_xgb_estimators.py ├── fear_greedy.py ├── mlflow_artifact_dataset.py ├── mlflow_utils.py ├── my_fit.py ├── my_keras_regressor.py ├── my_keras_regressor2.py ├── nonstationary_feature_remover.py ├── numerai_dataset.py ├── numerai_dataset2.py ├── optuna_bbc_cv.py ├── parquet_dataset.py ├── positive_homogeneous_regressor.py ├── ridge_feature_count_scaler.py └── srdo_regressor.py /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bot_snippets 2 | 3 | ボットに使ってるコード断片 4 | ライブラリにするほど仕様が確定していないもの 5 | -------------------------------------------------------------------------------- /clf_binning_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, clone 3 | from sklearn.preprocessing import KBinsDiscretizer 4 | from .utils import my_fit 5 | 6 | class ClfBinningRegressor(BaseEstimator): 7 | def __init__(self, classifier=None, n_bins=None): 8 | self.classifier = classifier 9 | self.n_bins = n_bins 10 | 11 | def fit(self, X, y, sample_weight=None, fit_context=None): 12 | self.n_features_in_ = X.shape[1] 13 | self.classifier_ = clone(self.classifier) 14 | self.transformer_ = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy='quantile') 15 | 16 | y = self.transformer_.fit_transform(y.reshape(-1, 1)).flatten().astype('int') 17 | 18 | if fit_context is not None: 19 | fit_context = fit_context.copy() 20 | fit_context['y_val'] = self.transformer_.transform(fit_context['y_val'].reshape(-1, 1)).flatten().astype('int') 21 | 22 | my_fit( 23 | self.classifier_, 24 | X, 25 | y, 26 | sample_weight=sample_weight, 27 | fit_context=fit_context, 28 | ) 29 | 30 | self.class_values_ = self.transformer_.inverse_transform(np.array(self.classifier_.classes_).reshape(-1, 1)).flatten() 31 | 32 | return self 33 | 34 | def predict(self, X): 35 | proba = self.classifier_.predict_proba(X) 36 | return np.sum(proba * self.class_values_, axis=1) 37 | -------------------------------------------------------------------------------- /clf_sign_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, clone 3 | from .utils import my_fit 4 | 5 | class ClfSignRegressor(BaseEstimator): 6 | def __init__(self, classifier=None): 7 | self.classifier = classifier 8 | 9 | def fit(self, X, y, sample_weight=None, fit_context=None): 10 | self.n_features_in_ = X.shape[1] 11 | self.classifier_ = clone(self.classifier) 12 | 13 | sw = np.abs(y) 14 | if sample_weight is not None: 15 | sw *= sample_weight 16 | y = np.sign(y).astype('int') 17 | 18 | if fit_context is not None: 19 | fit_context = fit_context.copy() 20 | sw_val = np.abs(fit_context['y_val']) 21 | if fit_context['sample_weight_val'] is not None: 22 | sw_val *= fit_context['sample_weight_val'] 23 | fit_context['y_val'] = np.sign(fit_context['y_val']).astype('int') 24 | fit_context['sample_weight_val'] = sw_val 25 | 26 | my_fit( 27 | self.classifier_, 28 | X, 29 | y, 30 | sample_weight=sample_weight, 31 | fit_context=fit_context, 32 | ) 33 | 34 | return self 35 | 36 | def predict(self, X): 37 | proba = self.classifier_.predict_proba(X) 38 | return np.sum(proba * np.array(self.classifier_.classes_), axis=1) 39 | -------------------------------------------------------------------------------- /cv.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | def _purge_idx(train_idx, val_idx, groups, purge): 5 | unique_groups = np.unique(groups[val_idx]) 6 | purged_groups = unique_groups.reshape(1, -1) + np.arange(-purge, purge + 1).reshape(-1, 1) 7 | purged_groups = np.unique(purged_groups) 8 | return train_idx[~np.isin(groups[train_idx], purged_groups)] 9 | 10 | def my_group_kfold(groups, n_splits=5, purge=12): 11 | if hasattr(groups, 'values'): 12 | groups = groups.values 13 | idx = np.arange(groups.size) 14 | g = np.sort(np.unique(groups)) 15 | cv = [] 16 | for i in range(n_splits): 17 | selected = g[i * g.size // n_splits:(i + 1) * g.size // n_splits] 18 | val_idx = np.isin(groups, selected) 19 | cv.append(( 20 | _purge_idx(idx[~val_idx], idx[val_idx], groups, purge), 21 | idx[val_idx], 22 | )) 23 | return cv 24 | 25 | def my_kfold(x, n_splits=5, purge=12): 26 | return my_group_kfold(np.arange(x.shape[0]), n_splits=n_splits, purge=purge) 27 | -------------------------------------------------------------------------------- /early_stopping_estimators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cvxpy as cp 3 | from sklearn.base import BaseEstimator, clone 4 | from sklearn.utils import check_random_state 5 | from sklearn.ensemble._base import _set_random_states 6 | from .utils import my_fit 7 | 8 | # https://proceedings.neurips.cc/paper/1996/file/f47330643ae134ca204bf6b2481fec47-Paper.pdf 9 | ENSEMBLE_MODE_BALANCING = 'balancing' 10 | 11 | class BaseEarlyStoppingEstimator(BaseEstimator): 12 | def __init__(self, 13 | base_estimator=None, 14 | n_estimators=10, 15 | cv=None, 16 | # max_samples=1.0, 17 | # max_features=1.0, 18 | ensemble_mode=None, 19 | random_state=None, 20 | verbose=0): 21 | 22 | self.base_estimator = base_estimator 23 | self.n_estimators = n_estimators 24 | self.cv = cv 25 | # self.max_samples = max_samples 26 | # self.max_features = max_features 27 | self.ensemble_mode = ensemble_mode 28 | self.random_state = random_state 29 | self.verbose = verbose 30 | 31 | def fit(self, X, y, sample_weight=None): 32 | # n = X.shape[0] 33 | random_state = check_random_state(self.random_state) 34 | # count = round(self.max_samples * n) 35 | # feature_count = round(self.max_features * X.shape[1]) 36 | 37 | self.n_features_in_ = X.shape[1] 38 | self.estimators_ = [] 39 | self.estimators_features_ = [] 40 | if self.ensemble_mode == ENSEMBLE_MODE_BALANCING: 41 | self.val_errors_ = [] 42 | 43 | cv_gen = self.cv.split(X) 44 | 45 | for i in range(self.n_estimators): 46 | train_idx, val_idx = cv_gen.__next__() 47 | 48 | estimator = clone(self.base_estimator) 49 | _set_random_states(estimator, random_state=random_state.randint(np.iinfo(np.int32).max)) 50 | 51 | sw = None if sample_weight is None else sample_weight[train_idx] 52 | 53 | fit_context = { 54 | 'X_val': indexing(X, val_idx), 55 | 'y_val': indexing(y, val_idx), 56 | 'sample_weight_val': None if sample_weight is None else indexing(sample_weight, val_idx), 57 | 'early_stopping_rounds': 100, 58 | } 59 | 60 | my_fit( 61 | estimator, 62 | indexing(X, train_idx), 63 | indexing(y, train_idx), 64 | sample_weight=sw, 65 | fit_context=fit_context, 66 | ) 67 | 68 | if self.ensemble_mode == ENSEMBLE_MODE_BALANCING: 69 | y_val_pred = estimator.predict(X_val) 70 | val_error = np.average((y_val - y_val_pred) ** 2, weights=sw_val) 71 | self.val_errors_.append(val_error) 72 | 73 | # indicies = calc_indicies(n, count, random_state) 74 | # feature_indicies = calc_feature_indicies(X.shape[1], feature_count, random_state) 75 | 76 | feature_indicies = np.arange(X.shape[1]) 77 | 78 | self.estimators_.append(estimator) 79 | self.estimators_features_.append(feature_indicies) 80 | 81 | if self.ensemble_mode == ENSEMBLE_MODE_BALANCING: 82 | self.val_errors_ = np.array(self.val_errors_) 83 | 84 | return self 85 | 86 | class EarlyStoppingRegressor(BaseEarlyStoppingEstimator): 87 | def predict(self, X): 88 | ys = [] 89 | for i, estimator in enumerate(self.estimators_): 90 | ys.append(estimator.predict(indexing2(X, self.estimators_features_[i]))) 91 | ys = np.array(ys) 92 | 93 | if self.ensemble_mode == ENSEMBLE_MODE_BALANCING: 94 | w = cp.Variable((len(self.estimators_), X.shape[0])) 95 | 96 | # 2 * w[i] * val_errors[i] 97 | # - w[i] * y[i] ** 2 98 | # + w[i] * w[j] * y[i] * y[j] -> sum(w[i] * y[i]) ** 2 99 | 100 | objective = cp.Minimize( 101 | 2 * cp.sum(cp.multiply(w, np.repeat(self.val_errors_.reshape(-1, 1), X.shape[0], axis=1))) 102 | - cp.sum(cp.multiply(w, ys ** 2)) 103 | + cp.sum(cp.multiply(w, ys)) ** 2 104 | ) 105 | 106 | constraints = [ 107 | 0 <= w, 108 | cp.sum(w, axis=0) == 1, 109 | ] 110 | 111 | prob = cp.Problem(objective, constraints) 112 | try: 113 | result = prob.solve() 114 | except cp.error.SolverError: 115 | print('cvxpy solve failed. use equal weight') 116 | return np.mean(ys, axis=0) 117 | 118 | return np.sum(ys * w.value, axis=0) 119 | else: 120 | return np.mean(ys, axis=0) 121 | 122 | class EarlyStoppingClassifier(BaseEarlyStoppingEstimator): 123 | def fit(self, X, y, sample_weight=None): 124 | self.classes_ = np.sort(np.unique(y)) 125 | self.n_classes_ = len(self.classes_) 126 | return super().fit(X, y, sample_weight=sample_weight) 127 | 128 | def predict(self, X): 129 | proba = self.predict_proba(X) 130 | return self.classes_.take(np.argmax(proba, axis=1), axis=0) 131 | 132 | def predict_proba(self, X): 133 | class_to_idx = {} 134 | for i, cls in enumerate(self.classes_): 135 | class_to_idx[cls] = i 136 | proba = np.zeros(X.shape[0], self.n_classes_) 137 | 138 | for estimator in self.estimators_: 139 | if hasattr(estimator, "predict_proba"): 140 | p = estimator.predict_proba(X) 141 | for i, cls in enumerate(estimator.classes_): 142 | proba[:, class_to_idx[cls]] += p[:, i] 143 | else: 144 | y_pred = estimator.predict(X) 145 | for i, cls in enumerate(self.classes_): 146 | proba[y_pred == cls, i] += 1 147 | 148 | return proba / self.n_estimators 149 | 150 | def calc_indicies(n, count, random_state): 151 | indicies = random_state.randint(n, size=count) 152 | return np.sort(indicies) 153 | 154 | def calc_feature_indicies(n, count, random_state): 155 | if n == count: 156 | return np.arange(n) 157 | else: 158 | return random_state.choice(np.arange(n), size=count, replace=False) 159 | 160 | def indexing(x, idx): 161 | if hasattr(x, 'iloc'): 162 | return x.iloc[idx] 163 | else: 164 | return x[idx] 165 | 166 | def indexing2(x, idx): 167 | if hasattr(x, 'iloc'): 168 | return x.iloc[:, idx] 169 | else: 170 | return x[:, idx] 171 | -------------------------------------------------------------------------------- /era_boost_xgb_estimators.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from sklearn.base import BaseEstimator, clone 4 | from sklearn.metrics import r2_score 5 | from .utils import my_fit 6 | 7 | class EraBoostXgbRegressor(BaseEstimator): 8 | def __init__(self, base_estimator=None, num_iterations=3, proportion=0.5, n_estimators=None): 9 | self.base_estimator = base_estimator 10 | self.num_iterations = num_iterations 11 | self.proportion = proportion 12 | self.n_estimators = n_estimators 13 | 14 | def fit(self, X, y, sample_weight=None, fit_context=None): 15 | self.n_features_in_ = X.shape[1] 16 | self.base_estimator_ = clone(self.base_estimator) 17 | 18 | my_fit( 19 | self.base_estimator_, 20 | X, 21 | y, 22 | sample_weight=sample_weight, 23 | fit_context=fit_context, 24 | ) 25 | 26 | for iter in range(self.num_iterations - 1): 27 | y_pred = self.base_estimator_.predict(X) 28 | 29 | era_scores = [] 30 | indicies = [] 31 | n = y_pred.shape[0] 32 | m = 10 33 | for i in range(m): 34 | idx = np.arange(i * n // m, (i + 1) * n // m) 35 | indicies.append(idx) 36 | y_pred2 = indexing(y_pred, idx) 37 | y2 = indexing(y, idx) 38 | era_scores.append(r2_score(y2, y_pred2)) 39 | 40 | score_threshold = np.quantile(era_scores, self.proportion) 41 | idx = [] 42 | for i in range(m): 43 | if era_scores[i] <= score_threshold: 44 | idx.append(indicies[i]) 45 | idx = np.concatenate(idx) 46 | 47 | self.base_estimator_.n_estimators += self.n_estimators 48 | booster = self.base_estimator_.get_booster() 49 | self.base_estimator_.fit(indexing(X, idx), indexing(y, idx), xgb_model=booster) 50 | 51 | return self 52 | 53 | def predict(self, X): 54 | return self.base_estimator_.predict(X) 55 | 56 | def indexing(x, idx): 57 | if hasattr(x, 'iloc'): 58 | return x.iloc[idx] 59 | else: 60 | return x[idx] 61 | -------------------------------------------------------------------------------- /fear_greedy.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | import json 4 | 5 | def fetch_fear_greedy(): 6 | url = 'https://api.alternative.me/fng/?limit=3000' 7 | df = pd.DataFrame(json.loads(requests.get(url).text)['data']) 8 | df = df[df['time_until_update'].isna()] 9 | df = df.drop(columns=['time_until_update', 'value_classification']) 10 | df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', utc=True) 11 | df['value'] = df['value'].astype('float') 12 | df = df.sort_values('timestamp') 13 | df = df.set_index('timestamp') 14 | df = df.rename(columns={ 'value': 'fear_greedy_index' }) 15 | return df 16 | -------------------------------------------------------------------------------- /mlflow_artifact_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | from kedro.io.core import ( 3 | AbstractDataSet 4 | ) 5 | 6 | import joblib 7 | from mlflow.tracking import MlflowClient 8 | import tempfile 9 | 10 | class MlflowArtifactDataset(AbstractDataSet): 11 | def __init__(self, run_id, artifact_path): 12 | self._run_id = run_id 13 | self._artifact_path = artifact_path 14 | 15 | def _load(self): 16 | with tempfile.TemporaryDirectory() as dest_path: 17 | client = MlflowClient() 18 | path = client.download_artifacts( 19 | run_id=self._run_id, 20 | path=self._artifact_path, 21 | dst_path=dest_path 22 | ) 23 | return joblib.load(path) 24 | 25 | def _describe(self): 26 | return dict(run_id=self._run_id, artifact_path=self._artifact_path) 27 | 28 | def _save(self, data) -> None: 29 | pass 30 | -------------------------------------------------------------------------------- /mlflow_utils.py: -------------------------------------------------------------------------------- 1 | import mlflow 2 | import yaml 3 | import matplotlib.pyplot as plt 4 | import cloudpickle 5 | import tempfile 6 | import lzma 7 | 8 | class MlflowPlot(): 9 | def __init__(self, filename): 10 | self.filename = filename 11 | 12 | def __enter__(self): 13 | plt.figure() 14 | plt.style.use('seaborn-darkgrid') 15 | return None 16 | 17 | def __exit__(self, type, value, traceback): 18 | with tempfile.TemporaryDirectory() as dir: 19 | fname = '{}/{}'.format(dir, self.filename) 20 | plt.savefig(fname, bbox_inches='tight') # tightでlegendが収まるようになる 21 | plt.close('all') 22 | mlflow.log_artifact(fname) 23 | 24 | def mlflow_plot(filename): 25 | return MlflowPlot(filename) 26 | 27 | def mlflow_log_model(model, path): 28 | if not path.endswith('.xz'): 29 | raise Exception('mlflow_log_model path must end with .xz') 30 | 31 | data = cloudpickle.dumps(model) 32 | data = lzma.compress(data) 33 | with tempfile.TemporaryDirectory() as dir: 34 | fname = '{}/{}'.format(dir, path) 35 | with open(fname, 'wb') as f: 36 | f.write(data) 37 | mlflow.log_artifact(fname) 38 | 39 | def mlflow_log_yaml(obj, path): 40 | with tempfile.TemporaryDirectory() as dir: 41 | fname = '{}/{}'.format(dir, path) 42 | with open(fname, "w") as f: 43 | yaml.dump(obj, f) 44 | mlflow.log_artifact(fname) 45 | 46 | def mlflow_log_str(x, path): 47 | with tempfile.TemporaryDirectory() as dir: 48 | fname = '{}/{}'.format(dir, path) 49 | with open(fname, "w") as f: 50 | f.write(str(x)) 51 | mlflow.log_artifact(fname) 52 | -------------------------------------------------------------------------------- /my_fit.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import lightgbm as lgb 3 | import xgboost as xgb 4 | 5 | def my_fit(model, *args, **kwargs): 6 | if kwargs.get('fit_context') is not None: 7 | fit_context = kwargs['fit_context'] 8 | if isinstance(model, lgb.LGBMRegressor) or isinstance(model, lgb.LGBMClassifier): 9 | kwargs['eval_set'] = [(fit_context['X_val'], fit_context['y_val'])] 10 | if 'sample_weight_val' in fit_context and fit_context['sample_weight_val'] is not None: 11 | kwargs['eval_sample_weight'] = [fit_context['sample_weight_val']] 12 | kwargs['early_stopping_rounds'] = fit_context['early_stopping_rounds'] 13 | kwargs['verbose'] = False 14 | del kwargs['fit_context'] 15 | print('early stopping is used lgbm') 16 | 17 | if isinstance(model, xgb.XGBRegressor) or isinstance(model, xgb.XGBClassifier): 18 | kwargs['eval_set'] = [(fit_context['X_val'], fit_context['y_val'])] 19 | if 'sample_weight_val' in fit_context and fit_context['sample_weight_val'] is not None: 20 | kwargs['eval_sample_weight'] = [fit_context['sample_weight_val']] 21 | kwargs['early_stopping_rounds'] = fit_context['early_stopping_rounds'] 22 | kwargs['verbose'] = False 23 | del kwargs['fit_context'] 24 | print('early stopping is used xgb') 25 | 26 | argspec = inspect.getfullargspec(model.fit) 27 | # print(argspec) 28 | if 'fit_context' in kwargs and 'fit_context' not in argspec.args: 29 | del kwargs['fit_context'] 30 | 31 | # print(model) 32 | # print(kwargs.keys()) 33 | # print(argspec.args) 34 | # print(argspec) 35 | # 36 | # if 'sample_weight' in kwargs and 'sample_weight' not in argspec.args: 37 | # del kwargs['sample_weight'] 38 | 39 | return model.fit(*args, **kwargs) 40 | -------------------------------------------------------------------------------- /my_keras_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras.wrappers.scikit_learn import KerasRegressor 4 | import marshal 5 | import types 6 | import traceback 7 | 8 | # 追加機能 9 | # mc dropout 10 | # early stopping 11 | 12 | class MyKerasRegressor(KerasRegressor): 13 | def __init__(self, mc_count=None, split=None, fit_params={}, **kwargs): 14 | KerasRegressor.__init__(self, **kwargs) 15 | self.mc_count_ = mc_count 16 | self.split_ = split 17 | self.fit_params_ = fit_params 18 | 19 | def fit(self, X, y, **kwargs): 20 | if self.split_ is None: 21 | return KerasRegressor.fit(self, X, y, **self.fit_params_, **kwargs) 22 | else: 23 | train_idx, val_idx = self.split_(X) 24 | return KerasRegressor.fit( 25 | self, 26 | X[train_idx], 27 | y[train_idx], 28 | validation_data=(X[val_idx], y[val_idx]), 29 | **self.fit_params_, 30 | **kwargs, 31 | ) 32 | 33 | def predict(self, X=None): 34 | if self.mc_count_ is None or self.mc_count_ == 1: 35 | return KerasRegressor.predict(self, X) 36 | 37 | ys = [] 38 | 39 | X = tf.data.Dataset.from_tensor_slices(X) 40 | X = X.batch(65536) 41 | 42 | for i in range(self.mc_count_): 43 | ys.append(KerasRegressor.predict(self, X)) 44 | 45 | return np.mean(ys, axis=0) 46 | 47 | def get_params(self, **params): 48 | res = KerasRegressor.get_params(self, **params) 49 | res.update({ 50 | 'mc_count': self.mc_count_, 51 | 'split': self.split_, 52 | 'fit_params': self.fit_params_, 53 | }) 54 | return res 55 | 56 | def set_params(self, **params): 57 | self.mc_count_ = params['mc_count'] 58 | self.split_ = params['split'] 59 | self.fit_params_ = params['fit_params'] 60 | params = params.copy() 61 | del params['mc_count'] 62 | del params['split'] 63 | del params['fit_params_'] 64 | return KerasRegressor.set_params(self, **params) 65 | 66 | # https://stackoverflow.com/questions/8574742/how-to-pickle-an-object-of-a-class-b-having-many-variables-that-inherits-from 67 | def __getstate__(self): 68 | a_state = KerasRegressor.__getstate__(self) 69 | b_state = { 70 | 'mc_count_': self.mc_count_, 71 | # 'split_': marshal.dumps(self.split_.__code__), 72 | # 'split_': self.split_, 73 | } 74 | return (a_state, b_state) 75 | 76 | def __setstate__(self, state): 77 | a_state, b_state = state 78 | self.mc_count_ = b_state['mc_count_'] 79 | # code = marshal.loads(b_state['split_']) 80 | # self.split_ = types.FunctionType(code, globals(), "some_func_name") 81 | # self.split_ = b_state['split_'] 82 | KerasRegressor.__setstate__(self, a_state) 83 | -------------------------------------------------------------------------------- /my_keras_regressor2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras.wrappers.scikit_learn import KerasRegressor 4 | import marshal 5 | import types 6 | import traceback 7 | 8 | # 追加機能 9 | # mc dropout 10 | # early stopping (fit_context版) 11 | 12 | class MyKerasRegressor2(KerasRegressor): 13 | def __init__(self, mc_count=None, fit_params={}, **kwargs): 14 | KerasRegressor.__init__(self, **kwargs) 15 | self.mc_count_ = mc_count 16 | self.fit_params_ = fit_params 17 | 18 | def fit(self, X, y, fit_context=None, **kwargs): 19 | if fit_context is None: 20 | return KerasRegressor.fit(self, X, y, **self.fit_params_, **kwargs) 21 | else: 22 | return KerasRegressor.fit( 23 | self, 24 | X, 25 | y, 26 | validation_data=(fit_context['X_val'], fit_context['y_val']), 27 | **self.fit_params_, 28 | **kwargs, 29 | ) 30 | 31 | def predict(self, X=None): 32 | if self.mc_count_ is None or self.mc_count_ == 1: 33 | return KerasRegressor.predict(self, X) 34 | 35 | ys = [] 36 | 37 | X = tf.data.Dataset.from_tensor_slices(X) 38 | X = X.batch(65536) 39 | 40 | for i in range(self.mc_count_): 41 | ys.append(KerasRegressor.predict(self, X)) 42 | 43 | return np.mean(ys, axis=0) 44 | 45 | def get_params(self, **params): 46 | res = KerasRegressor.get_params(self, **params) 47 | res.update({ 48 | 'mc_count': self.mc_count_, 49 | 'fit_params': self.fit_params_, 50 | }) 51 | return res 52 | 53 | def set_params(self, **params): 54 | self.mc_count_ = params['mc_count'] 55 | self.fit_params_ = params['fit_params'] 56 | params = params.copy() 57 | del params['mc_count'] 58 | del params['fit_params_'] 59 | return KerasRegressor.set_params(self, **params) 60 | 61 | # https://stackoverflow.com/questions/8574742/how-to-pickle-an-object-of-a-class-b-having-many-variables-that-inherits-from 62 | def __getstate__(self): 63 | a_state = KerasRegressor.__getstate__(self) 64 | b_state = { 65 | 'mc_count_': self.mc_count_, 66 | } 67 | return (a_state, b_state) 68 | 69 | def __setstate__(self, state): 70 | a_state, b_state = state 71 | self.mc_count_ = b_state['mc_count_'] 72 | KerasRegressor.__setstate__(self, a_state) 73 | -------------------------------------------------------------------------------- /nonstationary_feature_remover.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | import lightgbm as lgb 3 | import numpy as np 4 | import pandas as pd 5 | 6 | class NonstationaryFeatureRemover(BaseEstimator, TransformerMixin): 7 | def __init__(self, remove_count=None, remove_ratio=None): 8 | if remove_count and remove_ratio: 9 | raise Exception('remove_count and remove_ratio cannot be set simultaneously') 10 | self.remove_count = remove_count 11 | self.remove_ratio = remove_ratio 12 | 13 | def fit(self, X, y=None): 14 | X = self._validate_data(X) 15 | 16 | model = lgb.LGBMRegressor(n_jobs=-1, random_state=1) 17 | 18 | model.fit(X, np.arange(X.shape[0])) 19 | importances = model.feature_importances_ 20 | 21 | if self.remove_count: 22 | remove_count = self.remove_count 23 | else: 24 | remove_count = int(self.remove_ratio * X.shape[1]) 25 | 26 | features = list(range(X.shape[1])) 27 | feature_imp = pd.DataFrame(zip(importances, features), columns=['value', 'feature']) 28 | feature_imp = feature_imp.sort_values('value') 29 | 30 | for i in range(X.shape[1] - remove_count, X.shape[1]): 31 | features.remove(int(feature_imp['feature'].iloc[i])) 32 | 33 | self.selected_features_ = np.array(features) 34 | 35 | return self 36 | 37 | def transform(self, X, y=None): 38 | X = self._validate_data(X) 39 | 40 | return X[:, self.selected_features_].copy() 41 | 42 | def inverse_transform(self, X, y=None): 43 | raise Exception('inverse_transform not implemented') 44 | -------------------------------------------------------------------------------- /numerai_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | from kedro.io.core import ( 3 | AbstractDataSet 4 | ) 5 | 6 | import tempfile 7 | import pandas as pd 8 | import numerapi 9 | import requests, zipfile 10 | from kedro_work.utils import get_joblib_memory 11 | 12 | memory = get_joblib_memory() 13 | 14 | @memory.cache 15 | def download_url(url): 16 | r = requests.get(url) 17 | return r.content 18 | 19 | class NumeraiDataset(AbstractDataSet): 20 | def __init__(self, is_train): 21 | self._is_train = is_train 22 | self._napi = numerapi.NumerAPI(verbosity="info") 23 | 24 | def _load(self): 25 | url = self._napi.get_dataset_url() 26 | 27 | with tempfile.TemporaryDirectory() as dir: 28 | cache_path = '{}/numerai_cache.zip'.format(dir) 29 | with open(cache_path, 'wb') as f: 30 | f.write(download_url(url)) 31 | z = zipfile.ZipFile(cache_path) 32 | 33 | if self._is_train: 34 | fname = 'numerai_training_data.csv' 35 | else: 36 | fname = 'numerai_tournament_data.csv' 37 | 38 | df = pd.read_csv(z.open(fname), index_col=0) 39 | return df 40 | 41 | def _describe(self): 42 | return dict(is_train=self._is_train) 43 | 44 | def _save(self, data) -> None: 45 | pass 46 | -------------------------------------------------------------------------------- /numerai_dataset2.py: -------------------------------------------------------------------------------- 1 | 2 | from kedro.io.core import ( 3 | AbstractDataSet 4 | ) 5 | 6 | import pandas as pd 7 | import numerapi 8 | 9 | class NumeraiDataset2(AbstractDataSet): 10 | def __init__(self, is_train): 11 | self._is_train = is_train 12 | self._napi = numerapi.NumerAPI(verbosity="info") 13 | 14 | def _load(self): 15 | url = self._get_dataset_url() 16 | df = pd.read_parquet(url) 17 | return df 18 | 19 | def _get_current_round(self): 20 | return self._napi.get_current_round(tournament=8) 21 | 22 | def _get_dataset_url(self): 23 | round = self._get_current_round() 24 | 25 | filename = 'numerai_training_data.parquet' if self._is_train else 'numerai_tournament_data.parquet' 26 | 27 | query = """ 28 | query ($filename: String!) { 29 | dataset(filename: $filename) 30 | } 31 | """ 32 | params = { 33 | 'filename': filename 34 | } 35 | if round: 36 | query = """ 37 | query ($filename: String!, $round: Int) { 38 | dataset(filename: $filename, round: $round) 39 | } 40 | """ 41 | params['round'] = round 42 | return self._napi.raw_query(query, params)['data']['dataset'] 43 | 44 | def _describe(self): 45 | return dict(is_train=self._is_train) 46 | 47 | def _save(self, data) -> None: 48 | pass 49 | -------------------------------------------------------------------------------- /optuna_bbc_cv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import optuna 3 | from sklearn.base import BaseEstimator 4 | from sklearn.model_selection import cross_val_predict 5 | 6 | # https://arxiv.org/abs/1708.07180 7 | 8 | class OptunaBbcCv(BaseEstimator): 9 | def __init__(self, create_model=None, sampler=None, n_trials=None, cv=None, scoring_y_pred=None, features=None): 10 | self.create_model = create_model 11 | self.sampler = sampler 12 | self.n_trials = n_trials 13 | self.cv = cv 14 | self.scoring_y_pred = scoring_y_pred 15 | self.features = list(features) 16 | 17 | def fit(self, X=None, y=None, sample_weight=None): 18 | cv = list(self.cv.split(X)) 19 | 20 | y_preds = [] 21 | def objective(trial): 22 | model = self.create_model(trial) 23 | 24 | if sample_weight is not None: 25 | y_pred = np.zeros(X.shape[0]) 26 | X_filtered = self._filter_X(X) 27 | for train_idx, val_idx in cv: 28 | model.fit(X_filtered.iloc[train_idx], y.iloc[train_idx], sample_weight=sample_weight.iloc[train_idx]) 29 | y_pred[val_idx] = model.predict(X_filtered.iloc[val_idx]) 30 | else: 31 | y_pred = cross_val_predict(model, self._filter_X(X), y, cv=cv) 32 | 33 | score = self.scoring_y_pred(X, y, y_pred) 34 | y_preds.append(y_pred) 35 | return -score 36 | 37 | study = optuna.create_study(sampler=self._create_sampler()) 38 | study.optimize(objective, n_trials=self.n_trials) 39 | 40 | model = self.create_model(study.best_trial) 41 | model.fit(self._filter_X(X), y, sample_weight=sample_weight) 42 | 43 | y_pred_oos = np.zeros(X.shape[0]) 44 | for train_idx, val_idx in cv: 45 | scores = [] 46 | for y_pred in y_preds: 47 | score = self.scoring_y_pred(X.iloc[train_idx], y.iloc[train_idx], y_pred[train_idx]) 48 | scores.append(score) 49 | scores = np.array(scores) 50 | 51 | n_bests = 1 52 | selected_y_preds = [] 53 | for trial_idx in np.argsort(scores)[-n_bests:]: 54 | selected_y_preds.append(y_preds[trial_idx][val_idx]) 55 | 56 | y_pred_oos[val_idx] = np.mean(selected_y_preds, axis=0) 57 | 58 | self.study_ = study 59 | self.model_ = model 60 | self.y_preds_ = np.array(y_preds) 61 | self.y_pred_oos_ = y_pred_oos 62 | 63 | return self 64 | 65 | def predict(self, X=None): 66 | return self.model_.predict(self._filter_X(X)) 67 | 68 | def _filter_X(self, X): 69 | if self.features is not None: 70 | return X[self.features] 71 | return X 72 | 73 | def _create_sampler(self): 74 | optuna_seed = 1 75 | if self.sampler == 'tpe': 76 | sampler = optuna.samplers.TPESampler(seed=optuna_seed) 77 | elif self.sampler == 'tpe_mv': 78 | sampler = optuna.samplers.TPESampler(multivariate=True, group=True, seed=optuna_seed) 79 | elif self.sampler == 'random': 80 | sampler = optuna.samplers.RandomSampler(seed=optuna_seed) 81 | return sampler 82 | -------------------------------------------------------------------------------- /parquet_dataset.py: -------------------------------------------------------------------------------- 1 | 2 | from kedro.io.core import ( 3 | AbstractDataSet 4 | ) 5 | 6 | import pandas as pd 7 | 8 | class ParquetDataset(AbstractDataSet): 9 | def __init__(self, filepath): 10 | self._filepath = filepath 11 | 12 | def _load(self): 13 | return pd.read_parquet(self._filepath) 14 | 15 | def _describe(self): 16 | return dict(filepath=self._filepath) 17 | 18 | def _save(self, data) -> None: 19 | pass 20 | -------------------------------------------------------------------------------- /positive_homogeneous_regressor.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from sklearn.base import BaseEstimator, clone 4 | from .utils import my_fit 5 | 6 | # Positive Homogeneous 7 | # https://www.jstage.jst.go.jp/article/pjsai/JSAI2020/0/JSAI2020_4Rin120/_pdf/-char/ja 8 | 9 | class PositiveHomogeneousRegressor(BaseEstimator): 10 | def __init__(self, regressor=None): 11 | self.regressor = regressor 12 | 13 | def fit(self, X, y, sample_weight=None, fit_context=None): 14 | self.n_features_in_ = X.shape[1] 15 | self.regressor_ = clone(self.regressor) 16 | 17 | X_norm = np.sum(X.values ** 2, axis=1) ** 0.5 18 | X = X / (1e-37 + X_norm).reshape(-1, 1) 19 | y = y / (1e-37 + X_norm) 20 | if sample_weight is None: 21 | sample_weight = np.ones(X.shape[0]) 22 | sample_weight *= X_norm ** 2 23 | 24 | if fit_context is not None: 25 | fit_context = fit_context.copy() 26 | X_norm = np.sum(fit_context['X_val'].values ** 2, axis=1) ** 0.5 27 | fit_context['X_val'] = fit_context['X_val'] / (1e-37 + X_norm).reshape(-1, 1) 28 | fit_context['y_val'] = fit_context['y_val'] / (1e-37 + X_norm) 29 | if fit_context['sample_weight_val'] is None: 30 | fit_context['sample_weight_val'] = np.ones(fit_context['X_val'].shape[0]) 31 | fit_context['sample_weight_val'] *= X_norm ** 2 32 | 33 | my_fit( 34 | self.regressor_, 35 | X, 36 | y, 37 | sample_weight=sample_weight, 38 | fit_context=fit_context, 39 | ) 40 | 41 | return self 42 | 43 | def predict(self, X): 44 | X_norm = np.sum(X.values ** 2, axis=1) ** 0.5 45 | X = X / (1e-37 + X_norm).reshape(-1, 1) 46 | 47 | y = self.regressor_.predict(X) 48 | return y * (1e-37 + X_norm) 49 | -------------------------------------------------------------------------------- /ridge_feature_count_scaler.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | # 特徴量をコピーしてN倍にしたときに、Ridgeのalphaを変えなくて良いようにスケーリング 4 | class RidgeFeatureCountScaler(BaseEstimator, TransformerMixin): 5 | def __init__(self): 6 | pass 7 | 8 | def fit(self, X, y=None): 9 | return self 10 | 11 | def transform(self, X, y=None): 12 | X = self._validate_data(X) 13 | 14 | return X / (X.shape[1] ** 0.5) 15 | 16 | def inverse_transform(self, X, y=None): 17 | X = self._validate_data(X) 18 | 19 | return X * (X.shape[1] ** 0.5) 20 | -------------------------------------------------------------------------------- /srdo_regressor.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import lightgbm as lgb 4 | from sklearn.base import BaseEstimator, RegressorMixin, clone 5 | from sklearn.utils.validation import check_X_y, check_array, check_is_fitted 6 | 7 | from kedro_work.utils import get_joblib_memory 8 | 9 | memory = get_joblib_memory() 10 | 11 | # https://arxiv.org/abs/1911.12580 12 | 13 | class SrdoRegressor(RegressorMixin, BaseEstimator): 14 | def __init__(self, estimator=None, epsilon=1e-7): 15 | self.estimator = estimator 16 | self.epsilon = epsilon 17 | 18 | def fit(self, X, y): 19 | X, y = check_X_y(X, y) 20 | 21 | memory.reduce_size() 22 | w = calc_decorr_weight(X, self.epsilon) 23 | fitted = clone(self.estimator) 24 | fitted.fit(X, y, sample_weight=w) 25 | self.estimator_ = fitted 26 | 27 | return self 28 | 29 | def predict(self, X): 30 | return self.estimator_.predict(X) 31 | 32 | @memory.cache 33 | def calc_decorr_weight(X, epsilon): 34 | classifier = lgb.LGBMClassifier(n_jobs=-1, random_state=0) 35 | 36 | X_positive = [] 37 | for i in range(X.shape[1]): 38 | X_positive.append(np.random.choice(X[:, i], size=X.shape[0], replace=True)) 39 | X_positive = np.array(X_positive).transpose() 40 | 41 | classifier.fit( 42 | np.vstack([X, X_positive]), 43 | np.concatenate([np.zeros(X.shape[0]), np.ones(X.shape[0])]) 44 | ) 45 | proba = classifier.predict_proba(X) 46 | w = proba[:, 1] / (epsilon + proba[:, 0]) 47 | 48 | return w / np.sum(w) 49 | --------------------------------------------------------------------------------