├── LICENSE
├── README.md
├── clf_binning_regressor.py
├── clf_sign_regressor.py
├── cv.py
├── early_stopping_estimators.py
├── era_boost_xgb_estimators.py
├── fear_greedy.py
├── mlflow_artifact_dataset.py
├── mlflow_utils.py
├── my_fit.py
├── my_keras_regressor.py
├── my_keras_regressor2.py
├── nonstationary_feature_remover.py
├── numerai_dataset.py
├── numerai_dataset2.py
├── optuna_bbc_cv.py
├── parquet_dataset.py
├── positive_homogeneous_regressor.py
├── ridge_feature_count_scaler.py
└── srdo_regressor.py


/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # bot_snippets
2 | 
3 | ボットに使ってるコード断片
4 | ライブラリにするほど仕様が確定していないもの
5 | 


--------------------------------------------------------------------------------
/clf_binning_regressor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.base import BaseEstimator, clone
 3 | from sklearn.preprocessing import KBinsDiscretizer
 4 | from .utils import my_fit
 5 | 
 6 | class ClfBinningRegressor(BaseEstimator):
 7 |     def __init__(self, classifier=None, n_bins=None):
 8 |         self.classifier = classifier
 9 |         self.n_bins = n_bins
10 | 
11 |     def fit(self, X, y, sample_weight=None, fit_context=None):
12 |         self.n_features_in_ = X.shape[1]
13 |         self.classifier_ = clone(self.classifier)
14 |         self.transformer_ = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy='quantile')
15 | 
16 |         y = self.transformer_.fit_transform(y.reshape(-1, 1)).flatten().astype('int')
17 | 
18 |         if fit_context is not None:
19 |             fit_context = fit_context.copy()
20 |             fit_context['y_val'] = self.transformer_.transform(fit_context['y_val'].reshape(-1, 1)).flatten().astype('int')
21 | 
22 |         my_fit(
23 |             self.classifier_,
24 |             X,
25 |             y,
26 |             sample_weight=sample_weight,
27 |             fit_context=fit_context,
28 |         )
29 | 
30 |         self.class_values_ = self.transformer_.inverse_transform(np.array(self.classifier_.classes_).reshape(-1, 1)).flatten()
31 | 
32 |         return self
33 | 
34 |     def predict(self, X):
35 |         proba = self.classifier_.predict_proba(X)
36 |         return np.sum(proba * self.class_values_, axis=1)
37 | 


--------------------------------------------------------------------------------
/clf_sign_regressor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.base import BaseEstimator, clone
 3 | from .utils import my_fit
 4 | 
 5 | class ClfSignRegressor(BaseEstimator):
 6 |     def __init__(self, classifier=None):
 7 |         self.classifier = classifier
 8 | 
 9 |     def fit(self, X, y, sample_weight=None, fit_context=None):
10 |         self.n_features_in_ = X.shape[1]
11 |         self.classifier_ = clone(self.classifier)
12 | 
13 |         sw = np.abs(y)
14 |         if sample_weight is not None:
15 |             sw *= sample_weight
16 |         y = np.sign(y).astype('int')
17 | 
18 |         if fit_context is not None:
19 |             fit_context = fit_context.copy()
20 |             sw_val = np.abs(fit_context['y_val'])
21 |             if fit_context['sample_weight_val'] is not None:
22 |                 sw_val *= fit_context['sample_weight_val']
23 |             fit_context['y_val'] = np.sign(fit_context['y_val']).astype('int')
24 |             fit_context['sample_weight_val'] = sw_val
25 | 
26 |         my_fit(
27 |             self.classifier_,
28 |             X,
29 |             y,
30 |             sample_weight=sample_weight,
31 |             fit_context=fit_context,
32 |         )
33 | 
34 |         return self
35 | 
36 |     def predict(self, X):
37 |         proba = self.classifier_.predict_proba(X)
38 |         return np.sum(proba * np.array(self.classifier_.classes_), axis=1)
39 | 


--------------------------------------------------------------------------------
/cv.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | def _purge_idx(train_idx, val_idx, groups, purge):
 5 |     unique_groups = np.unique(groups[val_idx])
 6 |     purged_groups = unique_groups.reshape(1, -1) + np.arange(-purge, purge + 1).reshape(-1, 1)
 7 |     purged_groups = np.unique(purged_groups)
 8 |     return train_idx[~np.isin(groups[train_idx], purged_groups)]
 9 | 
10 | def my_group_kfold(groups, n_splits=5, purge=12):
11 |     if hasattr(groups, 'values'):
12 |         groups = groups.values
13 |     idx = np.arange(groups.size)
14 |     g = np.sort(np.unique(groups))
15 |     cv = []
16 |     for i in range(n_splits):
17 |         selected = g[i * g.size // n_splits:(i + 1) * g.size // n_splits]
18 |         val_idx = np.isin(groups, selected)
19 |         cv.append((
20 |             _purge_idx(idx[~val_idx], idx[val_idx], groups, purge),
21 |             idx[val_idx],
22 |         ))
23 |     return cv
24 | 
25 | def my_kfold(x, n_splits=5, purge=12):
26 |     return my_group_kfold(np.arange(x.shape[0]), n_splits=n_splits, purge=purge)
27 | 


--------------------------------------------------------------------------------
/early_stopping_estimators.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cvxpy as cp
  3 | from sklearn.base import BaseEstimator, clone
  4 | from sklearn.utils import check_random_state
  5 | from sklearn.ensemble._base import _set_random_states
  6 | from .utils import my_fit
  7 | 
  8 | # https://proceedings.neurips.cc/paper/1996/file/f47330643ae134ca204bf6b2481fec47-Paper.pdf
  9 | ENSEMBLE_MODE_BALANCING = 'balancing'
 10 | 
 11 | class BaseEarlyStoppingEstimator(BaseEstimator):
 12 |     def __init__(self,
 13 |                  base_estimator=None,
 14 |                  n_estimators=10,
 15 |                  cv=None,
 16 |                  # max_samples=1.0,
 17 |                  # max_features=1.0,
 18 |                  ensemble_mode=None,
 19 |                  random_state=None,
 20 |                  verbose=0):
 21 | 
 22 |         self.base_estimator = base_estimator
 23 |         self.n_estimators = n_estimators
 24 |         self.cv = cv
 25 |         # self.max_samples = max_samples
 26 |         # self.max_features = max_features
 27 |         self.ensemble_mode = ensemble_mode
 28 |         self.random_state = random_state
 29 |         self.verbose = verbose
 30 | 
 31 |     def fit(self, X, y, sample_weight=None):
 32 |         # n = X.shape[0]
 33 |         random_state = check_random_state(self.random_state)
 34 |         # count = round(self.max_samples * n)
 35 |         # feature_count = round(self.max_features * X.shape[1])
 36 | 
 37 |         self.n_features_in_ = X.shape[1]
 38 |         self.estimators_ = []
 39 |         self.estimators_features_ = []
 40 |         if self.ensemble_mode == ENSEMBLE_MODE_BALANCING:
 41 |             self.val_errors_ = []
 42 | 
 43 |         cv_gen = self.cv.split(X)
 44 | 
 45 |         for i in range(self.n_estimators):
 46 |             train_idx, val_idx = cv_gen.__next__()
 47 | 
 48 |             estimator = clone(self.base_estimator)
 49 |             _set_random_states(estimator, random_state=random_state.randint(np.iinfo(np.int32).max))
 50 | 
 51 |             sw = None if sample_weight is None else sample_weight[train_idx]
 52 | 
 53 |             fit_context = {
 54 |                 'X_val': indexing(X, val_idx),
 55 |                 'y_val': indexing(y, val_idx),
 56 |                 'sample_weight_val': None if sample_weight is None else indexing(sample_weight, val_idx),
 57 |                 'early_stopping_rounds': 100,
 58 |             }
 59 | 
 60 |             my_fit(
 61 |                 estimator,
 62 |                 indexing(X, train_idx),
 63 |                 indexing(y, train_idx),
 64 |                 sample_weight=sw,
 65 |                 fit_context=fit_context,
 66 |             )
 67 | 
 68 |             if self.ensemble_mode == ENSEMBLE_MODE_BALANCING:
 69 |                 y_val_pred = estimator.predict(X_val)
 70 |                 val_error = np.average((y_val - y_val_pred) ** 2, weights=sw_val)
 71 |                 self.val_errors_.append(val_error)
 72 | 
 73 |             # indicies = calc_indicies(n, count, random_state)
 74 |             # feature_indicies = calc_feature_indicies(X.shape[1], feature_count, random_state)
 75 | 
 76 |             feature_indicies = np.arange(X.shape[1])
 77 | 
 78 |             self.estimators_.append(estimator)
 79 |             self.estimators_features_.append(feature_indicies)
 80 | 
 81 |         if self.ensemble_mode == ENSEMBLE_MODE_BALANCING:
 82 |             self.val_errors_ = np.array(self.val_errors_)
 83 | 
 84 |         return self
 85 | 
 86 | class EarlyStoppingRegressor(BaseEarlyStoppingEstimator):
 87 |     def predict(self, X):
 88 |         ys = []
 89 |         for i, estimator in enumerate(self.estimators_):
 90 |             ys.append(estimator.predict(indexing2(X, self.estimators_features_[i])))
 91 |         ys = np.array(ys)
 92 | 
 93 |         if self.ensemble_mode == ENSEMBLE_MODE_BALANCING:
 94 |             w = cp.Variable((len(self.estimators_), X.shape[0]))
 95 | 
 96 |             # 2 * w[i] * val_errors[i]
 97 |             # - w[i] * y[i] ** 2
 98 |             # + w[i] * w[j] * y[i] * y[j] -> sum(w[i] * y[i]) ** 2
 99 | 
100 |             objective = cp.Minimize(
101 |                 2 * cp.sum(cp.multiply(w, np.repeat(self.val_errors_.reshape(-1, 1), X.shape[0], axis=1)))
102 |                 - cp.sum(cp.multiply(w, ys ** 2))
103 |                 + cp.sum(cp.multiply(w, ys)) ** 2
104 |             )
105 | 
106 |             constraints = [
107 |                 0 <= w,
108 |                 cp.sum(w, axis=0) == 1,
109 |             ]
110 | 
111 |             prob = cp.Problem(objective, constraints)
112 |             try:
113 |                 result = prob.solve()
114 |             except cp.error.SolverError:
115 |                 print('cvxpy solve failed. use equal weight')
116 |                 return np.mean(ys, axis=0)
117 | 
118 |             return np.sum(ys * w.value, axis=0)
119 |         else:
120 |             return np.mean(ys, axis=0)
121 | 
122 | class EarlyStoppingClassifier(BaseEarlyStoppingEstimator):
123 |     def fit(self, X, y, sample_weight=None):
124 |         self.classes_ = np.sort(np.unique(y))
125 |         self.n_classes_ = len(self.classes_)
126 |         return super().fit(X, y, sample_weight=sample_weight)
127 | 
128 |     def predict(self, X):
129 |         proba = self.predict_proba(X)
130 |         return self.classes_.take(np.argmax(proba, axis=1), axis=0)
131 | 
132 |     def predict_proba(self, X):
133 |         class_to_idx = {}
134 |         for i, cls in enumerate(self.classes_):
135 |             class_to_idx[cls] = i
136 |         proba = np.zeros(X.shape[0], self.n_classes_)
137 | 
138 |         for estimator in self.estimators_:
139 |             if hasattr(estimator, "predict_proba"):
140 |                 p = estimator.predict_proba(X)
141 |                 for i, cls in enumerate(estimator.classes_):
142 |                     proba[:, class_to_idx[cls]] += p[:, i]
143 |             else:
144 |                 y_pred = estimator.predict(X)
145 |                 for i, cls in enumerate(self.classes_):
146 |                     proba[y_pred == cls, i] += 1
147 | 
148 |         return proba / self.n_estimators
149 | 
150 | def calc_indicies(n, count, random_state):
151 |     indicies = random_state.randint(n, size=count)
152 |     return np.sort(indicies)
153 | 
154 | def calc_feature_indicies(n, count, random_state):
155 |     if n == count:
156 |         return np.arange(n)
157 |     else:
158 |         return random_state.choice(np.arange(n), size=count, replace=False)
159 | 
160 | def indexing(x, idx):
161 |     if hasattr(x, 'iloc'):
162 |         return x.iloc[idx]
163 |     else:
164 |         return x[idx]
165 | 
166 | def indexing2(x, idx):
167 |     if hasattr(x, 'iloc'):
168 |         return x.iloc[:, idx]
169 |     else:
170 |         return x[:, idx]
171 | 


--------------------------------------------------------------------------------
/era_boost_xgb_estimators.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from sklearn.base import BaseEstimator, clone
 4 | from sklearn.metrics import r2_score
 5 | from .utils import my_fit
 6 | 
 7 | class EraBoostXgbRegressor(BaseEstimator):
 8 |     def __init__(self, base_estimator=None, num_iterations=3, proportion=0.5, n_estimators=None):
 9 |         self.base_estimator = base_estimator
10 |         self.num_iterations = num_iterations
11 |         self.proportion = proportion
12 |         self.n_estimators = n_estimators
13 | 
14 |     def fit(self, X, y, sample_weight=None, fit_context=None):
15 |         self.n_features_in_ = X.shape[1]
16 |         self.base_estimator_ = clone(self.base_estimator)
17 | 
18 |         my_fit(
19 |             self.base_estimator_,
20 |             X,
21 |             y,
22 |             sample_weight=sample_weight,
23 |             fit_context=fit_context,
24 |         )
25 | 
26 |         for iter in range(self.num_iterations - 1):
27 |             y_pred = self.base_estimator_.predict(X)
28 | 
29 |             era_scores = []
30 |             indicies = []
31 |             n = y_pred.shape[0]
32 |             m = 10
33 |             for i in range(m):
34 |                 idx = np.arange(i * n // m, (i + 1) * n // m)
35 |                 indicies.append(idx)
36 |                 y_pred2 = indexing(y_pred, idx)
37 |                 y2 = indexing(y, idx)
38 |                 era_scores.append(r2_score(y2, y_pred2))
39 | 
40 |             score_threshold = np.quantile(era_scores, self.proportion)
41 |             idx = []
42 |             for i in range(m):
43 |                 if era_scores[i] <= score_threshold:
44 |                     idx.append(indicies[i])
45 |             idx = np.concatenate(idx)
46 | 
47 |             self.base_estimator_.n_estimators += self.n_estimators
48 |             booster = self.base_estimator_.get_booster()
49 |             self.base_estimator_.fit(indexing(X, idx), indexing(y, idx), xgb_model=booster)
50 | 
51 |         return self
52 | 
53 |     def predict(self, X):
54 |         return self.base_estimator_.predict(X)
55 | 
56 | def indexing(x, idx):
57 |     if hasattr(x, 'iloc'):
58 |         return x.iloc[idx]
59 |     else:
60 |         return x[idx]
61 | 


--------------------------------------------------------------------------------
/fear_greedy.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import requests
 3 | import json
 4 | 
 5 | def fetch_fear_greedy():
 6 |     url = 'https://api.alternative.me/fng/?limit=3000'
 7 |     df = pd.DataFrame(json.loads(requests.get(url).text)['data'])
 8 |     df = df[df['time_until_update'].isna()]
 9 |     df = df.drop(columns=['time_until_update', 'value_classification'])
10 |     df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', utc=True)
11 |     df['value'] = df['value'].astype('float')
12 |     df = df.sort_values('timestamp')
13 |     df = df.set_index('timestamp')
14 |     df = df.rename(columns={ 'value': 'fear_greedy_index' })
15 |     return df
16 | 


--------------------------------------------------------------------------------
/mlflow_artifact_dataset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from kedro.io.core import (
 3 |     AbstractDataSet
 4 | )
 5 | 
 6 | import joblib
 7 | from mlflow.tracking import MlflowClient
 8 | import tempfile
 9 | 
10 | class MlflowArtifactDataset(AbstractDataSet):
11 |     def __init__(self, run_id, artifact_path):
12 |         self._run_id = run_id
13 |         self._artifact_path = artifact_path
14 | 
15 |     def _load(self):
16 |         with tempfile.TemporaryDirectory() as dest_path:
17 |             client = MlflowClient()
18 |             path = client.download_artifacts(
19 |                 run_id=self._run_id,
20 |                 path=self._artifact_path,
21 |                 dst_path=dest_path
22 |             )
23 |             return joblib.load(path)
24 | 
25 |     def _describe(self):
26 |         return dict(run_id=self._run_id, artifact_path=self._artifact_path)
27 | 
28 |     def _save(self, data) -> None:
29 |         pass
30 | 


--------------------------------------------------------------------------------
/mlflow_utils.py:
--------------------------------------------------------------------------------
 1 | import mlflow
 2 | import yaml
 3 | import matplotlib.pyplot as plt
 4 | import cloudpickle
 5 | import tempfile
 6 | import lzma
 7 | 
 8 | class MlflowPlot():
 9 |     def __init__(self, filename):
10 |         self.filename = filename
11 | 
12 |     def __enter__(self):
13 |         plt.figure()
14 |         plt.style.use('seaborn-darkgrid')
15 |         return None
16 | 
17 |     def __exit__(self, type, value, traceback):
18 |         with tempfile.TemporaryDirectory() as dir:
19 |             fname = '{}/{}'.format(dir, self.filename)
20 |             plt.savefig(fname, bbox_inches='tight') # tightでlegendが収まるようになる
21 |             plt.close('all')
22 |             mlflow.log_artifact(fname)
23 | 
24 | def mlflow_plot(filename):
25 |     return MlflowPlot(filename)
26 | 
27 | def mlflow_log_model(model, path):
28 |     if not path.endswith('.xz'):
29 |         raise Exception('mlflow_log_model path must end with .xz')
30 | 
31 |     data = cloudpickle.dumps(model)
32 |     data = lzma.compress(data)
33 |     with tempfile.TemporaryDirectory() as dir:
34 |         fname = '{}/{}'.format(dir, path)
35 |         with open(fname, 'wb') as f:
36 |             f.write(data)
37 |         mlflow.log_artifact(fname)
38 | 
39 | def mlflow_log_yaml(obj, path):
40 |     with tempfile.TemporaryDirectory() as dir:
41 |         fname = '{}/{}'.format(dir, path)
42 |         with open(fname, "w") as f:
43 |             yaml.dump(obj, f)
44 |         mlflow.log_artifact(fname)
45 | 
46 | def mlflow_log_str(x, path):
47 |     with tempfile.TemporaryDirectory() as dir:
48 |         fname = '{}/{}'.format(dir, path)
49 |         with open(fname, "w") as f:
50 |             f.write(str(x))
51 |         mlflow.log_artifact(fname)
52 | 


--------------------------------------------------------------------------------
/my_fit.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import lightgbm as lgb
 3 | import xgboost as xgb
 4 | 
 5 | def my_fit(model, *args, **kwargs):
 6 |     if kwargs.get('fit_context') is not None:
 7 |         fit_context = kwargs['fit_context']
 8 |         if isinstance(model, lgb.LGBMRegressor) or isinstance(model, lgb.LGBMClassifier):
 9 |             kwargs['eval_set'] = [(fit_context['X_val'], fit_context['y_val'])]
10 |             if 'sample_weight_val' in fit_context and fit_context['sample_weight_val'] is not None:
11 |                 kwargs['eval_sample_weight'] = [fit_context['sample_weight_val']]
12 |             kwargs['early_stopping_rounds'] = fit_context['early_stopping_rounds']
13 |             kwargs['verbose'] = False
14 |             del kwargs['fit_context']
15 |             print('early stopping is used lgbm')
16 | 
17 |         if isinstance(model, xgb.XGBRegressor) or isinstance(model, xgb.XGBClassifier):
18 |             kwargs['eval_set'] = [(fit_context['X_val'], fit_context['y_val'])]
19 |             if 'sample_weight_val' in fit_context and fit_context['sample_weight_val'] is not None:
20 |                 kwargs['eval_sample_weight'] = [fit_context['sample_weight_val']]
21 |             kwargs['early_stopping_rounds'] = fit_context['early_stopping_rounds']
22 |             kwargs['verbose'] = False
23 |             del kwargs['fit_context']
24 |             print('early stopping is used xgb')
25 | 
26 |     argspec = inspect.getfullargspec(model.fit)
27 |     # print(argspec)
28 |     if 'fit_context' in kwargs and 'fit_context' not in argspec.args:
29 |         del kwargs['fit_context']
30 | 
31 |     # print(model)
32 |     # print(kwargs.keys())
33 |     # print(argspec.args)
34 |     # print(argspec)
35 |     #
36 |     # if 'sample_weight' in kwargs and 'sample_weight' not in argspec.args:
37 |     #     del kwargs['sample_weight']
38 | 
39 |     return model.fit(*args, **kwargs)
40 | 


--------------------------------------------------------------------------------
/my_keras_regressor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
 4 | import marshal
 5 | import types
 6 | import traceback
 7 | 
 8 | # 追加機能
 9 | # mc dropout
10 | # early stopping
11 | 
12 | class MyKerasRegressor(KerasRegressor):
13 |     def __init__(self, mc_count=None, split=None, fit_params={}, **kwargs):
14 |         KerasRegressor.__init__(self, **kwargs)
15 |         self.mc_count_ = mc_count
16 |         self.split_ = split
17 |         self.fit_params_ = fit_params
18 | 
19 |     def fit(self, X, y, **kwargs):
20 |         if self.split_ is None:
21 |             return KerasRegressor.fit(self, X, y, **self.fit_params_, **kwargs)
22 |         else:
23 |             train_idx, val_idx = self.split_(X)
24 |             return KerasRegressor.fit(
25 |                 self,
26 |                 X[train_idx],
27 |                 y[train_idx],
28 |                 validation_data=(X[val_idx], y[val_idx]),
29 |                 **self.fit_params_,
30 |                 **kwargs,
31 |             )
32 | 
33 |     def predict(self, X=None):
34 |         if self.mc_count_ is None or self.mc_count_ == 1:
35 |             return KerasRegressor.predict(self, X)
36 | 
37 |         ys = []
38 | 
39 |         X = tf.data.Dataset.from_tensor_slices(X)
40 |         X = X.batch(65536)
41 | 
42 |         for i in range(self.mc_count_):
43 |             ys.append(KerasRegressor.predict(self, X))
44 | 
45 |         return np.mean(ys, axis=0)
46 | 
47 |     def get_params(self, **params):
48 |         res = KerasRegressor.get_params(self, **params)
49 |         res.update({
50 |             'mc_count': self.mc_count_,
51 |             'split': self.split_,
52 |             'fit_params': self.fit_params_,
53 |         })
54 |         return res
55 | 
56 |     def set_params(self, **params):
57 |         self.mc_count_ = params['mc_count']
58 |         self.split_ = params['split']
59 |         self.fit_params_ = params['fit_params']
60 |         params = params.copy()
61 |         del params['mc_count']
62 |         del params['split']
63 |         del params['fit_params_']
64 |         return KerasRegressor.set_params(self, **params)
65 | 
66 |     # https://stackoverflow.com/questions/8574742/how-to-pickle-an-object-of-a-class-b-having-many-variables-that-inherits-from
67 |     def __getstate__(self):
68 |         a_state = KerasRegressor.__getstate__(self)
69 |         b_state = {
70 |             'mc_count_': self.mc_count_,
71 |             # 'split_': marshal.dumps(self.split_.__code__),
72 |             # 'split_': self.split_,
73 |         }
74 |         return (a_state, b_state)
75 | 
76 |     def __setstate__(self, state):
77 |         a_state, b_state = state
78 |         self.mc_count_ = b_state['mc_count_']
79 |         # code = marshal.loads(b_state['split_'])
80 |         # self.split_ = types.FunctionType(code, globals(), "some_func_name")
81 |         # self.split_ = b_state['split_']
82 |         KerasRegressor.__setstate__(self, a_state)
83 | 


--------------------------------------------------------------------------------
/my_keras_regressor2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
 4 | import marshal
 5 | import types
 6 | import traceback
 7 | 
 8 | # 追加機能
 9 | # mc dropout
10 | # early stopping (fit_context版)
11 | 
12 | class MyKerasRegressor2(KerasRegressor):
13 |     def __init__(self, mc_count=None, fit_params={}, **kwargs):
14 |         KerasRegressor.__init__(self, **kwargs)
15 |         self.mc_count_ = mc_count
16 |         self.fit_params_ = fit_params
17 | 
18 |     def fit(self, X, y, fit_context=None, **kwargs):
19 |         if fit_context is None:
20 |             return KerasRegressor.fit(self, X, y, **self.fit_params_, **kwargs)
21 |         else:
22 |             return KerasRegressor.fit(
23 |                 self,
24 |                 X,
25 |                 y,
26 |                 validation_data=(fit_context['X_val'], fit_context['y_val']),
27 |                 **self.fit_params_,
28 |                 **kwargs,
29 |             )
30 | 
31 |     def predict(self, X=None):
32 |         if self.mc_count_ is None or self.mc_count_ == 1:
33 |             return KerasRegressor.predict(self, X)
34 | 
35 |         ys = []
36 | 
37 |         X = tf.data.Dataset.from_tensor_slices(X)
38 |         X = X.batch(65536)
39 | 
40 |         for i in range(self.mc_count_):
41 |             ys.append(KerasRegressor.predict(self, X))
42 | 
43 |         return np.mean(ys, axis=0)
44 | 
45 |     def get_params(self, **params):
46 |         res = KerasRegressor.get_params(self, **params)
47 |         res.update({
48 |             'mc_count': self.mc_count_,
49 |             'fit_params': self.fit_params_,
50 |         })
51 |         return res
52 | 
53 |     def set_params(self, **params):
54 |         self.mc_count_ = params['mc_count']
55 |         self.fit_params_ = params['fit_params']
56 |         params = params.copy()
57 |         del params['mc_count']
58 |         del params['fit_params_']
59 |         return KerasRegressor.set_params(self, **params)
60 | 
61 |     # https://stackoverflow.com/questions/8574742/how-to-pickle-an-object-of-a-class-b-having-many-variables-that-inherits-from
62 |     def __getstate__(self):
63 |         a_state = KerasRegressor.__getstate__(self)
64 |         b_state = {
65 |             'mc_count_': self.mc_count_,
66 |         }
67 |         return (a_state, b_state)
68 | 
69 |     def __setstate__(self, state):
70 |         a_state, b_state = state
71 |         self.mc_count_ = b_state['mc_count_']
72 |         KerasRegressor.__setstate__(self, a_state)
73 | 


--------------------------------------------------------------------------------
/nonstationary_feature_remover.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator, TransformerMixin
 2 | import lightgbm as lgb
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | class NonstationaryFeatureRemover(BaseEstimator, TransformerMixin):
 7 |     def __init__(self, remove_count=None, remove_ratio=None):
 8 |         if remove_count and remove_ratio:
 9 |             raise Exception('remove_count and remove_ratio cannot be set simultaneously')
10 |         self.remove_count = remove_count
11 |         self.remove_ratio = remove_ratio
12 | 
13 |     def fit(self, X, y=None):
14 |         X = self._validate_data(X)
15 | 
16 |         model = lgb.LGBMRegressor(n_jobs=-1, random_state=1)
17 | 
18 |         model.fit(X, np.arange(X.shape[0]))
19 |         importances = model.feature_importances_
20 | 
21 |         if self.remove_count:
22 |             remove_count = self.remove_count
23 |         else:
24 |             remove_count = int(self.remove_ratio * X.shape[1])
25 | 
26 |         features = list(range(X.shape[1]))
27 |         feature_imp = pd.DataFrame(zip(importances, features), columns=['value', 'feature'])
28 |         feature_imp = feature_imp.sort_values('value')
29 | 
30 |         for i in range(X.shape[1] - remove_count, X.shape[1]):
31 |             features.remove(int(feature_imp['feature'].iloc[i]))
32 | 
33 |         self.selected_features_ = np.array(features)
34 | 
35 |         return self
36 | 
37 |     def transform(self, X, y=None):
38 |         X = self._validate_data(X)
39 | 
40 |         return X[:, self.selected_features_].copy()
41 | 
42 |     def inverse_transform(self, X, y=None):
43 |         raise Exception('inverse_transform not implemented')
44 | 


--------------------------------------------------------------------------------
/numerai_dataset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from kedro.io.core import (
 3 |     AbstractDataSet
 4 | )
 5 | 
 6 | import tempfile
 7 | import pandas as pd
 8 | import numerapi
 9 | import requests, zipfile
10 | from kedro_work.utils import get_joblib_memory
11 | 
12 | memory = get_joblib_memory()
13 | 
14 | @memory.cache
15 | def download_url(url):
16 |     r = requests.get(url)
17 |     return r.content
18 | 
19 | class NumeraiDataset(AbstractDataSet):
20 |     def __init__(self, is_train):
21 |         self._is_train = is_train
22 |         self._napi = numerapi.NumerAPI(verbosity="info")
23 | 
24 |     def _load(self):
25 |         url = self._napi.get_dataset_url()
26 | 
27 |         with tempfile.TemporaryDirectory() as dir:
28 |             cache_path = '{}/numerai_cache.zip'.format(dir)
29 |             with open(cache_path, 'wb') as f:
30 |                 f.write(download_url(url))
31 |             z = zipfile.ZipFile(cache_path)
32 | 
33 |             if self._is_train:
34 |                 fname = 'numerai_training_data.csv'
35 |             else:
36 |                 fname = 'numerai_tournament_data.csv'
37 | 
38 |             df = pd.read_csv(z.open(fname), index_col=0)
39 |         return df
40 | 
41 |     def _describe(self):
42 |         return dict(is_train=self._is_train)
43 | 
44 |     def _save(self, data) -> None:
45 |         pass
46 | 


--------------------------------------------------------------------------------
/numerai_dataset2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from kedro.io.core import (
 3 |     AbstractDataSet
 4 | )
 5 | 
 6 | import pandas as pd
 7 | import numerapi
 8 | 
 9 | class NumeraiDataset2(AbstractDataSet):
10 |     def __init__(self, is_train):
11 |         self._is_train = is_train
12 |         self._napi = numerapi.NumerAPI(verbosity="info")
13 | 
14 |     def _load(self):
15 |         url = self._get_dataset_url()
16 |         df = pd.read_parquet(url)
17 |         return df
18 | 
19 |     def _get_current_round(self):
20 |         return self._napi.get_current_round(tournament=8)
21 | 
22 |     def _get_dataset_url(self):
23 |         round = self._get_current_round()
24 | 
25 |         filename = 'numerai_training_data.parquet' if self._is_train else 'numerai_tournament_data.parquet'
26 | 
27 |         query = """
28 |             query ($filename: String!) {
29 |                 dataset(filename: $filename)
30 |             }
31 |             """
32 |         params = {
33 |             'filename': filename
34 |         }
35 |         if round:
36 |             query = """
37 |                         query ($filename: String!, $round: Int) {
38 |                             dataset(filename: $filename, round: $round)
39 |                         }
40 |                         """
41 |             params['round'] = round
42 |         return self._napi.raw_query(query, params)['data']['dataset']
43 | 
44 |     def _describe(self):
45 |         return dict(is_train=self._is_train)
46 | 
47 |     def _save(self, data) -> None:
48 |         pass
49 | 


--------------------------------------------------------------------------------
/optuna_bbc_cv.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import optuna
 3 | from sklearn.base import BaseEstimator
 4 | from sklearn.model_selection import cross_val_predict
 5 | 
 6 | # https://arxiv.org/abs/1708.07180
 7 | 
 8 | class OptunaBbcCv(BaseEstimator):
 9 |     def __init__(self, create_model=None, sampler=None, n_trials=None, cv=None, scoring_y_pred=None, features=None):
10 |         self.create_model = create_model
11 |         self.sampler = sampler
12 |         self.n_trials = n_trials
13 |         self.cv = cv
14 |         self.scoring_y_pred = scoring_y_pred
15 |         self.features = list(features)
16 | 
17 |     def fit(self, X=None, y=None, sample_weight=None):
18 |         cv = list(self.cv.split(X))
19 | 
20 |         y_preds = []
21 |         def objective(trial):
22 |             model = self.create_model(trial)
23 | 
24 |             if sample_weight is not None:
25 |                 y_pred = np.zeros(X.shape[0])
26 |                 X_filtered = self._filter_X(X)
27 |                 for train_idx, val_idx in cv:
28 |                     model.fit(X_filtered.iloc[train_idx], y.iloc[train_idx], sample_weight=sample_weight.iloc[train_idx])
29 |                     y_pred[val_idx] = model.predict(X_filtered.iloc[val_idx])
30 |             else:
31 |                 y_pred = cross_val_predict(model, self._filter_X(X), y, cv=cv)
32 | 
33 |             score = self.scoring_y_pred(X, y, y_pred)
34 |             y_preds.append(y_pred)
35 |             return -score
36 | 
37 |         study = optuna.create_study(sampler=self._create_sampler())
38 |         study.optimize(objective, n_trials=self.n_trials)
39 | 
40 |         model = self.create_model(study.best_trial)
41 |         model.fit(self._filter_X(X), y, sample_weight=sample_weight)
42 | 
43 |         y_pred_oos = np.zeros(X.shape[0])
44 |         for train_idx, val_idx in cv:
45 |             scores = []
46 |             for y_pred in y_preds:
47 |                 score = self.scoring_y_pred(X.iloc[train_idx], y.iloc[train_idx], y_pred[train_idx])
48 |                 scores.append(score)
49 |             scores = np.array(scores)
50 | 
51 |             n_bests = 1
52 |             selected_y_preds = []
53 |             for trial_idx in np.argsort(scores)[-n_bests:]:
54 |                 selected_y_preds.append(y_preds[trial_idx][val_idx])
55 | 
56 |             y_pred_oos[val_idx] = np.mean(selected_y_preds, axis=0)
57 | 
58 |         self.study_ = study
59 |         self.model_ = model
60 |         self.y_preds_ = np.array(y_preds)
61 |         self.y_pred_oos_ = y_pred_oos
62 | 
63 |         return self
64 | 
65 |     def predict(self, X=None):
66 |         return self.model_.predict(self._filter_X(X))
67 | 
68 |     def _filter_X(self, X):
69 |         if self.features is not None:
70 |             return X[self.features]
71 |         return X
72 | 
73 |     def _create_sampler(self):
74 |         optuna_seed = 1
75 |         if self.sampler == 'tpe':
76 |             sampler = optuna.samplers.TPESampler(seed=optuna_seed)
77 |         elif self.sampler == 'tpe_mv':
78 |             sampler = optuna.samplers.TPESampler(multivariate=True, group=True, seed=optuna_seed)
79 |         elif self.sampler == 'random':
80 |             sampler = optuna.samplers.RandomSampler(seed=optuna_seed)
81 |         return sampler
82 | 


--------------------------------------------------------------------------------
/parquet_dataset.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from kedro.io.core import (
 3 |     AbstractDataSet
 4 | )
 5 | 
 6 | import pandas as pd
 7 | 
 8 | class ParquetDataset(AbstractDataSet):
 9 |     def __init__(self, filepath):
10 |         self._filepath = filepath
11 | 
12 |     def _load(self):
13 |         return pd.read_parquet(self._filepath)
14 | 
15 |     def _describe(self):
16 |         return dict(filepath=self._filepath)
17 | 
18 |     def _save(self, data) -> None:
19 |         pass
20 | 


--------------------------------------------------------------------------------
/positive_homogeneous_regressor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from sklearn.base import BaseEstimator, clone
 4 | from .utils import my_fit
 5 | 
 6 | # Positive Homogeneous
 7 | # https://www.jstage.jst.go.jp/article/pjsai/JSAI2020/0/JSAI2020_4Rin120/_pdf/-char/ja
 8 | 
 9 | class PositiveHomogeneousRegressor(BaseEstimator):
10 |     def __init__(self, regressor=None):
11 |         self.regressor = regressor
12 | 
13 |     def fit(self, X, y, sample_weight=None, fit_context=None):
14 |         self.n_features_in_ = X.shape[1]
15 |         self.regressor_ = clone(self.regressor)
16 | 
17 |         X_norm = np.sum(X.values ** 2, axis=1) ** 0.5
18 |         X = X / (1e-37 + X_norm).reshape(-1, 1)
19 |         y = y / (1e-37 + X_norm)
20 |         if sample_weight is None:
21 |             sample_weight = np.ones(X.shape[0])
22 |         sample_weight *= X_norm ** 2
23 | 
24 |         if fit_context is not None:
25 |             fit_context = fit_context.copy()
26 |             X_norm = np.sum(fit_context['X_val'].values ** 2, axis=1) ** 0.5
27 |             fit_context['X_val'] = fit_context['X_val'] / (1e-37 + X_norm).reshape(-1, 1)
28 |             fit_context['y_val'] = fit_context['y_val'] / (1e-37 + X_norm)
29 |             if fit_context['sample_weight_val'] is None:
30 |                 fit_context['sample_weight_val'] = np.ones(fit_context['X_val'].shape[0])
31 |             fit_context['sample_weight_val'] *= X_norm ** 2
32 | 
33 |         my_fit(
34 |             self.regressor_,
35 |             X,
36 |             y,
37 |             sample_weight=sample_weight,
38 |             fit_context=fit_context,
39 |         )
40 | 
41 |         return self
42 | 
43 |     def predict(self, X):
44 |         X_norm = np.sum(X.values ** 2, axis=1) ** 0.5
45 |         X = X / (1e-37 + X_norm).reshape(-1, 1)
46 | 
47 |         y = self.regressor_.predict(X)
48 |         return y * (1e-37 + X_norm)
49 | 


--------------------------------------------------------------------------------
/ridge_feature_count_scaler.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator, TransformerMixin
 2 | 
 3 | # 特徴量をコピーしてN倍にしたときに、Ridgeのalphaを変えなくて良いようにスケーリング
 4 | class RidgeFeatureCountScaler(BaseEstimator, TransformerMixin):
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     def fit(self, X, y=None):
 9 |         return self
10 | 
11 |     def transform(self, X, y=None):
12 |         X = self._validate_data(X)
13 | 
14 |         return X / (X.shape[1] ** 0.5)
15 | 
16 |     def inverse_transform(self, X, y=None):
17 |         X = self._validate_data(X)
18 | 
19 |         return X * (X.shape[1] ** 0.5)
20 | 


--------------------------------------------------------------------------------
/srdo_regressor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import lightgbm as lgb
 4 | from sklearn.base import BaseEstimator, RegressorMixin, clone
 5 | from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 6 | 
 7 | from kedro_work.utils import get_joblib_memory
 8 | 
 9 | memory = get_joblib_memory()
10 | 
11 | # https://arxiv.org/abs/1911.12580
12 | 
13 | class SrdoRegressor(RegressorMixin, BaseEstimator):
14 |     def __init__(self, estimator=None, epsilon=1e-7):
15 |         self.estimator = estimator
16 |         self.epsilon = epsilon
17 | 
18 |     def fit(self, X, y):
19 |         X, y = check_X_y(X, y)
20 | 
21 |         memory.reduce_size()
22 |         w = calc_decorr_weight(X, self.epsilon)
23 |         fitted = clone(self.estimator)
24 |         fitted.fit(X, y, sample_weight=w)
25 |         self.estimator_ = fitted
26 | 
27 |         return self
28 | 
29 |     def predict(self, X):
30 |         return self.estimator_.predict(X)
31 | 
32 | @memory.cache
33 | def calc_decorr_weight(X, epsilon):
34 |     classifier = lgb.LGBMClassifier(n_jobs=-1, random_state=0)
35 | 
36 |     X_positive = []
37 |     for i in range(X.shape[1]):
38 |         X_positive.append(np.random.choice(X[:, i], size=X.shape[0], replace=True))
39 |     X_positive = np.array(X_positive).transpose()
40 | 
41 |     classifier.fit(
42 |         np.vstack([X, X_positive]),
43 |         np.concatenate([np.zeros(X.shape[0]), np.ones(X.shape[0])])
44 |     )
45 |     proba = classifier.predict_proba(X)
46 |     w = proba[:, 1] / (epsilon + proba[:, 0])
47 | 
48 |     return w / np.sum(w)
49 | 


--------------------------------------------------------------------------------