├── __init__.py
├── yunbase.png
├── star-history-2025223.png
├── requirements.txt
├── LICENSE
├── README.md
└── baseline.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/yunbase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yunsuxiaozi/Yunbase/main/yunbase.png
--------------------------------------------------------------------------------
/star-history-2025223.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yunsuxiaozi/Yunbase/main/star-history-2025223.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 |
2 | polars
3 | pandas
4 | numpy
5 | swifter==1.3.2
6 | tqdm
7 | scikit-learn==1.2.2
8 | lightgbm
9 | catboost>=1.2.8,<1.3.0
10 | xgboost
11 | dill
12 | optuna
13 | colorama
14 | regex
15 | unidecode
16 | gensim==4.3.3
17 | scipy==1.13.0
18 | ftfy
19 | nltk
20 | emoji
21 | pytorch_tabnet
22 | matplotlib==3.7.2
23 | seaborn
24 | cir-model
25 | pyspellchecker
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🚀Yunbase,first submission of your algorithm competition
2 |
3 |
4 |
5 | In the competition of data mining,there are many operations that need to be done in every time.Many of these operations,from data preprocessing to k-fold cross validation,are repetitive.It's a bit troublesome to write repetitive code every time,so I extracted the common parts among these operations and wrote the Yunbase class here.('Yun' is my name yunsuxiaozi,'base' is the baseline of competition.)
6 |
7 |
8 |
9 | ### Get Started Quickly
10 |
11 | 1.git clone
12 |
13 | ```python
14 | !git clone https://github.com/yunsuxiaozi/Yunbase.git
15 | ```
16 |
17 | 2.download wheel in requirements.txt
18 |
19 | ```python
20 | !pip download -r Yunbase/requirements.txt
21 | ```
22 |
23 | 3.install according to requirements.txt
24 |
25 | ```python
26 | !pip install -q --requirement yourpath/Yunbase/requirements.txt \
27 | --no-index --find-links file:yourpath
28 | ```
29 |
30 | 4.import Yunbase
31 |
32 | ```python
33 | from Yunbase.baseline import Yunbase
34 | ```
35 |
36 | 5.create Yunbase.
37 |
38 | All the parameters are below, and you can flexibly choose parameters according to the task.
39 |
40 | ```python
41 | yunbase=Yunbase(num_folds:int=5,
42 | n_repeats:int=1,
43 | models:list[tuple]=[],
44 | FE=None,
45 | CV_sample=None,
46 | group_col=None,
47 | target_col:str='target',
48 | weight_col:str='weight',
49 | kfold_col:str='fold',
50 | drop_cols:list[str]=[],
51 | seed:int=2025,
52 | objective:Literal['binary','multi_class','regression']='regression',
53 | metric:str='mse',
54 | nan_margin:float=0.95,
55 | num_classes=None,
56 | infer_size:int=10000,
57 | save_oof_preds:bool=True,
58 | save_test_preds:bool=True,
59 | device:str='cpu',
60 | one_hot_max:int=50,
61 | one_hot_cols=None,
62 | custom_metric=None,
63 | use_optuna_find_params:int=0,
64 | optuna_direction=None,
65 | early_stop:int=100,
66 | use_pseudo_label:bool=False,
67 | use_high_corr_feat:bool=True,
68 | cross_cols:list[str]=[],
69 | labelencoder_cols:list[str]=[],
70 | list_stat:list[tuple]=[],
71 | word2vec_models:list[tuple]=[],
72 | text_cols:list[str]=[],
73 | plot_feature_importance:bool=False,
74 | log:int=100,
75 | exp_mode:bool=False,
76 | use_reduce_memory:bool=False,
77 | use_data_augmentation:bool=False,
78 | use_oof_as_feature:bool=False,
79 | use_CIR:bool=False,
80 | use_median_as_pred:bool=False,
81 | use_scaler:bool=False,
82 | use_TTA:bool=False,
83 | use_eval_metric:bool=True,
84 | feats_stat:list[tuple]=[],
85 | target_stat:list[tuple]=[],
86 | use_spellchecker:bool=False,
87 | AGGREGATIONS:list=['nunique','count','min','max','first',
88 | 'last', 'mean','median','sum','std','skew',kurtosis],
89 | )
90 | ```
91 |
92 | - `num_folds`:int.the number of folds for k-fold cross validation.
93 |
94 | - `n_repeats`:int,Replace different seeds for multiple kfold cross validation.This parameter is generally used for small datasets to ensure the stability of the model.
95 |
96 | - `models`:list[tuple].Built in 3 GBDTs as baseline, you can also use custom models,such as
97 |
98 | ```python
99 | models=[(LGBMRegressor(**lgb_params),'lgb')]
100 | ```
101 |
102 |
103 |
104 | - `FE`:function.In addition to the built-in feature engineer, you can also customize feature engineer.For example:
105 |
106 | ```python
107 | def FE(df):
108 | return df.drop(['id'],axis=1)
109 | ```
110 |
111 | Currently, both polar and pandas are supported for writing this function.
112 |
113 |
114 | - `CV_sample`:function.You can customize your downsampling and oversampling operations inside.In order to ensure the accuracy of CV, operations on the validation set should not be performed in principle. However, to meet personalized needs, operations on the validation set are still allowed here In addition to sampling operations, related feature engineering can also be customized here.
115 |
116 | For example:
117 |
118 | ```python
119 | def CV_sample(X_train,y_train,X_valid,y_valid,
120 | sample_weight_train,sample_weight_valid):
121 | less_idx=list(np.where(y_train==1)[0])
122 | more_idx=list(np.where(y_train==0)[0])
123 | np.random.shuffle(more_idx)
124 | #undersample
125 | more_idx=more_idx[:int(len(more_idx)*0.9)]
126 | #Adversarial learning
127 | X_train_copy=X_train.iloc[less_idx].copy()
128 | y_train_copy=y_train.iloc[less_idx].copy()
129 | y_train_copy[:]=0
130 | sample_weight_train_copy=sample_weight_train.iloc[less_idx].copy()
131 |
132 | X_train=pd.concat((X_train.iloc[more_idx+less_idx],X_train_copy)).reset_index(drop=True)
133 | y_train=pd.concat((y_train.iloc[more_idx+less_idx],y_train_copy)).reset_index(drop=True)
134 | sample_weight_train=pd.concat((sample_weight_train.iloc[more_idx+less_idx],sample_weight_train_copy)).reset_index(drop=True)
135 | return X_train,y_train,X_valid,y_valid,sample_weight_train,sample_weight_valid
136 | ```
137 |
138 |
139 | In purgedCV(time series CV), in order to make the training set and test set closer, without a validation set, this function will become as follows:
140 |
141 | ```python
142 | def CV_sample(X_train,y_train,sample_weight_train):
143 | #your code
144 | return X_train,y_train,sample_weight_train
145 | ```
146 |
147 |
148 |
149 |
150 | - `group_col`:str.if you want to use groupkfold,then define this group_col.
151 |
152 | - `target_col`:str.the column that you want to predict.
153 |
154 | - `weight_col`:str.You can set the weight of each sample before model training. If not defined by the user, 1 will be used by default to train each sample.
155 |
156 | ```python
157 | train['weight']=np.array([0.1,0.3,……,0.2])
158 | ```
159 |
160 |
161 |
162 | - `kfold_col`:str.Allow users to customize kfold.For example,
163 |
164 | ```python
165 | num_folds=5
166 | train['fold']=train.index%num_folds
167 | ```
168 |
169 |
170 |
171 | - `drop_cols`:list.The column to be deleted after all feature engineering is completed.
172 |
173 | - `seed`:int.random seed.
174 |
175 | - `objective`:str.what task do you want to do?regression,binary or multi_class?
176 |
177 | - `metric`:str.metric to evaluate your model.
178 |
179 | - `nan_margin`:float.when the proportion of missing values in a column is greater than, we delete this column.
180 |
181 | - `num_classes`:int.if objectibe is multi_class or binary,you should define this parameter.
182 |
183 | - `infer_size`:int.the test data might be large,we can predict in batches to deal with memory issues.
184 |
185 | - `save_oof_preds`:bool.you can save OOF for your own offline study.
186 |
187 | - `save_test_preds`:bool.you can save test_preds for your own offline study.
188 |
189 | - `device`:str.GBDT can training on GPU,you can set this parameter 'gpu' when you want to training on GPU.
190 |
191 | - `one_hot_max`:int.If the nunique of a column is less than a certain value, perform one hot encoder.
192 |
193 | - `one_hot_cols`:list[str].Customize which columns to use onehotencoder.
194 |
195 | - `custom_metric`:function.you can define your own custom_metric.
196 |
197 | ```python
198 | def weighted_MAE(y_true,y_pred,
199 | weight=train['weight'].values):
200 | return np.sum(weight*np.abs(y_true-y_pred))/np.sum(weight)
201 | ```
202 |
203 | 1.custom_metric can only pass in the parameters y_true and y_pred. If it is a regular cross validation, it needs to be assigned a value in advance like the weight parameter above. If it is a time series CV, the use_weighted_metric parameter can be used without defining the weight parameter.
204 |
205 | 2.when objective is multi_class,`y_pred` in `custom_metric(y_true,y_pred)` is probability(shape:`(len(y_true),num_classes)`).
206 |
207 | - `use_optuna_find_params`:int.count of use optuna find best params,0 is not use optuna to find params.Currently only LGBM is supported.
208 |
209 | - `optuna_direction`:str.`minimize` or `maximize`,when you use custom metric,you must define the direction of optimization.
210 |
211 | - `early_stop`:int.If the performance of the model does not improve multiple times, then stop training the model.
212 |
213 | - `use_pseudo_label`:bool.Whether to use pseudo labels.When it is true,adding the test data to the training data and training again after obtaining the predicted results of the test data.To obtain a reliable CV, the test set and cross validation training set are concatenated and validated using the validation set.
214 |
215 | - `use_high_corr_feat`:bool.whether to use high correlation features or not.
216 |
217 | - `cross_cols`:list[str].Construct features using addition, subtraction, multiplication, and division brute force for these columns of features.
218 |
219 | - `labelencoder_cols`:list.Convert categorical string variables into [1,2,……,n].
220 |
221 | - `list_stat`:list[tuple]=[].example:`[('step_list',list_gap=[1,2,4])]`.step_list:If the data in a column is a list or str(list),
222 | such as [] or '[]', this can be used to extract diff and
223 | shift features for list_cols.
224 |
225 | - `word2vec_models`:list[tuple].Use models such as tfidf to extract features of string columns.For example:
226 |
227 | ```python
228 | word2vec_models=[(TfidfVectorizer(),col,model_name='tfidf',use_svd=False)]
229 | ```
230 |
231 |
232 |
233 | - `text_cols`:list[str].extract features of words, sentences, and paragraphs from text here.
234 |
235 | - `plot_feature_importance`:bool.after model training,whether plot feature importance or not.
236 |
237 | - `log`:int.How many iterators output scores on the validation set once.
238 |
239 | - `exp_mode`:bool.In regression tasks, the distribution of `target_col` is a long tail distribution, and this parameter can be used to perform log transform on the target_col before training.
240 |
241 | - `use_reduce_memory`:bool.When facing large datasets, this function can be used to reduce memory.
242 |
243 | - `use_data_augmentation`:bool.if use data augmentation,During cross validation, the training data will undergo PCA transformation followed by inverse transformation.You can see function `pca_augmentation` for more details.
244 |
245 | - `use_oof_as_feature`:bool.For training data, use the `oof_preds of the previous model` as the feature, and for testing data, use the predicted results of the previous model as the feature for next model.
246 |
247 | - `use_CIR`:bool. use `CenteredIsotonicRegression` to fit(oof_preds,target) in the final.
248 |
249 | - `use_median_as_pred`:bool.The general model ensemble uses the mean as the prediction result, and this parameter uses median as the prediction result, which sometimes achieves better results, but only slightly.
250 |
251 | - `use_scaler`:bool.Although the usual scaling operation is not useful for GBDT models, after scaling the data, the clip operation can be used to remove outliers.We are using `RobustScaler` here.
252 |
253 | - `use_TTA`:bool.It is to apply the previous `data augmentation` operation to the test set and then take the average of the predicted results.
254 |
255 | - `use_eval_metric`:bool.Use `self.metric` to evaluate models during training with lightgbm and xgboost.
256 |
257 | - `feats_stat`:list[tuple]=[].Construct groupby features.for example: training data has some patients, testing data has other patients, each patient has multiple samples, this function can be used.
258 |
259 |
260 |
261 | ```python
262 | feats_stat=
263 | [('patient_id','year',['max','min','median','mean','std','skew',kurtosis,'(x-mean)/std','max-min','mean/std'])]
264 | ```
265 |
266 | - `target_stat`:list[tuple]=[].For example, if you have counted 100000 male and female samples and found that the average height of males is 168 and females is 166, then you can use `{'male':168,'female':166} `as a new feature of sex.`target_stat=[('sex','height',['mean'])] `.
267 |
268 | The effect of binary variables may not be significant, but multivariate categorical variables can demonstrate the size relationship between variables in this way.
269 |
270 | Common aggregation features can be directly called using strings, while custom aggregation features need to be implemented through functions.Currently only supports polars.
271 |
272 | ```python
273 | STATS=['min','mean','std','max','median','sum','skew','count','nunique']
274 |
275 | def qp(percentage):
276 | def q(x):
277 | x=x.to_numpy()
278 | return np.percentile(x,percentage)
279 | return q
280 | [('q0',qp(0.05)),('q1',qp(0.25)),('q3',qp(0.75)),('q4',qp(0.95))]
281 | ```
282 |
283 |
284 |
285 |
286 | - `use_spellchecker`:bool.This is an immature feature that checks for word errors in text and then makes corrections. The main issue is that it takes too long time.
287 | - `AGGREGATIONS:list=['nunique','count','min','max','first',
288 | 'last', 'mean','median','sum','std','skew',kurtosis]`.
289 |
290 |
291 |
292 | 6.yunbase training
293 |
294 | At present, it supports read csv, parquet files according to path, or csv files that have already been read.
295 |
296 | ```python
297 | yunbase.fit(train_path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',
298 | category_cols:list[str]=[],date_cols:list[str]=[],
299 | target2idx:dict|None=None,pseudo_label_weight:float=0.5,
300 | save_trained_models:bool=True,
301 | )
302 | ```
303 |
304 | - `train_path_or_file`:You can use the file path or pass in the already loaded file.
305 | - `category_cols`:You can specify which columns to convert to 'category' in the training data.
306 | - `date_cols`:If a column of features are all of time type, for example :"2024-04-23",this can be used to construct features.
307 | - `target2idx`:The dictionary mapped in the classification task, if you want to predict a person's gender, you can specify `{'Male ': 0,' Female ': 1}`.If you do not specify it yourself, it will be mapped to 0, 1,... n in order of the number of times each target appears.
308 | - `pseudo_label_weight`:When using pseudo labels to train a model, the weight of the test data compared to the training data.For example, if the weight of the training data is 2 and set to 0.5, the test data will use 1 as the weight to train the model.
309 | - `save_trained_models`:Do you want to save the models generated during the training process. Note that if you need to separate training and inference, you only need to save the yunbase object, and do not need to save the models generated in between.
310 |
311 | 7.yunbase inference
312 |
313 | ```python
314 | test_preds=yunbase.predict(test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',weights=np.zeros(0))
315 | test_preds=yunbase.predict_proba(test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',weights=np.zeros(0))
316 | ```
317 |
318 | - `weights`:This is setting the weights for model ensemble. For example, if you specify lgb, xgb, and cat, you can set weights to [3,4,3].There will be functions internally that normalize and integrate the weights.
319 |
320 | 8.save test_preds to submission.csv
321 |
322 | ```python
323 | yunbase.submit(submission_path_or_file='submission.csv',test_preds=np.ones(3),save_name='yunbase')
324 | ```
325 |
326 | - `save_name` .if you set 'submission',it will give you a csv file named `submission.csv`.
327 |
328 | 9.ensemble
329 |
330 | ```python
331 | yunbase.ensemble(solution_paths_or_files:list[str]=[],id_col:str='id',target_col:str='',weights=None)
332 | ```
333 |
334 | - For example:
335 |
336 | ```python
337 | solution_paths_or_files=[
338 | 'submission1.csv',
339 | 'submission2.csv',
340 | 'submission3.csv'
341 | ]
342 | weights=[3,3,4]
343 | ```
344 |
345 | 10.If train and inference need to be separated.
346 |
347 | ```python
348 | #model save
349 | yunbase.pickle_dump(yunbase,'yunbase.model')
350 |
351 | import dill#serialize and deserialize objects (such as saving and loading tree models)
352 | def pickle_load(path):
353 | #open path,binary read
354 | with open(path, mode="rb") as f:
355 | data = dill.load(f)
356 | return data
357 | yunbase=Yunbase()
358 | yunbase=pickle_load("yunbase.model")
359 | yunbase.model_save_path=your_model_save_path
360 | ```
361 |
362 | 11.train data and test data can be seen as below.
363 |
364 | ```python
365 | yunbase.train.head(),yunbase.test.head()
366 | ```
367 |
368 | ##### Here is a static version that can be used to play Kaggle competition.You can refer to this notebook to learn usage of Yunbase.
369 |
370 | ## TimeSeries Purged CV
371 |
372 | ```python
373 | yunbase.purged_cross_validation(self,train_path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',
374 | test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',
375 | date_col:str='date',train_gap_each_fold:int=31,#one month
376 | train_test_gap:int=7,#a week
377 | train_date_range:int=0,test_date_range:int=0,
378 | category_cols:list[str]=[],
379 | use_seasonal_features:bool=True,
380 | use_weighted_metric:bool=False,
381 | only_inference:bool=False,
382 | timestep:str='day',
383 | target2idx:dict|None=None,
384 | save_trained_models:bool=True,
385 | )
386 | ```
387 |
388 | - `only_inference`:If you don't want to see the offline scores of the time-series CV or want to save time, you can directly train the final submitted model.
389 |
390 | Demo notebook:Rohlik Yunbase
391 |
392 |
393 |
394 | ## Adversarial Validation
395 |
396 | Demo notebook
397 |
398 | ### follow-up work
399 |
400 | The code has now completed a rough framework and will continue to be improved by adding new functions based on bug fixes.
401 |
402 | In principle, fix as many bugs as I discover and add as many new features as I think of.
403 |
404 | 1.fit function to `np.array`.(such as `model.fit(train_X,train_y)`
405 |
406 | `model.predict(test_X)`).
407 |
408 | 2.add more common `metric`.
409 |
410 | 3.In addition to kfold, `single model` training and inference are also implemented.
411 |
412 | 4.hill climbing to find `blending` weight.
413 |
414 | 5.Optimize `memory` and `time` to cope with larger datasets.`(pandas->polars)`
415 |
416 | 6.Make the code more beautiful, concise, and easy to understand.
417 |
418 | 7.Add statements with abnormal `error messages`.
419 |
420 | Waiting for updates.
421 |
422 | Kaggle:https://www.kaggle.com/yunsuxiaozi
423 |
424 |
425 |
426 | ## Some interesting dataset recommendations:
427 |
428 | #### 1.Competitions with significant differences in data distribution between training and testing sets: suitable for learning adversarial validation: Should I eat this mushroom? TFUG Delhi
429 |
430 | #### 2.Treat regression tasks as classification tasks:Regression with a Mohs Hardness Dataset
431 |
432 | #### 3.Treat classification tasks as regression tasks:Child Mind Institute — Problematic Internet Use
433 |
434 | #### 4.A dataset with almost noise and only weak signals can be used to learn TargetEncoder.Backpack Prediction Challenge
435 |
436 |
437 |
438 | Due to the large amount of content in the README, there may still be some errors even after updating,`baseline.py` and `README.md` may not synchronize updates.
439 |
440 | update time:2025/03/27
441 |
442 |
--------------------------------------------------------------------------------
/baseline.py:
--------------------------------------------------------------------------------
1 | """
2 | @author:yunsuxiaozi
3 | @start_time:2024/09/27
4 | @update_time:2025/12/03
5 | """
6 | import polars as pl#similar to pandas, but with better performance when dealing with large datasets.
7 | import pandas as pd#read csv,parquet
8 | import numpy as np#for scientific computation of matrices
9 | from tqdm import tqdm#progress bar
10 | from scipy.stats import kurtosis#calculate kurt
11 | #powerful plot libraries
12 | import matplotlib.pyplot as plt
13 | import seaborn as sns
14 | import swifter# speed up pandas
15 |
16 | #current supported kfold
17 | from sklearn.model_selection import KFold,StratifiedKFold,StratifiedGroupKFold,GroupKFold
18 | #metrics
19 | from sklearn.metrics import roc_auc_score,f1_score,matthews_corrcoef,precision_recall_curve, auc
20 | #models(lgb,xgb,cat,ridge,lr,tabnet)
21 | from sklearn.linear_model import Ridge,LinearRegression,LogisticRegression,Lasso
22 | #fit(oof_preds,target)
23 | from cir_model import CenteredIsotonicRegression
24 | from lightgbm import LGBMRegressor,LGBMClassifier,log_evaluation,early_stopping
25 | from catboost import CatBoostRegressor,CatBoostClassifier
26 | from xgboost import XGBRegressor,XGBClassifier
27 | from pytorch_tabnet.tab_model import TabNetRegressor,TabNetClassifier
28 | import optuna#automatic hyperparameter optimization framework
29 |
30 | import ast#parse Python list strings transform '[a,b,c]' to [a,b,c]
31 | import copy#copy object
32 | import gc#rubbish collection
33 | from typing import Literal#The parameters of a function can only have fixed values.
34 | import dill#serialize and deserialize objects (such as saving and loading tree models)
35 | from colorama import Fore, Style #print colorful text
36 | import os#interact with operation system
37 | #deal with tabm's print
38 | import sys
39 | from contextlib import contextmanager
40 |
41 | #deal with text
42 | import re#python's built-in regular expressions.
43 | from spellchecker import SpellChecker# spelling checker library
44 | from unidecode import unidecode#transform unicode to ASCII.
45 | #gene(topic) similarity
46 | from gensim.models import Word2Vec
47 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer#word2vec feature
48 | import ftfy#fixes text for you,correct unicode issues.
49 | import nltk #Natural Language toolkit
50 | from nltk.corpus import stopwords#import english stopwords
51 | import emoji#deal with emoji in natrual language
52 | from sklearn.preprocessing import RobustScaler#(x-median)/IQR
53 | from sklearn.decomposition import PCA,TruncatedSVD#Truncated Singular Value Decomposition
54 |
55 | import warnings#avoid some negligible errors
56 | #The filterwarnings () method is used to set warning filters, which can control the output method and level of warning information.
57 | warnings.filterwarnings('ignore')
58 |
59 | import random#provide some function to generate random_seed.
60 | #set random seed,to make sure model can be recurrented.
61 | def seed_everything(seed):
62 | np.random.seed(seed)#numpy's random seed
63 | random.seed(seed)#python built-in random seed
64 | seed_everything(seed=2025)
65 |
66 | class Yunbase():
67 | def __init__(self,num_folds:int=5,
68 | n_repeats:int=1,
69 | models:list[tuple]=[],
70 | FE=None,
71 | CV_sample=None,
72 | group_col=None,
73 | target_col:str='target',
74 | weight_col:str='weight',
75 | kfold_col:str='fold',
76 | drop_cols:list[str]=[],
77 | seed:int=2025,
78 | objective:Literal['binary','multi_class','regression']='regression',
79 | metric:str='mse',
80 | nan_margin:float=0.95,
81 | num_classes=None,
82 | infer_size:int=10000,
83 | save_oof_preds:bool=True,
84 | save_test_preds:bool=True,
85 | device:str='cpu',
86 | one_hot_max:int=50,
87 | one_hot_cols=None,
88 | custom_metric=None,
89 | use_optuna_find_params:int=0,
90 | optuna_direction=None,
91 | early_stop:int=100,
92 | use_pseudo_label:bool=False,
93 | use_high_corr_feat:bool=True,
94 | cross_cols:list[str]=[],
95 | labelencoder_cols:list[str]=[],
96 | list_stat:list[tuple]=[],
97 | word2vec_models:list[tuple]=[],
98 | text_cols:list[str]=[],
99 | plot_feature_importance:bool=False,
100 | log:int=100,
101 | exp_mode:bool=False,
102 | use_reduce_memory:bool=False,
103 | use_data_augmentation:bool=False,
104 | use_oof_as_feature:bool=False,
105 | use_CIR:bool=False,
106 | use_median_as_pred:bool=False,
107 | use_scaler:bool=False,
108 | use_TTA:bool=False,
109 | use_eval_metric:bool=True,
110 | feats_stat:list[tuple]=[],
111 | target_stat:list[tuple]=[],
112 | targetencoder_with_kfold:bool=False,
113 | use_spellchecker:bool=False,
114 | AGGREGATIONS:list=['nunique','count','min','max','first',
115 | 'last', 'mean','median','sum','std','skew',kurtosis],
116 | )->None:
117 | """
118 | num_folds :the number of folds for k-fold cross validation.
119 | n_repeats :Here,we will modify the random seed of kfold and models to repeat
120 | the cross validation several times.
121 | models :Built in 3 GBDTs as baseline, you can also use custom models,
122 | such as models=[(LGBMRegressor(**lgb_params),'lgb')]
123 | FE :In addition to the built-in feature engineer, you can also customize feature engineer.
124 | CV_sample :This function is for X_train and y_train,sample_weight in cross validation.
125 | In order to make the evaluation metrics of oof as accurate as possible,
126 | this function is not executed for X_valid and y_valid.
127 | You can perform downsampling, upsampling, taking the first 10000 data
128 | points, and other operations you want here, and
129 | ultimately return any X_train or y_train,sample_weight.
130 | group_col :if you want to use groupkfold,then define this group_col.
131 | target_col :the column that you want to predict.
132 | weight_col :When training the model, give each sample a different weight.
133 | If you don't set it, the weight of each sample will default to 1.
134 | kfold_col :You can add the feature 'fold' to the training data, which allows you to
135 | customize your own kfold. The values in this column are [0,1,...,
136 | num_folds-1].
137 | drop_cols :The column to be deleted after all feature engineering is completed.
138 | seed :random seed.
139 | objective :what task do you want to do?regression,binary or multi_class?
140 | metric :metric to evaluate your model.
141 | nan_margin :when the proportion of missing values in a column is greater than, we delete this column.
142 | num_classes :if objectibe is multi_class,you should define this class.
143 | infer_size :the test data might be large,we can predict in batches.
144 | save_oof_preds :you can save OOF for offline study.
145 | save_test_preds :you can save test_preds.For multi classification tasks,
146 | the predicted result is the category.If you need to save the probability of the test_data,
147 | you can save test_preds.
148 | device :GBDT can training on GPU,you can set this parameter like NN.
149 | one_hot_max/one_hot_cols
150 | :Perform onehotencoder on features, one considering the numerical value
151 | of nunique and the other customizing features.
152 |
153 | custom_metric :your custom_metric,when objective is multi_class,y_pred in custom(y_true,y_pred) is probability.
154 | use_optuna_find_params:count of use optuna find best params,0 is not use optuna to find params.
155 | Currently only LGBM is supported.
156 | optuna_direction :'minimize' or 'maximize',when you use custom metric,you need to define
157 | the direction of optimization.
158 | early_stop :Common parameters of GBDT.
159 | use_pseudo_label :Whether to use pseudo labels.When it is true,adding the test data
160 | to the training data and training again after obtaining the predicted
161 | results of the test data.
162 | use_high_corr_feat :whether to use high correlation features or not.
163 | cross_cols :Construct features for adding, subtracting, multiplying, and dividing these columns.
164 | labelencoder_cols :Convert categorical string variables into [1,2,……,n].
165 | list_stat :example:[(list_col:str='step_list',list_gap:list[int]=[1,2,4])].
166 | list_col:If the data in a column is a list or str(list),
167 | such as [] or '[]', this can be used to extract diff and
168 | shift features for list_cols.
169 | word2vec_models :Use models such as tfidf to extract features of string columns.
170 | example:word2vec_models=[(TfidfVectorizer(max_features=250,sublinear_tf=True,
171 | ngram_range=(2,3)),col,model_name,use_svd)],
172 | use_svd:use Truncated Singular value decomposition to word2vec features.
173 | text_cols :extract features of words, sentences, and paragraphs from text here.
174 | plot_feature_importance:after model training,whether print feature importance or not
175 | log :log trees are trained in the GBDT model to output a validation set score once.
176 | exp_mode :In regression tasks, the distribution of target_col is a long tail distribution,
177 | and this parameter can be used to perform log transform on the target_col.
178 | use_reduce_memory :if use function reduce_mem_usage(),then set this parameter True.
179 | use_data_augmentation :if use data augmentation,During cross validation, the training data
180 | will undergo PCA transformation followed by inverse transformation.
181 | use_oof_as_feature :Train the next model using the oof_preds obtained from the previous
182 | model as features, and the same applies to inference.
183 | use_CIR :use CenteredIsotonicRegression to fit oof_preds and target.
184 | use_median_as_pred :use median.(axis=0)) instead of mean.(axis=0)
185 | use_scaler :use robust scaler to deal with outlier.
186 | use_eval_metric : use 'eval_metric' when training lightgbm or xgboost.
187 | use_TTA :use 'test time augmentation'.It is to use
188 | data augmentation operations in the inference process
189 | feats_stat : (group_col,feature_col,aggregation_list)
190 | example:feats_stat = [ ('id','up_time', ['min', 'max']) ]
191 | target_stat :We can use target's AGGREGATIONS to encode categorical variables.
192 | In order to obtain a reliable CV, this operation is performed separately
193 | for the training set and validation set in cross validation.
194 | example:target_stat = [ (group_col,target_col, aggregation_list) ]
195 | To make it more versatile, you can also use other variables
196 | besides target to encode categorical variables.
197 | targetencoder_with_kfold:The difference between False and True is whether the
198 | training data (train) in cross validation (full=train+valid) uses
199 | the entire training data (train)'s Target Encoder directly,or is
200 | assigned through cross validation in the training data (train=tr+va).
201 | use_spellchecker :use SpellChecker to correct word in text.
202 | AGGREGATIONS :['nunique','count','min','max','first','last',
203 | 'mean','median','sum','std','skew',kurtosis,q1,q3],
204 | """
205 |
206 | #currented supported metric
207 | self.reg_metric=['mae','rmse','mse','medae','rmsle','msle','mape','r2','smape',#regression
208 | ]
209 | self.cla_metric=['auc','pr_auc','logloss','f1_score','mcc',#binary metric
210 | 'accuracy','multi_logloss',#multi_class or classification
211 | ]
212 | self.supported_metrics=['custom_metric']+self.reg_metric+self.cla_metric
213 |
214 | #current supported models
215 | #pytabkit refer to :https://www.kaggle.com/competitions/playground-series-s5e8/writeups/2nd-place-yet-another-ensemble
216 | self.supported_models=['lgb','cat','xgb','ridge','Lasso','LinearRegression','LogisticRegression','tabnet',
217 | 'realmlp(pytabkit need install yourself)','tabm(pytabkit need install yourself)',
218 | 'Word2Vec','tfidfvec','countvec',
219 | ]
220 | #current supported kfold.
221 | self.supported_kfolds=['KFold','GroupKFold','StratifiedKFold','StratifiedGroupKFold','purged_CV','custom_kfold']
222 | #current supported objective.
223 | self.supported_objectives=['binary','multi_class','regression']
224 |
225 | print(f"Currently supported metrics:{self.supported_metrics}")
226 | print(f"Currently supported models:{self.supported_models}")
227 | print(f"Currently supported kfolds:{self.supported_kfolds}")
228 | print(f"Currently supported objectives:{self.supported_objectives}")
229 |
230 | self.num_folds=num_folds
231 | self.n_repeats=n_repeats
232 | self.seed=seed
233 | self.models=models
234 | self.target_col=target_col
235 | self.group_col=group_col
236 |
237 | self.FE=FE
238 | self.CV_sample=CV_sample
239 | self.drop_cols=drop_cols
240 |
241 | self.objective=objective.lower()
242 | #binary multi_class,regression
243 | if self.objective not in self.supported_objectives:
244 | raise ValueError("Wrong or currently unsupported objective.")
245 |
246 | self.custom_metric=custom_metric#function
247 | if self.custom_metric!=None:
248 | self.metric=self.custom_metric.__name__.lower()
249 | else:
250 | self.metric=metric.lower()
251 | if self.metric not in self.supported_metrics and self.custom_metric==None:
252 | raise ValueError("Wrong or currently unsupported metric,You can customize the evaluation metrics using 'custom_metric'.")
253 |
254 | self.nan_margin=nan_margin
255 | if self.nan_margin<0 or self.nan_margin>1:
256 | raise ValueError("nan_margin must be within the range of 0 to 1.")
257 | self.infer_size=infer_size
258 | if self.infer_size<=0 or type(self.infer_size) is not int:
259 | raise ValueError("infer size must be greater than 0 and must be int.")
260 |
261 | self.save_oof_preds=save_oof_preds
262 | self.save_test_preds=save_test_preds
263 |
264 | self.num_classes=num_classes
265 | self.device=device.lower()
266 | if (self.objective=='binary') and self.num_classes!=2:
267 | raise ValueError("num_classes must be 2.")
268 | elif (self.objective=='multi_class') and (self.num_classes==None):
269 | raise ValueError("num_classes must be a number(int).")
270 | self.one_hot_max=one_hot_max
271 | self.one_hot_cols=one_hot_cols
272 |
273 | self.use_optuna_find_params=use_optuna_find_params
274 | self.optuna_direction=optuna_direction
275 | self.direction2metric={
276 | 'maximize':['accuracy','auc','pr_auc','f1_score','mcc',#classification
277 | 'r2'#regression
278 | ],
279 | 'minimize':['medae','mape','mae','rmse','mse','rmsle','msle','smape',#regression
280 | 'logloss','multi_logloss'#classification
281 | ]
282 | }
283 |
284 | if (self.custom_metric!=None) and (self.optuna_direction not in ['minimize','maximize']):
285 | raise ValueError("optuna_direction must be 'minimize' or 'maximize'.")
286 | self.early_stop=early_stop
287 | self.test=None#test data will be replaced when call predict function.
288 | self.use_pseudo_label=use_pseudo_label
289 | self.use_high_corr_feat=use_high_corr_feat
290 | self.cross_cols=cross_cols
291 | self.labelencoder_cols=labelencoder_cols
292 | self.list_stat=list_stat
293 | self.list_cols=[l[0] for l in self.list_stat]
294 |
295 | self.word2vec_models=word2vec_models
296 | for i in range(len(self.word2vec_models)):
297 | #default use_svd=False
298 | if len(self.word2vec_models[i])==3:#(model,col,model_name)
299 | tup=self.word2vec_models[i]
300 | self.word2vec_models[i]=(tup[0],tup[1],tup[2],False)
301 |
302 | self.word2vec_cols=[col for (model,col,model_name,use_svd) in self.word2vec_models]#origin cols that need to use in tfidf model.
303 | self.text_cols=text_cols#extract features of words, sentences, and paragraphs from text here.
304 | #to perform only one clean_text operation
305 | self.param_text=list(set(self.word2vec_cols+self.text_cols))
306 |
307 | self.plot_feature_importance=plot_feature_importance
308 | #Due to the presence of special characters in some column names,
309 | #they cannot be directly passed into the LGB model training, so conversion is required
310 | self.log=log
311 | self.exp_mode=exp_mode
312 | #when log transform, it is necessary to ensure that the minimum value of the target is greater than 0.
313 | #so target=target-min_target. b is -min_target.
314 | self.exp_mode_b=0
315 | if (self.objective!='regression') and (self.exp_mode==True):
316 | raise ValueError("exp_mode must be False in classification task.")
317 | self.use_reduce_memory=use_reduce_memory
318 | self.use_data_augmentation=use_data_augmentation
319 | self.use_oof_as_feature=use_oof_as_feature
320 | self.use_CIR=use_CIR
321 | self.use_median_as_pred=use_median_as_pred
322 | self.use_scaler=use_scaler
323 | self.use_eval_metric=use_eval_metric
324 | self.use_TTA=use_TTA
325 | self.use_spellchecker=use_spellchecker
326 | self.targetencoder_with_kfold=targetencoder_with_kfold
327 |
328 | attritubes=['save_oof_preds','save_test_preds','exp_mode',
329 | 'use_reduce_memory','use_data_augmentation','use_scaler',
330 | 'use_oof_as_feature','use_CIR','use_median_as_pred','use_eval_metric',
331 | 'use_spellchecker','use_TTA','targetencoder_with_kfold'
332 | ]
333 | for attr in attritubes:
334 | if getattr(self,attr) not in [True,False]:
335 | raise ValueError(f"{attr} must be True or False.")
336 |
337 | if self.use_oof_as_feature and self.use_pseudo_label:
338 | raise ValueError(f"use_oof_as_feature and use_pseudo_label cannot be both True at the same time.")
339 |
340 | self.feats_stat=feats_stat
341 | self.target_stat=target_stat
342 | #common AGGREGATIONS
343 | self.AGGREGATIONS = AGGREGATIONS
344 |
345 | #If inference one batch of data at a time requires repeatedly loading the model,
346 | #it will increase the program's running time.we need to save in dictionary when load.
347 | self.trained_models=[]#trained model
348 | self.trained_CIR=[]#trained CIR model
349 | self.trained_le={}
350 | self.trained_wordvec={}
351 | self.trained_svd={}
352 | self.trained_scaler={}
353 | self.trained_TE={}#TargetEncoder
354 | self.onehot_valuecounts={}
355 | #make folder to save model trained.such as GBDT,word2vec.
356 | self.model_save_path="Yunbase_info/"
357 | if not os.path.exists(self.model_save_path):
358 | os.mkdir(self.model_save_path)
359 |
360 | self.eps=1e-15#clip (eps,1-eps) | divide by zero.
361 | self.category_cols=[]
362 | self.high_corr_cols=[]
363 | #to make sure column's dtype in train.csv is same as the column's dtype in test.csv.
364 | #example:https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability
365 | self.col2dtype={}
366 | self.weight_col=weight_col
367 | self.kfold_col=kfold_col
368 |
369 | def get_params(self,):
370 | params_dict={'num_folds':self.num_folds,'n_repeats':self.n_repeats,'models':self.models,
371 | 'group_col':self.group_col,'target_col':self.target_col,'weight_col':self.weight_col,
372 | 'kfold_col':self.kfold_col,'drop_cols':self.drop_cols,'seed':self.seed,'objective':self.objective,
373 | 'metric':self.metric,'nan_margin':self.nan_margin,'num_classes':self.num_classes,
374 | 'infer_size':self.infer_size,'save_oof_preds':self.save_oof_preds,'save_test_preds':self.save_test_preds,
375 | 'device':self.device,'one_hot_max':self.one_hot_max,'one_hot_cols':self.one_hot_cols,
376 | 'custom_metric':self.custom_metric,
377 | 'use_optuna_find_params':self.use_optuna_find_params,'optuna_direction':self.optuna_direction,
378 | 'early_stop':self.early_stop,'use_pseudo_label':self.use_pseudo_label,
379 | 'use_high_corr_feat':self.use_high_corr_feat,'cross_cols':self.cross_cols,
380 | 'labelencoder_cols':self.labelencoder_cols,'list_stat':self.list_stat,
381 | 'word2vec_models':self.word2vec_models, 'text_cols':self.text_cols,
382 | 'plot_feature_importance':self.plot_feature_importance,'log':self.log,
383 | 'exp_mode':self.exp_mode,'use_reduce_memory':self.use_reduce_memory,
384 | 'use_data_augmentation':self.use_data_augmentation,
385 | 'use_oof_as_feature':self.use_oof_as_feature,'use_CIR':self.use_CIR,
386 | 'use_median_as_pred':self.use_median_as_pred,'use_scaler':self.use_scaler,
387 | 'use_TTA':self.use_TTA,'use_eval_metric':self.use_eval_metric,
388 | 'feats_stat':self.feats_stat,'target_stat':self.target_stat,
389 | 'targetencoder_with_kfold':self.targetencoder_with_kfold,
390 | 'use_spellchecker':self.use_spellchecker,'AGGREGATIONS':self.AGGREGATIONS,
391 | 'category_cols':self.category_cols,
392 | }
393 | return params_dict
394 |
395 | #print colorful text
396 | def PrintColor(self,text:str='',color = Fore.BLUE)->None:
397 | print(color + text + Style.RESET_ALL)
398 |
399 | #save models after training
400 | def pickle_dump(self,obj, path:str)->None:
401 | #open path,binary write
402 | with open(path, mode="wb") as f:
403 | dill.dump(obj, f, protocol=4)
404 | #load models when inference
405 | def pickle_load(self,path:str):
406 | #open path,b/finary read
407 | with open(path, mode="rb") as f:
408 | data = dill.load(f)
409 | return data
410 |
411 | #reference:https://www.kaggle.com/code/masayakawamata/mic-tabm-baseline
412 | @contextmanager
413 | def suppress_stdout(self,):
414 | with open(os.devnull, "w") as devnull:
415 | old_stdout = sys.stdout
416 | sys.stdout = devnull
417 | try:
418 | yield
419 | finally:
420 | sys.stdout = old_stdout
421 |
422 | #sample AGGREGATIONS
423 | def q1(self,x):
424 | return x.quantile(0.25)
425 | def q3(self,x):
426 | return x.quantile(0.75)
427 |
428 | #Time data cannot use this augmentation,as features such as year, month, and day are discrete variables.
429 | def pca_augmentation(self,X:pd.DataFrame,y=None,target_col:str=''):
430 | if type(y)!=pd.DataFrame:#y=None
431 | origin_data=X.copy()
432 | else:#
433 | origin_data=pd.concat((X,y),axis=1)
434 | n_components=np.clip( int(origin_data.shape[1]*0.8),1,X.shape[1])
435 | pca=PCA(n_components=n_components)
436 | pca_data=pca.fit_transform(origin_data)
437 | aug_data=pca.inverse_transform(pca_data)
438 | aug_data=pd.DataFrame(aug_data)
439 | if type(y)!=pd.DataFrame:#y=None
440 | aug_data.columns=list(X.columns)
441 | else:
442 | aug_data.columns=list(X.columns)+[target_col]
443 | del origin_data,pca,pca_data
444 | gc.collect()
445 |
446 | return aug_data
447 |
448 | #Traverse all columns of df, modify data types to reduce memory usage
449 | def reduce_mem_usage(self,df:pd.DataFrame, float16_as32:bool=True)->pd.DataFrame:
450 | #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
451 | start_mem = df.memory_usage().sum() / 1024**2
452 | print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
453 | for col in df.columns:
454 | col_type = df[col].dtype
455 | if col_type != object and str(col_type)!='category':#num_col
456 | c_min,c_max = df[col].min(),df[col].max()
457 | if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
458 | #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
459 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
460 | df[col] = df[col].astype(np.int8)
461 | #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
462 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
463 | df[col] = df[col].astype(np.int16)
464 | #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
465 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
466 | df[col] = df[col].astype(np.int32)
467 | #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
468 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
469 | df[col] = df[col].astype(np.int64)
470 | else:#如果是浮点数类型.
471 | #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
472 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
473 | if float16_as32:#如果数据需要更高的精度可以选择float32
474 | df[col] = df[col].astype(np.float32)
475 | else:
476 | df[col] = df[col].astype(np.float16)
477 | #如果数值在float32的取值范围内,对它进行类型转换
478 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
479 | df[col] = df[col].astype(np.float32)
480 | #如果数值在float64的取值范围内,对它进行类型转换
481 | else:
482 | df[col] = df[col].astype(np.float64)
483 | #calculate memory after optimization
484 | end_mem = df.memory_usage().sum() / 1024**2
485 | print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
486 | print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
487 | return df
488 |
489 | ############text preprocessor
490 | def clean_text(self,text:str='')->str:
491 | ############################## fix text #######################################################
492 | #transform “你好。” to 'NI HAO.'
493 | text = unidecode(text)
494 | #transform emoji to " "+text+" ".
495 | text=emoji.demojize(text,delimiters=(" ", " "))
496 | #correct unicode issues.
497 | text=ftfy.fix_text(text)
498 | #lower example:'Big' and 'big'
499 | text=text.lower()
500 | ############################## remove meaningless text ########################################
501 | #remove
meaningless
502 | html=re.compile(r'<.*?>')
503 | text=html.sub(r'',text)
504 | #remove urls '\w+':(word character,[a-zA-Z0-9_])
505 | #thanks to https://github.com/yunsuxiaozi/Yunbase/issues/1
506 | text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
507 | #remove @yunsuxiaozi person_name
508 | text=re.sub("@\w+",'',text)
509 | #drop single character,they are meaningless. 'space a space'
510 | text=re.sub("\s[a-z]\s",'',text)
511 | #remove number
512 | #text=re.sub("\d+",'',text)
513 | #drop english stopwords,they are meaningless.
514 | english_stopwords = stopwords.words('english')
515 | text_list=text.split(" ")
516 | text_list=[t for t in text_list if t not in english_stopwords]
517 | text=" ".join(text_list)
518 | #drop space front and end.
519 | text=text.strip()
520 | return text
521 |
522 | def text2word(self,text:str='hello world!'):
523 | return re.split(r'\.|\?|!|\s|\n|,',text)
524 | def text2sentence(self,text:str='hello world!'):
525 | return re.split(r'\.|\?|\!|\n',text)
526 | def text2paragraph(self,text:str='hello world!'):
527 | return text.split("\n")
528 | #3 text readable index
529 | def ARI(self,text):
530 | characters=len(text)
531 | words=len(self.text2word(text))
532 | sentence=len(self.text2sentence(text))
533 | ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
534 | return ari_score
535 | def McAlpine_EFLAW(self,text):
536 | W=len(self.text2word(text))
537 | S=len(self.text2sentence(text))
538 | mcalpine_eflaw_score=(W+S*W)/S
539 | return mcalpine_eflaw_score
540 | def CLRI(self,text):
541 | characters=len(text)
542 | words=len(self.text2word(text))
543 | sentence=len(self.text2sentence(text))
544 | L=100*characters/words
545 | S=100*sentence/words
546 | clri_score=0.0588*L-0.296*S-15.8
547 | return clri_score
548 |
549 | def text_correct(self,text:str='hello world!'):
550 | spell = SpellChecker()
551 | words = self.text2word(text)
552 | punctuation=['.','?','!',' ','\n',',']
553 | wordssplit=[text[i] for i in range(len(text)) if text[i] in punctuation]
554 | fixed_words=[spell.correction(word) for word in words]
555 | error_cnt=sum([1 for i in range(len(words)) if words[i]!=fixed_words[i]])
556 | fixed_text=[]
557 | for i in range(len(wordssplit)):
558 | fixed_text.append(fixed_words[i])
559 | fixed_text.append(wordssplit[i])
560 | fixed_text="".join(fixed_text)
561 | return error_cnt,fixed_text
562 |
563 | ############text Feature Engineer
564 | def text_FE(self,df:pd.DataFrame,text_col:str='text'):
565 | df['index']=np.arange(len(df))
566 | #correct text
567 | if self.use_spellchecker:
568 | self.PrintColor(f"-> for column {text_col} text correct",color=Fore.YELLOW)
569 | texts=df[text_col].values
570 | error_cnts=np.zeros(len(texts))
571 | for i in tqdm(range(len(texts))):
572 | error_cnts[i],texts[i]=self.text_correct(texts[i])
573 | df[f'{text_col}_error_cnts']=error_cnts
574 |
575 | df[text_col+"_ARI"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.ARI(x))
576 | df[text_col+"_CLRI"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.CLRI(x))
577 | df[text_col+"_McAlpine_EFLAW"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.McAlpine_EFLAW(x))
578 | #split by ps
579 | ps='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
580 | for i in range(len(ps)):
581 | df[text_col+f"split_ps{i}_count"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:len(x.split(ps[i])))
582 |
583 | self.PrintColor(f"-> for column {text_col} word feature",color=Fore.RED)
584 | text_col_word_df=df[['index',text_col]].copy()
585 | #get word_list [index,tcol,word_list]
586 | text_col_word_df[f'{text_col}_word']=text_col_word_df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.text2word(x))
587 | #[index,single_word]
588 | text_col_word_df=text_col_word_df.explode(f'{text_col}_word')[['index',f'{text_col}_word']]
589 | #[index,single_word,single_word_len]
590 | text_col_word_df[f'{text_col}_word_len'] = text_col_word_df[f'{text_col}_word'].swifter.allow_dask_on_strings(False).apply(len)
591 | #data clean [index,single_word,single_word_len]
592 | text_col_word_df=text_col_word_df[text_col_word_df[f'{text_col}_word_len']!=0]
593 | #for word features, extract the difference in length between the two words before and after.
594 | group_cols=[f'{text_col}_word_len']
595 | for gap in [1]:
596 | for col in [f'{text_col}_word_len']:
597 | text_col_word_df[f'{col}_diff{gap}']=text_col_word_df.groupby(['index'])[col].diff(gap)
598 | group_cols.append(f'{col}_diff{gap}')
599 | text_col_word_agg_df = text_col_word_df[['index']+group_cols].groupby(['index']).agg(self.AGGREGATIONS)
600 | text_col_word_agg_df.columns = ['_'.join(x) for x in text_col_word_agg_df.columns]
601 | df=df.merge(text_col_word_agg_df,on='index',how='left')
602 |
603 | self.PrintColor(f"-> for column {text_col} sentence feature",color=Fore.RED)
604 | text_col_sent_df=df[['index',text_col]].copy()
605 | #get sent_list [index,tcol,sent_list]
606 | text_col_sent_df[f'{text_col}_sent']=text_col_sent_df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x: self.text2sentence(x))
607 | #[index,single_sent]
608 | text_col_sent_df=text_col_sent_df.explode(f'{text_col}_sent')[['index',f'{text_col}_sent']]
609 | #[index,single_sent,single_sent_len]
610 | text_col_sent_df[f'{text_col}_sent_len'] = text_col_sent_df[f'{text_col}_sent'].swifter.allow_dask_on_strings(False).apply(len)
611 | text_col_sent_df[f'{text_col}_sent_word_count'] = text_col_sent_df[f'{text_col}_sent'].swifter.allow_dask_on_strings(False).apply(lambda x:len(re.split('\\ |\\,',x)))
612 | #data clean [index,single_sent,single_sent_len]
613 | group_cols=[f'{text_col}_sent_len',f'{text_col}_sent_word_count']
614 | for gcol in group_cols:
615 | text_col_sent_df=text_col_sent_df[text_col_sent_df[gcol]!=0]
616 | #for sent features, extract the difference in length between the two sents before and after.
617 | for gap in [1]:
618 | for col in [f'{text_col}_sent_len',f'{text_col}_sent_word_count']:
619 | text_col_sent_df[f'{col}_diff{gap}']=text_col_sent_df.groupby(['index'])[col].diff(gap)
620 | group_cols.append(f'{col}_diff{gap}')
621 | text_col_sent_agg_df = text_col_sent_df[['index']+group_cols].groupby(['index']).agg(self.AGGREGATIONS)
622 | text_col_sent_agg_df.columns = ['_'.join(x) for x in text_col_sent_agg_df.columns]
623 | df=df.merge(text_col_sent_agg_df,on='index',how='left')
624 |
625 | self.PrintColor(f"-> for column {text_col} paragraph feature",color=Fore.RED)
626 | text_col_para_df=df[['index',text_col]].copy()
627 | #get para_list [index,tcol,para_list]
628 | text_col_para_df[f'{text_col}_para']=text_col_para_df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x: self.text2paragraph(x))
629 | #[index,single_para]
630 | text_col_para_df=text_col_para_df.explode(f'{text_col}_para')[['index',f'{text_col}_para']]
631 | text_col_para_df[f'{text_col}_para_len'] = text_col_para_df[f'{text_col}_para'].swifter.allow_dask_on_strings(False).apply(len)
632 | text_col_para_df[f'{text_col}_para_sent_count'] = text_col_para_df[f'{text_col}_para'].swifter.allow_dask_on_strings(False).apply(lambda x: len(re.split('\\.|\\?|\\!',x)))
633 | text_col_para_df[f'{text_col}_para_word_count'] = text_col_para_df[f'{text_col}_para'].swifter.allow_dask_on_strings(False).apply(lambda x: len(re.split('\\.|\\?|\\!\\ |\\,',x)))
634 | #data clean [index,single_sent,single_sent_len]
635 | group_cols=[f'{text_col}_para_len',f'{text_col}_para_sent_count',f'{text_col}_para_word_count']
636 | for gcol in group_cols:
637 | text_col_para_df=text_col_para_df[text_col_para_df[gcol]!=0]
638 | #for sent features, extract the difference in length between the two sents before and after.
639 | for gap in [1]:
640 | for col in [f'{text_col}_para_len',f'{text_col}_para_sent_count',f'{text_col}_para_word_count']:
641 | text_col_para_df[f'{col}_diff{gap}']=text_col_para_df.groupby(['index'])[col].diff(gap)
642 | group_cols.append(f'{col}_diff{gap}')
643 | text_col_para_agg_df = text_col_para_df[['index']+group_cols].groupby(['index']).agg(self.AGGREGATIONS)
644 | text_col_para_agg_df.columns = ['_'.join(x) for x in text_col_para_agg_df.columns]
645 | df=df.merge(text_col_para_agg_df,on='index',how='left')
646 | df.drop(['index'],axis=1,inplace=True)
647 | return df
648 |
649 | #basic Feature Engineer,mode='train' or 'test' ,drop_cols is other cols you want to delete.
650 | def base_FE(self,df:pd.DataFrame,mode:str='train',drop_cols:list[str]=[])->pd.DataFrame:
651 | if self.FE!=None:
652 | #use your custom metric first
653 | try:#pandas FE
654 | df=self.FE(df)
655 | except:#polars FE
656 | df=pl.from_pandas(df)
657 | df=self.FE(df)
658 | df=df.to_pandas()
659 |
660 | #clean text
661 | for pt_col in tqdm(self.param_text):
662 | self.PrintColor(f"-> for column {pt_col} text clean",color=Fore.YELLOW)
663 | df[pt_col]=(df[pt_col].fillna('nan'))
664 | if df[pt_col].nunique()>0.5*len(df):
665 | df[pt_col]=df[pt_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.clean_text(x))
666 | else:#use dict to clean,save time.
667 | text2clean={}
668 | for text in df[pt_col].unique():
669 | text2clean[text]=self.clean_text(text)
670 | df[pt_col]=df[pt_col].swifter.allow_dask_on_strings(False).apply(lambda x:text2clean.get(x,'nan'))
671 | del text2clean
672 | gc.collect()
673 |
674 | #text feature extract,such as word,sentence,paragraph.
675 | #The reason why it needs to be done earlier is that it will generate columns such as nunique=1 or
676 | #object that need to be dropped, so it needs to be placed before finding these columns.
677 | if len(self.text_cols):
678 | print("< text column's feature >")
679 | for tcol in self.text_cols:
680 | #category text
681 | if df[tcol].nunique()<0.5*len(df):
682 | text_map_df=pd.DataFrame({tcol:df[tcol].unique()})
683 | text_agg_df=self.text_FE(text_map_df,tcol)
684 | df=df.merge(text_agg_df,on=tcol,how='left')
685 | else:
686 | df=self.text_FE(df,tcol)
687 |
688 | if mode=='train':
689 | #missing value
690 | self.nan_cols=[col for col in df.columns if df[col].isna().mean()>self.nan_margin]
691 |
692 | #nunique=1
693 | self.unique_cols=[]
694 | for col in df.drop(self.drop_cols+self.list_cols+\
695 | [self.weight_col,self.group_col,self.target_col,self.kfold_col],axis=1,errors='ignore').columns:
696 | if(df[col].nunique()<2):#maybe np.nan
697 | self.unique_cols.append(col)
698 | #max_value_counts's count
699 | elif len(list(df[col].value_counts().to_dict().items()))>0:
700 | if list(df[col].value_counts().to_dict().items())[0][1]>=len(df)*0.99:
701 | self.unique_cols.append(col)
702 | #num_cols and low_var
703 | elif (df[col].dtype!=object) and (df[col].std()/df[col].mean()<0.01):
704 | self.unique_cols.append(col)
705 |
706 | #object dtype
707 | self.object_cols=[col for col in df.drop(self.drop_cols+self.category_cols,axis=1,errors='ignore').columns if (df[col].dtype==object) and (col not in [self.group_col,self.target_col])]
708 | ##### one_hot_encoder
709 | if self.one_hot_cols==None:
710 | self.nunique3_cols=[]
711 | self.nunique2_cols=[]
712 | for col in df.drop(
713 | [self.target_col,self.group_col,self.weight_col,self.kfold_col]+\
714 | self.list_cols+self.drop_cols
715 | ,axis=1,errors='ignore'
716 | ).columns:
717 | if (df[col].dtype==object) and not (
718 | #such as sin_month,month have already been onehot.
719 | col.startswith('sin') or col.startswith('cos') or
720 | #AGGREGATION don't use onehot.
721 | col.endswith('_nunique') or col.endswith('_count') or
722 | col.endswith('_min') or col.endswith('_max') or
723 | col.endswith('_first') or col.endswith('_last') or
724 | col.endswith('_mean') or col.endswith('_median') or
725 | col.endswith('_sum') or col.endswith('_std') or col.endswith('_skew') or
726 | #q0:0.05,q1:0.25,q2:0.5,q3:0.75,q4:0.95
727 | col.endswith('_kurtosis') or col.endswith('_q0') or col.endswith('_q1') or
728 | col.endswith('_q2') or col.endswith('_q3') or col.endswith('_q4')
729 | ):
730 | if (df[col].nunique()