├── __init__.py
├── yunbase.png
├── star-history-2025223.png
├── requirements.txt
├── LICENSE
├── README.md
└── baseline.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/yunbase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yunsuxiaozi/Yunbase/main/yunbase.png


--------------------------------------------------------------------------------
/star-history-2025223.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yunsuxiaozi/Yunbase/main/star-history-2025223.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | polars
 3 | pandas
 4 | numpy
 5 | swifter==1.3.2
 6 | tqdm
 7 | scikit-learn==1.2.2
 8 | lightgbm
 9 | catboost>=1.2.8,<1.3.0
10 | xgboost
11 | dill
12 | optuna
13 | colorama
14 | regex
15 | unidecode
16 | gensim==4.3.3
17 | scipy==1.13.0
18 | ftfy
19 | nltk
20 | emoji
21 | pytorch_tabnet
22 | matplotlib==3.7.2
23 | seaborn
24 | cir-model
25 | pyspellchecker
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🚀Yunbase,first submission of your algorithm competition
  2 | 
  3 | <img src="yunbase.png" alt="yunbase title image" style="zoom:100%;" />
  4 | 
  5 | In the competition of data mining,there are many operations that need to be done in every time.Many of these operations,from data preprocessing to k-fold cross validation,are repetitive.It's a bit troublesome to write repetitive code every time,so I extracted the common parts among these operations and wrote the Yunbase class here.('Yun' is my name <b>yunsuxiaozi</b>,'base' is the baseline of competition.)
  6 | 
  7 | 
  8 | 
  9 | ### Get Started Quickly
 10 | 
 11 | 1.git clone 
 12 | 
 13 | ```python
 14 | !git clone https://github.com/yunsuxiaozi/Yunbase.git
 15 | ```
 16 | 
 17 | 2.download wheel in requirements.txt
 18 | 
 19 | ```python
 20 | !pip download -r Yunbase/requirements.txt
 21 | ```
 22 | 
 23 | 3.install according to  requirements.txt
 24 | 
 25 | ```python
 26 | !pip install -q --requirement yourpath/Yunbase/requirements.txt  \
 27 | --no-index --find-links file:yourpath
 28 | ```
 29 | 
 30 | 4.import Yunbase
 31 | 
 32 | ```python
 33 | from Yunbase.baseline import Yunbase
 34 | ```
 35 | 
 36 | 5.create Yunbase.
 37 | 
 38 | All the parameters are below, and you can flexibly choose parameters according to the task.
 39 | 
 40 | ```python
 41 | yunbase=Yunbase(num_folds:int=5,
 42 |                       n_repeats:int=1,
 43 |                       models:list[tuple]=[],
 44 |                       FE=None,
 45 |                       CV_sample=None,
 46 |                       group_col=None,
 47 |                       target_col:str='target',
 48 |                       weight_col:str='weight',
 49 |                       kfold_col:str='fold',
 50 |                       drop_cols:list[str]=[],
 51 |                       seed:int=2025,
 52 |                       objective:Literal['binary','multi_class','regression']='regression',
 53 |                       metric:str='mse',
 54 |                       nan_margin:float=0.95,
 55 |                       num_classes=None,
 56 |                       infer_size:int=10000,
 57 |                       save_oof_preds:bool=True,
 58 |                       save_test_preds:bool=True,
 59 |                       device:str='cpu',
 60 |                       one_hot_max:int=50,
 61 |                       one_hot_cols=None,
 62 |                       custom_metric=None,
 63 |                       use_optuna_find_params:int=0,
 64 |                       optuna_direction=None,
 65 |                       early_stop:int=100,
 66 |                       use_pseudo_label:bool=False,
 67 |                       use_high_corr_feat:bool=True,
 68 |                       cross_cols:list[str]=[],
 69 |                       labelencoder_cols:list[str]=[],
 70 |                       list_stat:list[tuple]=[],
 71 |                       word2vec_models:list[tuple]=[],
 72 |                       text_cols:list[str]=[],
 73 |                       plot_feature_importance:bool=False,
 74 |                       log:int=100,
 75 |                       exp_mode:bool=False,
 76 |                       use_reduce_memory:bool=False,
 77 |                       use_data_augmentation:bool=False,
 78 |                       use_oof_as_feature:bool=False,
 79 |                       use_CIR:bool=False,
 80 |                       use_median_as_pred:bool=False,
 81 |                       use_scaler:bool=False,
 82 |                       use_TTA:bool=False,
 83 |                       use_eval_metric:bool=True,
 84 |                       feats_stat:list[tuple]=[],
 85 |                       target_stat:list[tuple]=[],
 86 |                       use_spellchecker:bool=False,
 87 |                       AGGREGATIONS:list=['nunique','count','min','max','first',
 88 |                                            'last', 'mean','median','sum','std','skew',kurtosis],
 89 |                 )
 90 | ```
 91 | 
 92 | - `num_folds`:<b>int</b>.the number of folds for k-fold cross validation.
 93 | 
 94 | - `n_repeats`:<b>int</b>,Replace different seeds for multiple kfold cross validation.<b>This parameter is generally used for small datasets to ensure the stability of the model.</b>
 95 | 
 96 | - `models`:<b>list[tuple]</b>.Built in 3 GBDTs as baseline, you can also use custom models,such as 
 97 |   
 98 |   ```python
 99 |   models=[(LGBMRegressor(**lgb_params),'lgb')]
100 |   ```
101 |   
102 |   
103 |   
104 | - `FE`:<b>function</b>.In addition to the built-in feature engineer, you can also customize feature engineer.For example:
105 | 
106 |      ```python
107 |      def FE(df):
108 |          return df.drop(['id'],axis=1)
109 |      ```
110 | 
111 | ​       <b>Currently, both polar and pandas are supported for writing this function.</b>
112 | 
113 | 
114 | - `CV_sample`:<b>function</b>.You can customize your downsampling and oversampling operations inside.In order to ensure the accuracy of CV, operations on the validation set should not be performed in principle. However, to meet personalized needs, operations on the validation set are still allowed here In addition to sampling operations, related feature engineering can also be customized here.
115 | 
116 |   For example:
117 | 
118 |   ```python
119 |   def CV_sample(X_train,y_train,X_valid,y_valid,
120 |                 sample_weight_train,sample_weight_valid):
121 |       less_idx=list(np.where(y_train==1)[0])
122 |       more_idx=list(np.where(y_train==0)[0])
123 |       np.random.shuffle(more_idx)
124 |       #undersample
125 |       more_idx=more_idx[:int(len(more_idx)*0.9)]
126 |       #Adversarial learning
127 |       X_train_copy=X_train.iloc[less_idx].copy()
128 |       y_train_copy=y_train.iloc[less_idx].copy()
129 |       y_train_copy[:]=0
130 |       sample_weight_train_copy=sample_weight_train.iloc[less_idx].copy()
131 |       
132 |       X_train=pd.concat((X_train.iloc[more_idx+less_idx],X_train_copy)).reset_index(drop=True)
133 |       y_train=pd.concat((y_train.iloc[more_idx+less_idx],y_train_copy)).reset_index(drop=True)
134 |       sample_weight_train=pd.concat((sample_weight_train.iloc[more_idx+less_idx],sample_weight_train_copy)).reset_index(drop=True)
135 |       return X_train,y_train,X_valid,y_valid,sample_weight_train,sample_weight_valid
136 |   ```
137 | 
138 | 
139 | ​      In purgedCV(time series CV), in order to make the training set and test set closer, without a validation set, this function will become as follows:
140 | 
141 | ```python
142 | def CV_sample(X_train,y_train,sample_weight_train):
143 |     #your code
144 |     return X_train,y_train,sample_weight_train
145 | ```
146 | 
147 | 
148 | 
149 | 
150 | - `group_col`:<b>str</b>.if you want to use groupkfold,then define this group_col.
151 | 
152 | - `target_col`:<b>str</b>.the column that you want to predict.
153 | 
154 | - `weight_col`:<b>str</b>.You can set the weight of each sample before model training. If not defined by the user, 1 will be used by default to train each sample.
155 | 
156 |      ```python
157 |      train['weight']=np.array([0.1,0.3,……,0.2])
158 |      ```
159 | 
160 |      
161 | 
162 | - `kfold_col`:<b>str</b>.Allow users to customize kfold.For example,
163 | 
164 |      ```python
165 |      num_folds=5
166 |      train['fold']=train.index%num_folds
167 |      ```
168 | 
169 |      
170 | 
171 | - `drop_cols`:<b>list</b>.The column to be deleted after all feature engineering is completed.
172 | 
173 | - `seed`:<b>int</b>.random seed.
174 | 
175 | - `objective`:<b>str</b>.what task do you want to do?<b>regression</b>,<b>binary</b> or <b>multi_class</b>?
176 | 
177 | - `metric`:<b>str</b>.metric to evaluate your model.
178 | 
179 | - `nan_margin`:<b>float</b>.when the proportion of missing values in a column is greater than, we delete this column.
180 | 
181 | - `num_classes`:<b>int</b>.if objectibe is <b>multi_class</b> or <b>binary</b>,you should define this parameter.
182 | 
183 | - `infer_size`:<b>int</b>.the test data might be large,we can predict in batches to deal with memory issues.
184 | 
185 | - `save_oof_preds`:<b>bool</b>.you can save OOF for your own offline study.
186 | 
187 | - `save_test_preds`:<b>bool</b>.you can save test_preds for your own offline study.                       
188 | 
189 | - `device`:<b>str</b>.GBDT can training on GPU,you can set this parameter 'gpu' when you want to training on GPU.
190 | 
191 | - `one_hot_max`:<b>int</b>.If the nunique of a column is less than a certain value, perform one hot encoder.
192 | 
193 | - `one_hot_cols`:<b>list[str]</b>.Customize which columns to use onehotencoder.
194 | 
195 | - `custom_metric`:<b>function</b>.you can define your  own custom_metric.
196 | 
197 |      ```python
198 |      def weighted_MAE(y_true,y_pred,
199 |                       weight=train['weight'].values):
200 |          return np.sum(weight*np.abs(y_true-y_pred))/np.sum(weight)
201 |      ```
202 | 
203 |      <b>1.custom_metric can only pass in the parameters y_true and y_pred. If it is a regular cross validation, it needs to be assigned a value in advance like the weight parameter above. If it is a time series CV, the use_weighted_metric parameter can be used without defining the weight parameter.</b>
204 | 
205 |      <b>2.when objective is multi_class,`y_pred` in `custom_metric(y_true,y_pred)` is probability(shape:`(len(y_true),num_classes)`).</b>
206 | 
207 | - `use_optuna_find_params`:<b>int</b>.count of use optuna find best params,0 is not use optuna to find params.<b>Currently only LGBM is supported.</b>
208 | 
209 | - `optuna_direction`:<b>str</b>.`minimize` or `maximize`,when you use custom metric,you must define the direction of optimization.
210 | 
211 | - `early_stop`:<b>int</b>.If the performance of the model does not improve multiple times, then stop training the model.
212 | 
213 | - `use_pseudo_label`:<b>bool</b>.Whether to use pseudo labels.When it is true,adding the test data to the training data and training again after obtaining the predicted results of the test data.<b>To obtain a reliable CV, the test set and cross validation training set are concatenated and validated using the validation set.</b>
214 | 
215 | - `use_high_corr_feat`:<b>bool</b>.whether to use high correlation features or not. 
216 | 
217 | - `cross_cols`:<b>list[str]</b>.Construct features using addition, subtraction, multiplication, and division brute force for these columns of features.
218 | 
219 | - `labelencoder_cols`:<b>list</b>.Convert categorical string variables into [1,2,……,n].
220 | 
221 | - `list_stat`:<b>list[tuple]=[]</b>.example:`[('step_list',list_gap=[1,2,4])]`.<b>step_list:If the data in a column is a list or str(list),
222 |                                    such as [] or '[]', this can be used to extract diff and 
223 |                                     shift features for list_cols.  </b>   
224 | 
225 | - `word2vec_models`:<b>list[tuple]</b>.Use models such as tfidf to extract features of string columns.For example:
226 |   
227 |   ```python
228 |   word2vec_models=[(TfidfVectorizer(),col,model_name='tfidf',use_svd=False)]
229 |   ```
230 |   
231 |   
232 |   
233 | - `text_cols`:<b>list[str]</b>.extract features of words, sentences, and paragraphs from text here.
234 | 
235 | - `plot_feature_importance`:<b>bool</b>.after model training,whether plot feature importance or not.
236 | 
237 | - `log`:<b>int</b>.How many iterators output scores on the validation set once.
238 | 
239 | - `exp_mode`:<b>bool</b>.In regression tasks, the distribution of `target_col` is a long tail distribution, and this parameter can be used to perform log transform on the target_col before training.
240 | 
241 | - `use_reduce_memory`:<b>bool</b>.<b>When facing large datasets, this function can be used to reduce memory.</b>   
242 | 
243 | - `use_data_augmentation`:<b>bool</b>.if use data augmentation,During cross validation, the training data will undergo PCA transformation followed by inverse transformation.<b>You can see function `pca_augmentation` for more details.</b>   
244 | 
245 | - `use_oof_as_feature`:<b>bool</b>.For training data, use the `oof_preds of the previous model` as the feature, and for testing data, use the predicted results of the previous model as the feature for next model.
246 | 
247 | - `use_CIR`:<b>bool</b>. use `CenteredIsotonicRegression` to fit(oof_preds,target) in the final.
248 | 
249 | - `use_median_as_pred`:<b>bool</b>.The general model ensemble uses the mean as the prediction result, and this parameter uses median as the prediction result, which sometimes achieves better results, but only slightly.
250 | 
251 | - `use_scaler`:<b>bool</b>.Although the usual scaling operation is not useful for GBDT models, after scaling the data, the clip operation can be used to remove outliers.We are using `RobustScaler` here.
252 | 
253 | - `use_TTA`:<b>bool</b>.It is to apply the previous `data augmentation` operation to the test set and then take the average of the predicted results.
254 | 
255 | - `use_eval_metric`:<b>bool</b>.Use `self.metric` to evaluate models during training with lightgbm and xgboost.
256 | 
257 | - `feats_stat`:<b>list[tuple]=[]</b>.Construct groupby features.for example: training data has some patients, testing data has other patients, each patient has multiple samples, this function can be used.
258 | 
259 |      
260 | 
261 |      ```python
262 |      feats_stat=
263 |      [('patient_id','year',['max','min','median','mean','std','skew',kurtosis,'(x-mean)/std','max-min','mean/std'])]
264 |      ```
265 | 
266 | - `target_stat`:<b>list[tuple]=[]</b>.For example, if you have counted 100000 male and female samples and found that the average height of males is 168 and females is 166, then you can use `{'male':168,'female':166} `as a new feature of sex.`target_stat=[('sex','height',['mean'])] `.
267 | 
268 |      The effect of binary variables may not be significant, but multivariate categorical variables can demonstrate the size relationship between variables in this way.
269 | 
270 |      <b>Common aggregation features can be directly called using strings, while custom aggregation features need to be implemented through functions.Currently only supports polars.</b>
271 | 
272 |      ```python
273 |      STATS=['min','mean','std','max','median','sum','skew','count','nunique']
274 |      
275 |      def qp(percentage):
276 |          def q(x):
277 |              x=x.to_numpy()
278 |              return np.percentile(x,percentage)
279 |          return q
280 |      [('q0',qp(0.05)),('q1',qp(0.25)),('q3',qp(0.75)),('q4',qp(0.95))]
281 |      ```
282 | 
283 |      
284 | 
285 | 
286 | - `use_spellchecker`:<b>bool</b>.This is an immature feature that checks for word errors in text and then makes corrections. The main issue is that it takes too long time.
287 | - `AGGREGATIONS:list=['nunique','count','min','max','first',
288 |       'last', 'mean','median','sum','std','skew',kurtosis]`.
289 | 
290 | 
291 | 
292 | 6.yunbase training
293 | 
294 | At present, it supports read csv, parquet files according to path, or csv files that have already been read.
295 | 
296 | ```python
297 | yunbase.fit(train_path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',
298 |             category_cols:list[str]=[],date_cols:list[str]=[],
299 |             target2idx:dict|None=None,pseudo_label_weight:float=0.5,
300 |             save_trained_models:bool=True,
301 |            )
302 | ```
303 | 
304 | - `train_path_or_file`:You can use the file path or pass in the already loaded file.
305 | - `category_cols`:You can specify which columns to convert to 'category' in the training data.
306 | - `date_cols`:If a column of features are all of time type, for example :"2024-04-23",this can be used to construct features.
307 | - `target2idx`:The dictionary mapped in the classification task, if you want to predict a person's gender, you can specify `{'Male ': 0,' Female ': 1}`.If you do not specify it yourself, it will be mapped to 0, 1,... n in order of the number of times each target appears.
308 | - `pseudo_label_weight`:When using pseudo labels to train a model, the weight of the test data compared to the training data.For example, if the weight of the training data is 2 and set to 0.5, the test data will use 1 as the weight to train the model.
309 | - `save_trained_models`:Do you want to save the models generated during the training process. Note that if you need to separate training and inference, you only need to save the yunbase object, and do not need to save the models generated in between.
310 | 
311 | 7.yunbase inference
312 | 
313 | ```python
314 | test_preds=yunbase.predict(test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',weights=np.zeros(0))
315 | test_preds=yunbase.predict_proba(test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',weights=np.zeros(0))
316 | ```
317 | 
318 | - `weights`:This is setting the weights for model ensemble. For example, if you specify lgb, xgb, and cat, you can set weights to [3,4,3].There will be functions internally that normalize and integrate the weights.
319 | 
320 | 8.save test_preds to submission.csv
321 | 
322 | ```python
323 | yunbase.submit(submission_path_or_file='submission.csv',test_preds=np.ones(3),save_name='yunbase')
324 | ```
325 | 
326 | - `save_name` .if you set  'submission',it will give you a csv file named `submission.csv`.
327 | 
328 | 9.ensemble
329 | 
330 | ```python
331 | yunbase.ensemble(solution_paths_or_files:list[str]=[],id_col:str='id',target_col:str='',weights=None)
332 | ```
333 | 
334 | - For example:
335 | 
336 |   ```python
337 |   solution_paths_or_files=[
338 |   'submission1.csv',
339 |   'submission2.csv',
340 |   'submission3.csv'
341 |   ]
342 |   weights=[3,3,4]
343 |   ```
344 | 
345 | 10.If train and inference need to be separated.
346 | 
347 | ```python
348 | #model save
349 | yunbase.pickle_dump(yunbase,'yunbase.model')
350 | 
351 | import dill#serialize and deserialize objects (such as saving and loading tree models)
352 | def pickle_load(path):
353 |     #open path,binary read
354 |     with open(path, mode="rb") as f:
355 |         data = dill.load(f)
356 |         return data
357 | yunbase=Yunbase()
358 | yunbase=pickle_load("yunbase.model")
359 | yunbase.model_save_path=your_model_save_path
360 | ```
361 | 
362 | 11.train data and test data can be seen as below.
363 | 
364 | ```python
365 | yunbase.train.head(),yunbase.test.head()
366 | ```
367 | 
368 | ##### <a href="https://www.kaggle.com/code/yunsuxiaozi/yunbase">Here</a> is a static version that can be used to play Kaggle competition.You can refer to this <a href="https://www.kaggle.com/code/yunsuxiaozi/brist1d-top5-solution">notebook</a> to learn usage of Yunbase. 
369 | 
370 | ## TimeSeries Purged CV
371 | 
372 | ```python
373 | yunbase.purged_cross_validation(self,train_path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',
374 |                                 test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',
375 |                                 date_col:str='date',train_gap_each_fold:int=31,#one month
376 |                                 train_test_gap:int=7,#a week
377 |                                 train_date_range:int=0,test_date_range:int=0,
378 |                                 category_cols:list[str]=[],
379 |                                 use_seasonal_features:bool=True,
380 |                                 use_weighted_metric:bool=False,
381 |                                 only_inference:bool=False,
382 |                                 timestep:str='day',
383 |                                 target2idx:dict|None=None,
384 |                                 save_trained_models:bool=True,
385 |                                )
386 | ```
387 | 
388 | - `only_inference`:If you don't want to see the offline scores of the time-series CV or want to save time, you can directly train the final submitted model.
389 | 
390 | Demo notebook:<a href="https://www.kaggle.com/code/yunsuxiaozi/rsfc-yunbase">Rohlik Yunbase</a>
391 | 
392 | 
393 | 
394 | ## Adversarial Validation
395 | 
396 | <a href='https://www.kaggle.com/code/yunsuxiaozi/spfdp-yunbase'>Demo notebook</a>
397 | 
398 | ### follow-up work
399 | 
400 | The code has now completed a rough framework and will continue to be improved by adding new functions based on bug fixes.
401 | 
402 | <b>In principle, fix as many bugs as I discover and add as many new features as I think of.</b>
403 | 
404 | 1.fit function to `np.array`.(such as `model.fit(train_X,train_y)`
405 | 
406 | `model.predict(test_X)`).
407 | 
408 | 2.add more common `metric`.
409 | 
410 | 3.In addition to kfold, `single model` training and inference are also implemented.
411 | 
412 | 4.hill climbing to find `blending` weight.
413 | 
414 | 5.Optimize `memory` and `time` to cope with larger datasets.`(pandas->polars)`
415 | 
416 | 6.Make the code more beautiful, concise, and easy to understand.
417 | 
418 | 7.Add statements with abnormal `error messages`.
419 | 
420 | Waiting for updates.
421 | 
422 | Kaggle:https://www.kaggle.com/yunsuxiaozi
423 | 
424 | <img src="star-history-2025223.png" alt="yunbase title image" style="zoom:100%;" />
425 | 
426 | ## Some interesting dataset recommendations:
427 | 
428 | #### 1.Competitions with significant differences in data distribution between training and testing sets: suitable for learning adversarial validation: <a href="https://www.kaggle.com/competitions/should-i-eat-this-mushroom-tfug-delhi/overview">Should I eat this mushroom? TFUG Delhi</a>
429 | 
430 | #### 2.Treat regression tasks as classification tasks:<a href="https://www.kaggle.com/competitions/playground-series-s3e25/overview">Regression with a Mohs Hardness Dataset</a>
431 | 
432 | #### 3.Treat classification tasks as regression tasks:<a href="https://www.kaggle.com/competitions/child-mind-institute-problematic-internet-use/overview">Child Mind Institute — Problematic Internet Use</a>
433 | 
434 | #### 4.A dataset with almost noise and only weak signals can be used to learn TargetEncoder.<a href="https://www.kaggle.com/competitions/playground-series-s5e2/overview">Backpack Prediction Challenge</a>
435 | 
436 | 
437 | 
438 | Due to the large amount of content in the README, there may still be some errors even after updating,`baseline.py` and `README.md` may not synchronize updates. 
439 | 
440 | <b>update time:2025/03/27</b>
441 | 
442 | 


--------------------------------------------------------------------------------
/baseline.py:
--------------------------------------------------------------------------------
   1 | """
   2 | @author:yunsuxiaozi
   3 | @start_time:2024/09/27
   4 | @update_time:2025/12/03
   5 | """
   6 | import polars as pl#similar to pandas, but with better performance when dealing with large datasets.
   7 | import pandas as pd#read csv,parquet
   8 | import numpy as np#for scientific computation of matrices
   9 | from tqdm import tqdm#progress bar
  10 | from scipy.stats import kurtosis#calculate kurt
  11 | #powerful plot libraries
  12 | import matplotlib.pyplot as plt
  13 | import seaborn as sns
  14 | import swifter# speed up pandas
  15 | 
  16 | #current supported kfold
  17 | from sklearn.model_selection import KFold,StratifiedKFold,StratifiedGroupKFold,GroupKFold
  18 | #metrics
  19 | from sklearn.metrics import roc_auc_score,f1_score,matthews_corrcoef,precision_recall_curve, auc
  20 | #models(lgb,xgb,cat,ridge,lr,tabnet)
  21 | from sklearn.linear_model import Ridge,LinearRegression,LogisticRegression,Lasso
  22 | #fit(oof_preds,target)
  23 | from cir_model import CenteredIsotonicRegression
  24 | from lightgbm import LGBMRegressor,LGBMClassifier,log_evaluation,early_stopping
  25 | from catboost import CatBoostRegressor,CatBoostClassifier
  26 | from xgboost import XGBRegressor,XGBClassifier
  27 | from pytorch_tabnet.tab_model import TabNetRegressor,TabNetClassifier
  28 | import optuna#automatic hyperparameter optimization framework
  29 | 
  30 | import ast#parse Python list strings  transform '[a,b,c]' to [a,b,c]
  31 | import copy#copy object
  32 | import gc#rubbish collection
  33 | from typing import Literal#The parameters of a function can only have fixed values.
  34 | import dill#serialize and deserialize objects (such as saving and loading tree models)
  35 | from colorama import Fore, Style #print colorful text
  36 | import os#interact with operation system
  37 | #deal with tabm's print
  38 | import sys
  39 | from contextlib import contextmanager
  40 | 
  41 | #deal with text
  42 | import re#python's built-in regular expressions.
  43 | from spellchecker import SpellChecker# spelling checker library
  44 | from unidecode import unidecode#transform unicode to ASCII.
  45 | #gene(topic) similarity   
  46 | from gensim.models import Word2Vec
  47 | from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer#word2vec feature
  48 | import ftfy#fixes text for you,correct unicode issues.
  49 | import nltk #Natural Language toolkit
  50 | from nltk.corpus import stopwords#import english stopwords
  51 | import emoji#deal with emoji in natrual language
  52 | from sklearn.preprocessing import RobustScaler#(x-median)/IQR
  53 | from sklearn.decomposition import PCA,TruncatedSVD#Truncated Singular Value Decomposition
  54 | 
  55 | import warnings#avoid some negligible errors
  56 | #The filterwarnings () method is used to set warning filters, which can control the output method and level of warning information.
  57 | warnings.filterwarnings('ignore')
  58 | 
  59 | import random#provide some function to generate random_seed.
  60 | #set random seed,to make sure model can be recurrented.
  61 | def seed_everything(seed):
  62 |     np.random.seed(seed)#numpy's random seed
  63 |     random.seed(seed)#python built-in random seed
  64 | seed_everything(seed=2025)
  65 | 
  66 | class Yunbase():
  67 |     def __init__(self,num_folds:int=5,
  68 |                       n_repeats:int=1,
  69 |                       models:list[tuple]=[],
  70 |                       FE=None,
  71 |                       CV_sample=None,
  72 |                       group_col=None,
  73 |                       target_col:str='target',
  74 |                       weight_col:str='weight',
  75 |                       kfold_col:str='fold',
  76 |                       drop_cols:list[str]=[],
  77 |                       seed:int=2025,
  78 |                       objective:Literal['binary','multi_class','regression']='regression',
  79 |                       metric:str='mse',
  80 |                       nan_margin:float=0.95,
  81 |                       num_classes=None,
  82 |                       infer_size:int=10000,
  83 |                       save_oof_preds:bool=True,
  84 |                       save_test_preds:bool=True,
  85 |                       device:str='cpu',
  86 |                       one_hot_max:int=50,
  87 |                       one_hot_cols=None,
  88 |                       custom_metric=None,
  89 |                       use_optuna_find_params:int=0,
  90 |                       optuna_direction=None,
  91 |                       early_stop:int=100,
  92 |                       use_pseudo_label:bool=False,
  93 |                       use_high_corr_feat:bool=True,
  94 |                       cross_cols:list[str]=[],
  95 |                       labelencoder_cols:list[str]=[],
  96 |                       list_stat:list[tuple]=[],
  97 |                       word2vec_models:list[tuple]=[],
  98 |                       text_cols:list[str]=[],
  99 |                       plot_feature_importance:bool=False,
 100 |                       log:int=100,
 101 |                       exp_mode:bool=False,
 102 |                       use_reduce_memory:bool=False,
 103 |                       use_data_augmentation:bool=False,
 104 |                       use_oof_as_feature:bool=False,
 105 |                       use_CIR:bool=False,
 106 |                       use_median_as_pred:bool=False,
 107 |                       use_scaler:bool=False,
 108 |                       use_TTA:bool=False,
 109 |                       use_eval_metric:bool=True,
 110 |                       feats_stat:list[tuple]=[],
 111 |                       target_stat:list[tuple]=[],
 112 |                       targetencoder_with_kfold:bool=False,
 113 |                       use_spellchecker:bool=False,
 114 |                       AGGREGATIONS:list=['nunique','count','min','max','first',
 115 |                                            'last', 'mean','median','sum','std','skew',kurtosis],
 116 |                 )->None:
 117 |         """
 118 |         num_folds             :the number of folds for k-fold cross validation.
 119 |         n_repeats             :Here,we will modify the random seed of kfold and models to repeat 
 120 |                                the cross validation several times.
 121 |         models                :Built in 3 GBDTs as baseline, you can also use custom models,
 122 |                                such as models=[(LGBMRegressor(**lgb_params),'lgb')]
 123 |         FE                    :In addition to the built-in feature engineer, you can also customize feature engineer.
 124 |         CV_sample             :This function is for X_train and y_train,sample_weight in cross validation. 
 125 |                                In order to make the evaluation metrics of oof as accurate as possible,
 126 |                                this function is not executed for X_valid and y_valid. 
 127 |                                You can perform downsampling, upsampling, taking the first 10000 data 
 128 |                                points, and other operations you want here, and 
 129 |                                ultimately return any X_train or y_train,sample_weight.
 130 |         group_col             :if you want to use groupkfold,then define this group_col.
 131 |         target_col            :the column that you want to predict.
 132 |         weight_col            :When training the model, give each sample a different weight. 
 133 |                                If you don't set it, the weight of each sample will default to 1.
 134 |         kfold_col             :You can add the feature 'fold' to the training data, which allows you to 
 135 |                                customize your own kfold. The values in this column are [0,1,..., 
 136 |                                num_folds-1].
 137 |         drop_cols             :The column to be deleted after all feature engineering is completed.
 138 |         seed                  :random seed.
 139 |         objective             :what task do you want to do?regression,binary or multi_class?
 140 |         metric                :metric to evaluate your model.
 141 |         nan_margin            :when the proportion of missing values in a column is greater than, we delete this column.
 142 |         num_classes           :if objectibe is multi_class,you should define this class.
 143 |         infer_size            :the test data might be large,we can predict in batches.
 144 |         save_oof_preds        :you can save OOF for offline study.
 145 |         save_test_preds       :you can save test_preds.For multi classification tasks, 
 146 |                                the predicted result is the category.If you need to save the probability of the test_data,
 147 |                                you can save test_preds.
 148 |         device                :GBDT can training on GPU,you can set this parameter like NN.
 149 |         one_hot_max/one_hot_cols
 150 |                               :Perform onehotencoder on features, one considering the numerical value 
 151 |                                of nunique and the other customizing features.
 152 |         
 153 |         custom_metric         :your custom_metric,when objective is multi_class,y_pred in custom(y_true,y_pred) is probability.            
 154 |         use_optuna_find_params:count of use optuna find best params,0 is not use optuna to find params.
 155 |                                Currently only LGBM is supported.
 156 |         optuna_direction      :'minimize' or 'maximize',when you use custom metric,you need to define 
 157 |                                the direction of optimization.
 158 |         early_stop            :Common parameters of GBDT.
 159 |         use_pseudo_label      :Whether to use pseudo labels.When it is true,adding the test data 
 160 |                                to the training data and training again after obtaining the predicted 
 161 |                                results of the test data.
 162 |         use_high_corr_feat    :whether to use high correlation features or not. 
 163 |         cross_cols            :Construct features for adding, subtracting, multiplying, and dividing these columns.
 164 |         labelencoder_cols     :Convert categorical string variables into [1,2,……,n].
 165 |         list_stat             :example:[(list_col:str='step_list',list_gap:list[int]=[1,2,4])].
 166 |                                list_col:If the data in a column is a list or str(list),
 167 |                                such as [] or '[]', this can be used to extract diff and 
 168 |                                shift features for list_cols.
 169 |         word2vec_models       :Use models such as tfidf to extract features of string columns.
 170 |                                example:word2vec_models=[(TfidfVectorizer(max_features=250,sublinear_tf=True,
 171 |                                         ngram_range=(2,3)),col,model_name,use_svd)],
 172 |                                use_svd:use Truncated Singular value decomposition to word2vec features.
 173 |         text_cols             :extract features of words, sentences, and paragraphs from text here.
 174 |         plot_feature_importance:after model training,whether print feature importance or not
 175 |         log                   :log trees are trained in the GBDT model to output a validation set score once.
 176 |         exp_mode              :In regression tasks, the distribution of target_col is a long tail distribution, 
 177 |                                and this parameter can be used to perform log transform on the target_col.
 178 |         use_reduce_memory     :if use function reduce_mem_usage(),then set this parameter True.
 179 |         use_data_augmentation :if use data augmentation,During cross validation, the training data 
 180 |                                will undergo PCA transformation followed by inverse transformation.
 181 |         use_oof_as_feature    :Train the next model using the oof_preds obtained from the previous 
 182 |                                model as features, and the same applies to inference.
 183 |         use_CIR               :use CenteredIsotonicRegression to fit oof_preds and target.
 184 |         use_median_as_pred    :use median.(axis=0)) instead of mean.(axis=0)
 185 |         use_scaler            :use robust scaler to deal with outlier.
 186 |         use_eval_metric       : use 'eval_metric' when training lightgbm or xgboost.
 187 |         use_TTA               :use 'test time augmentation'.It is to use 
 188 |                                data augmentation operations in the inference process
 189 |         feats_stat            : (group_col,feature_col,aggregation_list)
 190 |                                example:feats_stat = [ ('id','up_time', ['min', 'max'])   ]
 191 |         target_stat           :We can use target's AGGREGATIONS to encode categorical variables.
 192 |                                In order to obtain a reliable CV, this operation is performed separately 
 193 |                                for the training set and validation set in cross validation. 
 194 |                                example:target_stat = [ (group_col,target_col, aggregation_list)   ]
 195 |                                To make it more versatile, you can also use  other variables 
 196 |                                besides target to encode categorical variables.
 197 |         targetencoder_with_kfold:The difference between False and True is whether the 
 198 |                                training data (train) in cross validation (full=train+valid) uses 
 199 |                                the entire training data (train)'s Target Encoder directly,or is 
 200 |                                assigned through cross validation in the training data (train=tr+va).
 201 |         use_spellchecker      :use SpellChecker to correct word in text.
 202 |         AGGREGATIONS          :['nunique','count','min','max','first','last',
 203 |                                'mean','median','sum','std','skew',kurtosis,q1,q3],
 204 |         """
 205 |         
 206 |         #currented supported metric
 207 |         self.reg_metric=['mae','rmse','mse','medae','rmsle','msle','mape','r2','smape',#regression
 208 |                         ]
 209 |         self.cla_metric=['auc','pr_auc','logloss','f1_score','mcc',#binary metric
 210 |                         'accuracy','multi_logloss',#multi_class or classification
 211 |                         ]
 212 |         self.supported_metrics=['custom_metric']+self.reg_metric+self.cla_metric
 213 |                                
 214 |         #current supported models
 215 |         #pytabkit refer to :https://www.kaggle.com/competitions/playground-series-s5e8/writeups/2nd-place-yet-another-ensemble
 216 |         self.supported_models=['lgb','cat','xgb','ridge','Lasso','LinearRegression','LogisticRegression','tabnet',
 217 |                                'realmlp(pytabkit need install yourself)','tabm(pytabkit need install yourself)',
 218 |                                 'Word2Vec','tfidfvec','countvec',
 219 |                               ]
 220 |         #current supported kfold.
 221 |         self.supported_kfolds=['KFold','GroupKFold','StratifiedKFold','StratifiedGroupKFold','purged_CV','custom_kfold']
 222 |         #current supported objective.
 223 |         self.supported_objectives=['binary','multi_class','regression']
 224 |         
 225 |         print(f"Currently supported metrics:{self.supported_metrics}")
 226 |         print(f"Currently supported models:{self.supported_models}")
 227 |         print(f"Currently supported kfolds:{self.supported_kfolds}")
 228 |         print(f"Currently supported objectives:{self.supported_objectives}")
 229 |         
 230 |         self.num_folds=num_folds
 231 |         self.n_repeats=n_repeats
 232 |         self.seed=seed
 233 |         self.models=models
 234 |         self.target_col=target_col
 235 |         self.group_col=group_col
 236 |         
 237 |         self.FE=FE
 238 |         self.CV_sample=CV_sample
 239 |         self.drop_cols=drop_cols
 240 |         
 241 |         self.objective=objective.lower()
 242 |         #binary multi_class,regression
 243 |         if self.objective not in self.supported_objectives:
 244 |             raise ValueError("Wrong or currently unsupported objective.")
 245 |         
 246 |         self.custom_metric=custom_metric#function
 247 |         if self.custom_metric!=None:
 248 |             self.metric=self.custom_metric.__name__.lower()
 249 |         else:
 250 |             self.metric=metric.lower()
 251 |         if self.metric not in self.supported_metrics and self.custom_metric==None:
 252 |             raise ValueError("Wrong or currently unsupported metric,You can customize the evaluation metrics using 'custom_metric'.")
 253 |         
 254 |         self.nan_margin=nan_margin
 255 |         if self.nan_margin<0 or self.nan_margin>1:
 256 |             raise ValueError("nan_margin must be within the range of 0 to 1.")
 257 |         self.infer_size=infer_size
 258 |         if self.infer_size<=0 or type(self.infer_size) is not int:
 259 |             raise ValueError("infer size must be greater than 0 and must be int.")  
 260 |         
 261 |         self.save_oof_preds=save_oof_preds
 262 |         self.save_test_preds=save_test_preds
 263 | 
 264 |         self.num_classes=num_classes
 265 |         self.device=device.lower()
 266 |         if (self.objective=='binary') and self.num_classes!=2:
 267 |             raise ValueError("num_classes must be 2.")
 268 |         elif (self.objective=='multi_class') and (self.num_classes==None):
 269 |             raise ValueError("num_classes must be a number(int).")
 270 |         self.one_hot_max=one_hot_max
 271 |         self.one_hot_cols=one_hot_cols
 272 |         
 273 |         self.use_optuna_find_params=use_optuna_find_params
 274 |         self.optuna_direction=optuna_direction
 275 |         self.direction2metric={
 276 |             'maximize':['accuracy','auc','pr_auc','f1_score','mcc',#classification
 277 |                         'r2'#regression
 278 |                        ],
 279 |             'minimize':['medae','mape','mae','rmse','mse','rmsle','msle','smape',#regression
 280 |                         'logloss','multi_logloss'#classification
 281 |                        ]
 282 |         }
 283 |         
 284 |         if (self.custom_metric!=None) and (self.optuna_direction not in ['minimize','maximize']):
 285 |             raise ValueError("optuna_direction must be 'minimize' or 'maximize'.")
 286 |         self.early_stop=early_stop
 287 |         self.test=None#test data will be replaced when call predict function.
 288 |         self.use_pseudo_label=use_pseudo_label
 289 |         self.use_high_corr_feat=use_high_corr_feat
 290 |         self.cross_cols=cross_cols
 291 |         self.labelencoder_cols=labelencoder_cols
 292 |         self.list_stat=list_stat
 293 |         self.list_cols=[l[0] for l in self.list_stat]
 294 |         
 295 |         self.word2vec_models=word2vec_models
 296 |         for i in range(len(self.word2vec_models)):
 297 |             #default use_svd=False
 298 |             if len(self.word2vec_models[i])==3:#(model,col,model_name)
 299 |                 tup=self.word2vec_models[i]
 300 |                 self.word2vec_models[i]=(tup[0],tup[1],tup[2],False)
 301 |             
 302 |         self.word2vec_cols=[col for (model,col,model_name,use_svd) in self.word2vec_models]#origin cols that need to use in tfidf model. 
 303 |         self.text_cols=text_cols#extract features of words, sentences, and paragraphs from text here.
 304 |         #to perform only one clean_text operation
 305 |         self.param_text=list(set(self.word2vec_cols+self.text_cols))
 306 |         
 307 |         self.plot_feature_importance=plot_feature_importance
 308 |         #Due to the presence of special characters in some column names, 
 309 |         #they cannot be directly passed into the LGB model training, so conversion is required
 310 |         self.log=log
 311 |         self.exp_mode=exp_mode
 312 |         #when log transform, it is necessary to ensure that the minimum value of the target is greater than 0.
 313 |         #so target=target-min_target. b is -min_target.
 314 |         self.exp_mode_b=0
 315 |         if (self.objective!='regression') and (self.exp_mode==True):
 316 |             raise ValueError("exp_mode must be False in classification task.")
 317 |         self.use_reduce_memory=use_reduce_memory
 318 |         self.use_data_augmentation=use_data_augmentation
 319 |         self.use_oof_as_feature=use_oof_as_feature
 320 |         self.use_CIR=use_CIR
 321 |         self.use_median_as_pred=use_median_as_pred
 322 |         self.use_scaler=use_scaler
 323 |         self.use_eval_metric=use_eval_metric
 324 |         self.use_TTA=use_TTA
 325 |         self.use_spellchecker=use_spellchecker
 326 |         self.targetencoder_with_kfold=targetencoder_with_kfold
 327 | 
 328 |         attritubes=['save_oof_preds','save_test_preds','exp_mode',
 329 |                     'use_reduce_memory','use_data_augmentation','use_scaler',
 330 |                     'use_oof_as_feature','use_CIR','use_median_as_pred','use_eval_metric',
 331 |                     'use_spellchecker','use_TTA','targetencoder_with_kfold'
 332 |                    ]
 333 |         for attr in attritubes:
 334 |             if getattr(self,attr) not in [True,False]:
 335 |                 raise ValueError(f"{attr} must be True or False.")
 336 | 
 337 |         if self.use_oof_as_feature and self.use_pseudo_label:
 338 |             raise ValueError(f"use_oof_as_feature and use_pseudo_label cannot be both True at the same time.")
 339 |         
 340 |         self.feats_stat=feats_stat
 341 |         self.target_stat=target_stat
 342 |         #common AGGREGATIONS
 343 |         self.AGGREGATIONS = AGGREGATIONS
 344 |         
 345 |         #If inference one batch of data at a time requires repeatedly loading the model,
 346 |         #it will increase the program's running time.we need to save  in dictionary when load.
 347 |         self.trained_models=[]#trained model
 348 |         self.trained_CIR=[]#trained CIR model
 349 |         self.trained_le={}
 350 |         self.trained_wordvec={}
 351 |         self.trained_svd={}
 352 |         self.trained_scaler={}
 353 |         self.trained_TE={}#TargetEncoder
 354 |         self.onehot_valuecounts={}
 355 |         #make folder to save model trained.such as GBDT,word2vec.
 356 |         self.model_save_path="Yunbase_info/"
 357 |         if not os.path.exists(self.model_save_path):
 358 |             os.mkdir(self.model_save_path)
 359 | 
 360 |         self.eps=1e-15#clip (eps,1-eps) | divide by zero.
 361 |         self.category_cols=[]
 362 |         self.high_corr_cols=[]
 363 |         #to make sure column's dtype in train.csv is same as the column's dtype in test.csv.
 364 |         #example:https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability
 365 |         self.col2dtype={} 
 366 |         self.weight_col=weight_col
 367 |         self.kfold_col=kfold_col
 368 | 
 369 |     def get_params(self,):        
 370 |         params_dict={'num_folds':self.num_folds,'n_repeats':self.n_repeats,'models':self.models,
 371 |                      'group_col':self.group_col,'target_col':self.target_col,'weight_col':self.weight_col,
 372 |                      'kfold_col':self.kfold_col,'drop_cols':self.drop_cols,'seed':self.seed,'objective':self.objective,
 373 |                      'metric':self.metric,'nan_margin':self.nan_margin,'num_classes':self.num_classes,
 374 |                      'infer_size':self.infer_size,'save_oof_preds':self.save_oof_preds,'save_test_preds':self.save_test_preds,
 375 |                      'device':self.device,'one_hot_max':self.one_hot_max,'one_hot_cols':self.one_hot_cols,
 376 |                      'custom_metric':self.custom_metric,
 377 |                      'use_optuna_find_params':self.use_optuna_find_params,'optuna_direction':self.optuna_direction,
 378 |                      'early_stop':self.early_stop,'use_pseudo_label':self.use_pseudo_label,
 379 |                      'use_high_corr_feat':self.use_high_corr_feat,'cross_cols':self.cross_cols,
 380 |                      'labelencoder_cols':self.labelencoder_cols,'list_stat':self.list_stat,
 381 |                      'word2vec_models':self.word2vec_models, 'text_cols':self.text_cols,
 382 |                      'plot_feature_importance':self.plot_feature_importance,'log':self.log,
 383 |                      'exp_mode':self.exp_mode,'use_reduce_memory':self.use_reduce_memory,
 384 |                      'use_data_augmentation':self.use_data_augmentation,
 385 |                      'use_oof_as_feature':self.use_oof_as_feature,'use_CIR':self.use_CIR,
 386 |                      'use_median_as_pred':self.use_median_as_pred,'use_scaler':self.use_scaler,
 387 |                      'use_TTA':self.use_TTA,'use_eval_metric':self.use_eval_metric,
 388 |                      'feats_stat':self.feats_stat,'target_stat':self.target_stat,
 389 |                      'targetencoder_with_kfold':self.targetencoder_with_kfold,
 390 |                      'use_spellchecker':self.use_spellchecker,'AGGREGATIONS':self.AGGREGATIONS,
 391 |                      'category_cols':self.category_cols,
 392 |               }
 393 |         return params_dict
 394 |     
 395 |     #print colorful text
 396 |     def PrintColor(self,text:str='',color = Fore.BLUE)->None:
 397 |         print(color + text + Style.RESET_ALL)
 398 |     
 399 |     #save models after training
 400 |     def pickle_dump(self,obj, path:str)->None:
 401 |         #open path,binary write
 402 |         with open(path, mode="wb") as f:
 403 |             dill.dump(obj, f, protocol=4)
 404 |     #load models when inference
 405 |     def pickle_load(self,path:str):
 406 |         #open path,b/finary read
 407 |         with open(path, mode="rb") as f:
 408 |             data = dill.load(f)
 409 |             return data
 410 | 
 411 |     #reference:https://www.kaggle.com/code/masayakawamata/mic-tabm-baseline
 412 |     @contextmanager
 413 |     def suppress_stdout(self,):
 414 |         with open(os.devnull, "w") as devnull:
 415 |             old_stdout = sys.stdout
 416 |             sys.stdout = devnull
 417 |             try:
 418 |                 yield
 419 |             finally:
 420 |                 sys.stdout = old_stdout
 421 | 
 422 |     #sample AGGREGATIONS
 423 |     def q1(self,x):
 424 |         return x.quantile(0.25)
 425 |     def q3(self,x):
 426 |         return x.quantile(0.75)
 427 | 
 428 |     #Time data cannot use this augmentation,as features such as year, month, and day are discrete variables.
 429 |     def pca_augmentation(self,X:pd.DataFrame,y=None,target_col:str=''):
 430 |         if type(y)!=pd.DataFrame:#y=None
 431 |             origin_data=X.copy()
 432 |         else:#
 433 |             origin_data=pd.concat((X,y),axis=1)
 434 |         n_components=np.clip( int(origin_data.shape[1]*0.8),1,X.shape[1])
 435 |         pca=PCA(n_components=n_components)
 436 |         pca_data=pca.fit_transform(origin_data)
 437 |         aug_data=pca.inverse_transform(pca_data)
 438 |         aug_data=pd.DataFrame(aug_data)
 439 |         if type(y)!=pd.DataFrame:#y=None
 440 |             aug_data.columns=list(X.columns)
 441 |         else:
 442 |             aug_data.columns=list(X.columns)+[target_col]
 443 |         del origin_data,pca,pca_data
 444 |         gc.collect()
 445 |         
 446 |         return aug_data
 447 |         
 448 |     #Traverse all columns of df, modify data types to reduce memory usage
 449 |     def reduce_mem_usage(self,df:pd.DataFrame, float16_as32:bool=True)->pd.DataFrame:
 450 |         #memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
 451 |         start_mem = df.memory_usage().sum() / 1024**2
 452 |         print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
 453 |         for col in df.columns:
 454 |             col_type = df[col].dtype
 455 |             if col_type != object and str(col_type)!='category':#num_col
 456 |                 c_min,c_max = df[col].min(),df[col].max()
 457 |                 if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
 458 |                     #如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
 459 |                     if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
 460 |                         df[col] = df[col].astype(np.int8)
 461 |                     #如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
 462 |                     elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
 463 |                         df[col] = df[col].astype(np.int16)
 464 |                     #如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
 465 |                     elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
 466 |                         df[col] = df[col].astype(np.int32)
 467 |                     #如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
 468 |                     elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
 469 |                         df[col] = df[col].astype(np.int64)  
 470 |                 else:#如果是浮点数类型.
 471 |                     #如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
 472 |                     if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
 473 |                         if float16_as32:#如果数据需要更高的精度可以选择float32
 474 |                             df[col] = df[col].astype(np.float32)
 475 |                         else:
 476 |                             df[col] = df[col].astype(np.float16)  
 477 |                     #如果数值在float32的取值范围内，对它进行类型转换
 478 |                     elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
 479 |                         df[col] = df[col].astype(np.float32)
 480 |                     #如果数值在float64的取值范围内，对它进行类型转换
 481 |                     else:
 482 |                         df[col] = df[col].astype(np.float64)
 483 |         #calculate memory after optimization
 484 |         end_mem = df.memory_usage().sum() / 1024**2
 485 |         print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
 486 |         print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
 487 |         return df
 488 | 
 489 |     ############text preprocessor
 490 |     def clean_text(self,text:str='')->str:
 491 |         ############################## fix text #######################################################
 492 |         #transform “你好。” to 'NI HAO.'
 493 |         text = unidecode(text)
 494 |         #transform emoji to " "+text+" ".
 495 |         text=emoji.demojize(text,delimiters=(" ", " "))
 496 |         #correct unicode issues.
 497 |         text=ftfy.fix_text(text)
 498 |         #lower         example:'Big' and 'big'
 499 |         text=text.lower()
 500 |         ############################## remove meaningless text ########################################
 501 |         #remove <b>  <p> meaningless
 502 |         html=re.compile(r'<.*?>')
 503 |         text=html.sub(r'',text)
 504 |         #remove urls '\w+':(word character,[a-zA-Z0-9_])
 505 |         #thanks to https://github.com/yunsuxiaozi/Yunbase/issues/1
 506 |         text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
 507 |         #remove @yunsuxiaozi   person_name 
 508 |         text=re.sub("@\w+",'',text)
 509 |         #drop single character,they are meaningless. 'space a space'
 510 |         text=re.sub("\s[a-z]\s",'',text)
 511 |         #remove number
 512 |         #text=re.sub("\d+",'',text)
 513 |         #drop english stopwords,they are meaningless.
 514 |         english_stopwords = stopwords.words('english')
 515 |         text_list=text.split(" ")
 516 |         text_list=[t for t in text_list if t not in english_stopwords]
 517 |         text=" ".join(text_list)
 518 |         #drop space front and end.
 519 |         text=text.strip()
 520 |         return text
 521 | 
 522 |     def text2word(self,text:str='hello world!'):
 523 |         return re.split(r'\.|\?|!|\s|\n|,',text)
 524 |     def text2sentence(self,text:str='hello world!'):
 525 |         return re.split(r'\.|\?|\!|\n',text)
 526 |     def text2paragraph(self,text:str='hello world!'):
 527 |         return text.split("\n")
 528 |     #3 text readable index 
 529 |     def ARI(self,text):
 530 |         characters=len(text)
 531 |         words=len(self.text2word(text))
 532 |         sentence=len(self.text2sentence(text))
 533 |         ari_score=4.71*(characters/words)+0.5*(words/sentence)-21.43
 534 |         return ari_score
 535 |     def McAlpine_EFLAW(self,text):
 536 |         W=len(self.text2word(text))
 537 |         S=len(self.text2sentence(text))
 538 |         mcalpine_eflaw_score=(W+S*W)/S
 539 |         return mcalpine_eflaw_score
 540 |     def CLRI(self,text):
 541 |         characters=len(text)
 542 |         words=len(self.text2word(text))
 543 |         sentence=len(self.text2sentence(text))
 544 |         L=100*characters/words
 545 |         S=100*sentence/words
 546 |         clri_score=0.0588*L-0.296*S-15.8
 547 |         return clri_score
 548 | 
 549 |     def text_correct(self,text:str='hello world!'):
 550 |         spell = SpellChecker()
 551 |         words = self.text2word(text)
 552 |         punctuation=['.','?','!',' ','\n',',']
 553 |         wordssplit=[text[i] for i in range(len(text)) if text[i] in punctuation]
 554 |         fixed_words=[spell.correction(word) for word in words]
 555 |         error_cnt=sum([1 for i in range(len(words)) if words[i]!=fixed_words[i]])
 556 |         fixed_text=[]
 557 |         for i in range(len(wordssplit)):
 558 |             fixed_text.append(fixed_words[i])
 559 |             fixed_text.append(wordssplit[i])
 560 |         fixed_text="".join(fixed_text)
 561 |         return error_cnt,fixed_text
 562 |         
 563 |     ############text Feature Engineer
 564 |     def text_FE(self,df:pd.DataFrame,text_col:str='text'):
 565 |         df['index']=np.arange(len(df))
 566 |         #correct text
 567 |         if self.use_spellchecker:
 568 |             self.PrintColor(f"-> for column {text_col} text correct",color=Fore.YELLOW)
 569 |             texts=df[text_col].values
 570 |             error_cnts=np.zeros(len(texts))
 571 |             for i in tqdm(range(len(texts))):
 572 |                 error_cnts[i],texts[i]=self.text_correct(texts[i])
 573 |             df[f'{text_col}_error_cnts']=error_cnts
 574 |         
 575 |         df[text_col+"_ARI"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.ARI(x))
 576 |         df[text_col+"_CLRI"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.CLRI(x))
 577 |         df[text_col+"_McAlpine_EFLAW"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.McAlpine_EFLAW(x))
 578 |         #split by ps
 579 |         ps='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
 580 |         for i in range(len(ps)):
 581 |             df[text_col+f"split_ps{i}_count"]=df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:len(x.split(ps[i])))
 582 | 
 583 |         self.PrintColor(f"-> for column {text_col} word feature",color=Fore.RED)
 584 |         text_col_word_df=df[['index',text_col]].copy()
 585 |         #get word_list   [index,tcol,word_list]
 586 |         text_col_word_df[f'{text_col}_word']=text_col_word_df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.text2word(x))
 587 |         #[index,single_word]
 588 |         text_col_word_df=text_col_word_df.explode(f'{text_col}_word')[['index',f'{text_col}_word']]
 589 |         #[index,single_word,single_word_len]
 590 |         text_col_word_df[f'{text_col}_word_len'] = text_col_word_df[f'{text_col}_word'].swifter.allow_dask_on_strings(False).apply(len)
 591 |         #data clean [index,single_word,single_word_len]
 592 |         text_col_word_df=text_col_word_df[text_col_word_df[f'{text_col}_word_len']!=0]
 593 |         #for word features, extract the difference in length between the two words before and after.
 594 |         group_cols=[f'{text_col}_word_len']
 595 |         for gap in [1]:
 596 |             for col in [f'{text_col}_word_len']:
 597 |                 text_col_word_df[f'{col}_diff{gap}']=text_col_word_df.groupby(['index'])[col].diff(gap)
 598 |                 group_cols.append(f'{col}_diff{gap}')
 599 |         text_col_word_agg_df = text_col_word_df[['index']+group_cols].groupby(['index']).agg(self.AGGREGATIONS)
 600 |         text_col_word_agg_df.columns = ['_'.join(x) for x in text_col_word_agg_df.columns]
 601 |         df=df.merge(text_col_word_agg_df,on='index',how='left')
 602 | 
 603 |         self.PrintColor(f"-> for column {text_col} sentence feature",color=Fore.RED)
 604 |         text_col_sent_df=df[['index',text_col]].copy()
 605 |         #get sent_list   [index,tcol,sent_list]
 606 |         text_col_sent_df[f'{text_col}_sent']=text_col_sent_df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x: self.text2sentence(x))
 607 |         #[index,single_sent]
 608 |         text_col_sent_df=text_col_sent_df.explode(f'{text_col}_sent')[['index',f'{text_col}_sent']]
 609 |         #[index,single_sent,single_sent_len]
 610 |         text_col_sent_df[f'{text_col}_sent_len'] = text_col_sent_df[f'{text_col}_sent'].swifter.allow_dask_on_strings(False).apply(len)
 611 |         text_col_sent_df[f'{text_col}_sent_word_count'] = text_col_sent_df[f'{text_col}_sent'].swifter.allow_dask_on_strings(False).apply(lambda x:len(re.split('\\ |\\,',x)))
 612 |         #data clean [index,single_sent,single_sent_len]
 613 |         group_cols=[f'{text_col}_sent_len',f'{text_col}_sent_word_count']
 614 |         for gcol in group_cols:
 615 |             text_col_sent_df=text_col_sent_df[text_col_sent_df[gcol]!=0]
 616 |         #for sent features, extract the difference in length between the two sents before and after.
 617 |         for gap in [1]:
 618 |             for col in [f'{text_col}_sent_len',f'{text_col}_sent_word_count']:
 619 |                 text_col_sent_df[f'{col}_diff{gap}']=text_col_sent_df.groupby(['index'])[col].diff(gap)
 620 |                 group_cols.append(f'{col}_diff{gap}')
 621 |         text_col_sent_agg_df = text_col_sent_df[['index']+group_cols].groupby(['index']).agg(self.AGGREGATIONS)
 622 |         text_col_sent_agg_df.columns = ['_'.join(x) for x in text_col_sent_agg_df.columns]
 623 |         df=df.merge(text_col_sent_agg_df,on='index',how='left')
 624 | 
 625 |         self.PrintColor(f"-> for column {text_col} paragraph feature",color=Fore.RED)
 626 |         text_col_para_df=df[['index',text_col]].copy()
 627 |         #get para_list   [index,tcol,para_list]
 628 |         text_col_para_df[f'{text_col}_para']=text_col_para_df[text_col].swifter.allow_dask_on_strings(False).apply(lambda x: self.text2paragraph(x))
 629 |         #[index,single_para]
 630 |         text_col_para_df=text_col_para_df.explode(f'{text_col}_para')[['index',f'{text_col}_para']]
 631 |         text_col_para_df[f'{text_col}_para_len'] = text_col_para_df[f'{text_col}_para'].swifter.allow_dask_on_strings(False).apply(len)
 632 |         text_col_para_df[f'{text_col}_para_sent_count'] = text_col_para_df[f'{text_col}_para'].swifter.allow_dask_on_strings(False).apply(lambda x: len(re.split('\\.|\\?|\\!',x)))
 633 |         text_col_para_df[f'{text_col}_para_word_count'] = text_col_para_df[f'{text_col}_para'].swifter.allow_dask_on_strings(False).apply(lambda x: len(re.split('\\.|\\?|\\!\\ |\\,',x)))
 634 |         #data clean [index,single_sent,single_sent_len]
 635 |         group_cols=[f'{text_col}_para_len',f'{text_col}_para_sent_count',f'{text_col}_para_word_count']
 636 |         for gcol in group_cols:
 637 |             text_col_para_df=text_col_para_df[text_col_para_df[gcol]!=0]
 638 |         #for sent features, extract the difference in length between the two sents before and after.
 639 |         for gap in [1]:
 640 |             for col in [f'{text_col}_para_len',f'{text_col}_para_sent_count',f'{text_col}_para_word_count']:
 641 |                 text_col_para_df[f'{col}_diff{gap}']=text_col_para_df.groupby(['index'])[col].diff(gap)
 642 |                 group_cols.append(f'{col}_diff{gap}')
 643 |         text_col_para_agg_df = text_col_para_df[['index']+group_cols].groupby(['index']).agg(self.AGGREGATIONS)
 644 |         text_col_para_agg_df.columns = ['_'.join(x) for x in text_col_para_agg_df.columns]
 645 |         df=df.merge(text_col_para_agg_df,on='index',how='left') 
 646 |         df.drop(['index'],axis=1,inplace=True)
 647 |         return df
 648 |         
 649 |     #basic Feature Engineer,mode='train' or 'test' ,drop_cols is other cols you want to delete.
 650 |     def base_FE(self,df:pd.DataFrame,mode:str='train',drop_cols:list[str]=[])->pd.DataFrame:
 651 |         if self.FE!=None:
 652 |             #use your custom metric first
 653 |             try:#pandas FE
 654 |                 df=self.FE(df)
 655 |             except:#polars FE
 656 |                 df=pl.from_pandas(df)
 657 |                 df=self.FE(df)
 658 |                 df=df.to_pandas()
 659 |                 
 660 |         #clean text
 661 |         for pt_col in tqdm(self.param_text):
 662 |             self.PrintColor(f"-> for column {pt_col} text clean",color=Fore.YELLOW)
 663 |             df[pt_col]=(df[pt_col].fillna('nan'))
 664 |             if df[pt_col].nunique()>0.5*len(df):
 665 |                 df[pt_col]=df[pt_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.clean_text(x))
 666 |             else:#use dict to clean,save time.
 667 |                 text2clean={}
 668 |                 for text in df[pt_col].unique():
 669 |                     text2clean[text]=self.clean_text(text)
 670 |                 df[pt_col]=df[pt_col].swifter.allow_dask_on_strings(False).apply(lambda x:text2clean.get(x,'nan'))
 671 |                 del text2clean
 672 |                 gc.collect()
 673 |         
 674 |         #text feature extract,such as word,sentence,paragraph.
 675 |         #The reason why it needs to be done earlier is that it will generate columns such as nunique=1 or
 676 |         #object that need to be dropped, so it needs to be placed before finding these columns.
 677 |         if len(self.text_cols):
 678 |             print("< text column's feature >")
 679 |             for tcol in self.text_cols:
 680 |                 #category text
 681 |                 if df[tcol].nunique()<0.5*len(df):
 682 |                     text_map_df=pd.DataFrame({tcol:df[tcol].unique()})
 683 |                     text_agg_df=self.text_FE(text_map_df,tcol)
 684 |                     df=df.merge(text_agg_df,on=tcol,how='left')
 685 |                 else:
 686 |                     df=self.text_FE(df,tcol)
 687 |         
 688 |         if mode=='train':
 689 |             #missing value 
 690 |             self.nan_cols=[col for col in df.columns if df[col].isna().mean()>self.nan_margin]
 691 |             
 692 |             #nunique=1
 693 |             self.unique_cols=[]
 694 |             for col in df.drop(self.drop_cols+self.list_cols+\
 695 |                                [self.weight_col,self.group_col,self.target_col,self.kfold_col],axis=1,errors='ignore').columns:
 696 |                 if(df[col].nunique()<2):#maybe np.nan
 697 |                     self.unique_cols.append(col)
 698 |                 #max_value_counts's count
 699 |                 elif len(list(df[col].value_counts().to_dict().items()))>0:
 700 |                     if list(df[col].value_counts().to_dict().items())[0][1]>=len(df)*0.99:
 701 |                         self.unique_cols.append(col)
 702 |                 #num_cols and low_var
 703 |                 elif (df[col].dtype!=object) and (df[col].std()/df[col].mean()<0.01):
 704 |                     self.unique_cols.append(col)        
 705 |             
 706 |             #object dtype
 707 |             self.object_cols=[col for col in df.drop(self.drop_cols+self.category_cols,axis=1,errors='ignore').columns if (df[col].dtype==object) and (col not in [self.group_col,self.target_col])]
 708 |             ##### one_hot_encoder
 709 |             if self.one_hot_cols==None:
 710 |                 self.nunique3_cols=[]
 711 |                 self.nunique2_cols=[]
 712 |                 for col in df.drop(
 713 |                     [self.target_col,self.group_col,self.weight_col,self.kfold_col]+\
 714 |                     self.list_cols+self.drop_cols
 715 |                     ,axis=1,errors='ignore'
 716 |                 ).columns:
 717 |                     if (df[col].dtype==object) and not (
 718 |                         #such as sin_month,month have already been onehot.
 719 |                         col.startswith('sin') or col.startswith('cos') or
 720 |                         #AGGREGATION don't use onehot.
 721 |                         col.endswith('_nunique') or col.endswith('_count') or
 722 |                         col.endswith('_min') or col.endswith('_max') or 
 723 |                         col.endswith('_first') or col.endswith('_last') or
 724 |                         col.endswith('_mean') or col.endswith('_median') or 
 725 |                         col.endswith('_sum') or col.endswith('_std') or col.endswith('_skew') or 
 726 |                         #q0:0.05,q1:0.25,q2:0.5,q3:0.75,q4:0.95
 727 |                         col.endswith('_kurtosis') or col.endswith('_q0') or col.endswith('_q1') or
 728 |                         col.endswith('_q2') or col.endswith('_q3') or col.endswith('_q4')
 729 |                         ):
 730 |                         if (df[col].nunique()<self.one_hot_max) and (df[col].nunique()>2):
 731 |                             self.nunique3_cols.append([col,list(df[col].value_counts().to_dict().keys())]) 
 732 |                         elif (self.one_hot_max>=2) and (df[col].nunique()==2):
 733 |                             self.nunique2_cols.append([col,list(df[col].unique())[0]])
 734 |             else:#self.one_hot_cols=[col1,col2]
 735 |                 self.nunique2_cols=[[c,list(df[c].unique())[0]] \
 736 |                                     for c in self.one_hot_cols if df[c].nunique()==2]
 737 |                 self.nunique3_cols=[ [c,list(df[c].value_counts().to_dict().keys())]  \
 738 |                                    for c in self.one_hot_cols if c not in self.nunique2_cols and df[c].nunique()>2]
 739 |                 self.one_hot_max=3
 740 |         
 741 |         df=pl.from_pandas(df)
 742 |         if self.one_hot_max>1:
 743 |             print("< one hot encoder >")          
 744 |             for i in range(len(self.nunique3_cols)):
 745 |                 col,nunique=self.nunique3_cols[i]
 746 |                 for u in nunique:
 747 |                     df=df.with_columns((pl.col(col)==u).cast(pl.Int8).alias(f"{col}_{u}"))
 748 |                 #one_hot_value_count
 749 |                 try:
 750 |                     col_valuecounts=self.onehot_valuecounts[col]
 751 |                 except:
 752 |                     col_valuecounts=df[col].value_counts().to_dict()
 753 |                     new_col_valuecounts={}
 754 |                     for k,v in zip(col_valuecounts[col],col_valuecounts['count']):
 755 |                         new_col_valuecounts[k]=v
 756 |                     col_valuecounts=new_col_valuecounts
 757 |                     self.onehot_valuecounts[col]=col_valuecounts
 758 |                 df=df.with_columns(pl.col(col).replace(col_valuecounts,default=np.nan).alias(col+"_valuecounts"))
 759 |                 df=df.with_columns((pl.col(col+"_valuecounts")>=5)*pl.col(col+"_valuecounts"))    
 760 |             for i in range(len(self.nunique2_cols)):
 761 |                 c,u=self.nunique2_cols[i]
 762 |                 df=df.with_columns((pl.col(c)==u).cast(pl.Int8).alias(f"{c}_{u}"))
 763 |         df=df.to_pandas()
 764 | 
 765 |         #category columns
 766 |         for col in self.category_cols:
 767 |             #preprocessing
 768 |             df[col]=df[col].swifter.allow_dask_on_strings(False).apply(lambda x:str(x).lower())
 769 |             df[col]=df[col].astype(str).astype('category')
 770 |         
 771 |         if len(self.list_stat):
 772 |             print("< list column's feature >")
 773 |             for (l_col,l_gaps) in self.list_stat:
 774 |                 try:#if str(list),transform '[a,b]' to [a,b]
 775 |                     df[l_col]=df[l_col].swifter.allow_dask_on_strings(False).apply(lambda x:ast.literal_eval(x))
 776 |                 except:#origin data is list or data can't be parsed.
 777 |                     #<class 'numpy.ndarray'> [10103]
 778 |                     if not isinstance(list(df[l_col].dropna().values[0]),list):
 779 |                         raise ValueError(f"col '{l_col}' is not a list.")
 780 |                 
 781 |                 #add index,data of list can groupby index.
 782 |                 df['index']=np.arange(len(df))
 783 |                 #construct origin feats 
 784 |                 list_col_df=df.copy().explode(l_col)[['index',l_col]]
 785 |                 list_col_df[l_col]=list_col_df[l_col].astype(np.float32)
 786 | 
 787 |                 group_cols=[l_col]
 788 |                 for gap in l_gaps:
 789 |                     self.PrintColor(f"-> for column {l_col} gap{gap}",color=Fore.RED)
 790 |                     list_col_df[f"{l_col}_gap{gap}"]=list_col_df.groupby(['index'])[l_col].diff(gap)
 791 |                     group_cols.append( f"{l_col}_gap{gap}" )
 792 | 
 793 |                 list_col_agg_df = list_col_df[['index']+group_cols].groupby(['index']).agg(self.AGGREGATIONS)
 794 |                 list_col_agg_df.columns = ['_'.join(x) for x in list_col_agg_df.columns]
 795 |                 df=df.merge(list_col_agg_df,on='index',how='left')
 796 |                 df[f'{l_col}_len']=df[l_col].swifter.allow_dask_on_strings(False).apply(len)
 797 |                 
 798 |                 for gcol in group_cols:
 799 |                     if (f'{gcol}_max' in df.columns) and (f'{gcol}_min' in df.columns):
 800 |                         df[f'{gcol}_ptp']=df[f'{gcol}_max']-df[f'{gcol}_min']
 801 |                     if (f'{gcol}_mean' in df.columns) and (f'{gcol}_std' in df.columns):
 802 |                         df[f'{gcol}_mean_divide_{gcol}_std']=df[f'{gcol}_mean']/(df[f'{gcol}_std']+self.eps)
 803 | 
 804 |                 col_list=df[l_col].values
 805 |                 max_k=10
 806 |                 res=[]
 807 |                 for i in range(len(df)):
 808 |                     #{1:count2,2:count4,3:count1}
 809 |                     vs,cs = np.unique(col_list[i], return_counts=True)
 810 |                     res_=[]
 811 |                     for k in range(max_k):
 812 |                         res_.append(np.sum(cs==k+1))
 813 |                     res.append(res_)
 814 |                 res=np.array(res)
 815 |                 for k in range(max_k):
 816 |                     df[f"{l_col}_valuecount_equal{k+1}_cnt"]=res[:,k]
 817 |                 
 818 |                 #drop index after using.
 819 |                 df.drop(['index'],axis=1,inplace=True)
 820 | 
 821 |         if len(self.feats_stat):
 822 |             print("< groupby feature >")
 823 |             for items in tqdm(self.feats_stat):
 824 |                 group_col,feature_col,AGGREGATIONS=items
 825 |                 #simple agg or cross agg
 826 |                 agg1,agg2=[],[]
 827 |                 for agg in AGGREGATIONS:
 828 |                     if type(agg)==str:
 829 |                         agg_name=agg
 830 |                     else:#function
 831 |                         agg_name=agg.__name__
 832 |                     if ('+' in agg_name) or ('-' in agg_name) or ('*' in agg_name) or ('/' in agg_name):
 833 |                         agg2.append(agg)
 834 |                     else:
 835 |                         agg1.append(agg)
 836 |                 if type(group_col)==str:
 837 |                     choose_cols=[group_col,feature_col]
 838 |                 else:#such as list
 839 |                     choose_cols=group_col+[feature_col]
 840 |                 #deal with agg1
 841 |                 agg_df = df[choose_cols].groupby(group_col).agg(agg1)
 842 |                 agg_df.columns = ['_'.join(x) for x in agg_df.columns]
 843 |                 df=df.merge(agg_df,on=group_col,how='left')
 844 |                 #deal with agg2
 845 |                 for agg in agg2:
 846 |                     if agg=='max-min':
 847 |                         if f'{feature_col}_max' in df.columns and f'{feature_col}_min' in df.columns:
 848 |                             df[f'{feature_col}_ptp']=df[f'{feature_col}_max']-df[f'{feature_col}_min']
 849 |                         else:
 850 |                             raise ValueError(f"max and min must in {feature_col}'s AGGREGATIONS.")
 851 |             
 852 |                     elif agg=='mean/std':
 853 |                         if f'{feature_col}_mean' in df.columns and f'{feature_col}_std' in df.columns:
 854 |                             df[f'{feature_col}_mean_divide_std']=df[f'{feature_col}_mean']/(df[f'{feature_col}_std']+self.eps)
 855 |                         else:
 856 |                             raise ValueError(f"mean and std must in {feature_col}'s AGGREGATIONS.")
 857 | 
 858 |                     elif agg=='(x-mean)/std':
 859 |                         if f'{feature_col}_mean' in df.columns and f'{feature_col}_std' in df.columns:
 860 |                            df[f'{feature_col}-{feature_col}_mean_divide_std']=(df[f'{feature_col}']-df[f'{feature_col}_mean'])/(df[f'{feature_col}_std']+self.eps)
 861 |                         else:
 862 |                            raise ValueError(f"mean and std must in {feature_col}'s AGGREGATIONS.")
 863 |                     else:
 864 |                         if '+' in agg:
 865 |                             #example:['mean','std']
 866 |                             split_agg=agg.split('+')
 867 |                             if f'{feature_col}_{split_agg[0]}' in df.columns and f'{feature_col}_{split_agg[1]}' in df.columns:
 868 |                                 df[f'{feature_col}_{split_agg[0]}+{feature_col}_{split_agg[1]}']=\
 869 |                                 df[f'{feature_col}_{split_agg[0]}']+df[f'{feature_col}_{split_agg[1]}']
 870 |                             else:
 871 |                                 raise ValueError(f"{split_agg[0]} and {split_agg[1]} must in {feature_col}'s AGGREGATIONS.")
 872 |                         
 873 |                         elif '-' in agg:
 874 |                             #example:['mean','std']
 875 |                             split_agg=agg.split('-')
 876 |                             if f'{feature_col}_{split_agg[0]}' in df.columns and f'{feature_col}_{split_agg[1]}' in df.columns:
 877 |                                 df[f'{feature_col}_{split_agg[0]}-{feature_col}_{split_agg[1]}']=\
 878 |                                 df[f'{feature_col}_{split_agg[0]}']-df[f'{feature_col}_{split_agg[1]}']
 879 |                             else:
 880 |                                 raise ValueError(f"{split_agg[0]} and {split_agg[1]} must in {feature_col}'s AGGREGATIONS.")   
 881 | 
 882 |                         elif '*' in agg:
 883 |                             #example:['mean','std']
 884 |                             split_agg=agg.split('*')
 885 |                             if f'{feature_col}_{split_agg[0]}' in df.columns and f'{feature_col}_{split_agg[1]}' in df.columns:
 886 |                                 df[f'{feature_col}_{split_agg[0]}*{feature_col}_{split_agg[1]}']=\
 887 |                                 df[f'{feature_col}_{split_agg[0]}']*df[f'{feature_col}_{split_agg[1]}']
 888 |                             else:
 889 |                                 raise ValueError(f"{split_agg[0]} and {split_agg[1]} must in {feature_col}'s AGGREGATIONS.")  
 890 | 
 891 |                         elif '/' in agg:
 892 |                             #example:['mean','std']
 893 |                             split_agg=agg.split('/')
 894 |                             if f'{feature_col}_{split_agg[0]}' in df.columns and f'{feature_col}_{split_agg[1]}' in df.columns:
 895 |                                 df[f'{feature_col}_{split_agg[0]}_divide_{feature_col}_{split_agg[1]}']=\
 896 |                                 df[f'{feature_col}_{split_agg[0]}']/(df[f'{feature_col}_{split_agg[1]}']+self.eps)
 897 |                             else:
 898 |                                 raise ValueError(f"{split_agg[0]} and {split_agg[1]} must in {feature_col}'s AGGREGATIONS.")  
 899 | 
 900 |                         else:
 901 |                             raise ValueError(f"Yunbase can't support {agg}")  
 902 | 
 903 |            
 904 |         if (mode=='train') and (self.use_high_corr_feat==False):#drop high correlation features
 905 |             print("< drop high correlation feature >")
 906 |             self.high_corr_cols=self.drop_high_correlation_feats(df)
 907 |        
 908 |         if len(self.cross_cols)!=0:
 909 |             print("< cross feature >")
 910 |             for i in range(len(self.cross_cols)):
 911 |                 for j in range(i+1,len(self.cross_cols)):
 912 |                     df[self.cross_cols[i]+"+"+self.cross_cols[j]]=df[self.cross_cols[i]]+df[self.cross_cols[j]]
 913 |                     df[self.cross_cols[i]+"-"+self.cross_cols[j]]=df[self.cross_cols[i]]-df[self.cross_cols[j]]
 914 |                     df[self.cross_cols[i]+"*"+self.cross_cols[j]]=df[self.cross_cols[i]]*df[self.cross_cols[j]]
 915 |                     df[self.cross_cols[i]+"_divide_"+self.cross_cols[j]]=df[self.cross_cols[i]]/(df[self.cross_cols[j]]+self.eps)
 916 |         
 917 |         print("< drop useless cols >")
 918 |         total_drop_cols=self.nan_cols+self.unique_cols+drop_cols+self.high_corr_cols
 919 |         print(f"nan_cols:{self.nan_cols}")
 920 |         print(f"unique_cols:{self.unique_cols}")
 921 |         print(f"drop_cols:{drop_cols}")
 922 |         print(f"high_corr_cols:{self.high_corr_cols}")
 923 |         total_drop_cols=[col for col in total_drop_cols if col not in \
 924 |                          self.word2vec_cols+self.labelencoder_cols+self.category_cols]
 925 |         df.drop(total_drop_cols,axis=1,inplace=True,errors='ignore')
 926 | 
 927 |         if self.use_scaler:
 928 |             if mode=='train':
 929 |                 #'/' will be considered as a path.
 930 |                 self.num_cols=[col for col in df.drop([self.target_col,self.kfold_col],axis=1,errors='ignore').columns \
 931 |                                if ( str(df[col].dtype) not in ['object','category'] ) and ('/' not in col)]
 932 |             print("< robust scaler >")
 933 |             for col in self.num_cols:
 934 |                 try:
 935 |                     scaler=self.trained_scaler[f'robustscaler_{col}.model']
 936 |                 except:
 937 |                     scaler = RobustScaler(
 938 |                             with_centering=True,with_scaling=True,
 939 |                             quantile_range=(5.0,95.0),
 940 |                         )
 941 |                     scaler.fit(df[col].values.reshape(-1,1))
 942 |                     if self.save_trained_models:
 943 |                         self.pickle_dump(scaler,self.model_save_path+f'robustscaler_{col}_{self.target_col}.model')
 944 |                     self.trained_scaler[f'robustscaler_{col}.model']=copy.deepcopy(scaler)
 945 |                 df[col] = scaler.transform(df[col].values.reshape(-1,1))
 946 |                 df[col]=df[col].clip(-5,5)
 947 |         if self.use_reduce_memory:
 948 |             df=self.reduce_mem_usage(df,float16_as32=True)
 949 |         
 950 |         print("-"*30)
 951 |         return df
 952 | 
 953 |     def label_encoder(self,df:pd.DataFrame,label_encoder_cols,fold:int=0,repeat:int=0):
 954 |         for col in label_encoder_cols:
 955 |             self.PrintColor(f"-> for column {col} labelencoder feature",color=Fore.RED)
 956 |             #load model when model is existed,fit when model isn't exist.
 957 |             try:
 958 |                 le=self.trained_le[f'le_{col}_repeat{repeat}_fold{fold}.model']
 959 |             except:#training
 960 |                 value=df[col].value_counts().to_dict()
 961 |                 le={}
 962 |                 for k,v in value.items():
 963 |                     le[k]=len(le)
 964 |                 if self.save_trained_models:
 965 |                     self.pickle_dump(le,self.model_save_path+f'le_{col}_repeat{repeat}_fold{fold}_{self.target_col}.model')
 966 |                 self.trained_le[f'le_{col}_repeat{repeat}_fold{fold}.model']=copy.deepcopy(le)
 967 |             df[col] = df[col].swifter.allow_dask_on_strings(False).apply(lambda x:le.get(x,0)) 
 968 |         return df
 969 | 
 970 |     def get_agg2pl(self,t_col):
 971 |         #polars AGGREGATIONS
 972 |         return  {'nunique':pl.col(t_col).n_unique(),'count':pl.col(t_col).count(),
 973 |                  'min':pl.col(t_col).min(),'max':pl.col(t_col).max(),
 974 |                  'first':pl.col(t_col).first(),'last':pl.col(t_col).last(),
 975 |                  'mean': pl.col(t_col).mean(),'sum':pl.col(t_col).sum(),"std": pl.col(t_col).std(),
 976 |                  'median':pl.col(t_col).median(),'skew':pl.col(t_col).skew(),
 977 |                  
 978 |                  'ptp':pl.col(t_col).max()-pl.col(t_col).min(),
 979 |                  'nunique/count':pl.col(t_col).n_unique()/pl.col(t_col).count(),
 980 |                  'mean/std':pl.col(t_col).mean()/pl.col(t_col).std()
 981 |                 }
 982 | 
 983 |     """
 984 |     What is being done here is the TargetEncoder.
 985 |     The difference between the two functions is whether the training data (train) in cross validation
 986 |     (full=train+valid) uses the entire training data (train)'s Target Encoder directly,
 987 |     or is assigned through cross validation in the training data (train=tr+va).
 988 |     """
 989 |     #reference: https://www.kaggle.com/code/cdeotte/first-place-single-model-cv-1-016-lb-1-016   In[6]
 990 |     def CV_stat_without_kfold(self,X,y=None,repeat:int=0,fold:int=0):
 991 |         X=pl.from_pandas(X)
 992 |         if len(self.target_stat):
 993 |             if type(y)==type(None):#valid set or test set(transform)
 994 |                 TE=self.trained_TE[f'TE_repeat{repeat}_fold{fold}.model']
 995 |             else:#train set(fit and transform)
 996 |                 y=pl.DataFrame({self.target_col:y.values})
 997 |                 df=pl.concat((X,y),how="horizontal")
 998 |                 #repeat i fold j's TE  cat_col2value 
 999 |                 TE={}
1000 |                 for (g_col,t_col,aggs) in self.target_stat:
1001 |                     g_col=self.colname_clean([g_col])[0]
1002 |                     if t_col!=self.target_col:
1003 |                         t_col=self.colname_clean([t_col])[0]
1004 |                         
1005 |                     #deal with value_counts()<10 
1006 |                     df_copy=copy.deepcopy(df[[g_col,t_col]])
1007 |                     gcol2counts=df_copy[g_col].to_pandas().value_counts().to_dict()
1008 |                     GCOLS=[gcol for gcol,count in gcol2counts.items() if count>min(10,len(df)//100) ]
1009 |                     df_copy=df_copy.filter(df_copy[g_col].is_in(GCOLS) )
1010 | 
1011 |                     agg2pl=self.get_agg2pl(t_col)
1012 | 
1013 |                     mappl={}
1014 |                     for agg in aggs:
1015 |                         if type(agg)==type('mean'):
1016 |                             try:
1017 |                                 mappl[agg]=agg2pl[agg]
1018 |                             except:
1019 |                                 raise ValueError(f"{agg} not in {agg2pl.keys()}.")
1020 |                         else:#tuple(function_name,function)
1021 |                             mappl[agg[0]]=pl.col(t_col).map_elements(agg[1])
1022 |                     
1023 |                     agg_df=df_copy.group_by([g_col]).agg(**mappl)
1024 |                     agg_df.columns=[c if c==g_col else f"{g_col}_transform_{t_col}_{c}" for c in agg_df.columns]
1025 |                     TE[f"{g_col}_TE_{t_col}"]=agg_df
1026 |                 #save TE
1027 |                 if self.save_trained_models:
1028 |                     self.pickle_dump(TE,self.model_save_path+f'TE_repeat{repeat}_fold{fold}_{self.target_col}.model')
1029 |                 self.trained_TE[f'TE_repeat{repeat}_fold{fold}.model']=copy.deepcopy(TE)
1030 |             #transform
1031 |             for k,agg_df in TE.items():
1032 |                 g_col,t_col=k.split("_TE_")
1033 |                 X=X.join(agg_df,on=g_col,how='left')
1034 |                 #fillna
1035 |                 y=pl.DataFrame({self.target_col:self.target})
1036 |                 y=pl.concat((pl.from_pandas(self.features),y),how="horizontal")
1037 |                 agg_df_columns=agg_df.drop([g_col]).columns
1038 |                 agg2pl=self.get_agg2pl(t_col)
1039 |                 for i in range(len(agg_df_columns)):
1040 |                     agg_col=agg_df_columns[i]
1041 |                     agg='null'
1042 |                     for s in ['nunique','count','min','max','first','last',
1043 |                               'mean','median','sum','std','skew']:
1044 |                         #maybe feature has agg such as 'last_status'.
1045 |                         if s==agg_col[-len(s):]:
1046 |                             agg=s
1047 |                     if agg!='null':
1048 |                         nan_value=y[[t_col]].select(agg2pl[agg]).to_numpy()[0][0]
1049 |                         try:
1050 |                             X=X.with_columns(pl.col(agg_col).fill_null(nan_value))
1051 |                         except:
1052 |                             pass
1053 |         X=X.to_pandas()
1054 |         return X
1055 | 
1056 |     #reference: https://www.kaggle.com/code/cdeotte/first-place-single-model-cv-1-016-lb-1-016   In[6]
1057 |     def CV_stat_with_kfold(self,X,y=None,repeat:int=0,fold:int=0):
1058 |         #polars don't have 'index','iloc'.
1059 |         X['temp_index']=X.index
1060 |         X=pl.from_pandas(X)
1061 |         if len(self.target_stat):
1062 |             if type(y)!=type(None):#train set(fit and transform)
1063 |                 #train_df,not full_df
1064 |                 y=pl.DataFrame({self.target_col:y.values})
1065 |                 df=pl.concat((X,y),how="horizontal")
1066 |                 
1067 |                 #TargetEncoder for valid/test set. use full trainset to agg.
1068 |                 TE={}
1069 |                 for (g_col,t_col,aggs) in self.target_stat:
1070 |                     g_col=self.colname_clean([g_col])[0]
1071 |                     if t_col!=self.target_col:
1072 |                         t_col=self.colname_clean([t_col])[0]
1073 |                         
1074 |                     #deal with value_counts()<10 
1075 |                     df_copy=copy.deepcopy(df[[g_col,t_col]])
1076 |                     gcol2counts=df_copy[g_col].to_pandas().value_counts().to_dict()
1077 |                     GCOLS=[gcol for gcol,count in gcol2counts.items() if count>min(10,len(df)//100) ]
1078 |                     df_copy=df_copy.filter(df_copy[g_col].is_in(GCOLS) )
1079 | 
1080 |                     agg2pl=self.get_agg2pl(t_col)
1081 |                     mappl={}
1082 |                     for agg in aggs:
1083 |                         if type(agg)==type('mean'):
1084 |                             try:
1085 |                                 mappl[agg]=agg2pl[agg]
1086 |                             except:
1087 |                                 raise ValueError(f"{agg} not in {agg2pl.keys()}.")
1088 |                         else:#tuple(function_name,function)
1089 |                             mappl[agg[0]]=pl.col(t_col).map_elements(agg[1])
1090 |                     
1091 |                     agg_df=df_copy.group_by([g_col]).agg(**mappl)
1092 |                     agg_df.columns=[c if c==g_col else f"{g_col}_transform_{t_col}_{c}" for c in agg_df.columns]
1093 |                     TE[f"{g_col}_TE_{t_col}"]=agg_df
1094 |                 #save TE
1095 |                 if self.save_trained_models:
1096 |                     self.pickle_dump(TE,self.model_save_path+f'TE_repeat{repeat}_fold{fold}_{self.target_col}.model')
1097 |                 self.trained_TE[f'TE_repeat{repeat}_fold{fold}.model']=copy.deepcopy(TE)
1098 | 
1099 |                 #train set fit and transform,use kfold num_folds=5,index%5.
1100 |                 for fold in range(5):
1101 |                     #train set in trainset.
1102 |                     train_TE=df.filter(pl.col('temp_index')%5!=fold)
1103 |                     for (g_col,t_col,aggs) in self.target_stat:
1104 |                         #colname_clean deal with json
1105 |                         g_col=self.colname_clean([g_col])[0]
1106 |                         if t_col!=self.target_col:
1107 |                             t_col=self.colname_clean([t_col])[0]
1108 |                             
1109 |                         #deal with value_counts()<10 
1110 |                         df_copy=copy.deepcopy(train_TE[[g_col,t_col]])
1111 |                         gcol2counts=df_copy[g_col].to_pandas().value_counts().to_dict()
1112 |                         GCOLS=[gcol for gcol,count in gcol2counts.items() if count>min(5,len(df)//100) ]
1113 |                         df_copy=df_copy.filter(df_copy[g_col].is_in(GCOLS) )
1114 |     
1115 |                         agg2pl=self.get_agg2pl(t_col)
1116 |                         mappl={}
1117 |                         for agg in aggs:
1118 |                             if type(agg)==type('mean'):
1119 |                                 try:
1120 |                                     mappl[agg]=agg2pl[agg]
1121 |                                 except:
1122 |                                     raise ValueError(f"{agg} not in {agg2pl.keys()}.")
1123 |                             else:#tuple(function_name,function)
1124 |                                 mappl[agg[0]]=pl.col(t_col).map_elements(agg[1])
1125 |                         
1126 |                         agg_df=df_copy.group_by([g_col]).agg(**mappl)
1127 |                         agg_df.columns=[c if c==g_col else f"{g_col}_transform_{t_col}_{c}" for c in agg_df.columns]
1128 | 
1129 |                         #when fold=0,X don't have agg_df.columns
1130 |                         for col in agg_df.columns:
1131 |                             if col not in X.columns:
1132 |                                 X=X.with_columns(pl.lit(1).alias(col))
1133 | 
1134 |                         #fold{fold}_columns
1135 |                         agg_df.columns=[f"fold{fold}_"+c if c!=g_col else c for c in agg_df.columns ]
1136 |                         X=X.join(agg_df,on=g_col,how='left')
1137 |                         for col in agg_df.columns:
1138 |                             if col!=g_col:
1139 |                                 X=X.with_columns(pl.when(pl.col('temp_index')%5==fold)
1140 |                                     .then(pl.col(col)).otherwise(pl.col(col[len(f"fold{fold}_"):])) 
1141 |                                     .alias(col[len(f"fold{fold}_"):]) )
1142 |                         X=X.drop([c for c in agg_df.columns if c!=g_col])
1143 |                         
1144 |             else:#valid set/test set,transform with full trainset AGGS.
1145 |                 #TargetEncoder
1146 |                 TE=self.trained_TE[f'TE_repeat{repeat}_fold{fold}.model']
1147 |                 for k,agg_df in TE.items():
1148 |                     g_col,t_col=k.split("_TE_")
1149 |                     X=X.join(agg_df,on=g_col,how='left')
1150 |                     
1151 |             #full data to fillna (train/valid/test set).
1152 |             y=pl.DataFrame({self.target_col:self.target})
1153 |             full_df=pl.concat((pl.from_pandas(self.features),y),how="horizontal")
1154 |             for k,agg_df in TE.items():
1155 |                 g_col,t_col=k.split("_TE_")
1156 |                 #fillna with full_df.
1157 |                 agg_df_columns=agg_df.drop([g_col]).columns
1158 |                 agg2pl=self.get_agg2pl(t_col)
1159 |                 for i in range(len(agg_df_columns)):
1160 |                     agg_col=agg_df_columns[i]
1161 |                     agg='null'
1162 |                     for s in ['nunique','count','min','max','first','last',
1163 |                               'mean','median','sum','std','skew']:
1164 |                         #maybe feature has agg such as 'last_status'.
1165 |                         if s==agg_col[-len(s):]:
1166 |                             agg=s
1167 |                     if agg!='null':
1168 |                         nan_value=full_df[[t_col]].select(agg2pl[agg]).to_numpy()[0][0]
1169 |                         try:
1170 |                             X=X.with_columns(pl.col(agg_col).fill_null(nan_value))
1171 |                         except:
1172 |                             pass
1173 |         X=X.to_pandas().drop(['temp_index'],axis=1)
1174 |             
1175 |         return X
1176 |     
1177 |     #Feature engineering that needs to be done internally in cross validation.
1178 |     def CV_FE(self,df:pd.DataFrame,mode:str='train',fold:int=0,repeat:int=0)->pd.DataFrame:
1179 |         #labelencoder
1180 |         if len(self.labelencoder_cols):
1181 |             print("< label encoder >")
1182 |             df=self.label_encoder(df,label_encoder_cols=self.colname_clean(self.labelencoder_cols),fold=fold,repeat=repeat)
1183 | 
1184 |         if len(self.word2vec_models):
1185 |             print("< word2vec >")
1186 |             for (word2vec,col,model_name,use_svd) in self.word2vec_models:
1187 |                 col=self.colname_clean([col])[0]
1188 |                 self.PrintColor(f"-> for column {col} {model_name} word2vec feature",color=Fore.RED)
1189 |                 #tfidf,countvec
1190 |                 if 'word2vec' not in model_name:
1191 |                     #load when model is existed.fit when model isn't existed.
1192 |                     try:
1193 |                         word2vec=self.trained_wordvec[f'{model_name}_{col}_repeat{repeat}_fold{fold}.model' ]
1194 |                     except:
1195 |                         word2vec.fit(df[col])
1196 |                         if self.save_trained_models:
1197 |                             self.pickle_dump(word2vec,self.model_save_path+f'{model_name}_{col}_repeat{repeat}_fold{fold}_{self.target_col}.model') 
1198 |                         self.trained_wordvec[f'{model_name}_{col}_repeat{repeat}_fold{fold}.model' ]=copy.deepcopy(word2vec)
1199 |                     word2vec_feats=word2vec.transform(df[col]).toarray()
1200 |                 else:#word2vec from gensim  Word2Vec(vector_size=256, window=5, min_count=2, workers=16)
1201 |                     texts=list(df[col].values)
1202 |                     texts_split=[text.split() for text in texts]
1203 |                     try:
1204 |                         word2vec_copy=self.trained_wordvec[f'{model_name}_{col}_repeat{repeat}_fold{fold}.model' ]
1205 |                     except:
1206 |                         word2vec_copy=copy.deepcopy(word2vec)
1207 |                         word2vec_copy.build_vocab(texts_split)
1208 |                         word2vec_copy.train(texts_split, total_examples=word2vec_copy.corpus_count,
1209 |                                        epochs=word2vec_copy.epochs)
1210 |                         if self.save_trained_models:
1211 |                             self.pickle_dump(word2vec_copy,self.model_save_path+f'{model_name}_{col}_repeat{repeat}_fold{fold}_{self.target_col}.model') 
1212 |                         self.trained_wordvec[f'{model_name}_{col}_repeat{repeat}_fold{fold}.model' ]=copy.deepcopy(word2vec_copy)
1213 |                     #transform 
1214 |                     word2vec_feats = []
1215 |                     for text in texts:
1216 |                         vector = np.zeros(word2vec_copy.vector_size)
1217 |                         count = 0
1218 |                         for word in text.split():
1219 |                             if word in word2vec_copy.wv:#if word in word vocabulary
1220 |                                 vector += word2vec_copy.wv[word]
1221 |                                 count += 1
1222 |                         if count > 0:
1223 |                             vector /= count
1224 |                         word2vec_feats.append(vector)
1225 |                     word2vec_feats=np.array(word2vec_feats)
1226 | 
1227 |                 if use_svd:
1228 |                     try:
1229 |                         svd=self.trained_svd[f'{model_name}_svd_{col}_repeat{repeat}_fold{fold}.model']
1230 |                     except:
1231 |                         svd = TruncatedSVD(n_components=word2vec_feats.shape[1]//3+1,
1232 |                                             n_iter=10, random_state=self.seed)
1233 |                         svd.fit(word2vec_feats)
1234 |                         self.trained_svd[f'{model_name}_svd_{col}_repeat{repeat}_fold{fold}.model']=copy.deepcopy(svd)
1235 |                         if self.save_trained_models:
1236 |                             self.pickle_dump(word2vec,self.model_save_path+f'{model_name}_svd_{col}_repeat{repeat}_fold{fold}_{self.target_col}.model') 
1237 |                     word2vec_feats=svd.transform(word2vec_feats)
1238 |                 for i in range(word2vec_feats.shape[1]):
1239 |                     df[f"{col}_{model_name}_{i}"]=word2vec_feats[:,i]
1240 |         #word2vec_cols maybe category_cols.
1241 |         drop_word2vec_cols=[c for c in self.word2vec_cols if c not in self.colname_clean(self.category_cols)]
1242 |         #drop object_cols here(not base_FE) because CV_stat may use category_string_cols.
1243 |         df.drop(self.colname_clean(drop_word2vec_cols+self.labelencoder_cols+self.object_cols),axis=1,inplace=True,errors='ignore')
1244 |         #after this operation,df will be dropped into model,so we need to Convert object to floating-point numbers
1245 |         for col in df.columns:
1246 |             if (df[col].dtype==object):
1247 |                 df[col]=df[col].astype(np.float32)
1248 |         #replace inf to nan
1249 |         df.replace([np.inf, -np.inf], np.nan, inplace=True)
1250 |         #xgboost treat -1 as missing value.
1251 |         num_cols=[col for col in df.columns if str(df[col].dtype) not in ['category','object','string']]
1252 |         #when you use tabnet or LinearRegression.
1253 |         df[num_cols]=df[num_cols].fillna(0)
1254 |         return df
1255 | 
1256 |     def roc_auc_score(self,y_true:np.array,y_pro:np.array):
1257 |         pos_idx=np.where(y_true==1)[0]
1258 |         neg_idx=np.where(y_true==0)[0]
1259 |         pos_pro=sorted(y_pro[pos_idx])
1260 |         neg_pro=sorted(y_pro[neg_idx])
1261 |         total_sample_cnt,greater_sample_cnt=len(pos_pro)*len(neg_pro),0
1262 |         left,right=0,0
1263 |         while left<len(pos_pro):
1264 |             while right<len(neg_pro) and (pos_pro[left]>neg_pro[right]):
1265 |                 right+=1
1266 |             if right<len(neg_pro):
1267 |                 greater_sample_cnt+=right
1268 |                 left+=1
1269 |             else:#right>=len(neg_pro)
1270 |                 greater_sample_cnt+=len(neg_pro)*(len(pos_pro)-left)
1271 |                 left=len(pos_pro)
1272 |         auc_score=greater_sample_cnt/total_sample_cnt
1273 |         return auc_score
1274 | 
1275 |     def Medae(self,y_true:np.array,y_pred:np.array):
1276 |         return np.median(np.abs(y_true-y_pred))
1277 | 
1278 |     def Metric(self,y_true:np.array,y_pred:np.array,weight=np.zeros(0),
1279 |                #y_true/y_pred:for classification,label and proability.
1280 |                mode:Literal['lenient','strict']='lenient')->float:
1281 |         #due to the use of the reduce_mem function to reduce memory,it may result in over range after data addition.
1282 |         if self.objective=='regression':
1283 |             y_true,y_pred=y_true.astype(np.float64),y_pred.astype(np.float64)
1284 |         else:
1285 |             y_true,y_pred=y_true.astype(np.int64),y_pred.astype(np.float64)
1286 |         if mode=='strict':#The strict version does not allow nan or inf.
1287 |             if np.any(np.isnan(y_true)) or np.any(np.isnan(y_pred)):
1288 |                 raise ValueError("y_true or y_pred contains NaN values.")
1289 |             if np.any(np.isinf(y_true)) or np.any(np.isinf(y_pred)):
1290 |                 raise ValueError("y_true or y_pred contains infinite values.")
1291 |         else:#The lenient version removes nan and inf.
1292 |             def find_illegal_rows(y):
1293 |                 y=np.isnan(y)|np.isinf(y)
1294 |                 if len(y.shape)==2:
1295 |                     y=np.mean(y,axis=1)
1296 |                 return np.where(y>0)[0]
1297 |             illegal_rows=list(set(list(find_illegal_rows(y_true))+list(find_illegal_rows(y_pred))))
1298 |             legal_rows=[i for i in range(len(y_true)) if i not in illegal_rows]
1299 |             y_true,y_pred=y_true[legal_rows],y_pred[legal_rows]
1300 |             
1301 |         #use cutom_metric when you define.
1302 |         if self.custom_metric!=None:
1303 |             if len(weight)!=0:
1304 |                 return self.custom_metric(y_true,y_pred,weight)
1305 |             else:#weight=np.zeros(0)
1306 |                 try:
1307 |                     return self.custom_metric(y_true,y_pred)
1308 |                 except:
1309 |                     weight=np.ones(len(y_true))
1310 |                     return self.custom_metric(y_true,y_pred,weight)
1311 |         if self.objective=='regression':
1312 |             if self.metric=='medae':
1313 |                 return self.Medae(y_true,y_pred)
1314 |             elif self.metric=='mae':
1315 |                 return np.mean(np.abs(y_true-y_pred))
1316 |             elif self.metric=='rmse':
1317 |                 return np.sqrt(np.mean((y_true-y_pred)**2))
1318 |             elif self.metric=='mse':
1319 |                 return np.mean((y_true-y_pred)**2)
1320 |             elif self.metric=='rmsle':
1321 |                 y_pred=np.clip(y_pred,0,1e20)
1322 |                 return np.sqrt(np.mean((np.log1p(y_true)-np.log1p(y_pred))**2))
1323 |             elif self.metric=='msle':
1324 |                 y_pred=np.clip(y_pred,0,1e20)
1325 |                 return np.mean((np.log1p(y_true)-np.log1p(y_pred))**2)
1326 |             elif self.metric=='mape':#y_true>0 or not?
1327 |                 y_true[y_true<1]=1
1328 |                 return np.mean(np.abs(y_true-y_pred)/(np.abs(y_true)+self.eps))
1329 |             elif self.metric=='r2':
1330 |                 return 1-np.sum ((y_true-y_pred)**2)/np.sum ((y_true-np.mean(y_true))**2)
1331 |             elif self.metric=='smape':
1332 |                 return 200*np.mean(np.abs(y_true-y_pred) / ( np.abs(y_true)+np.abs(y_pred)+self.eps ) )
1333 |         else:
1334 |             if self.metric=='accuracy':
1335 |                 #lgb_eval_metric or Metric(target,oof_preds)?
1336 |                 if y_pred.shape==(len(y_pred),self.num_classes):
1337 |                     y_pred=np.argmax(y_pred,axis=1)#transform probability to label
1338 |                 else:#shape==len(y_pred), maybe:[0.1,0.9],maybe:[0,1]
1339 |                     y_pred=np.round(y_pred)
1340 |                 return np.mean(y_true==y_pred)
1341 |             elif self.metric=='auc':
1342 |                 #lgb_eval_metric or Metric(target,oof_preds)?
1343 |                 if y_pred.shape==(len(y_pred),self.num_classes):
1344 |                     y_pred=y_pred[:,1]
1345 |                 return roc_auc_score(y_true,y_pred)
1346 |             #https://www.kaggle.com/competitions/phems-hackathon-early-sepsis-prediction
1347 |             elif self.metric=='pr_auc':
1348 |                 if y_pred.shape==(len(y_pred),self.num_classes):
1349 |                     y_pred=y_pred[:,1]
1350 |                 precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
1351 |                 pr_auc = auc(recall, precision)
1352 |                 return pr_auc
1353 |             elif self.metric=='f1_score':
1354 |                 #lgb_eval_metric or Metric(target,oof_preds)?
1355 |                 if y_pred.shape==(len(y_pred),self.num_classes):
1356 |                     y_pred=np.argmax(y_pred,axis=1)#transform probability to label
1357 |                 else:#shape==len(y_pred), maybe:[0.1,0.9],maybe:[0,1]
1358 |                     y_pred=np.round(y_pred)
1359 |                 if self.objective=='binary':
1360 |                     return f1_score(y_true, y_pred,average='binary')
1361 |                 else:
1362 |                     return f1_score(y_true, y_pred,average='macro')
1363 |             elif self.metric=='mcc':
1364 |                 #lgb_eval_metric or Metric(target,oof_preds)?
1365 |                 if y_pred.shape==(len(y_pred),self.num_classes):
1366 |                     y_pred=np.argmax(y_pred,axis=1)#transform probability to label
1367 |                 else:#shape==len(y_pred), maybe:[0.1,0.9],maybe:[0,1]
1368 |                     y_pred=np.round(y_pred)
1369 |                 return matthews_corrcoef(y_true, y_pred)
1370 |             elif self.metric in ['logloss','multi_logloss']:
1371 |                 y_true=np.eye(self.num_classes)[y_true]
1372 |                 y_pred=np.clip(y_pred,self.eps,1-self.eps)
1373 |                 return -np.mean(np.sum(y_true*np.log(y_pred),axis=-1))
1374 |     
1375 |     def optuna_lgb(self,X:pd.DataFrame,y:pd.DataFrame,group,kf_folds:pd.DataFrame,metric:str)->dict:
1376 |         def objective(trial):
1377 |             objective=None if self.metric not in ['mae','mape','smape'] else 'regression_l1'
1378 |             params = {
1379 |                 "boosting_type": "gbdt","metric": metric,'objective':objective,
1380 |                 'random_state': self.seed,'n_estimators': trial.suggest_int('n_estimators', 500,1500),
1381 |                 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
1382 |                 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
1383 |                 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
1384 |                 'subsample': trial.suggest_float('subsample', 0.5, 1),
1385 |                 'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),
1386 |                 'num_leaves' : trial.suggest_int('num_leaves', 8, 64),
1387 |                 'min_child_samples': trial.suggest_int('min_child_samples', 2, 100),
1388 |                 "extra_trees":True,
1389 |                 "verbose": -1
1390 |             }
1391 |             if self.device in ['cuda','gpu']:#gpu mode when training
1392 |                 params['device']='gpu'
1393 |                 params['gpu_use_dp']=True
1394 |             model_name='lgb'
1395 |             if self.objective=='regression':
1396 |                 model=LGBMRegressor(**params)
1397 |             else:
1398 |                 model=LGBMClassifier(**params)   
1399 |             oof_preds,metric_score=self.cross_validation(X=X,y=y,group=group,kf_folds=kf_folds,
1400 |                                                          model=model,model_name=model_name,
1401 |                                                          sample_weight=self.sample_weight,use_optuna=True,                       
1402 |                                                          repeat=0,num_folds=self.num_folds,CV_FE=self.CV_FE,
1403 |                                                          num_classes=self.num_classes,objective=self.objective,
1404 |                                                          log=self.log,use_pseudo_label=self.use_pseudo_label,
1405 |                                                          test=self.test,group_col=self.group_col,target_col=self.target_col,
1406 |                                                          category_cols=self.colname_clean(copy.deepcopy(self.category_cols)),
1407 |                                                          CV_stat=self.CV_stat_with_kfold if self.targetencoder_with_kfold else self.CV_stat_without_kfold,
1408 |                                                          use_data_augmentation=self.use_data_augmentation,
1409 |                                                          CV_sample=self.CV_sample,device=self.device,
1410 |                                                          early_stop=self.early_stop,use_eval_metric=self.use_eval_metric,
1411 |                                                          lgb_eval_metric=self.lgb_eval_metric,xgb_eval_metric=self.xgb_eval_metric,
1412 |                                                          plot_feature_importance=self.plot_feature_importance,
1413 |                                                          exp_mode=self.exp_mode,exp_mode_b=self.exp_mode_b,
1414 |                                                          use_CIR=self.use_CIR,metric=self.metric,Metric=self.Metric
1415 |                                                         )
1416 |             return metric_score
1417 |         
1418 |         #direction is 'minimize' or 'maximize'
1419 |         direction=self.optuna_direction
1420 |         for key in self.direction2metric.keys():
1421 |             if self.metric in self.direction2metric[key]:
1422 |                 direction=key
1423 |             
1424 |         study = optuna.create_study(direction=direction, study_name='find best lgb_params') 
1425 |         study.optimize(objective, n_trials=self.use_optuna_find_params)
1426 |         best_params=study.best_trial.params
1427 |         best_params["boosting_type"]="gbdt"
1428 |         best_params["extra_trees"]=True
1429 |         best_params["metric"]=metric
1430 |         best_params['random_state']=self.seed
1431 |         best_params['verbose']=-1
1432 |         print(f"best_params={best_params}")
1433 |         return best_params
1434 | 
1435 |     def colname_clean(self,cols):
1436 |         #deal with json character
1437 |         json_char=',[]{}:"\\'
1438 |         for i in range(len(cols)):
1439 |             if cols[i]!=self.target_col:
1440 |                 for char in json_char:
1441 |                     cols[i]=cols[i].replace(char,'json')
1442 |                 cols[i]=cols[i].replace(' ','_')
1443 |         return cols
1444 | 
1445 |     def load_data(self,path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',mode:str='')->None|pd.DataFrame:
1446 |         if mode=='train':
1447 |             #read csv,parquet or csv_file
1448 |             self.train_path_or_file=path_or_file
1449 |         if type(path_or_file)==str:#path
1450 |             if path_or_file[-4:]=='.csv':
1451 |                 try:
1452 |                     file=pl.read_csv(path_or_file)
1453 |                 except:
1454 |                     raise ValueError(f"{path_or_file} doesn't exist.Please check your path.")
1455 |             elif path_or_file[-8:]=='.parquet':
1456 |                 try:
1457 |                     file=pl.read_parquet(path_or_file)
1458 |                 except:
1459 |                     raise ValueError(f"{path_or_file} doesn't exist.Please check your path.")
1460 |             try:#if load csv or parquet file
1461 |                 file=file.to_pandas()
1462 |             except:
1463 |                 raise ValueError("Yunbase can only support csv file or parquet file")
1464 |         else:#file
1465 |             file=path_or_file.copy()
1466 |             #polars to pandas.
1467 |             if isinstance(file, pl.DataFrame):
1468 |                 file=file.to_pandas()
1469 |             if not isinstance(file, pd.DataFrame):
1470 |                 raise ValueError(f"{mode} path_or_file is not pd.DataFrame.")
1471 |         
1472 |         if mode=='train':
1473 |             train=file.copy()
1474 |             if self.target_col not in list(train.columns):
1475 |                 raise ValueError(f"{self.target_col}(Parameter target_col) must in train.columns.")
1476 |             #if target_col is nan,then drop these data.
1477 |             train=train[~train[self.target_col].isna()]
1478 |             if self.weight_col not in list(train.columns):
1479 |                train[self.weight_col]=1
1480 |             if len(train)<=self.one_hot_max:
1481 |                 raise ValueError(f"one_hot_max must less than {len(train)}")
1482 |             for col in train.columns:
1483 |                 self.col2dtype[col]=train[col].dtype
1484 |             if self.objective=='regression':
1485 |                 train[self.target_col]=train[self.target_col].astype(np.float32)
1486 |             return train
1487 |         elif mode=='test':
1488 |             test=file.drop([self.target_col],axis=1,errors='ignore').copy()
1489 |             for c in test.columns:#c:column
1490 |                 if self.col2dtype.get(c,object) in ['int64','int32','int16','int8']:
1491 |                     test[c]=test[c].fillna(-1)#missing value
1492 |                 test[c]=test[c].astype(self.col2dtype.get(c,object))
1493 |             return test
1494 |         else:#submission.csv or you just want to load data.
1495 |             return file
1496 | 
1497 |     #https://www.kaggle.com/code/yunsuxiaozi/posv-yunbase-purgedcv-vs-cv
1498 |     def construct_seasonal_feature(self,df:pd.DataFrame,date_col:str='date',timestep:str='day'):
1499 |         #df[date_col] second
1500 |         if timestep=='second':
1501 |             df[f'{date_col}_second']=df[date_col]%60
1502 |             df[f"sin_{date_col}_second"]=np.sin(2*np.pi*df[f'{date_col}_second']/60)
1503 |             df[f"cos_{date_col}_second"]=np.cos(2*np.pi*df[f'{date_col}_second']/60)
1504 |         df[date_col]=df[date_col]//60
1505 |         if timestep=='minute':
1506 |             df[f'{date_col}_minute']=df[date_col]%60
1507 |             df[f"sin_{date_col}_minute"]=np.sin(2*np.pi*df[f'{date_col}_minute']/60)
1508 |             df[f"cos_{date_col}_minute"]=np.cos(2*np.pi*df[f'{date_col}_minute']/60)
1509 |         df[date_col]=df[date_col]//60
1510 |         if timestep=='hour':
1511 |             df[f'{date_col}_hour']=df[date_col]%24
1512 |             df[f"sin_{date_col}_hour"]=np.sin(2*np.pi*df[f'{date_col}_hour']/24)
1513 |             df[f"cos_{date_col}_hour"]=np.cos(2*np.pi*df[f'{date_col}_hour']/24)
1514 |         df[date_col]=df[date_col]//24   
1515 |         if timestep=='day':
1516 |             df[f'{date_col}_day']=df[date_col]%31+1
1517 |             df[f"sin_{date_col}_day"]=np.sin(2*np.pi*df[f'{date_col}_day']/31)
1518 |             df[f"cos_{date_col}_day"]=np.cos(2*np.pi*df[f'{date_col}_day']/31)
1519 |         #month
1520 |         df[f'{date_col}_month']=df[date_col]//31%12+1
1521 |         df[f"sin_{date_col}_month"]=np.sin(2*np.pi*df[f'{date_col}_month']/12)
1522 |         df[f"cos_{date_col}_month"]=np.cos(2*np.pi*df[f'{date_col}_month']/12)
1523 |         #year
1524 |         df[f'{date_col}_year']=df[date_col]//365+1970
1525 |         return df
1526 | 
1527 |     #https://www.kaggle.com/code/marketneutral/purged-time-series-cv-xgboost-optuna
1528 |     def purged_cross_validation(self,train_path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',
1529 |                                 test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',
1530 |                                 date_col:str='date',train_gap_each_fold:int=31,#one month
1531 |                                 train_test_gap:int=7,#a week
1532 |                                 train_date_range:int=0,test_date_range:int=0,
1533 |                                 category_cols:list[str]=[],
1534 |                                 use_seasonal_features:bool=True,
1535 |                                 use_weighted_metric:bool=False,
1536 |                                 only_inference:bool=False,
1537 |                                 timestep:str='day',
1538 |                                 target2idx:dict|None=None,
1539 |                                 save_trained_models:bool=True,
1540 |                                ):
1541 |         """
1542 |         train_path_or_file/test_path_or_file:your train and test dataset.
1543 |         date_col                            :timestamp column
1544 |         train_gap_each_fold                 :For example,the start time of fold 0 is 0,
1545 |                                              the start time of fold 1 is 31.
1546 |         train_test_gap                      :the gap between the end of train dataset 
1547 |                                              and the start of test dataset.
1548 |         train_date_range                    :the days of data are included in the train data.
1549 |         test_date_range                     :the days of data are included in the test data.
1550 |         category_cols                       :You can define features that are category_cols.
1551 |         timestep                            :Interval of time series data.'second',
1552 |                                              'minute','hour'or'day'.
1553 |         """
1554 |         
1555 |         if self.use_pseudo_label:
1556 |             raise ValueError("purged CV can't support use pseudo label.")
1557 |         if self.group_col!=None:
1558 |             raise ValueError("purged CV can't support groupkfold.")
1559 |         if len(self.models)==0:
1560 |             raise ValueError("len(models) can't be 0.")
1561 |         if (self.use_optuna_find_params!=0) or (self.optuna_direction!=None):
1562 |             raise ValueError("purged CV can't support optuna find params.")
1563 |         self.train=self.load_data(path_or_file=train_path_or_file,mode='train')
1564 |         if len(self.train.dropna())==0:
1565 |             raise ValueError("At least one row of train data must have no missing values.")
1566 |         self.test=self.load_data(path_or_file=test_path_or_file,mode='test')
1567 |         if len(self.test.dropna())==0:
1568 |             raise ValueError("At least one row of test data must have no missing values.")
1569 |         self.date_col=date_col
1570 |         self.save_trained_models=save_trained_models
1571 |         
1572 |         self.category_cols=category_cols
1573 |         self.target_dtype=self.train[self.target_col].dtype
1574 |         if self.objective!='regression':#check target
1575 |             unique_target=sorted(list(self.train[self.target_col].value_counts().to_dict().keys()))
1576 |             if unique_target!=list(np.arange(len(unique_target))):
1577 |                 raise ValueError(f"purged CV can only support target from 0 to {len(unique_target)-1}")
1578 |         print("< preprocess date_col >")
1579 |         try:#if df[date_col] is string such as '2024-11-08'
1580 |             #transform date to datetime
1581 |             if type(self.train.dropna()[self.date_col].values[0])==str:
1582 |                 self.train[self.date_col]=pd.to_datetime(self.train[self.date_col])
1583 |             if type(self.test.dropna()[self.date_col].values[0])==str:
1584 |                 self.test[self.date_col]=pd.to_datetime(self.test[self.date_col])
1585 |             #transform 'date' to seconds.
1586 |             self.train[self.date_col]=(self.train[self.date_col]-pd.to_datetime('1970-01-01')).dt.total_seconds()
1587 |             self.test[self.date_col]=(self.test[self.date_col]-pd.to_datetime('1970-01-01')).dt.total_seconds()
1588 |         except:#df[date_col] is [0,1,2,……,n]
1589 |             min_date=self.train[self.date_col].min()
1590 |             self.train[self.date_col]=self.train[self.date_col]-min_date
1591 |             self.test[self.date_col]=self.test[self.date_col]-min_date
1592 |             #transform (minute,hour,day) to seconds
1593 |             if timestep=='minute':
1594 |                 self.train[self.date_col]=self.train[self.date_col]*60
1595 |                 self.test[self.date_col]=self.test[self.date_col]*60
1596 |             if timestep=='hour':
1597 |                 self.train[self.date_col]=self.train[self.date_col]*3600
1598 |                 self.test[self.date_col]=self.test[self.date_col]*3600
1599 |             if timestep=='day':
1600 |                 self.train[self.date_col]=self.train[self.date_col]*3600*24
1601 |                 self.test[self.date_col]=self.test[self.date_col]*3600*24
1602 | 
1603 |         self.train=self.base_FE(self.train,mode='train',drop_cols=self.drop_cols)
1604 |         self.test=self.base_FE(self.test,mode='test',drop_cols=self.drop_cols)
1605 | 
1606 |         #Considering that some competitions may anonymize time,
1607 |         #the real year, month, and day features were not extracted here.
1608 |         if use_seasonal_features:
1609 |             self.train=self.construct_seasonal_feature(self.train,self.date_col,timestep)
1610 |             self.test=self.construct_seasonal_feature(self.test,self.date_col,timestep)
1611 |         else:
1612 |             if timestep=='minute':
1613 |                 self.train[self.date_col]=self.train[self.date_col]//60
1614 |                 self.test[self.date_col]=self.test[self.date_col]//60
1615 |             if timestep=='hour':
1616 |                 self.train[self.date_col]=self.train[self.date_col]//3600
1617 |                 self.test[self.date_col]=self.test[self.date_col]//3600
1618 |             if timestep=='day':
1619 |                 self.train[self.date_col]=self.train[self.date_col]//3600//24
1620 |                 self.test[self.date_col]=self.test[self.date_col]//3600//24
1621 |         min_date_col=self.train[self.date_col].min()
1622 |         self.train[self.date_col]-=min_date_col
1623 |         self.test[self.date_col]-=min_date_col
1624 | 
1625 |         if test_date_range==0:#date_range same as test_data
1626 |             test_date_range=self.test[self.date_col].max()-self.test[self.date_col].min()+1
1627 |         if train_date_range==0:
1628 |            a=self.train[self.date_col].max()+1-(self.num_folds-1)*train_gap_each_fold-train_test_gap-test_date_range
1629 |            b=self.train[self.date_col].max()+1-self.num_folds*train_gap_each_fold-train_test_gap 
1630 |            train_date_range=min(a,b)
1631 |         #last fold out of index?
1632 |         assert (self.num_folds-1)*train_gap_each_fold+train_date_range+train_test_gap+test_date_range<=self.train[self.date_col].max()+1
1633 |         #final train set out of index?
1634 |         assert self.num_folds*train_gap_each_fold+train_date_range+train_test_gap <=self.train[self.date_col].max()+1
1635 | 
1636 |         if self.objective!='regression':
1637 |             y=self.train[self.target_col]
1638 |             self.train[self.target_col]=self.set_target2idx(y,target2idx)
1639 |         
1640 |         self.train.columns=self.colname_clean(list(self.train.columns))
1641 |         self.test.columns=self.colname_clean(list(self.test.columns))
1642 |         self.date_col=self.colname_clean([self.date_col])[0]
1643 |         
1644 |         if not only_inference:
1645 |             self.PrintColor("purged CV")
1646 |             for (model,model_name) in self.models:
1647 |                 print(f"{model_name}_params={model.get_params()}")
1648 |             for fold in range(self.num_folds):
1649 |                 print(f"fold {fold},model_name:{model_name}")
1650 |                 train_date_min=fold*train_gap_each_fold
1651 |                 train_date_max=train_date_min+train_date_range
1652 |                 test_date_min=train_date_max+train_test_gap
1653 |                 test_date_max=test_date_min+test_date_range
1654 |                 print(f"train_date_range:[{train_date_min},{train_date_max})")
1655 |                 print(f"test_date_range:[{test_date_min},{test_date_max})")
1656 |                 
1657 |                 train_fold=self.train.copy()[(self.train[self.date_col]>=train_date_min)&(self.train[self.date_col]<train_date_max)]
1658 |                 valid_fold=self.train.copy()[(self.train[self.date_col]>=test_date_min)&(self.train[self.date_col]<test_date_max)]
1659 |                 
1660 |                 X_train=train_fold.drop([self.target_col,self.date_col,self.weight_col],axis=1)
1661 |                 y_train=train_fold[self.target_col]
1662 |                 X_valid=valid_fold.drop([self.target_col,self.date_col,self.weight_col],axis=1)
1663 |                 y_valid=valid_fold[self.target_col]
1664 | 
1665 |                 self.features,self.target=X_train,y_train.values
1666 |                 
1667 |                 train_weight,valid_weight=train_fold[self.weight_col],valid_fold[self.weight_col]
1668 |                 del train_fold,valid_fold
1669 |                 gc.collect()
1670 | 
1671 |                 if self.targetencoder_with_kfold:
1672 |                     X_train=self.CV_stat_with_kfold(X_train,y_train,repeat=0,fold=fold)
1673 |                     X_valid=self.CV_stat_with_kfold(X_valid,repeat=0,fold=fold)
1674 |                 else:
1675 |                     X_train=self.CV_stat_without_kfold(X_train,y_train,repeat=0,fold=fold)
1676 |                     X_valid=self.CV_stat_without_kfold(X_valid,repeat=0,fold=fold)
1677 |                 
1678 |                 X_train=self.CV_FE(X_train,mode='train',fold=fold)
1679 |                 X_valid=self.CV_FE(X_valid,mode='test',fold=fold)
1680 |                 
1681 |                 CV_score=[]
1682 |                 for (model,model_name) in self.models:
1683 |                     if self.exp_mode:#use log transform for target_col
1684 |                         self.exp_mode_b=-y_train.min()
1685 |                         y_train=np.log1p(y_train+self.exp_mode_b)
1686 |                         y_valid=np.log1p(y_valid+self.exp_mode_b)
1687 | 
1688 |                     if self.CV_sample!=None:
1689 |                         X_train,y_train,train_weight=self.CV_sample(X_train,y_train,train_weight)
1690 |                     
1691 |                     #don't use early_stop,because final_trainset don't have valid_set.
1692 |                     if 'lgb' in model_name:
1693 |                          #gpu params isn't set
1694 |                         if self.device in ['cuda','gpu']:#gpu mode when training
1695 |                             params=model.get_params()
1696 |                             if (params.get('device',-1)==-1) or (params.get('gpu_use_dp',-1)==-1):
1697 |                                  raise ValueError("The 'device' of lightgbm is 'gpu' and 'gpu_use_dp' must be True.")
1698 |                         model.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],
1699 |                                   sample_weight=train_weight,
1700 |                                   eval_metric=self.lgb_eval_metric if self.use_eval_metric else None,
1701 |                                   callbacks=[log_evaluation(self.log)])
1702 |                     elif 'xgb' in model_name:
1703 |                         #gpu params isn't set
1704 |                         if self.device in ['cuda','gpu']:#gpu mode when training
1705 |                             params=model.get_params()
1706 |                             if (params.get('tree_method',-1)!='gpu_hist'):
1707 |                                  raise ValueError("The 'tree_method' of xgboost must be 'gpu_hist'.")
1708 |                         model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
1709 |                                   sample_weight=train_weight,
1710 |                                   eval_metric=self.xgb_eval_metric if self.use_eval_metric else None,
1711 |                                   verbose=self.log)
1712 |                     elif 'cat' in model_name:
1713 |                         #gpu params isn't set
1714 |                         if self.device in ['cuda','gpu']:#gpu mode when training
1715 |                             params=model.get_params()
1716 |                             if (params.get('task_type',-1)==-1):
1717 |                                  raise ValueError("The 'task_type' of catboost must be 'GPU'.")
1718 |                         X_train[self.category_cols]=X_train[self.category_cols].astype('string')
1719 |                         X_valid[self.category_cols]=X_valid[self.category_cols].astype('string')
1720 |                         model.fit(X_train,y_train, eval_set=(X_valid, y_valid),
1721 |                                   sample_weight=train_weight,
1722 |                                   cat_features=self.category_cols,
1723 |                                   verbose=self.log)
1724 |                     elif 'tabnet' in model_name:
1725 |                         cat_idxs,cat_dims=[],[]
1726 |                         X_train_columns=list(X_train.columns)
1727 |                         for idx in range(len(X_train_columns)):
1728 |                             if X_train[X_train_columns[idx]].dtype=='category':
1729 |                                 cat_idxs.append(idx)
1730 |                                 cat_dims.append(self.train[X_train_columns[idx]].nunique())
1731 |                                 X_train[X_train_columns[idx]]=X_train[X_train_columns[idx]].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)          
1732 |                         params=model.get_params()
1733 |                         params['cat_idxs']=cat_idxs
1734 |                         params['cat_dims']=cat_dims
1735 |                         params['cat_emb_dim']=[5]*len(cat_idxs)
1736 |                         if self.objective=='regression':
1737 |                             model=TabNetRegressor(**params)
1738 |                             model.fit(
1739 |                                 X_train.to_numpy(), y_train.to_numpy().reshape(-1,1),
1740 |                                 eval_metric=['rmse'],
1741 |                                 batch_size=1024,
1742 |                             )
1743 |                         else:
1744 |                             model=TabNetClassifier(**params)
1745 |                             model.fit(
1746 |                                 X_train.to_numpy(), y_train.to_numpy(),
1747 |                                 batch_size=1024,
1748 |                             )
1749 |                     elif 'tabm' in model_name:
1750 |                         with self.suppress_stdout():
1751 |                             model.fit(X_train, y_train, X_valid, y_valid, cat_col_names=self.category_cols)
1752 |                     elif 'realmlp' in model_name:
1753 |                         model.fit(X_train, y_train, X_valid, y_valid, cat_col_names=self.category_cols)
1754 |                     else:
1755 |                         model.fit(X_train,y_train)
1756 |                     
1757 |                     #print feature importance when not use optuna to find params.
1758 |                     if self.plot_feature_importance:
1759 |                         #can only support GBDT.
1760 |                         if ('lgb' in model_name) or ('xgb' in model_name) or ('cat' in model_name):
1761 |                             origin_features=list(X_train.columns)  
1762 |                             feature_importance=model.feature_importances_
1763 |                             #convert to percentage
1764 |                             feature_importance=feature_importance/np.sum(feature_importance)
1765 |                             feat_import_dict={k:v for k,v in zip(origin_features,feature_importance)}
1766 |                             feat_import_dict={k:v for k,v in sorted(feat_import_dict.items(),key=lambda x:-x[1])}
1767 |                             self.pickle_dump(feat_import_dict,self.model_save_path+f'{model_name}_fold{fold}_{self.target_col}_feature_importance.pkl')
1768 |                             bestk,worstk=min(20,int(len(origin_features)*0.1+1)),min(20,int(len(origin_features)*0.1+1))
1769 |                             print(f"top{bestk} best features is :{list(feat_import_dict.keys())[:bestk]}")
1770 |                             print(f"top{worstk} worst features is :{list(feat_import_dict.keys())[-worstk:]}")
1771 |         
1772 |                             #plot feature importance
1773 |                             plt.figure(figsize = (12, 2*bestk/5))
1774 |                             sns.barplot(
1775 |                                 y=list(feat_import_dict.keys())[:bestk],
1776 |                                 x=list(feat_import_dict.values())[:bestk],
1777 |                             )
1778 |                             plt.title(f"{model_name} fold {fold} top{bestk} best Feature Importance")
1779 |                             plt.show()
1780 |                     if self.objective=='regression':
1781 |                         valid_pred=self.predict_batch(model=model,test_X=X_valid)
1782 |                     else:
1783 |                         valid_pred=self.predict_proba_batch(model=model,test_X=X_valid)
1784 |                     if self.exp_mode:
1785 |                         y_train=np.expm1(y_train)-self.exp_mode_b
1786 |                         valid_pred=np.expm1(valid_pred)-self.exp_mode_b
1787 |                         y_valid=np.expm1(y_valid)-self.exp_mode_b
1788 |                     if self.save_oof_preds:#if oof_preds is needed
1789 |                         np.save(self.model_save_path+f"{model_name}_seed{self.seed}_fold{fold}_{self.target_col}_target.npy",y_valid.values)
1790 |                         np.save(self.model_save_path+f"{model_name}_seed{self.seed}_fold{fold}_{self.target_col}_valid_pred.npy",valid_pred)
1791 |                     
1792 |                     if use_weighted_metric:#only support custom_metric
1793 |                         CV_score.append(self.Metric(y_valid,valid_pred,valid_weight))
1794 |                     else:
1795 |                         CV_score.append(self.Metric(y_valid,valid_pred)) 
1796 | 
1797 |                     if 'tabnet' not in model_name:
1798 |                         if self.save_trained_models:
1799 |                             self.pickle_dump(model,self.model_save_path+f'{model_name}_fold{fold}_{self.target_col}.model')
1800 |                     metric=self.metric if self.custom_metric==None else self.custom_metric.__name__
1801 |                     print(f"{metric}:{CV_score[-1]}")
1802 |                 self.PrintColor(f"mean_{metric}------------------------------>{np.mean(CV_score)}",color = Fore.RED)
1803 |                 del X_train,y_train,X_valid,y_valid,valid_pred,train_weight,valid_weight
1804 |                 gc.collect()
1805 |                 
1806 |         self.PrintColor("prediction on test data")
1807 |         train_date_min=self.train[self.date_col].max()-train_test_gap-train_date_range+1
1808 |         train_date_max=self.train[self.date_col].max()-train_test_gap+1
1809 |         print(f"train_date_range:[{train_date_min},{train_date_max})")
1810 |         train=self.train[(self.train[self.date_col]>=train_date_min)&(self.train[self.date_col]<train_date_max)]
1811 |         train_weight=train[self.weight_col]
1812 |         X_train=train.drop([self.target_col,self.date_col,self.weight_col],axis=1)
1813 |         y_train=train[self.target_col]
1814 |         
1815 |         self.features,self.target=X_train,y_train.values
1816 |         del train
1817 |         gc.collect()
1818 |         
1819 |         test_X=self.test.drop([self.date_col],axis=1)
1820 | 
1821 |         
1822 |         if self.targetencoder_with_kfold:
1823 |             X_train=self.CV_stat_with_kfold(X_train,y_train,repeat=0,fold=self.num_folds)
1824 |             test_X=self.CV_stat_with_kfold(test_X,repeat=0,fold=self.num_folds)
1825 |         else:
1826 |             X_train=self.CV_stat_without_kfold(X_train,y_train,repeat=0,fold=self.num_folds)
1827 |             test_X=self.CV_stat_without_kfold(test_X,repeat=0,fold=self.num_folds)
1828 |         
1829 |         X_train=self.CV_FE(X_train,mode='train',fold=self.num_folds)
1830 |         test_X=self.CV_FE(test_X,mode='test',fold=self.num_folds)
1831 |         
1832 |         if self.exp_mode:#use log transform for target_col
1833 |             self.exp_mode_b=-y_train.min()
1834 |             y_train=np.log1p(y_train+self.exp_mode_b)
1835 | 
1836 |         if self.CV_sample!=None:
1837 |             X_train,y_train,train_weight=self.CV_sample(X_train,y_train,train_weight)
1838 |         
1839 |         test_preds=[]
1840 |         for model,model_name in self.models:
1841 |             X_train[self.category_cols]=X_train[self.category_cols].astype(str).astype('category')
1842 |             #don't use early_stop,because final_trainset don't have valid_set.
1843 |             if 'lgb' in model_name:
1844 |                 model.fit(X_train, y_train,sample_weight=train_weight,
1845 |                           callbacks=[log_evaluation(self.log)])
1846 |             elif 'xgb' in model_name:
1847 |                 model.fit(X_train,y_train,sample_weight=train_weight,
1848 |                           verbose=self.log)
1849 |             elif 'cat' in model_name:
1850 |                 X_train[self.category_cols]=X_train[self.category_cols].astype('string')
1851 |                 model.fit(X_train,y_train, cat_features=self.category_cols,
1852 |                           sample_weight=train_weight,
1853 |                           verbose=self.log)
1854 |             elif 'tabnet' in model_name:
1855 |                 cat_idxs,cat_dims=[],[]
1856 |                 X_train_columns=list(X_train.columns)
1857 |                 for idx in range(len(X_train_columns)):
1858 |                     if X_train[X_train_columns[idx]].dtype=='category':
1859 |                         cat_idxs.append(idx)
1860 |                         cat_dims.append(self.train[X_train_columns[idx]].nunique())
1861 |                         X_train[X_train_columns[idx]]=X_train[X_train_columns[idx]].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)      
1862 |                          
1863 |                 params=model.get_params()
1864 |                 params['cat_idxs']=cat_idxs
1865 |                 params['cat_dims']=cat_dims
1866 |                 params['cat_emb_dim']=[5]*len(cat_idxs)
1867 | 
1868 |                 if self.objective=='regression':
1869 |                     model=TabNetRegressor(**params)
1870 |                     model.fit(
1871 |                             X_train.to_numpy(), y_train.to_numpy().reshape(-1,1),
1872 |                             eval_metric=['rmse'],
1873 |                             batch_size=1024,
1874 |                         )
1875 |                 else:
1876 |                     model=TabNetClassifier(**params)
1877 |                     model.fit(
1878 |                             X_train.to_numpy(), y_train.to_numpy(),
1879 |                             batch_size=1024,
1880 |                         )
1881 |             elif 'tabm' in model_name:
1882 |                 with self.suppress_stdout():
1883 |                     model.fit(X_train, y_train, cat_col_names=self.category_cols)
1884 |             elif 'realmlp' in model_name:
1885 |                 model.fit(X_train, y_train,cat_col_names=self.category_cols)
1886 |             else:
1887 |                 model.fit(X_train,y_train)
1888 |             if self.save_trained_models:
1889 |                 self.pickle_dump(model,self.model_save_path+f'{model_name}_fold{self.num_folds}_{self.target_col}.model')
1890 |             #inference
1891 |             if self.objective=='regression':
1892 |                 test_pred=self.predict_batch(model=model,test_X=test_X)
1893 |             else:
1894 |                 test_pred=self.predict_proba_batch(model=model,test_X=test_X)
1895 |             test_preds.append(test_pred)
1896 |         if self.save_test_preds:#True
1897 |             np.save(self.model_save_path+f'{self.target_col}_test_preds.npy',test_preds)
1898 |         test_preds=np.mean(test_preds,axis=0)
1899 |         if self.objective!='regression':
1900 |             if self.metric=='auc':
1901 |                 test_preds=test_preds[:,1]
1902 |             else:
1903 |                 test_preds=np.argmax(test_preds,axis=1)
1904 |         if self.exp_mode:
1905 |             test_preds=np.expm1(test_preds)-self.exp_mode_b
1906 |         return test_preds
1907 | 
1908 |     #https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/501577
1909 |     def lgb_eval_metric(self,y_true,y_pred):
1910 |         if self.exp_mode:
1911 |             y_true,y_pred=np.expm1(y_true)-self.exp_mode_b,np.expm1(y_pred)-self.exp_mode_b
1912 |         score=self.Metric(y_true,y_pred)
1913 |         if self.custom_metric!=None:
1914 |             return self.custom_metric.__name__,score, bool(self.optuna_direction=='maximize')
1915 |         else:
1916 |             direction='minimize'
1917 |             if self.metric in self.direction2metric['maximize']:
1918 |                 direction='maximize'
1919 |             return self.metric,score, bool(direction=='maximize')
1920 |     def xgb_eval_metric(self,y_pred,y_true):
1921 |         y_true=y_true.get_label()
1922 |         if self.exp_mode:
1923 |             y_true,y_pred=np.expm1(y_true)-self.exp_mode_b,np.expm1(y_pred)-self.exp_mode_b
1924 |         score=self.Metric(y_true,y_pred)
1925 |         if self.custom_metric!=None:
1926 |             return self.custom_metric.__name__,score
1927 |         else:
1928 |             return self.metric,score
1929 |     
1930 |     # return oof_preds and metric_score
1931 |     # can use optuna to find params.If use optuna,then not save models.
1932 |     def cross_validation(self,X:pd.DataFrame,y:pd.DataFrame,group,kf_folds:pd.DataFrame,
1933 |                          model,model_name,sample_weight,use_optuna,repeat:int=0,
1934 |                          num_folds=5,CV_FE=None,
1935 |                          num_classes=None,objective='regression',
1936 |                          log=100,use_pseudo_label:bool=False,
1937 |                          test=None,group_col=None,target_col:str='target',
1938 |                          category_cols=[],
1939 |                          use_data_augmentation:bool=False,
1940 |                          CV_stat=None,
1941 |                          CV_sample=None,device:str='cpu',
1942 |                          early_stop:int=100,use_eval_metric:bool=False,
1943 |                          lgb_eval_metric=None,xgb_eval_metric=None,
1944 |                          plot_feature_importance:bool=False,
1945 |                          exp_mode:bool=False,exp_mode_b:int=0,
1946 |                          use_CIR:bool=False,metric=None,Metric=None,
1947 |                         ):
1948 |        
1949 |         if use_optuna:
1950 |             log=10000
1951 |         if objective=='regression':
1952 |             oof_preds=np.zeros(len(y))
1953 |         else:
1954 |             oof_preds=np.zeros((len(y),num_classes))
1955 |         for fold in tqdm(range(num_folds)):
1956 |             
1957 |             train_index=kf_folds[kf_folds[self.kfold_col]!=fold].index
1958 |             valid_index=kf_folds[kf_folds[self.kfold_col]==fold].index
1959 |             
1960 |             print(f"name:{model_name},fold:{fold}")
1961 | 
1962 |             X_train, X_valid = X.iloc[train_index].reset_index(drop=True), X.iloc[valid_index].reset_index(drop=True)
1963 |             y_train, y_valid = y.iloc[train_index].reset_index(drop=True), y.iloc[valid_index].reset_index(drop=True)
1964 |             
1965 |             if CV_stat!=None:
1966 |                 X_train=CV_stat(X=X_train,y=y_train,fold=fold,repeat=repeat )
1967 |                 X_valid=CV_stat(X=X_valid,fold=fold,repeat=repeat)
1968 |             
1969 |             if CV_FE!=None:
1970 |                 X_train=CV_FE(X_train,mode='train',fold=fold,repeat=repeat)
1971 |                 X_valid=CV_FE(X_valid,mode='test',fold=fold,repeat=repeat)
1972 |             sample_weight_train=sample_weight.iloc[train_index].reset_index(drop=True)
1973 |             sample_weight_valid=sample_weight.iloc[valid_index].reset_index(drop=True)
1974 | 
1975 |             if (use_pseudo_label) and (type(test)==pd.DataFrame):
1976 |                 test_copy=test.copy()
1977 |                 if CV_stat!=None:
1978 |                     test_copy=CV_stat(X=test_copy,fold=fold,repeat=repeat)
1979 |                 if CV_FE!=None:
1980 |                     test_copy=CV_FE(test_copy,mode='test',fold=fold,repeat=repeat)
1981 |                 test_X=test_copy.drop([group_col,target_col],axis=1,errors='ignore')
1982 |                 test_y=test_copy[target_col]
1983 |                 
1984 |                 #concat will transform 'category' to 'object'
1985 |                 X_train=pd.concat((X_train,test_X),axis=0)
1986 |                 X_train[category_cols]=X_train[category_cols].astype(str).astype('category')
1987 |                 y_train=pd.concat((y_train,test_y),axis=0)
1988 |                 sample_weight_test=pd.DataFrame({'weight':np.ones(len(test_X))*self.pseudo_label_weight*np.mean(sample_weight_train.values)})['weight']
1989 |                 sample_weight_train=pd.concat((sample_weight_train,sample_weight_test))
1990 |                 
1991 |                 del test_X,test_copy
1992 |                 gc.collect()
1993 |                 
1994 |             if use_data_augmentation:#X_train,y_train,sample_weight_train
1995 |                 aug_data=self.pca_augmentation(X_train,y_train,target_col)
1996 |                 
1997 |                 #concat origin_data aug_data
1998 |                 X_train=pd.concat((X_train,aug_data[X_train.columns]),axis=0)
1999 |                 X_train[category_cols]=X_train[category_cols].astype(str).astype('category')
2000 |                 y_train=pd.concat((y_train,y_train),axis=0)
2001 |                 sample_weight_train=pd.concat((sample_weight_train,sample_weight_train),axis=0)
2002 | 
2003 |                 del aug_data
2004 |                 gc.collect()
2005 | 
2006 |             if CV_sample!=None:
2007 |                 X_train,y_train,X_valid,y_valid,sample_weight_train,sample_weight_valid=\
2008 |                     CV_sample(X_train,y_train,X_valid,y_valid,sample_weight_train,sample_weight_valid)
2009 |                 
2010 |             if 'lgb' in model_name:
2011 |                 #gpu params isn't set
2012 |                 if device in ['cuda','gpu']:#gpu mode when training
2013 |                     params=model.get_params()
2014 |                     if (params.get('device',-1)==-1) or (params.get('gpu_use_dp',-1)==-1):
2015 |                          raise ValueError("The 'device' of lightgbm is 'gpu' and 'gpu_use_dp' must be True.")
2016 |                 model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
2017 |                          sample_weight=sample_weight_train,
2018 |                          eval_metric=lgb_eval_metric if use_eval_metric else None,
2019 |                          categorical_feature=category_cols,
2020 |                          callbacks=[log_evaluation(log),early_stopping(early_stop)]
2021 |                     )
2022 |             elif 'cat' in model_name:
2023 |                 #gpu params isn't set
2024 |                 if device in ['cuda','gpu']:#gpu mode when training
2025 |                     params=model.get_params()
2026 |                     if (params.get('task_type',-1)==-1):
2027 |                          raise ValueError("The 'task_type' of catboost must be 'GPU'.")
2028 |                 X_train[category_cols]=X_train[category_cols].astype('string')
2029 |                 X_valid[category_cols]=X_valid[category_cols].astype('string')
2030 |                 model.fit(X_train, y_train,
2031 |                       eval_set=(X_valid, y_valid),
2032 |                       cat_features=category_cols,
2033 |                       sample_weight=sample_weight_train,
2034 |                       early_stopping_rounds=early_stop, verbose=log)
2035 |             elif 'xgb' in model_name: 
2036 |                 #gpu params isn't set
2037 |                 if device in ['cuda','gpu']:#gpu mode when training
2038 |                     params=model.get_params()
2039 |                     if (params.get('tree_method',-1)!='gpu_hist'):
2040 |                          raise ValueError("The 'tree_method' of xgboost must be 'gpu_hist'.")
2041 |                 model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
2042 |                           sample_weight=sample_weight_train,
2043 |                           eval_metric=xgb_eval_metric if use_eval_metric else None,
2044 |                           verbose=log)
2045 |             elif 'tabnet' in model_name:
2046 |                  cat_idxs,cat_dims=[],[]
2047 |                  X_train_columns=list(X_train.columns)
2048 |                  for idx in range(len(X_train_columns)):
2049 |                      if X_train[X_train_columns[idx]].dtype=='category':
2050 |                          cat_idxs.append(idx)
2051 |                          cat_dims.append(X[X_train_columns[idx]].nunique())
2052 |                          X_train[X_train_columns[idx]]=X_train[X_train_columns[idx]].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)      
2053 |                          X_valid[X_train_columns[idx]]=X_valid[X_train_columns[idx]].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)      
2054 |                  params=model.get_params()
2055 |                  params['cat_idxs']=cat_idxs
2056 |                  params['cat_dims']=cat_dims
2057 |                  params['cat_emb_dim']=[5]*len(cat_idxs)
2058 |                  if objective=='regression':
2059 |                      model=TabNetRegressor(**params)
2060 |                      model.fit(
2061 |                         X_train.to_numpy(), y_train.to_numpy().reshape(-1,1),
2062 |                         eval_metric=['rmse'],
2063 |                         eval_set=[(X_valid.to_numpy(), y_valid.to_numpy().reshape(-1,1) ) ],
2064 |                         batch_size=1024,
2065 |                     )
2066 |                  else:
2067 |                      model=TabNetClassifier(**params)
2068 |                      model.fit(
2069 |                         X_train.to_numpy(), y_train.to_numpy(),
2070 |                         eval_set=[(X_valid.to_numpy(), y_valid.to_numpy())],
2071 |                         batch_size=1024,
2072 |                     )
2073 | 
2074 |             elif 'tabm' in model_name:
2075 |                 with self.suppress_stdout():
2076 |                     model.fit(X_train, y_train, X_valid, y_valid, cat_col_names=category_cols)
2077 |             elif 'realmlp' in model_name:
2078 |                 model.fit(X_train, y_train, X_valid, y_valid, cat_col_names=category_cols)
2079 |             else:#other models such as ridge,LinearRegression,LogisticRegression,
2080 |                 model.fit(X_train,y_train) 
2081 | 
2082 |             #print feature importance when not use optuna to find params.
2083 |             if (use_optuna==False) and (plot_feature_importance):
2084 |                 #can only support GBDT.
2085 |                 if ('lgb' in model_name) or ('xgb' in model_name) or ('cat' in model_name):
2086 |                     origin_features=list(X_train.columns)
2087 |                             
2088 |                     feature_importance=model.feature_importances_
2089 |                     #convert to percentage
2090 |                     feature_importance=feature_importance/np.sum(feature_importance)
2091 |                     feat_import_dict={k:v for k,v in zip(origin_features,feature_importance)}
2092 |                     feat_import_dict={k:v for k,v in sorted(feat_import_dict.items(),key=lambda x:-x[1])}
2093 |                     self.pickle_dump(feat_import_dict,self.model_save_path+f'{model_name}_repeat{repeat}_fold{fold}_{self.target_col}_feature_importance.pkl')
2094 |                     bestk,worstk=min(10,int(len(origin_features)*0.1+1)),min(10,int(len(origin_features)*0.1+1))
2095 |                     print(f"top{bestk} best features is :{list(feat_import_dict.keys())[:bestk]}")
2096 |                     print(f"top{worstk} worst features is :{list(feat_import_dict.keys())[-worstk:]}")
2097 | 
2098 |                     #plot feature importance
2099 |                     plt.figure(figsize = (12, 2*bestk/5))
2100 |                     sns.barplot(
2101 |                         y=list(feat_import_dict.keys())[:bestk],
2102 |                         x=list(feat_import_dict.values())[:bestk],
2103 |                     )
2104 |                     plt.title(f"{model_name} fold {fold} top{bestk} best Feature Importance")
2105 |                     plt.show()
2106 |             
2107 |             if objective=='regression':
2108 |                 if 'tabnet' not in model_name:
2109 |                     oof_preds[valid_index]=model.predict(X_valid)
2110 |                 else:#tabnet
2111 |                     oof_preds[valid_index]=model.predict(X_valid.to_numpy()).reshape(-1)
2112 |             else:
2113 |                 if 'tabnet' not in model_name:
2114 |                     oof_preds[valid_index]=model.predict_proba(X_valid)
2115 |                 else:#tabnet
2116 |                     oof_preds[valid_index]=model.predict_proba(X_valid.to_numpy())
2117 |             if not use_optuna:#not find_params(training)
2118 |                 if 'tabnet' not in model_name:
2119 |                     if self.save_trained_models:
2120 |                         self.pickle_dump(model,self.model_save_path+f'{model_name}_repeat{repeat}_fold{fold}_{self.target_col}.model')
2121 |                 self.trained_models.append(copy.deepcopy(model))
2122 |             
2123 |             del X_train,y_train,X_valid,y_valid
2124 |             gc.collect()
2125 |         y=y.values
2126 |         if exp_mode:#y and oof need expm1.
2127 |             y=np.expm1(y)-exp_mode_b
2128 |             oof_preds=np.expm1(oof_preds)-exp_mode_b
2129 |         if use_CIR:
2130 |             print(f"{metric} before CIR:{Metric(y,oof_preds)}")
2131 |             CIR=CenteredIsotonicRegression().fit(oof_preds,y)
2132 |             oof_preds=CIR.transform(oof_preds)
2133 |             #save CIR models
2134 |             if self.save_trained_models:
2135 |                 self.pickle_dump(CIR,self.model_save_path+f'CIR_{model_name}_repeat{repeat}_fold{fold}_{self.target_col}.model')
2136 |             self.trained_CIR.append(copy.deepcopy(CIR))
2137 |             del CIR
2138 |             gc.collect()
2139 |         
2140 |         metric_score=Metric(y,oof_preds)
2141 |         return oof_preds,metric_score
2142 | 
2143 |     #https://www.kaggle.com/code/carlmcbrideellis/what-is-adversarial-validation
2144 |     #This function does not perform any feature engineering.
2145 |     def adversarial_validation(self,train_path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',
2146 |                                test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',
2147 |                                     target_col:str='target',save_trained_models:bool=True):
2148 |         self.save_trained_models=save_trained_models
2149 |         train=self.load_data(train_path_or_file,mode='adv')
2150 |         test=self.load_data(test_path_or_file,mode='adv')
2151 |         train.columns=self.colname_clean(list(train.columns))
2152 |         test.columns=self.colname_clean(list(test.columns))
2153 |         target_col=self.colname_clean([target_col])[0]
2154 | 
2155 |         if target_col=='' or type(target_col)==type(None):
2156 |             raise ValueError(f"{target_col} can't be None.")
2157 |         if target_col not in train.columns:
2158 |             raise ValueError(f"{target_col} must in train.columns.")
2159 |             
2160 |         #drop object cols
2161 |         object_cols=[c for c in train.columns if c!=target_col and train[c].dtype==object]
2162 |         train.drop(object_cols,axis=1,inplace=True)
2163 |         test.drop(object_cols,axis=1,inplace=True)
2164 | 
2165 |         train.drop([target_col],axis=1,inplace=True)
2166 |         test.drop([target_col],axis=1,inplace=True,errors='ignore')
2167 | 
2168 |         if sorted(list(train.columns))!=sorted(list(test.columns)):
2169 |             raise ValueError(f"train and test must have same features.")
2170 |         
2171 |         train['is_test']=0
2172 |         test['is_test']=1
2173 |         total=pd.concat((train,test))
2174 |         X=total.drop(['is_test'],axis=1)
2175 |         y=total['is_test']
2176 |         sample_weight=pd.DataFrame({"weight":np.ones(len(y))})['weight']
2177 |         kf=StratifiedKFold(n_splits=5,random_state=self.seed,shuffle=True)     
2178 |         kf_folds=pd.DataFrame({'fold':np.zeros(len(y))})
2179 |         for fold, (train_index, valid_index) in (enumerate(kf.split(X,y))):
2180 |             kf_folds['fold'][valid_index]=fold
2181 | 
2182 |         temp_metric,temp_objective,temp_num_classes=self.metric,self.objective,self.num_classes
2183 |         temp_custom_metric=self.custom_metric
2184 |         temp_trained_models=copy.deepcopy(self.trained_models)
2185 |         self.metric,self.objective,self.num_classes,self.custom_metric,self.trained_models='auc','binary',2,None,[]
2186 |         
2187 |         oof_preds,metric_score=self.cross_validation(X=X,y=y,group=None,kf_folds=kf_folds,
2188 |                          model=LGBMClassifier(n_estimators=256,metric='auc',
2189 |                                               colsample_bytree=0.8,colsample_bynode=0.8),
2190 |                          model_name='lgb_adversarial_validation',
2191 |                          sample_weight=sample_weight,
2192 |                          use_optuna=False,repeat=0,num_folds=5,CV_FE=None,
2193 |                          num_classes=2,objective='binary',
2194 |                          log=100,use_pseudo_label=False,
2195 |                          test=None,group_col=None,target_col='is_test',
2196 |                          category_cols=[],device=self.device,
2197 |                          early_stop=100,plot_feature_importance=True,
2198 |                          metric='auc',Metric=self.Metric
2199 |                         )
2200 |         self.PrintColor(f"{self.metric}------------------------------>{metric_score}",color = Fore.RED)
2201 | 
2202 |         self.metric,self.objective,self.num_classes=temp_metric,temp_objective,temp_num_classes
2203 |         self.custom_metric=temp_custom_metric
2204 |         self.trained_models=copy.deepcopy(temp_trained_models)
2205 |     
2206 |     def drop_high_correlation_feats(self,df:pd.DataFrame)->None:
2207 |         #target_col and group_col is for model training,don't delete.object feature is string.
2208 |         #Here we choose 0.99,other feature with high correlation can use Dimensionality reduction such as PCA.
2209 |         #if you want to delete other feature with high correlation,add into drop_cols when init.
2210 |         numerical_cols=[col for col in df.columns \
2211 |                         if (col not in [self.target_col,self.group_col,self.weight_col,self.kfold_col]) \
2212 |                         and str(df[col].dtype) not in ['object','category']]
2213 |         corr_matrix=df[numerical_cols].corr().values
2214 |         drop_cols=[]
2215 |         for i in range(len(corr_matrix)):
2216 |             #time series data
2217 |             #bg0 and bg1 have correlation of 0.99,……,bg{n-1} and bg{n} have correlation of 0.99,
2218 |             #if don't add this,we will drop([bg1,……,bgn]),although bg0 and bgn have a low correlation.
2219 |             if numerical_cols[i] not in drop_cols:
2220 |                 for j in range(i+1,len(corr_matrix)):
2221 |                     if numerical_cols[j]  not in drop_cols:
2222 |                         if abs(corr_matrix[i][j])>=0.99:
2223 |                             drop_cols.append(numerical_cols[j])
2224 |         #add drop_cols to self.drop_cols,they will be dropped in the final part of the function base_FE.
2225 |         print(f"drop_cols={drop_cols}")
2226 |         del numerical_cols
2227 |         gc.collect()
2228 |         return drop_cols
2229 | 
2230 |     #binary or multi_class
2231 |     def set_target2idx(self,y:pd.DataFrame,target2idx:dict|None=None):
2232 |         #use your custom target2idx  when objective=='binary' or 'multi_class'
2233 |         self.use_custom_target2idx=bool(type(target2idx)==dict)
2234 |         if self.use_custom_target2idx:#use your custom target2idx
2235 |             self.target2idx=target2idx
2236 |         else:
2237 |             self.target2idx={}
2238 |             y_unique=list(y.value_counts().to_dict().keys())
2239 |             for idx in range(len(y_unique)):
2240 |                 self.target2idx[y_unique[idx]]=idx
2241 |         #deal with {0:1,1:0}
2242 |         if self.target2idx=={0:1,1:0}:
2243 |             self.target2idx={0:0,1:1}
2244 | 
2245 |         self.idx2target={}
2246 |         for tgt,idx in self.target2idx.items():
2247 |                 self.idx2target[idx]=tgt
2248 |             
2249 |         y=y.swifter.allow_dask_on_strings(False).apply(lambda k:self.target2idx[k])
2250 |         return y
2251 |     
2252 |     def fit(self,train_path_or_file:str|pd.DataFrame|pl.DataFrame='train.csv',
2253 |             category_cols:list[str]=[],save_trained_models:bool=True,
2254 |             target2idx:dict|None=None,pseudo_label_weight:float=0.5,
2255 |             fitwithCV:bool=True,date_cols:list[str]=[],
2256 |            ):
2257 |         #lightgbm:https://github.com/microsoft/LightGBM/blob/master/python-package/lightgbm/sklearn.py
2258 |         #xgboost:https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py
2259 |         #category_cols:Convert string columns to 'category'.
2260 |         self.category_cols=category_cols
2261 |         self.save_trained_models=save_trained_models
2262 |         self.pseudo_label_weight=pseudo_label_weight
2263 |         self.fitwithCV=fitwithCV
2264 |         self.date_cols=date_cols
2265 | 
2266 |         if (self.fitwithCV==False) and (self.use_oof_as_feature or self.save_oof_preds):
2267 |             raise ValueError("When fit without CV,use_oof_as_feature or save_oof_preds must be False.")
2268 |         if (self.fitwithCV==False) and (self.use_optuna_find_params>0):
2269 |             raise ValueError("When fit without CV,use_optuna_find_params must be 0.")
2270 |         if (self.fitwithCV==False) and (self.num_folds!=1):
2271 |             raise ValueError("When fit without CV,num_folds must be 1.")
2272 |         if (self.fitwithCV==False) and (self.n_repeats!=1):
2273 |             raise ValueError("When fit without CV,n_repeats must be 1.")
2274 |         
2275 |         if self.fitwithCV and self.num_folds<2:#kfold must greater than 1
2276 |             raise ValueError("num_folds must be greater than 1.")
2277 |         self.PrintColor("fit......",color=Fore.GREEN)
2278 |         self.PrintColor("load train data")
2279 |         self.train=self.load_data(path_or_file=train_path_or_file,mode='train')
2280 |         if self.fitwithCV and (self.kfold_col in self.train.columns):
2281 |             if sorted(self.train[self.kfold_col].unique())!=list(np.arange(self.num_folds)):
2282 |                 raise ValueError(f"values in {self.kfold_col} must be one of [0,1,……,{self.num_folds-1}]")   
2283 |         #process date_cols
2284 |         if len(self.date_cols):
2285 |             self.PrintColor("process date columns.")
2286 |             for date_col in self.date_cols:
2287 |                 self.train[date_col]=pd.to_datetime(self.train[date_col])
2288 |                 self.train[date_col]=(self.train[date_col]-pd.to_datetime('1970-01-01')).dt.total_seconds()
2289 |                 self.train=self.construct_seasonal_feature(self.train,date_col,timestep='day')
2290 |             self.train.drop(self.date_cols,axis=1,inplace=True)   
2291 | 
2292 |         try:#deal with TypeError: unhashable type: 'list'
2293 |             self.train=self.train.drop_duplicates()
2294 |         except:
2295 |             pass
2296 |         self.sample_weight=self.train[self.weight_col]
2297 |         self.train.drop([self.weight_col],axis=1,inplace=True)
2298 |         self.target_dtype=self.train[self.target_col].dtype
2299 |         
2300 |         self.PrintColor("Feature Engineer")
2301 |         self.train=self.base_FE(self.train,mode='train',drop_cols=self.drop_cols)
2302 |         X=self.train.drop([self.group_col,self.target_col],axis=1,errors='ignore')
2303 |         #special characters in columns'name will lead to errors when GBDT model training.
2304 |         X.columns=self.colname_clean(list(X.columns))
2305 |         print(f"train.shape:{X.shape}")
2306 |         y=self.train[self.target_col]
2307 |                 
2308 |         #classification:target2idx,idx2target
2309 |         if self.objective!='regression':
2310 |             y=self.set_target2idx(y,target2idx)
2311 | 
2312 |         #save true label in train data to calculate final score 
2313 |         self.features,self.target=X,y.values
2314 |         
2315 |         if self.exp_mode:#use log transform for target_col
2316 |             self.exp_mode_b=-y.min()
2317 |             y=np.log1p(y+self.exp_mode_b)
2318 |         
2319 |         if self.group_col!=None:
2320 |             group=self.train[self.group_col]
2321 |         else:
2322 |             group=None
2323 |         
2324 |         #if you don't use your own models,then use built-in models.
2325 |         self.PrintColor("load models")
2326 |         if len(self.models)==0:
2327 | 
2328 |             #init objective and metric
2329 |             metric,objective=self.metric,None
2330 |             if self.objective=='multi_class':
2331 |                 metric='multi_logloss'
2332 |             #lightgbm don't support f1_score,but we will calculate f1_score as Metric.
2333 |             if metric in ['f1_score','mcc','logloss','pr_auc','accuracy']:
2334 |                 metric='auc'
2335 |             elif metric in ['medae','mape','smape']:
2336 |                 metric,objective='mae','regression_l1'
2337 |             elif metric in ['rmsle','msle','r2']:
2338 |                 metric,objective='mse','regression'
2339 |             if self.custom_metric!=None:
2340 |                 #objective to metric
2341 |                 lgb_o2m={'regression':'rmse','binary':'auc','multi_class':'multi_logloss'}
2342 |                 metric=lgb_o2m[self.objective]
2343 |             #objective='regression','regression_l1','huber','fair','poisson','quantile','mape','gamma',
2344 |             #'tweedie','binary','multiclass','multiclassova','cross_entropy','cross_entropy_lambda'
2345 |             #https://lightgbm.readthedocs.io/en/stable/Parameters.html
2346 |             lgb_params={"boosting_type": "gbdt","metric": metric,'objective':objective,
2347 |                         'random_state': self.seed,  "max_depth": 10,"learning_rate": 0.1,
2348 |                         "n_estimators": 20000,"colsample_bytree": 0.6,"colsample_bynode": 0.6,
2349 |                         "verbose": -1,"reg_alpha": 0.2,"reg_lambda": 5,
2350 |                         "extra_trees":True,'num_leaves':64,"max_bin":255,
2351 |                         'importance_type': 'gain',#better than 'split'
2352 |                         }
2353 |             #find new params then use optuna
2354 |             if self.use_optuna_find_params:
2355 |                 if self.kfold_col not in X.columns:
2356 |                     #choose cross validation
2357 |                     if self.objective!='regression':
2358 |                         if self.group_col!=None:#group
2359 |                             kf=StratifiedGroupKFold(n_splits=self.num_folds,random_state=self.seed,shuffle=True)
2360 |                         else:
2361 |                             kf=StratifiedKFold(n_splits=self.num_folds,random_state=self.seed,shuffle=True)
2362 |                     else:#regression
2363 |                         if self.group_col!=None:#group
2364 |                             kf=GroupKFold(n_splits=self.num_folds)
2365 |                         else:
2366 |                             kf=KFold(n_splits=self.num_folds,random_state=self.seed,shuffle=True)
2367 |                     kf_folds=pd.DataFrame({self.kfold_col:np.zeros(len(y))})
2368 |                     for fold, (train_index, valid_index) in (enumerate(kf.split(X,y,group))):
2369 |                         kf_folds[self.kfold_col][valid_index]=fold
2370 |                 else:
2371 |                     kf_folds=pd.DataFrame({self.kfold_col:X[self.kfold_col].values})
2372 |                 lgb_params=self.optuna_lgb(X=X.drop([self.kfold_col],axis=1,errors='ignore'),y=y,group=group,kf_folds=kf_folds,metric=metric)
2373 |              
2374 |             #catboost's metric
2375 |             # Valid options are:  'CrossEntropy', 'CtrFactor', 'Focal', 'RMSE', 'LogCosh', 
2376 |             # 'Lq','Quantile', 'MultiQuantile', 'Expectile', 'LogLinQuantile',
2377 |             # 'Poisson', 'MSLE', 'MedianAbsoluteError', 'Huber', 'Tweedie', 'Cox', 
2378 |             # 'RMSEWithUncertainty', 'MultiClass', 'MultiClassOneVsAll', 'PairLogit', 'PairLogitPairwise',
2379 |             # 'YetiRank', 'YetiRankPairwise', 'QueryRMSE', 'GroupQuantile', 'QuerySoftMax', 
2380 |             # 'QueryCrossEntropy', 'StochasticFilter', 'LambdaMart', 'StochasticRank', 
2381 |             # 'PythonUserDefinedPerObject', 'PythonUserDefinedMultiTarget', 'UserPerObjMetric',
2382 |             # 'UserQuerywiseMetric','NumErrors', 'FairLoss', 'BalancedAccuracy','Combination',
2383 |             # 'BalancedErrorRate', 'BrierScore', 'Precision', 'Recall', 'TotalF1', 'F', 'MCC', 
2384 |             # 'ZeroOneLoss', 'HammingLoss', 'HingeLoss', 'Kappa', 'WKappa', 'LogLikelihoodOfPrediction',
2385 |             # 'NormalizedGini','PairAccuracy', 'AverageGain', 'QueryAverage', 'QueryAUC',
2386 |             # 'PFound', 'PrecisionAt', 'RecallAt', 'MAP', 'NDCG', 'DCG', 'FilteredDCG', 'MRR', 'ERR', 
2387 |             # 'SurvivalAft', 'MultiRMSE', 'MultiRMSEWithMissingValues', 'MultiLogloss', 'MultiCrossEntropy',
2388 | 
2389 |             #catboost metric to params
2390 |             metric2params={#regression
2391 |                           'mse':'RMSE','rmsle':'RMSE','msle':'MSLE','rmse':'RMSE',
2392 |                            'mae':'MAE','medae':'MAE','mape':'MAPE','r2':'R2','smape':'SMAPE',
2393 |                            #classification
2394 |                            'accuracy':'Accuracy','logloss':'Logloss','multi_logloss':'Accuracy',
2395 |                            'f1_score':'F1','auc':'AUC','mcc':'MCC','pr_auc':'PRAUC',
2396 |                           }
2397 |             metric=metric2params.get(self.metric,'None')
2398 |             
2399 |             if self.custom_metric!=None:#use your custom_metric
2400 |                 #objective to metric
2401 |                 cat_o2m={'regression':'RMSE','binary':'Logloss','multi_class':'Accuracy'}
2402 |                 metric=cat_o2m[self.objective]
2403 |             #objective to loss function
2404 |             cat_o2l={'regression':'RMSE','binary':'Logloss','multi_class':'Multiclass'}
2405 |             loss_function=cat_o2l[self.objective]
2406 |             #metric to loss_function (MAE and MAPE)
2407 |             if metric in ['MAE','MAPE']:
2408 |                 loss_function=metric    
2409 |             #'grow_policy': 'SymmetricTree','boost_from_average': True, colsample_bylevel
2410 |             cat_params={'loss_function':loss_function,'random_state':self.seed,'eval_metric': metric,
2411 |                        'bagging_temperature' : 0.50,'iterations': 20000,'learning_rate': 0.1,
2412 |                         'max_depth': 12,'l2_leaf_reg': 1.25,'min_data_in_leaf': 24,
2413 |                         'random_strength': 0.25, 'verbose' : 0,
2414 |                       }
2415 |             #grow_policy:depthwise/lossguide,'objective': 'reg:squarederror','tree_method': 'hist',
2416 |             #https://www.kaggle.com/code/sureshmecad/xgboost-hyperparameter-autompg
2417 |             #https://www.kaggle.com/discussions/general/238684
2418 |             xgb_params={'random_state': self.seed, 'n_estimators': 20000, 
2419 |                         'learning_rate': 0.1, 'max_depth': 10,
2420 |                         'reg_alpha': 0.2, 'reg_lambda': 5, 
2421 |                         'subsample': 0.95, 'colsample_bytree': 0.6, 
2422 |                         'min_child_weight': 5,'early_stopping_rounds':self.early_stop,
2423 |                         'enable_categorical':True,
2424 |                        }
2425 | 
2426 |             if self.device in ['cuda','gpu']:#gpu's name
2427 |                 lgb_params['device']='gpu'
2428 |                 lgb_params['gpu_use_dp']=True
2429 |                 cat_params['task_type']="GPU"
2430 |                 xgb_params['tree_method']='gpu_hist'
2431 |             else:#self.device=='cpu'
2432 |                 lgb_params['n_jobs']=-1
2433 |                 xgb_params['n_jobs']=-1
2434 |                 
2435 |             if self.objective=='regression':
2436 |                 self.models=[(LGBMRegressor(**lgb_params),'lgb'),
2437 |                              (CatBoostRegressor(**cat_params),'cat'),
2438 |                              (XGBRegressor(**xgb_params),'xgb')
2439 |                             ]
2440 |             else:
2441 |                 self.models=[(LGBMClassifier(**lgb_params),'lgb'),
2442 |                              (CatBoostClassifier(**cat_params),'cat'),
2443 |                              (XGBClassifier(**xgb_params),'xgb'),
2444 |                             ]
2445 | 
2446 |         for repeat in range(self.n_repeats):
2447 |             if self.fitwithCV:
2448 |                 if self.kfold_col not in X.columns:
2449 |                     #choose cross validation
2450 |                     if self.objective!='regression':
2451 |                         if self.group_col!=None:#group
2452 |                             kf=StratifiedGroupKFold(n_splits=self.num_folds,random_state=self.seed+repeat,shuffle=True)
2453 |                         else:
2454 |                             kf=StratifiedKFold(n_splits=self.num_folds,random_state=self.seed+repeat,shuffle=True)
2455 |                     else:#regression
2456 |                         if self.group_col!=None:#group
2457 |                             kf=GroupKFold(n_splits=self.num_folds)
2458 |                         else:
2459 |                             kf=KFold(n_splits=self.num_folds,random_state=self.seed+repeat,shuffle=True)
2460 |                     kf_folds=pd.DataFrame({self.kfold_col:np.zeros(len(y))})
2461 |                     #use groupkfoldshuffle
2462 |                     if (self.group_col!=None) and self.objective=='regression':
2463 |                         unique_group=sorted(group.unique())
2464 |                         random_group=unique_group.copy()
2465 |                         np.random.shuffle(random_group)
2466 |                         random_map={k:v for k,v in zip(unique_group,random_group)}
2467 |                         group=group.swifter.allow_dask_on_strings(False).apply(lambda x:random_map[x])
2468 |                         group=group.sort_values()
2469 |                         X=X.loc[list(group.index)]
2470 |                         y=y.loc[list(group.index)]
2471 |                         del unique_group,random_group,random_map
2472 |                         gc.collect()
2473 |                     
2474 |                     for fold, (train_index, valid_index) in (enumerate(kf.split(X,y,group))):
2475 |                         kf_folds[self.kfold_col][valid_index]=fold
2476 |                     #sort_index
2477 |                     if (self.group_col!=None):
2478 |                         X,y=X.sort_index(),y.sort_index()
2479 |                         group,kf_folds=group.sort_index(),kf_folds.sort_index()
2480 |                 else:#custom_kfold
2481 |                     kf_folds=pd.DataFrame({self.kfold_col:X[self.kfold_col].values})
2482 |             
2483 |             #check params and update
2484 |             for i in range(len(self.models)):
2485 |                 model,model_name=self.models[i]
2486 |                 if 'lgb' in model_name or 'xgb' in model_name or 'cat' in model_name:
2487 |                     params=model.get_params()
2488 |                     params['random_state']=self.seed+repeat
2489 |                     model.set_params(**params)
2490 |                 self.models[i]=(model,model_name)
2491 |                 print(f"{self.models[i][1]}_params:{self.models[i][0].get_params()}")  
2492 |                 
2493 |             self.PrintColor("model training")
2494 |             for (model,model_name) in self.models:
2495 |                 if self.fitwithCV:
2496 |                     oof_preds,metric_score=self.cross_validation(X=X.drop([self.kfold_col],axis=1,errors='ignore'),y=y,group=group,repeat=repeat,kf_folds=kf_folds,
2497 |                                                                  model=copy.deepcopy(model),model_name=model_name,
2498 |                                                                  sample_weight=self.sample_weight,use_optuna=False,
2499 |                                                                  num_folds=self.num_folds,CV_FE=self.CV_FE,
2500 |                                                                  num_classes=self.num_classes,objective=self.objective,
2501 |                                                                  log=self.log,use_pseudo_label=self.use_pseudo_label,
2502 |                                                                  test=self.test,group_col=self.group_col,target_col=self.target_col,
2503 |                                                                  category_cols=self.colname_clean(copy.deepcopy(self.category_cols)),
2504 |                                                                  CV_stat=self.CV_stat_with_kfold if self.targetencoder_with_kfold else self.CV_stat_without_kfold,
2505 |                                                                  use_data_augmentation=self.use_data_augmentation,
2506 |                                                                  CV_sample=self.CV_sample,device=self.device,
2507 |                                                                  early_stop=self.early_stop,use_eval_metric=self.use_eval_metric,
2508 |                                                                  lgb_eval_metric=self.lgb_eval_metric,xgb_eval_metric=self.xgb_eval_metric,
2509 |                                                                  plot_feature_importance=self.plot_feature_importance,
2510 |                                                                  exp_mode=self.exp_mode,exp_mode_b=self.exp_mode_b,
2511 |                                                                  use_CIR=self.use_CIR,metric=self.metric,Metric=self.Metric
2512 |                                                                 )
2513 |                 else:
2514 |                     self.fit_fulldata(X=X,y=y,
2515 |                                       model=copy.deepcopy(model),model_name=model_name,
2516 |                                       CV_FE=self.CV_FE,sample_weight=self.sample_weight, 
2517 |                                       category_cols=self.colname_clean(copy.deepcopy(self.category_cols)),
2518 |                                       CV_stat=self.CV_stat_with_kfold if self.targetencoder_with_kfold else self.CV_stat_without_kfold,
2519 |                                      )
2520 |                 #Gradually transform this function into something similar to cross_validaiton(self,)
2521 |                 """                                             
2522 |                  use_optuna=False,num_classes=self.num_classes,objective=self.objective,
2523 |                  use_pseudo_label=self.use_pseudo_label,
2524 |                  test=self.test,target_col=self.target_col,
2525 |                  use_data_augmentation=self.use_data_augmentation,
2526 |                  device=self.device,
2527 |                  plot_feature_importance=self.plot_feature_importance,
2528 |                  exp_mode=self.exp_mode,exp_mode_b=self.exp_mode_b,
2529 |                  use_CIR=self.use_CIR,metric=self.metric,Metric=self.Metric                                                  
2530 |                 """
2531 |                 
2532 |                 
2533 |                 if self.use_oof_as_feature:
2534 |                     if self.objective=='regression':
2535 |                         X[f'{model_name}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_oof_preds']=oof_preds
2536 |                         self.train[f'{model_name}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_oof_preds']=oof_preds
2537 |                     else:
2538 |                         for c in range(self.num_classes):
2539 |                             X[f'{model_name}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_oof_preds_class{c}']=oof_preds[:,c]
2540 |                             self.train[f'{model_name}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_oof_preds_class{c}']=oof_preds[:,c]
2541 | 
2542 |                 if self.fitwithCV:
2543 |                     metric=self.metric if self.custom_metric==None else self.custom_metric.__name__
2544 |                     self.PrintColor(f"{metric}------------------------------>{metric_score}",color = Fore.RED)
2545 | 
2546 |                 if self.save_oof_preds:#if oof_preds is needed
2547 |                     np.save(self.model_save_path+f"{model_name}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_{self.target_col}.npy",oof_preds)
2548 | 
2549 |     #fit without kfold,just full data with one fold.
2550 |     def fit_fulldata(self,X,y,model,model_name,sample_weight,
2551 |                      category_cols,CV_FE=None,CV_stat=None,
2552 |                     ):
2553 |         if CV_stat!=None:
2554 |             X=CV_stat(X=X,y=y,fold=0,repeat=0)
2555 |         if CV_FE!=None:
2556 |             X=CV_FE(X,mode='train',fold=0,repeat=0)
2557 |         if 'lgb' in model_name:
2558 |             #gpu params isn't set
2559 |             if self.device in ['cuda','gpu']:#gpu mode when training
2560 |                 params=model.get_params()
2561 |                 if (params.get('device',-1)==-1) or (params.get('gpu_use_dp',-1)==-1):
2562 |                      raise ValueError("The 'device' of lightgbm is 'gpu' and 'gpu_use_dp' must be True.")
2563 |             model.fit(X, y,sample_weight=sample_weight,categorical_feature=category_cols,
2564 |                      )
2565 |         elif 'xgb' in model_name:
2566 |             #gpu params isn't set
2567 |             if self.device in ['cuda','gpu']:#gpu mode when training
2568 |                 params=model.get_params()
2569 |                 if (params.get('tree_method',-1)!='gpu_hist'):
2570 |                      raise ValueError("The 'tree_method' of xgboost must be 'gpu_hist'.")
2571 |             model.fit(X,y,sample_weight=sample_weight
2572 |                      )
2573 |         elif 'cat' in model_name:
2574 |             #gpu params isn't set
2575 |             if self.device in ['cuda','gpu']:#gpu mode when training
2576 |                 params=model.get_params()
2577 |                 if (params.get('task_type',-1)==-1):
2578 |                      raise ValueError("The 'task_type' of catboost must be 'GPU'.")
2579 |             X[category_cols]=X[category_cols].astype('string')
2580 |             model.fit(X,y,sample_weight=sample_weight,cat_features=category_cols
2581 |                      )
2582 | 
2583 |         elif 'tabm' in model_name:
2584 |             with self.suppress_stdout():
2585 |                  model.fit(X,y,cat_col_names=category_cols)
2586 |         elif 'realmlp' in model_name:
2587 |             model.fit(X,y,cat_col_names=category_cols)
2588 |         else:
2589 |             model.fit(X,y)
2590 |             
2591 |         if self.plot_feature_importance:
2592 |             #can only support GBDT.
2593 |             if ('lgb' in model_name) or ('xgb' in model_name) or ('cat' in model_name):
2594 |                 origin_features=list(X.columns)  
2595 |                 feature_importance=model.feature_importances_
2596 |                 #convert to percentage
2597 |                 feature_importance=feature_importance/np.sum(feature_importance)
2598 |                 feat_import_dict={k:v for k,v in zip(origin_features,feature_importance)}
2599 |                 feat_import_dict={k:v for k,v in sorted(feat_import_dict.items(),key=lambda x:-x[1])}
2600 |                 self.pickle_dump(feat_import_dict,self.model_save_path+f'{model_name}_fold0_{self.target_col}_feature_importance.pkl')
2601 |                 bestk,worstk=min(20,int(len(origin_features)*0.1+1)),min(20,int(len(origin_features)*0.1+1))
2602 |                 print(f"top{bestk} best features is :{list(feat_import_dict.keys())[:bestk]}")
2603 |                 print(f"top{worstk} worst features is :{list(feat_import_dict.keys())[-worstk:]}")
2604 | 
2605 |                 #plot feature importance
2606 |                 plt.figure(figsize = (12, 2*bestk/5))
2607 |                 sns.barplot(
2608 |                     y=list(feat_import_dict.keys())[:bestk],
2609 |                     x=list(feat_import_dict.values())[:bestk],
2610 |                 )
2611 |                 plt.title(f"{model_name} fold 0 top{bestk} best Feature Importance")
2612 |                 plt.show()
2613 |         self.trained_models.append(copy.deepcopy(model))
2614 |         
2615 |     #calculate each model cross validation metric scores.
2616 |     def CVMetricsSummary(self,):
2617 |         if self.objective=='regression':
2618 |             metrics=self.reg_metric
2619 |         else:
2620 |             metrics=self.cla_metric
2621 |         temp_metric=self.metric
2622 |         
2623 |         summary_df=pd.DataFrame(np.zeros((0,len(self.models))))
2624 |         summary_df.columns=[model_name for (model,model_name) in self.models]
2625 |         for modeli in range(len(self.models)):
2626 |             oof_preds=np.zeros_like(np.load(self.model_save_path+f"{self.models[0//self.num_folds][1]}_seed{self.seed}_repeat0_fold{self.num_folds}_{self.target_col}.npy"))
2627 |             for repeat in range(self.n_repeats):
2628 |                 oof_preds+=np.load(self.model_save_path+f"{self.models[modeli][1]}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_{self.target_col}.npy")
2629 |             oof_preds=oof_preds/self.n_repeats
2630 |             for metrici in range(len(metrics)):
2631 |                 self.metric=metrics[metrici]
2632 |                 try:
2633 |                     #calculate each metric
2634 |                     summary_df.loc[self.metric,self.models[modeli][1]]=self.Metric(self.target,oof_preds) 
2635 |                 except:
2636 |                     summary_df.loc[self.metric,self.models[modeli][1]]=np.nan
2637 |         self.metric=temp_metric
2638 |         return summary_df
2639 |     
2640 |     def cal_final_score(self,weights):
2641 |         #calculate oof score if save_oof_preds
2642 |         if self.save_oof_preds:
2643 |             for repeat in range(self.n_repeats):
2644 |                 oof_preds=np.zeros_like(np.load(self.model_save_path+f"{self.models[0][1]}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_{self.target_col}.npy"))
2645 |                 for i in range(len(weights)//self.n_repeats):
2646 |                     oof_pred=np.load(self.model_save_path+f"{self.models[i][1]}_seed{self.seed}_repeat{repeat}_fold{self.num_folds}_{self.target_col}.npy")
2647 |                     oof_preds+=weights[i]*oof_pred
2648 |                 oof_preds=oof_preds/len(self.models)
2649 |                 metric=self.metric if self.custom_metric==None else self.custom_metric.__name__
2650 |                 print(f"final_repeat{repeat}_{metric}:{self.Metric(self.target,oof_preds)}")
2651 | 
2652 |     #regression/classification
2653 |     def predict_batch(self,model,test_X):
2654 |         category_cols=self.colname_clean(copy.deepcopy(self.category_cols))
2655 |         test_preds=np.zeros((len(test_X)))
2656 |         if 'catboost' in str(type(model)):#catboost
2657 |             test_X[category_cols]=test_X[category_cols].astype('string')
2658 |         else:
2659 |             test_X[category_cols]=test_X[category_cols].astype(str).astype('category')
2660 |         for idx in range(0,len(test_X),self.infer_size):
2661 |             if 'tabnet' in str(type(model)):
2662 |                 for c in category_cols:
2663 |                    test_X[c]=test_X[c].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)     
2664 |                 test_preds[idx:idx+self.infer_size]=model.predict(test_X[idx:idx+self.infer_size].to_numpy()).reshape(-1)
2665 |             else:   
2666 |                 test_preds[idx:idx+self.infer_size]=model.predict(test_X[idx:idx+self.infer_size])  
2667 |         
2668 |         if self.use_TTA:
2669 |             test_aug_X=self.pca_augmentation(X=test_X)
2670 |             test_aug_preds=np.zeros((len(test_aug_X)))
2671 |             if 'catboost' in str(type(model)):#catboost
2672 |                 test_aug_X[category_cols]=test_aug_X[category_cols].astype('string')
2673 |             else:
2674 |                 test_aug_X[category_cols]=test_aug_X[category_cols].astype(str).astype('category')
2675 |             for idx in range(0,len(test_aug_X),self.infer_size):
2676 |                 if 'tabnet' in str(type(model)):
2677 |                     for c in category_cols:
2678 |                        test_aug_X[c]=test_aug_X[c].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)     
2679 |                     test_aug_preds[idx:idx+self.infer_size]=model.predict(test_aug_X[idx:idx+self.infer_size].to_numpy()).reshape(-1)
2680 |                 else:   
2681 |                     test_aug_preds[idx:idx+self.infer_size]=model.predict(test_aug_X[idx:idx+self.infer_size])  
2682 |             test_preds=(test_preds+test_aug_preds)/2
2683 |         
2684 |         return test_preds
2685 | 
2686 |     #classification
2687 |     def predict_proba_batch(self,model,test_X):
2688 |         category_cols=self.colname_clean(copy.deepcopy(self.category_cols))
2689 |         test_preds=np.zeros((len(test_X),self.num_classes))
2690 |         if 'catboost' in str(type(model)):#catboost
2691 |             test_X[category_cols]=test_X[category_cols].astype('string')
2692 |         else:
2693 |             test_X[category_cols]=test_X[category_cols].astype(str).astype('category')
2694 |         for idx in range(0,len(test_X),self.infer_size):
2695 |             if 'tabnet' in str(type(model)):
2696 |                 for c in category_cols:
2697 |                     test_X[c]=test_X[c].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)     
2698 |                 test_preds[idx:idx+self.infer_size]=model.predict_proba(test_X[idx:idx+self.infer_size].to_numpy())
2699 |             else:
2700 |                 test_preds[idx:idx+self.infer_size]=model.predict_proba(test_X[idx:idx+self.infer_size])
2701 |         
2702 |         if self.use_TTA:
2703 |             test_aug_X=self.pca_augmentation(X=test_X)
2704 |             test_aug_preds=np.zeros((len(test_aug_X),self.num_classes))
2705 |             if 'catboost' in str(type(model)):#catboost
2706 |                 test_aug_X[category_cols]=test_aug_X[category_cols].astype('string')
2707 |             else:
2708 |                 test_aug_X[category_cols]=test_aug_X[category_cols].astype(str).astype('category')
2709 |             for idx in range(0,len(test_aug_X),self.infer_size):
2710 |                 if 'tabnet' in str(type(model)):
2711 |                     for c in category_cols:
2712 |                        test_aug_X[c]=test_aug_X[c].swifter.allow_dask_on_strings(False).apply(lambda x:int(x)).astype(np.int32)     
2713 |                     test_aug_preds[idx:idx+self.infer_size]=model.predict_proba(test_aug_X[idx:idx+self.infer_size].to_numpy())
2714 |                 else:   
2715 |                     test_aug_preds[idx:idx+self.infer_size]=model.predict_proba(test_aug_X[idx:idx+self.infer_size])  
2716 |             test_preds=(test_preds+test_aug_preds)/2
2717 |         
2718 |         return test_preds
2719 |     
2720 |     def predict(self,test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',
2721 |                      weights=np.zeros(0))->np.array:
2722 |         if self.objective=='regression':
2723 |             self.PrintColor("predict......",color=Fore.GREEN)
2724 |             #weights:[1]*len(self.models)
2725 |             n=len(self.models)
2726 |             #if you don't set weights,then calculate mean value as result.
2727 |             if len(weights)==0:
2728 |                 weights=np.ones(n)
2729 |             if len(weights)!=n:
2730 |                 raise ValueError(f"length of weights must be {len(self.models)}.")
2731 |             self.PrintColor("weight normalization")
2732 |             weights=np.array([w for r in range(self.n_repeats) for w in weights],dtype=np.float32)
2733 |             #normalization
2734 |             weights=weights*(len(weights))/np.sum(weights)
2735 |             print(f"weights:{weights}")
2736 |     
2737 |             #calculate oof score if save_oof_preds
2738 |             self.cal_final_score(weights)
2739 |             self.PrintColor("load test data")
2740 |             self.test=self.load_data(test_path_or_file,mode='test')
2741 |             #process date_cols
2742 |             if len(self.date_cols):
2743 |                 self.PrintColor("process date columns.")
2744 |                 for date_col in self.date_cols:
2745 |                     self.test[date_col]=pd.to_datetime(self.test[date_col])
2746 |                     self.test[date_col]=(self.test[date_col]-pd.to_datetime('1970-01-01')).dt.total_seconds()
2747 |                     self.test=self.construct_seasonal_feature(self.test,date_col,timestep='day')
2748 |                 self.test.drop(self.date_cols,axis=1,inplace=True)
2749 |             
2750 |             self.PrintColor("Feature Engineer")
2751 |             self.test=self.base_FE(self.test,mode='test',drop_cols=self.drop_cols)
2752 |             self.test=self.test.drop([self.group_col,self.target_col],axis=1,errors='ignore')
2753 |             self.test.columns=self.colname_clean(list(self.test.columns))
2754 |             print(f"test.shape:{self.test.shape}")
2755 |             self.PrintColor("prediction on test data")
2756 |             test_preds=np.zeros((len(self.models)*self.n_repeats,len(self.test)))
2757 |             for idx in range(len(self.trained_models)): 
2758 |                 if self.targetencoder_with_kfold:
2759 |                     test_copy=self.CV_stat_with_kfold(X=self.test.copy(),fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds)  )
2760 |                 else:
2761 |                     test_copy=self.CV_stat_without_kfold(X=self.test.copy(),fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds)  )
2762 |                     
2763 |                 test_copy=self.CV_FE(test_copy,mode='test',fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds)  )
2764 |                 try:
2765 |                     test_pred=self.predict_batch(model=self.trained_models[idx],test_X=test_copy)
2766 |                 except:#catboost
2767 |                     test_copy[self.category_cols]=test_copy[self.category_cols].astype('string')
2768 |                     test_pred=self.predict_batch(model=self.trained_models[idx],test_X=test_copy)
2769 |                 if self.use_CIR:
2770 |                     test_pred=self.trained_CIR[idx//self.num_folds].transform(test_pred)
2771 |                 
2772 |                 test_preds[idx//self.num_folds]+=test_pred
2773 |                 if idx%self.num_folds==self.num_folds-1:
2774 |                     test_preds[idx//self.num_folds]/=self.num_folds
2775 |                     
2776 |                 if self.use_oof_as_feature and idx%self.num_folds==self.num_folds-1:
2777 |                     self.test[f'{self.models[idx//self.num_folds%len(self.models)][1]}_seed{self.seed}_repeat{idx//(len(self.models)*self.num_folds )}_fold{self.num_folds}_oof_preds']=test_preds[idx//self.num_folds]
2778 |                     
2779 |             if self.save_test_preds:
2780 |                 np.save(self.model_save_path+f'{self.target_col}_test_preds.npy',test_preds)
2781 |             if self.use_median_as_pred:
2782 |                 test_preds=np.median(test_preds,axis=0)
2783 |             else:
2784 |                 test_preds=np.mean([test_preds[i]*weights[i] for i in range(len(test_preds))],axis=0)
2785 |             
2786 |             #use pseudo label
2787 |             if self.use_pseudo_label:
2788 |                 self.test[self.target_col]=test_preds
2789 |                 self.trained_models=[]
2790 |                 self.trained_CIR=[]
2791 |                 self.fit(train_path_or_file=self.train_path_or_file,category_cols=self.category_cols,
2792 |                          date_cols=self.date_cols,
2793 |                          pseudo_label_weight=self.pseudo_label_weight,
2794 |                          save_trained_models=self.save_trained_models
2795 |                         )
2796 |                 #calculate oof score if save_oof_preds
2797 |                 self.cal_final_score(weights)
2798 |                 
2799 |                 test_preds=np.zeros((len(self.models)*self.n_repeats,len(self.test)))
2800 |                 for idx in range(len(self.trained_models)):
2801 |                     if self.targetencoder_with_kfold:
2802 |                         test_copy=self.CV_stat_with_kfold(X=self.test.copy(),fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds)  )
2803 |                     else:
2804 |                         test_copy=self.CV_stat_without_kfold(X=self.test.copy(),fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds)  )
2805 |                     
2806 |                     test_copy=self.CV_FE(test_copy,mode='test',fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds))
2807 |                     try:
2808 |                         test_pred=self.predict_batch(model=self.trained_models[idx],test_X=test_copy.drop([self.target_col],axis=1))
2809 |                     except:
2810 |                         test_copy[self.category_cols]=test_copy[self.category_cols].astype('string')
2811 |                         test_pred=self.predict_batch(model=self.trained_models[idx],test_X=test_copy.drop([self.target_col],axis=1))
2812 |                     
2813 |                     test_preds[idx//self.num_folds]+=test_pred
2814 |                     if idx%self.num_folds==self.num_folds-1:
2815 |                         test_preds[idx//self.num_folds]/=self.num_folds
2816 |                 
2817 |                 if self.save_test_preds:
2818 |                     np.save(self.model_save_path+f'{self.target_col}_test_preds.npy',test_preds)
2819 |                 if self.use_median_as_pred:
2820 |                     test_preds=np.median(test_preds,axis=0)
2821 |                 else:
2822 |                     test_preds=np.mean([test_preds[i]*weights[i] for i in range(len(test_preds))],axis=0)
2823 |             if self.exp_mode:
2824 |                 test_preds=np.expm1(test_preds)-self.exp_mode_b       
2825 |             return test_preds
2826 |         else:#classification 
2827 |             #(len(self.test),self.num_classes)
2828 |             test_preds=self.predict_proba(test_path_or_file,weights)
2829 |             if 'auc' in self.metric:
2830 |                 return test_preds[:,1]
2831 |             else:
2832 |                 return np.argmax(test_preds,axis=1)          
2833 | 
2834 |     def predict_proba(self,test_path_or_file:str|pd.DataFrame|pl.DataFrame='test.csv',
2835 |                       weights=np.zeros(0))->np.array:
2836 |         self.PrintColor("predict......",color=Fore.GREEN)
2837 |         #weights:[1]*len(self.models)
2838 |         n=len(self.models)
2839 |         #if you don't set weights,then calculate mean value as result.
2840 |         if len(weights)==0:
2841 |             weights=np.ones(n)
2842 |         if len(weights)!=n:
2843 |             raise ValueError(f"length of weights must be {len(self.models)}.")
2844 |         self.PrintColor("weight normalization")
2845 |         weights=np.array([w for r in range(self.n_repeats) for w in weights],dtype=np.float32)
2846 |         #normalization
2847 |         weights=weights*len(weights)/np.sum(weights)
2848 |         print(f"weights:{weights}")
2849 | 
2850 |         #calculate oof score if save_oof_preds
2851 |         self.cal_final_score(weights)
2852 |         
2853 |         self.PrintColor("load test data")
2854 |         self.test=self.load_data(test_path_or_file,mode='test')
2855 |         #process date_cols
2856 |         for date_col in self.date_cols:
2857 |             self.test[date_col]=pd.to_datetime(self.test[date_col])
2858 |             self.test[date_col]=(self.test[date_col]-pd.to_datetime('1970-01-01')).dt.total_seconds()
2859 |             self.test=self.construct_seasonal_feature(self.test,date_col,timestep='day')
2860 |         self.test.drop(self.date_cols,axis=1,inplace=True)
2861 |         
2862 |         self.PrintColor("Feature Engineer")
2863 |         self.test=self.base_FE(self.test,mode='test',drop_cols=self.drop_cols)
2864 |         self.test=self.test.drop([self.group_col,self.target_col],axis=1,errors='ignore')
2865 |         self.test.columns=self.colname_clean(list(self.test.columns))
2866 |         print(f"test.shape:{self.test.shape}")
2867 |         self.PrintColor("prediction on test data")
2868 |         test_preds=np.zeros((len(self.models)*self.n_repeats,len(self.test),self.num_classes))
2869 |         for idx in range(len(self.trained_models)):
2870 |             if self.targetencoder_with_kfold:
2871 |                 test_copy=self.CV_stat_with_kfold(X=self.test.copy(),fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds)  )
2872 |             else:
2873 |                 test_copy=self.CV_stat_without_kfold(X=self.test.copy(),fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds)  )
2874 |                     
2875 |             
2876 |             test_copy=self.CV_FE(test_copy,mode='test',fold=idx%self.num_folds,repeat=idx//(len(self.models)*self.num_folds))
2877 |             try:
2878 |                 test_pred=self.predict_proba_batch(model=self.trained_models[idx],test_X=test_copy)
2879 |             except:
2880 |                 test_copy[self.category_cols]=test_copy[self.category_cols].astype('string')
2881 |                 test_pred=self.predict_proba_batch(model=self.trained_models[idx],test_X=test_copy)
2882 |             
2883 |             test_preds[idx//self.num_folds]+=test_pred
2884 |             if idx%self.num_folds==self.num_folds-1:
2885 |                 test_preds[idx//self.num_folds]/=self.num_folds
2886 |             if self.use_oof_as_feature and idx%self.num_folds==self.num_folds-1:
2887 |                 for c in range(self.num_classes):
2888 |                     self.test[f'{self.models[idx//self.num_folds%len(self.models)][1]}_seed{self.seed}_repeat{idx//(len(self.models)*self.num_folds )}_fold{self.num_folds}_oof_preds_class{c}']=test_preds[idx//self.num_folds][:,c]
2889 |                     
2890 |         if self.save_test_preds:
2891 |             np.save(self.model_save_path+f'{self.target_col}_test_preds.npy',test_preds)
2892 |         test_preds=np.mean([test_preds[i]*weights[i] for i in range(len(test_preds))],axis=0)#(len(test),self.num_classes)
2893 |         
2894 |         #use pseudo label
2895 |         if self.use_pseudo_label:
2896 |             self.test[self.target_col]=np.argmax(test_preds,axis=1)
2897 |             self.trained_models=[]
2898 |             self.fit(train_path_or_file=self.train_path_or_file,category_cols=self.category_cols,
2899 |                      date_cols=self.date_cols,target2idx=self.target2idx,
2900 |                      pseudo_label_weight=self.pseudo_label_weight,
2901 |                      save_trained_models=self.save_trained_models
2902 |                     )
2903 |             #calculate oof score if save_oof_preds
2904 |             self.cal_final_score(weights)
2905 |             test_preds=np.zeros((len(self.models)*self.n_repeats,len(self.test),self.num_classes))
2906 |             for idx in range(len(self.trained_models)):
2907 |                 if self.targetencoder_with_kfold:
2908 |                     test_copy=self.CV_stat_with_kfold(X=self.test.copy(),fold=idx%self.num_folds,
2909 |                                                       repeat=idx//(len(self.models)*self.num_folds)  )
2910 |                 else:
2911 |                     test_copy=self.CV_stat_without_kfold(X=self.test.copy(),fold=idx%self.num_folds,
2912 |                                                          repeat=idx//(len(self.models)*self.num_folds)  )
2913 |                     
2914 |                 test_copy=self.CV_FE(test_copy,mode='test',fold=idx%self.num_folds, repeat=idx//(len(self.models)*self.num_folds))
2915 |                 try:
2916 |                     test_pred=self.predict_proba_batch(model=self.trained_models[idx],test_X=test_copy.drop([self.target_col],axis=1))
2917 |                 except:
2918 |                     test_copy[self.category_cols]=test_copy[self.category_cols].astype('string')
2919 |                     test_pred=self.predict_proba_batch(model=self.trained_models[idx],test_X=test_copy.drop([self.target_col],axis=1))
2920 |                 test_preds[idx//self.num_folds]+=test_pred
2921 |                 if idx%self.num_folds==self.num_folds-1:
2922 |                     test_preds[idx//self.num_folds]/=self.num_folds
2923 |                 
2924 |             if self.save_test_preds:
2925 |                 np.save(self.model_save_path+f'{self.target_col}_test_preds.npy',test_preds)
2926 |             test_preds=np.mean([test_preds[i]*weights[i] for i in range(len(test_preds))],axis=0)
2927 |         self.PrintColor(f"idx2target={self.idx2target}",color=Fore.RED)
2928 |         if self.save_trained_models:
2929 |             self.pickle_dump(self.idx2target,self.model_save_path+f'_{self.target_col}_idx2target.pkl')
2930 |         
2931 |         return test_preds        
2932 |         
2933 |     #ensemble some solutions.
2934 |     def ensemble(self,solution_paths_or_files:list[str]=[],id_col:str='id',target_col:str='',weights=None):
2935 |         
2936 |         #If you don't set weights,then use mean_value as result.
2937 |         n=len(solution_paths_or_files)
2938 |         if n<2:#ensemble files not one file.
2939 |             raise ValueError(f"length of solution_paths_or_files must be greater than 1.")
2940 |         if weights==None:
2941 |             weights=np.ones(n)
2942 |         if len(weights)!=n:
2943 |             raise ValueError(f"length of weights must be {len(solution_paths_or_files)}.")
2944 |         #normalization
2945 |         weights=weights/np.sum(weights)
2946 | 
2947 |         if target_col=='':
2948 |             target_col=self.target_col
2949 | 
2950 |         solutions=[]
2951 |         for i in range(n):
2952 |             #file(pd.csv) or path(str)
2953 |             solution=solution_paths_or_files[i]
2954 |             if type(solution)==str:#path
2955 |                 try:
2956 |                     solution=pl.read_csv(solution).to_pandas()
2957 |                 except:
2958 |                     raise ValueError("Yunbase can only support csv file's path.")
2959 |             if not isinstance(solution, pd.DataFrame):
2960 |                 raise ValueError(f"solution{i} is not pd.DataFrame.")
2961 |             if sorted(list(solution.columns))!=sorted([id_col,target_col]):
2962 |                 raise ValueError(f"solution{i} must have 2 columns,{id_col} and {target_col}.")
2963 |             #different file has different order.
2964 |             solutions.append(solution.sort_values([id_col]).reset_index(drop=True))
2965 |             
2966 |         solution_ids=[sorted(solutions[i][id_col].values)  for i in range(n)]
2967 |         for i in range(n):
2968 |             for j in range(i+1,n):
2969 |                 if solution_ids[i]!=solution_ids[j]:
2970 |                     raise ValueError(f"solution_files must have same ids,but {i} and {j} isn't same.")
2971 |         
2972 |         solution_files_len=[len(solutions[i]) for i in range(n)]
2973 |         if len(np.unique(solution_files_len))!=1:
2974 |             raise ValueError(f"solution_files must have same length,not {solution_files_len}")
2975 |         ensemble_df=solutions[0].copy()
2976 |         ensemble_df[target_col]=0
2977 |         #Weighted Sum of Continuous Values
2978 |         if (self.objective=='regression') or(self.metric=='auc'):
2979 |             for i in range(n):
2980 |                 ensemble_df[target_col]+=(weights[i]*solutions[i][target_col].values)
2981 |             return ensemble_df
2982 |         else:#classification find mode
2983 |             solutions=[solutions[i][target_col].values for i in range(n)]
2984 |             final_solutions=[]
2985 |             for i in range(len(solutions[0])):
2986 |                 solution2count={}
2987 |                 #data[i] solution[j]
2988 |                 for j in range(n):
2989 |                     if solutions[j][i] in solution2count.keys():
2990 |                         solution2count[ solutions[j][i] ]+=weights[j]
2991 |                     else:
2992 |                         solution2count[ solutions[j][i] ]=weights[j]
2993 |                 solution2count=dict(sorted(solution2count.items(),key=lambda x:-x[1]))
2994 |                 final_solutions.append(list(solution2count.keys())[0])
2995 |             ensemble_df[target_col]=np.array(final_solutions)
2996 |             return ensemble_df
2997 | 
2998 |     #save test_preds to submission.csv
2999 |     def submit(self,submission_path_or_file:str|pd.DataFrame='submission.csv',
3000 |                test_preds:np.array=np.ones(3),save_name:str='yunbase'):
3001 |         """
3002 |         submission_path_or_file  :Usually it is a submittion.csv file.
3003 |         test_preds               :The prediction results of the model on the test data.
3004 |         save_name                :A string,if save_name='subsmission',then you will
3005 |                                   get a submittion.csv file.
3006 |         """
3007 |         self.PrintColor('submission......',color = Fore.GREEN)
3008 |         submission=self.load_data(submission_path_or_file,mode='submission')
3009 |         submission[self.target_col]=test_preds
3010 |         if self.objective!='regression':
3011 |             #auc and your custom auc metric
3012 |             if 'auc' not in self.metric:
3013 |                 submission[self.target_col]=submission[self.target_col].swifter.allow_dask_on_strings(False).apply(lambda x:self.idx2target[x])
3014 |         #deal with bool.
3015 |         if 'auc'  not in self.metric:
3016 |             submission[self.target_col]=submission[self.target_col].astype(self.target_dtype)
3017 |         submission.to_csv(f"{save_name}.csv",index=None)
3018 | 


--------------------------------------------------------------------------------