├── .DS_Store
├── LICENSE
├── README.md
├── img
    ├── 1559738185650.png
    └── README.md
└── solutions
    ├── .DS_Store
    ├── TinyMind人民币面值&冠字号编码识别挑战赛
        ├── .DS_Store
        ├── README.md
        ├── task1
        │   ├── 1_train.ipynb
        │   ├── README.md
        │   └── predict_rmb.py
        └── task2
        │   ├── .DS_Store
        │   ├── 1_train_faster_rcnn.py
        │   ├── 2_predict_faster_rcnn.py
        │   ├── 3_savejson.py
        │   ├── VOC2007.zip
        │   ├── crnn-pytorch
        │       ├── README.md
        │       ├── __init__.py
        │       ├── dataset
        │       │   ├── __init__.py
        │       │   ├── collate_fn.py
        │       │   ├── data_transform.py
        │       │   ├── test_data.py
        │       │   └── text_data.py
        │       ├── fold_tta.pkl
        │       ├── lr_policy.py
        │       ├── models
        │       │   ├── __init__.py
        │       │   ├── crnn.py
        │       │   └── model_loader.py
        │       ├── pb_rcnn_label.csv
        │       ├── submit.py
        │       ├── test.py
        │       ├── test2.py
        │       ├── test2_tta.py
        │       └── train.py
        │   ├── data
        │       └── data.json
        │   └── multi-digit-pytorch
        │       ├── .ipynb_checkpoints
        │           └── 未命名-checkpoint.ipynb
        │       ├── 1_train.py
        │       ├── 2_predict.py
        │       ├── example.log
        │       └── 未命名.ipynb
    ├── kaggle-allstate-claims-severity
        ├── README.md
        ├── XGB_encoding(LB1106.33084).py
        └── nn_bagging_1111.84364.py
    ├── kaggle-quickdraw-doodle-recognition
        ├── 1_save2df.py
        ├── 2_train.py
        ├── EDA.ipynb
        ├── EDA_predict.ipynb
        ├── PlotLoss.ipynb
        ├── README.md
        └── Transform_Example.ipynb
    ├── kaggle-titanic
        ├── EDA.ipynb
        └── README.md
    ├── kaggle-two-sigma-connect-rental-listing-inquiries
        ├── README.md
        └── lgb.py
    ├── tianchi-第三届阿里云安全算法挑战赛
        ├── EDA.ipynb
        ├── GBM_old.ipynb
        ├── LGB_LinuX_0819.py
        ├── README.md
        ├── api.csv
        ├── finetune.ipynb
        └── gbm.py
    └── 点石-Retention Rate of Baidu Hao Kan APP Users
        ├── 1_splitdf.py
        ├── 2_baseline_1128.py
        ├── 2_baseline_1202.py
        ├── 2_baseline_1203_Train0.75989_Test0.75627.py
        ├── 2_baseline_1203_Train0.76103_Test0.75740.py
        ├── 2_baseline_1203_Train0.77218_Test0.76203.py
        ├── README.md
        └── featselect.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/.DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DueApe数据科学
 2 | https://www.dueape.com/#/background/match/
 3 | 
 4 | DueApe数据科学，Kaggle代码资源分享
 5 | 
 6 | > 一起来学习数据科学吧！
 7 | 
 8 | ![](img/1559738185650.png)
 9 | 
10 | 想要更加系统的学习Kaggle竞赛，请添加我们的微信或者访问官网。
11 | 
12 | ![](https://www.dueape.com/static/img/ewm.e93e5ce.png)
13 | 
14 | # 版权声明
15 | 
16 | 本开源库只能用于个人学习，DueApe保留原有文件版权和使用权。
17 | 开源库中任何文件不得进行任何商业活动。
18 | 


--------------------------------------------------------------------------------
/img/1559738185650.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/img/1559738185650.png


--------------------------------------------------------------------------------
/img/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/solutions/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/.DS_Store


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/.DS_Store


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/README.md:
--------------------------------------------------------------------------------
 1 | # TinyMind人民币面值&冠字号编码识别挑战赛
 2 | 
 3 | https://www.tinymind.cn/competitions/47
 4 | 
 5 | 任务1面值分类100分代码，和任务2编码识别第五名代码。
 6 | 
 7 | - 任务1：直接是一个分类问题；
 8 | - 任务2：可以抽象成一个字符识别问题；
 9 |   - 先用检测模型（Fast-RCNN）进行检测；
10 |   - 再使用识别模型CRNN或者muti-CNN进行识别
11 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task1/README.md:
--------------------------------------------------------------------------------
1 | 1. 修改`predict_rmb.py`文件中对应的路径;
2 | 2. `python predict_rmb.py`
3 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task1/predict_rmb.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | 
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | from PIL import Image
 12 | 
 13 | from sklearn.preprocessing import LabelEncoder
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold
 15 | 
 16 | import torch
 17 | torch.manual_seed(0)
 18 | torch.backends.cudnn.deterministic = False
 19 | torch.backends.cudnn.benchmark = True
 20 | 
 21 | import torchvision.models as models
 22 | import torchvision.transforms as transforms
 23 | import torchvision.datasets as datasets
 24 | import torch.nn as nn
 25 | import torch.nn.functional as F
 26 | import torch.optim as optim
 27 | from torch.autograd import Variable
 28 | from torch.utils.data.dataset import Dataset
 29 | 
 30 | class QRDataset(Dataset):
 31 |     def __init__(self, img_path, img_label, transform=None):
 32 |         self.img_path = img_path
 33 |         self.img_label=img_label
 34 |         
 35 |         if transform is not None:
 36 |             self.transform = transform
 37 |         else:
 38 |             self.transform = None
 39 |     
 40 |     def __getitem__(self, index):
 41 |         start_time = time.time()
 42 |         img = Image.open(self.img_path[index])
 43 |         
 44 |         if self.transform is not None:
 45 |             img = self.transform(img)
 46 |                 
 47 |         return img, torch.from_numpy(np.array([self.img_label[index]]))
 48 |     
 49 |     def __len__(self):
 50 |         return len(self.img_path)
 51 |         
 52 | class VisitNet(nn.Module):
 53 |     def __init__(self):
 54 |         super(VisitNet, self).__init__()
 55 |         model = models.resnet18(False)
 56 |         model.avgpool = nn.AdaptiveAvgPool2d(1)
 57 |         model.fc = nn.Linear(512, 256)
 58 |         self.resnet = model
 59 |         
 60 |     def forward(self, img):
 61 |         out = self.resnet(img)
 62 |         return F.log_softmax(out, dim=1)
 63 |     
 64 | def predict(test_loader, model, tta=10):
 65 |     # switch to evaluate mode
 66 |     model.eval()
 67 |     
 68 |     test_pred_tta = None
 69 |     for _ in range(tta):
 70 |         test_pred = []
 71 |         with torch.no_grad():
 72 |             end = time.time()
 73 |             for i, (input, target) in enumerate(test_loader):
 74 |                 input = input.cuda()
 75 |                 target = target.cuda()
 76 | 
 77 |                 # compute output
 78 |                 output = model(input)
 79 |                 output = output.data.cpu().numpy()
 80 | 
 81 |                 test_pred.append(output)
 82 |         test_pred = np.vstack(test_pred)
 83 |     
 84 |         if test_pred_tta is None:
 85 |             test_pred_tta = test_pred
 86 |         else:
 87 |             test_pred_tta += test_pred
 88 |     
 89 |     return test_pred_tta
 90 | 
 91 | 
 92 | def main():
 93 |     
 94 |     # 修改输入的路径
 95 |     df_train = pd.read_csv('../../input/train_face_value_label.csv', dtype={' label': object, 'name': object})
 96 |     lbl = LabelEncoder()
 97 |     df_train['y'] = lbl.fit_transform(df_train[' label'].values)
 98 |     
 99 |     # 修改输入的路径
100 |     test_path = glob.glob('../../input/public_test_data/*.jpg')
101 |     test_path = np.array(test_path)
102 |     
103 |     test_loader = torch.utils.data.DataLoader(
104 |         QRDataset(test_path, np.zeros(len(test_path)),
105 |                 transforms.Compose([
106 |                             # transforms.Resize((124, 124)),
107 |                             transforms.Resize(280),
108 |                             transforms.RandomCrop((256, 256)),
109 |                             transforms.RandomHorizontalFlip(),
110 |                             transforms.RandomVerticalFlip(),
111 |                             transforms.ToTensor(),
112 |                             transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
113 |             ])
114 |         ), batch_size=10, shuffle=False, num_workers=10, pin_memory=True
115 |     )
116 | 
117 |     model = VisitNet()
118 |     model = model.cuda()
119 |     model.load_state_dict(torch.load('./resnet18_fold0_11_Acc@1100.00(100.00).pt'))
120 |     
121 |     test_pred = predict(test_loader, model, 10)
122 |     test_pred = np.vstack(test_pred)
123 |     test_pred = np.argmax(test_pred, 1)
124 |     
125 |     test_pred = lbl.inverse_transform(test_pred)
126 |     test_csv = pd.DataFrame()
127 |     test_csv['name'] = [x.split('/')[-1] for x in test_path]
128 |     test_csv['label'] = test_pred
129 |     test_csv.sort_values(by='name', inplace=True)
130 |     test_csv.to_csv('tmp_newmodel_resnet18_tta10.csv', index=None, sep=',')
131 | 
132 | if __name__== "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/.DS_Store


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/2_predict_faster_rcnn.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import os, glob, shutil, codecs
 3 | 
 4 | import mxnet as mx
 5 | from matplotlib import pyplot as plt
 6 | import gluoncv
 7 | from gluoncv import model_zoo, data, utils
 8 | 
 9 | net = model_zoo.get_model('faster_rcnn_resnet50_v1b_voc', ctx=mx.gpu(0), pretrained=False)
10 | net.load_parameters('./faster_rcnn_resnet50_v1b_voc_0002_0.0519.params')
11 | net.classes = ['zipcode']
12 | net.collect_params().reset_ctx(ctx = mx.gpu(0))
13 | 
14 | # MXNET_CUDNN_AUTOTUNE_DEFAULT=0 python 2_predict_faster_rcnn.py
15 | 
16 | with codecs.open('./data/train_data_box.csv', 'w') as up:
17 |     for path in glob.glob('../input/train_data/*.jpg'):
18 |         orig_img_cv2 = cv2.imread(path)
19 |         x, orig_img = data.transforms.presets.rcnn.load_test(path)
20 |         x = x.as_in_context(mx.gpu(0))
21 |         box_ids, scores, bboxes = net(x)
22 |         bboxes = bboxes.asnumpy()[0][0].astype(int)
23 |         
24 |         y1, x1, y2, x2 = bboxes
25 |         x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
26 |         x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
27 |         
28 |         y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
29 |         y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
30 |         
31 |         x1, x2 = int(x1), int(x2)
32 |         y1, y2 = int(y1), int(y2)
33 |         
34 |         # x1-=10; x2+=10
35 |         # y1-=10; y2+=10
36 |         
37 |         # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :])
38 |         cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)])
39 |         up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2))
40 | 
41 | with codecs.open('./data/public_test_data_box.csv', 'w') as up:
42 |     for path in glob.glob('../input/public_test_data/*.jpg'):
43 |         orig_img_cv2 = cv2.imread(path)
44 |         x, orig_img = data.transforms.presets.rcnn.load_test(path)
45 |         x = x.as_in_context(mx.gpu(0))
46 |         box_ids, scores, bboxes = net(x)
47 |         bboxes = bboxes.asnumpy()[0][0].astype(int)
48 |         
49 |         y1, x1, y2, x2 = bboxes
50 |         x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
51 |         x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
52 |         
53 |         y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
54 |         y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
55 |         
56 |         x1, x2 = int(x1), int(x2)
57 |         y1, y2 = int(y1), int(y2)
58 |         
59 |         #x1-=10; x2+=10
60 |         # y1-=10; y2+=10
61 |         
62 |         # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :])
63 |         cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)])
64 |         up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2))
65 |         
66 | with codecs.open('./data/private_test_data_box.csv', 'w') as up:
67 |     for path in glob.glob('../input/private_test_data/*.jpg'):
68 |         orig_img_cv2 = cv2.imread(path)
69 |         x, orig_img = data.transforms.presets.rcnn.load_test(path)
70 |         x = x.as_in_context(mx.gpu(0))
71 |         box_ids, scores, bboxes = net(x)
72 |         bboxes = bboxes.asnumpy()[0][0].astype(int)
73 |         
74 |         y1, x1, y2, x2 = bboxes
75 |         x1*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
76 |         x2*=(orig_img_cv2.shape[0]*1.0/orig_img.shape[0])
77 |         
78 |         y1*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
79 |         y2*=(orig_img_cv2.shape[1]*1.0/orig_img.shape[1])
80 |         
81 |         x1, x2 = int(x1), int(x2)
82 |         y1, y2 = int(y1), int(y2)
83 |         
84 |         #x1-=10; x2+=10
85 |         # y1-=10; y2+=10
86 |         
87 |         # plt.imshow(orig_img_cv2[int(x1):int(x2), int(y1):int(y2), :])
88 |         cv2.imwrite('./data/data/'+path.split('/')[-1], orig_img_cv2[int(x1):int(x2), int(y1):int(y2)])
89 |         up.write('{0},{1},{2},{3},{4}\n'.format(path, x1, y1, x2, y2))


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/3_savejson.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import os, glob, shutil, codecs, json
 3 | from tqdm import tqdm, tqdm_notebook
 4 | # %pylab inline
 5 | 
 6 | 
 7 | 
 8 | desc = {}
 9 | desc['abc'] = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
10 | 
11 | desc['train'] = []
12 | desc['test'] = []
13 | desc['pb'] = []
14 | 
15 | import pandas as pd
16 | df_train_label = pd.read_csv('../input/train_id_label.csv')
17 | df_submit = pd.read_csv('./crnn-pytorch/pb_rcnn_label.csv')
18 | df_submit['label'] = df_submit['label'].apply(lambda x: ' '+x)
19 | df_submit.columns = ['name', ' label']
20 | 
21 | df_train_label = pd.concat([df_train_label, df_submit], axis=0, ignore_index=True)
22 | print(df_train_label.shape)
23 | 
24 | train_guanzi = df_train_label[' label'].apply(lambda x: x[-4:]).unique()
25 | 
26 | 
27 | def checkImageIsValid(imagePath):
28 |     img = cv2.imread(imagePath)
29 |     if img is None:
30 |         return False
31 |     
32 |     with open(imagePath, 'rb') as f:
33 |         imageBin = f.read()
34 |     
35 |     if imageBin is None:
36 |         return False
37 |     
38 |     try:
39 |         imageBuf = np.fromstring(imageBin, dtype=np.uint8)
40 |         img = cv2.imdecode(imageBuf, cv2.IMREAD_GRAYSCALE)
41 |         imgH, imgW = img.shape[0], img.shape[1]
42 |         if imgH * imgW == 0:
43 |             return False
44 |         return True
45 |     except:
46 |         return False
47 | 
48 | bad_img_path = []
49 | for x in df_train_label['name'].values:
50 |     if not checkImageIsValid('./data/data/'+x):
51 |         bad_img_path.append(x)
52 | 
53 | 
54 | import numpy as np
55 | from sklearn.model_selection import KFold, StratifiedKFold
56 | X = np.zeros((df_train_label['name'].shape[0], 2))
57 | kf = KFold(n_splits=24)
58 | kf.get_n_splits(X)
59 | 
60 | print(kf)
61 | fold_idx=0
62 | for train_index, test_index in kf.split(X, df_train_label[' label'].apply(lambda x:x[1:2])):
63 |     print("TRAIN:", train_index, "TEST:", test_index)
64 |     
65 |     desc['fold'+str(fold_idx)+'_train'] = []
66 |     desc['fold'+str(fold_idx)+'_test'] = []
67 |     
68 |     for row in df_train_label.iloc[train_index].iterrows():
69 | #         desc['fold'+str(fold_idx)+'_train'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
70 | #         continue
71 | 
72 |         if row[1]['name'] in bad_img_path:
73 |             continue
74 |             
75 |         if checkImageIsValid('./data/data/'+row[1]['name']):
76 |             desc['fold'+str(fold_idx)+'_train'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
77 |         else:
78 |             print('./data/data/'+row[1]['name'])
79 |             
80 |     for row in df_train_label.iloc[test_index].iterrows():
81 | #         desc['fold'+str(fold_idx)+'_test'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
82 | #         continue
83 | 
84 |         if row[1]['name'] in bad_img_path:
85 |             continue
86 | 
87 |         if checkImageIsValid('./data/data/'+row[1]['name']):
88 |             desc['fold'+str(fold_idx)+'_test'].append({'text':row[1][' label'].strip(), 'name':row[1]['name']})
89 |         else:
90 |             print('./data/data/'+row[1]['name'])
91 |             
92 |     fold_idx+=1
93 | 
94 | for row in glob.glob('../input/private_test_data/*'):
95 |     desc['pb'].append({'text':'QJ69411105', 'name':row.split('/')[-1]})
96 |     
97 | with open('./data/desc.json', 'w') as outfile:
98 |     json.dump(desc, outfile)


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/VOC2007.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/VOC2007.zip


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/README.md:
--------------------------------------------------------------------------------
 1 | Convolutional Recurrent Neural Network
 2 | ======================================
 3 | 
 4 | This software implements OCR system using CNN + RNN + CTCLoss, inspired by CRNN network.
 5 | 
 6 | Usage
 7 | -----
 8 | 
 9 | `
10 | python ./train.py --help
11 | `
12 | 
13 | Demo
14 | ----
15 | 
16 | 1. Train simple OCR using TestDataset data generator.
17 | Training for ~60-100 epochs.
18 | ```
19 | python train.py --test-init True --test-epoch 10 --output-dir <path_to_folder_with_snapshots>
20 | ```
21 | 
22 | 2. Run test for trained model with visualization mode.
23 | ```
24 | python test.py --snapshot <path_to_folder_with_snapshots>/crnn_resnet18_10_best --visualize True
25 | ```
26 | 
27 | Train on custom dataset
28 | -----------------------
29 | 
30 | 1. Create dataset
31 | 
32 | - Structure of dataset:
33 | ```
34 | <root_dataset_dir>
35 | ---- data
36 | -------- <img_filename_0>
37 | ...
38 | -------- <img_filename_1>
39 | ---- desc.json
40 | ```
41 | 
42 | - Structure of desc.json:
43 | ```
44 | {
45 | "abc": <symbols_in_aphabet>,
46 | "train": [
47 | {
48 | "text": <text_on_image>
49 | "name": <img_filename>
50 | },
51 | ...
52 | {
53 | "text": <text_on_image>
54 | "name": <img_filename>
55 | }
56 | ],
57 | "test": [
58 | {
59 | "text": <text_on_image>
60 | "name": <img_filename>
61 | },
62 | ...
63 | {
64 | "text": <text_on_image>
65 | "name": <img_filename>
66 | }
67 | ]
68 | }
69 | ```
70 | 
71 | 2. Train simple OCR using custom dataset.
72 | ```
73 | python train.pt --test-init True --test-epoch 10 --output-dir <path_to_folder_with_snapshots> --data-path <path_to_custom_dataset>
74 | ```
75 | 
76 | 3. Run test for trained model with visualization mode.
77 | ```
78 | python test.py --snapshot <path_to_folder_with_snapshots>/crnn_resnet18_10_best --visualize True --data-path <path_to_custom_dataset>
79 | ```
80 | 
81 | 
82 | Dependence
83 | ----------
84 | * pytorch 0.3.0 +
85 | * [warp-ctc](https://github.com/SeanNaren/warp-ctc)
86 | 
87 | Articles
88 | --------
89 | 
90 | * [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717)
91 | * [Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks](https://dl.acm.org/citation.cfm?id=1143891)
92 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/__init__.py


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/__init__.py


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/collate_fn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | def text_collate(batch):
 5 |     img = list()
 6 |     seq = list()
 7 |     seq_len = list()
 8 |     for sample in batch:
 9 |         img.append(torch.from_numpy(sample["img"].transpose((2, 0, 1))).float())
10 |         seq.extend(sample["seq"])
11 |         seq_len.append(sample["seq_len"])
12 |     img = torch.stack(img)
13 |     seq = torch.Tensor(seq).int()
14 |     seq_len = torch.Tensor(seq_len).int()
15 |     batch = {"img": img, "seq": seq, "seq_len": seq_len}
16 |     return batch
17 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/data_transform.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import cv2
  4 | import torch
  5 | import albumentations.augmentations.functional as albumentations
  6 | 
  7 | class ToTensor(object):
  8 |     def __call__(self, sample):
  9 |         sample["img"] = torch.from_numpy(sample["img"].transpose((2, 0, 1))).float()
 10 | #         sample["img"][0] = (sample["img"][0] - 0.485)/0.229
 11 | #         sample["img"][0] = (sample["img"][0] - 0.456)/0.224
 12 | #         sample["img"][0] = (sample["img"][0] - 0.406)/0.225
 13 |         
 14 |         sample["seq"] = torch.Tensor(sample["seq"]).int()
 15 |         return sample
 16 | 
 17 | 
 18 | class Resize(object):
 19 |     def __init__(self, size=(320, 32)):
 20 |         self.size = size
 21 | 
 22 |     def __call__(self, sample):
 23 |         if sample["img"] is None:
 24 |             return np.zeros((320, 32, 3))
 25 |             
 26 |         else:
 27 |             sample["img"] = cv2.resize(sample["img"], self.size)
 28 |             sample["img"] = sample["img"].astype(float)/255.0
 29 |             sample["img"][0] = (sample["img"][0] - 0.485)/0.229
 30 |             sample["img"][0] = (sample["img"][0] - 0.456)/0.224
 31 |             sample["img"][0] = (sample["img"][0] - 0.406)/0.225
 32 |             return sample
 33 | 
 34 | 
 35 | class Rotation(object):
 36 |     def __init__(self, angle=5, fill_value=0, p = 0.5):
 37 |         self.angle = angle
 38 |         self.fill_value = fill_value
 39 |         self.p = p
 40 | 
 41 |     def __call__(self, sample):
 42 |         if np.random.uniform(0.0, 1.0) < self.p:
 43 |             return sample
 44 |         h,w,_ = sample["img"].shape
 45 |         ang_rot = np.random.uniform(self.angle) - self.angle/2
 46 |         transform = cv2.getRotationMatrix2D((w/2, h/2), ang_rot, 1)
 47 |         sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value)
 48 |         return sample
 49 | 
 50 | 
 51 | class Translation(object):
 52 |     def __init__(self, fill_value=0, p = 0.5):
 53 |         self.fill_value = fill_value
 54 |         self.p = p
 55 | 
 56 |     def __call__(self, sample):
 57 |         if np.random.uniform(0.0, 1.0) < self.p:
 58 |             return sample
 59 |         h,w,_ = sample["img"].shape
 60 |         trans_range = [w / 20, h / 20]
 61 |         tr_x = trans_range[0]*np.random.uniform()-trans_range[0]/2
 62 |         tr_y = trans_range[1]*np.random.uniform()-trans_range[1]/2
 63 |         transform = np.float32([[1,0, tr_x], [0,1, tr_y]])
 64 |         sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value)
 65 |         return sample
 66 | 
 67 | 
 68 | class Scale(object):
 69 |     def __init__(self, scale=[0.5, 1.2], fill_value=0, p = 0.5):
 70 |         self.scale = scale
 71 |         self.fill_value = fill_value
 72 |         self.p = p
 73 | 
 74 |     def __call__(self, sample):
 75 |         if np.random.uniform(0.0, 1.0) < self.p:
 76 |             return sample
 77 |         h, w, _ = sample["img"].shape
 78 |         scale = np.random.uniform(self.scale[0], self.scale[1])
 79 |         transform = np.float32([[scale, 0, 0],[0, scale, 0]])
 80 |         sample["img"] = cv2.warpAffine(sample["img"], transform, (w,h), borderValue = self.fill_value)
 81 |         return sample
 82 | 
 83 | # add lyz
 84 | class Snow(object):
 85 |     def __init__(self, p = 0.5):
 86 |         self.p = p
 87 | 
 88 |     def __call__(self, sample):
 89 |         if np.random.uniform(0.0, 1.0) < self.p or not sample["aug"]:
 90 |             return sample
 91 |         h, w, _ = sample["img"].shape
 92 |         sample["img"] = albumentations.add_snow(sample["img"], snow_point=0.5, brightness_coeff=2)
 93 |         return sample
 94 |     
 95 | class Contrast(object):
 96 |     def __init__(self, p = 0.5):
 97 |         self.p = p
 98 | 
 99 |     def __call__(self, sample):
100 |         if np.random.uniform(0.0, 1.0) < self.p:
101 |             return sample
102 |         h, w, _ = sample["img"].shape
103 |         sample["img"] = albumentations.brightness_contrast_adjust(sample["img"], beta=np.random.uniform(0.0, 1.0)+0.1)
104 |         # sample["img"] = cv2.GaussianBlur(sample["img"],(3,3),0)
105 |         return sample
106 |     
107 | class Grid_distortion(object):
108 |     def __init__(self, p = 0.5):
109 |         self.p = p
110 | 
111 |     def __call__(self, sample):
112 |         # print('grid', np.random.uniform(0.0, 1.0))
113 |         
114 |         if np.random.uniform(0.0, 1.0) < self.p:
115 |             return sample
116 |         h, w, _ = sample["img"].shape
117 |         
118 |         # grid_distortion
119 |         if np.random.uniform(0.0, 1.0) < self.p:
120 |             num_steps=15
121 |             distort_limit=[-0.05,0.05]
122 |             stepsx = [1 + random.uniform(distort_limit[0], distort_limit[1]) for i in
123 |                               range(num_steps + 1)]
124 |             stepsy = [1 + random.uniform(distort_limit[0], distort_limit[1]) for i in
125 |                               range(num_steps + 1)]
126 |             sample["img"]=albumentations.grid_distortion(sample["img"],5,stepsx, stepsy)
127 |         # elastic_transform
128 |         else:
129 |             sample["img"]=albumentations.elastic_transform(sample["img"], alpha=5, sigma=1, alpha_affine=random.uniform(0,2), 
130 |                                             interpolation=cv2.INTER_LINEAR, border_mode=cv2.BORDER_REFLECT_101,)
131 |         
132 |         if np.random.uniform(0.0, 1.0) < self.p-0.2:
133 |             sample["img"]=albumentations.jpeg_compression(sample["img"], random.randint(20, 100))
134 |         return sample


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/test_data.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | import string
 6 | import random
 7 | 
 8 | class TestDataset(Dataset):
 9 |     def __init__(self,
10 |                  epoch_len = 10000,
11 |                  seq_len = 8,
12 |                  transform=None,
13 |                  abc=string.digits):
14 |         super().__init__()
15 |         self.abc = abc
16 |         self.epoch_len = epoch_len
17 |         self.seq_len = seq_len
18 |         self.transform = transform
19 | 
20 |     def __len__(self):
21 |         return self.epoch_len
22 | 
23 |     def get_abc(self):
24 |         return self.abc
25 | 
26 |     def set_mode(self, mode='train'):
27 |         return
28 | 
29 |     def generate_string(self):
30 |         return ''.join(random.choice(self.abc) for _ in range(self.seq_len))
31 | 
32 |     def get_sample(self):
33 |         h, w = 64, int(self.seq_len * 64 * 2.5)
34 |         pw = int(w / self.seq_len)
35 |         seq = []
36 |         img = np.zeros((h, w), dtype=np.uint8)
37 |         text = self.generate_string()
38 |         for i in range(len(text)):
39 |             c = text[i]
40 |             seq.append(self.abc.find(c) + 1)
41 |             hs, ws = 32, 32
42 |             symb = np.zeros((hs, ws), dtype=np.uint8)
43 |             font = cv2.FONT_HERSHEY_SIMPLEX
44 |             cv2.putText(symb, str(c), (3, 30), font, 1.2, (255), 2, cv2.LINE_AA)
45 |             # Rotation
46 |             angle = 60
47 |             ang_rot = np.random.uniform(angle) - angle/2
48 |             transform = cv2.getRotationMatrix2D((ws/2, hs/2), ang_rot, 1)
49 |             symb = cv2.warpAffine(symb, transform, (ws, hs), borderValue = 0)
50 |             # Scale
51 |             scale = np.random.uniform(0.7, 1.0)
52 |             transform = np.float32([[scale, 0, 0],[0, scale, 0]])
53 |             symb = cv2.warpAffine(symb, transform, (ws, hs), borderValue = 0)
54 |             y = np.random.randint(hs, h)
55 |             x = np.random.randint(i * pw, (i + 1) * pw - ws)
56 |             img[y-hs:y, x:x+ws] = symb
57 |         nw = int(w * 32 / h)
58 |         img = cv2.resize(img, (nw, 32))
59 |         img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
60 |         return img, seq
61 | 
62 |     def __getitem__(self, idx):
63 |         img, seq = self.get_sample()
64 |         sample = {"img": img, "seq": seq, "seq_len": len(seq), "aug": True}
65 |         if self.transform:
66 |             sample = self.transform(sample)
67 |         return sample
68 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/dataset/text_data.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import json
 3 | import os
 4 | import cv2
 5 | 
 6 | class TextDataset(Dataset):
 7 |     def __init__(self, data_path, mode="train", transform=None):
 8 |         super(Dataset, self).__init__()
 9 |         self.data_path = data_path
10 |         self.mode = mode
11 |         self.config = json.load(open(os.path.join(data_path, "desc.json")))
12 |         self.transform = transform
13 | 
14 |     def abc_len(self):
15 |         return len(self.config["abc"])
16 | 
17 |     def get_abc(self):
18 |         return self.config["abc"]
19 | 
20 |     def set_mode(self, mode):
21 |         self.mode = mode
22 | 
23 |     def __len__(self):
24 |         if self.mode == "test":
25 |             return len(self.config[self.mode])
26 |         return len(self.config[self.mode])
27 | 
28 |     def __getitem__(self, idx):
29 |         
30 |         name = self.config[self.mode][idx]["name"]
31 |         text = self.config[self.mode][idx]["text"]
32 | 
33 |         img = cv2.imread(os.path.join(self.data_path, "data", name))
34 |         # print(os.path.join(self.data_path, "data", name))
35 |         # img = cv2.imread(os.path.join(self.data_path, name))
36 |         seq = self.text_to_seq(text)
37 |         sample = {"img": img, "seq": seq, "seq_len": len(seq), "aug": self.mode == "train"}
38 |         if self.transform:
39 |             # print('trans')
40 |             sample = self.transform(sample)
41 |         return sample
42 | 
43 |     def text_to_seq(self, text):
44 |         seq = []
45 |         for c in text:
46 |             seq.append(self.config["abc"].find(c) + 1)
47 |         return seq


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/fold_tta.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/fold_tta.pkl


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/lr_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class StepLR(object):
 5 |     def __init__(self, optimizer, step_size=1000, max_iter=10000):
 6 |         self.optimizer = optimizer
 7 |         self.max_iter = max_iter
 8 |         self.step_size = step_size
 9 |         self.last_iter = -1
10 |         self.base_lrs = list(map(lambda group: group['lr'], optimizer.param_groups))
11 | 
12 |     def get_lr(self):
13 |         return self.optimizer.param_groups[0]['lr']
14 | 
15 |     def step(self, last_iter=None):
16 |         if last_iter is not None:
17 |             self.last_iter = last_iter
18 |         if self.last_iter + 1 == self.max_iter:
19 |             self.last_iter = -1
20 |         self.last_iter = (self.last_iter + 1) % self.max_iter
21 |         for ids, param_group in enumerate(self.optimizer.param_groups):
22 |             param_group['lr'] = self.base_lrs[ids] * 0.8 ** ( self.last_iter // self.step_size )
23 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/__init__.py


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/crnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | 
  5 | import torchvision.models as models
  6 | import string
  7 | import numpy as np
  8 | import torch.nn.init as init
  9 | 
 10 | def weight_init(m):
 11 |     '''
 12 |     Usage:
 13 |         model = Model()
 14 |         model.apply(weight_init)
 15 |     '''
 16 |     if isinstance(m, nn.Conv1d):
 17 |         init.normal_(m.weight.data)
 18 |         if m.bias is not None:
 19 |             init.normal_(m.bias.data)
 20 |     elif isinstance(m, nn.Conv2d):
 21 |         init.xavier_normal_(m.weight.data)
 22 |         if m.bias is not None:
 23 |             init.normal_(m.bias.data)
 24 |     elif isinstance(m, nn.Conv3d):
 25 |         init.xavier_normal_(m.weight.data)
 26 |         if m.bias is not None:
 27 |             init.normal_(m.bias.data)
 28 |     elif isinstance(m, nn.ConvTranspose1d):
 29 |         init.normal_(m.weight.data)
 30 |         if m.bias is not None:
 31 |             init.normal_(m.bias.data)
 32 |     elif isinstance(m, nn.ConvTranspose2d):
 33 |         init.xavier_normal_(m.weight.data)
 34 |         if m.bias is not None:
 35 |             init.normal_(m.bias.data)
 36 |     elif isinstance(m, nn.ConvTranspose3d):
 37 |         init.xavier_normal_(m.weight.data)
 38 |         if m.bias is not None:
 39 |             init.normal_(m.bias.data)
 40 |     elif isinstance(m, nn.BatchNorm1d):
 41 |         init.normal_(m.weight.data, mean=1, std=0.02)
 42 |         init.constant_(m.bias.data, 0)
 43 |     elif isinstance(m, nn.BatchNorm2d):
 44 |         init.normal_(m.weight.data, mean=1, std=0.02)
 45 |         init.constant_(m.bias.data, 0)
 46 |     elif isinstance(m, nn.BatchNorm3d):
 47 |         init.normal_(m.weight.data, mean=1, std=0.02)
 48 |         init.constant_(m.bias.data, 0)
 49 |     elif isinstance(m, nn.Linear):
 50 |         init.xavier_normal_(m.weight.data)
 51 |         init.normal_(m.bias.data)
 52 |     elif isinstance(m, nn.LSTM):
 53 |         for param in m.parameters():
 54 |             if len(param.shape) >= 2:
 55 |                 init.orthogonal_(param.data)
 56 |             else:
 57 |                 init.normal_(param.data)
 58 |     elif isinstance(m, nn.LSTMCell):
 59 |         for param in m.parameters():
 60 |             if len(param.shape) >= 2:
 61 |                 init.orthogonal_(param.data)
 62 |             else:
 63 |                 init.normal_(param.data)
 64 |     elif isinstance(m, nn.GRU):
 65 |         for param in m.parameters():
 66 |             if len(param.shape) >= 2:
 67 |                 init.orthogonal_(param.data)
 68 |             else:
 69 |                 init.normal_(param.data)
 70 |     elif isinstance(m, nn.GRUCell):
 71 |         for param in m.parameters():
 72 |             if len(param.shape) >= 2:
 73 |                 init.orthogonal_(param.data)
 74 |             else:
 75 |                 init.normal_(param.data)
 76 | 
 77 | class CRNN(nn.Module):
 78 |     def __init__(self,
 79 |                  abc=string.digits,
 80 |                  backend='resnet18',
 81 |                  rnn_hidden_size=64,
 82 |                  rnn_num_layers=2,
 83 |                  rnn_dropout=0,
 84 |                  seq_proj=[0, 0]):
 85 |         super(CRNN, self).__init__()
 86 | 
 87 |         self.abc = abc
 88 |         self.num_classes = len(self.abc)
 89 | 
 90 |         self.feature_extractor = getattr(models, backend)(pretrained=False)
 91 |         self.cnn = nn.Sequential(
 92 |             self.feature_extractor.conv1,
 93 |             self.feature_extractor.bn1,
 94 |             self.feature_extractor.relu,
 95 |             # self.feature_extractor.maxpool,
 96 |             self.feature_extractor.layer1,
 97 |             self.feature_extractor.layer2,
 98 |             self.feature_extractor.layer3,
 99 |             self.feature_extractor.layer4
100 |         )
101 | 
102 |         self.fully_conv = seq_proj[0] == 0
103 |         if not self.fully_conv:
104 |             self.proj = nn.Conv2d(seq_proj[0], seq_proj[1], kernel_size=1)
105 | 
106 |         self.rnn_hidden_size = rnn_hidden_size
107 |         self.rnn_num_layers = rnn_num_layers
108 |         
109 |         if backend in ['resnet18','resnet34']:
110 |             self.block_size = 512
111 |         else:
112 |             self.block_size = 2048
113 |         
114 |         self.rnn = nn.GRU(self.block_size,
115 |                           rnn_hidden_size, rnn_num_layers,
116 |                           batch_first=False,
117 |                           dropout=rnn_dropout, bidirectional=True)
118 |         self.linear = nn.Linear(rnn_hidden_size * 2, self.num_classes + 1)
119 |         self.softmax = nn.LogSoftmax(dim=2)
120 | 
121 |     def forward(self, x, decode=False):
122 |         hidden = self.init_hidden(x.size(0), next(self.parameters()).is_cuda)
123 |         features = self.cnn(x)
124 |         # print(features.shape)
125 |         
126 |         if features.shape[2] != 1:
127 |             # features = features.max(2).reshape(features.shape[0], features.shape[1], 1, features.shape[-1])
128 |             features = features.max(2)[0].reshape(features.shape[0], features.shape[1], 1, features.shape[-1])
129 |         # features = features.max(2).reshape(features.shape[0], features.shape[1], 1, features.shape[-1])
130 |         # print(features.shape)
131 |         # return features
132 |     
133 |         features = self.features_to_sequence(features)
134 |         seq, hidden = self.rnn(features, hidden)
135 |         seq = self.linear(seq)
136 |         if not self.training:
137 |             seq = self.softmax(seq)
138 |             if decode:
139 |                 seq = self.decode(seq)
140 |         return seq
141 | 
142 |     def init_hidden(self, batch_size, gpu=False):
143 |         h0 = Variable(torch.zeros( self.rnn_num_layers * 2,
144 |                                    batch_size,
145 |                                    self.rnn_hidden_size))
146 |         if gpu:
147 |             h0 = h0.cuda()
148 |         return h0
149 | 
150 |     def features_to_sequence(self, features):
151 |         b, c, h, w = features.size()
152 |         assert h == 1, "the height of out must be 1"
153 |         if not self.fully_conv:
154 |             features = features.permute(0, 3, 2, 1)
155 |             features = self.proj(features)
156 |             features = features.permute(1, 0, 2, 3)
157 |         else:
158 |             features = features.permute(3, 0, 2, 1)
159 |         features = features.squeeze(2)
160 |         return features
161 | 
162 |     def get_block_size(self, layer):
163 |         return layer[-1][-1].bn2.weight.size()[0]
164 | 
165 |     def pred_to_string(self, pred):
166 |         seq = []
167 |         for i in range(pred.shape[0]):
168 |             label = np.argmax(pred[i])
169 |             seq.append(label - 1)
170 |         out = []
171 |         for i in range(len(seq)):
172 |             if len(out) == 0:
173 |                 if seq[i] != -1:
174 |                     out.append(seq[i])
175 |             else:
176 |                 if seq[i] != -1 and seq[i] != seq[i - 1]:
177 |                     out.append(seq[i])
178 |         out = ''.join(self.abc[i] for i in out)
179 |         return out
180 | 
181 |     def decode(self, pred):
182 |         pred = pred.permute(1, 0, 2).cpu().data.numpy()
183 |         seq = []
184 |         for i in range(pred.shape[0]):
185 |             seq.append(self.pred_to_string(pred[i]))
186 |         return seq
187 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/models/model_loader.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from .crnn import CRNN
 7 | 
 8 | def load_weights(target, source_state):
 9 |     new_dict = OrderedDict()
10 |     for k, v in target.state_dict().items():
11 |         if k in source_state and v.size() == source_state[k].size():
12 |             new_dict[k] = source_state[k]
13 |         else:
14 |             new_dict[k] = v
15 |     target.load_state_dict(new_dict)
16 | 
17 | def load_model(abc, seq_proj=[0, 0], backend='resnet18', snapshot=None, cuda=True):
18 |     net = CRNN(abc=abc, seq_proj=seq_proj, backend=backend)
19 |     net = nn.DataParallel(net)
20 |     if snapshot is not None:
21 |         load_weights(net, torch.load(snapshot))
22 |     if cuda:
23 |         net = net.cuda()
24 |     return net
25 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/submit.py:
--------------------------------------------------------------------------------
 1 | def check_label(s):
 2 |     if '*' in s:
 3 |         return True
 4 |     if len(s) != 10:
 5 |         return True
 6 |     
 7 |     if len(set(s[3:]) & set(string.ascii_uppercase)) > 0:
 8 |         return True
 9 |     
10 |     if s[0] in string.digits:
11 |         return True
12 |     
13 |     if s[0] in string.ascii_uppercase and s[1] in string.ascii_uppercase and s[2] in string.ascii_uppercase:
14 |         return True
15 |     
16 |     if s[0] in string.ascii_uppercase and s[1] in string.ascii_uppercase:
17 |         return True 
18 |     elif s[0] in string.ascii_uppercase and s[2] in string.ascii_uppercase and s[1] in string.digits:
19 |         return True
20 |     else:
21 |         return False
22 |     
23 | 
24 | import pandas as pd
25 | import string
26 | submit_df1 = pd.read_csv('./tmp_rcnn_tta10_pb.csv')
27 | submit_df2 = pd.read_csv('../multi-digit-pytorch/tmp_rcnn_tta10_cnn.csv')
28 | 
29 | submit_df1.loc[submit_df1['name'] == 'OFTUHPVE.jpg', 'label'] = submit_df2[submit_df2['name'] == 'OFTUHPVE.jpg']['label']
30 | submit_df1[~submit_df1['label'].apply(lambda x: check_label(x))]
31 | submit_df1.to_csv('tmp_rcnn_tta10_pb_submit.csv',index=None)


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import string
 4 | from tqdm import tqdm
 5 | import click
 6 | import numpy as np
 7 | import pandas as pd
 8 | import torch
 9 | from torch.autograd import Variable
10 | from torch.utils.data import DataLoader
11 | 
12 | from dataset.test_data import TestDataset
13 | from dataset.text_data import TextDataset
14 | from dataset.collate_fn import text_collate
15 | from dataset.data_transform import Resize, Rotation, Translation, Scale
16 | from models.model_loader import load_model
17 | from torchvision.transforms import Compose
18 | 
19 | import editdistance
20 | 
21 | def test(net, data, abc, cuda, visualize, batch_size=10):
22 |     data_loader = DataLoader(data, batch_size=10, num_workers=1, shuffle=False, collate_fn=text_collate)
23 |     
24 |     error_idx = []
25 |     idx= 0
26 |     count = 0.0
27 |     tp = 0.0
28 |     avg_ed = 0.0
29 |     iterator = tqdm(data_loader)
30 |     for sample in iterator:
31 |         imgs = Variable(sample["img"])
32 |         if cuda:
33 |             imgs = imgs.cuda()
34 |         out = net(imgs, decode=True)
35 |         gt = (sample["seq"].numpy() - 1).tolist()
36 |         lens = sample["seq_len"].numpy().tolist()
37 |         pos = 0
38 |         key = ''
39 |         for i in range(len(out)):
40 |             gts = ''.join(abc[c] for c in gt[pos:pos+lens[i]])
41 |             pos += lens[i]
42 |             
43 |             if gts != out[i]:
44 |                 # print(out[i], gts, imgs.shape)
45 |                 error_idx.append(int(count))
46 |             if gts == out[i]:
47 |                 tp += 1.0
48 |             else:
49 |                 avg_ed += editdistance.eval(out[i], gts)
50 |             count += 1.0
51 |         if not visualize:
52 |             iterator.set_description("acc: {0:.4f}; avg_ed: {1:.4f}".format(tp / count, avg_ed / count))
53 |         idx+=1
54 |         
55 |     acc = tp / count
56 |     avg_ed = avg_ed / count
57 |     return acc, avg_ed, error_idx
58 | 
59 | @click.command()
60 | @click.option('--data-path', type=str, default=None, help='Path to dataset')
61 | @click.option('--abc', type=str, default=string.digits+string.ascii_uppercase, help='Alphabet')
62 | @click.option('--seq-proj', type=str, default="10x20", help='Projection of sequence')
63 | @click.option('--backend', type=str, default="resnet18", help='Backend network')
64 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights')
65 | @click.option('--input-size', type=str, default="320x32", help='Input size')
66 | @click.option('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3')
67 | @click.option('--visualize', type=bool, default=False, help='Visualize output')
68 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize):
69 |     os.environ["CUDA_VISIBLE_DEVICES"] = gpu
70 |     cuda = True if gpu is not '' else False
71 | 
72 |     input_size = [int(x) for x in input_size.split('x')]
73 |     transform = Compose([
74 |         Rotation(),
75 |         Resize(size=(input_size[0], input_size[1]))
76 |     ])
77 |     if data_path is not None:
78 |         data = TextDataset(data_path=data_path, mode="test", transform=transform)
79 |     else:
80 |         data = TestDataset(transform=transform, abc=abc)
81 |     seq_proj = [int(x) for x in seq_proj.split('x')]
82 |     net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval()
83 |     acc, avg_ed = test(net, data, data.get_abc(), cuda, visualize)
84 |     
85 |     df_submit = pd.DataFrame()
86 |     
87 |     print("Accuracy: {}".format(acc))
88 |     print("Edit distance: {}".format(avg_ed))
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/test2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2, glob
  3 | import string
  4 | from tqdm import tqdm
  5 | import click
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import torch
 10 | from torch.autograd import Variable
 11 | from torch.utils.data import DataLoader
 12 | 
 13 | from dataset.test_data import TestDataset
 14 | from dataset.text_data import TextDataset
 15 | from dataset.collate_fn import text_collate
 16 | from dataset.data_transform import Resize, Rotation, Translation, Scale, Contrast, Snow, Grid_distortion
 17 | from models.model_loader import load_model
 18 | from torchvision.transforms import Compose
 19 | 
 20 | import editdistance
 21 | 
 22 | def pred_to_string(pred):
 23 |     seq = []
 24 |     for i in range(pred.shape[0]):
 25 |         label = np.argmax(pred[i])
 26 |         seq.append(label - 1)
 27 |     out = []
 28 |     for i in range(len(seq)):
 29 |         if len(out) == 0:
 30 |             if seq[i] != -1:
 31 |                 out.append(seq[i])
 32 |         else:
 33 |             if seq[i] != -1 and seq[i] != seq[i - 1]:
 34 |                 out.append(seq[i])
 35 |     out = ''.join('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'[i] for i in out)
 36 |     return out
 37 | 
 38 | def decode(pred):
 39 |     seq = []
 40 |     for i in range(pred.shape[0]):
 41 |         seq.append(pred_to_string(pred[i]))
 42 |     return seq
 43 | 
 44 | def test(net, data, abc, cuda, visualize, batch_size=256):
 45 |     data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=text_collate)
 46 | 
 47 |     count = 0.0
 48 |     tp = 0.0
 49 |     avg_ed = 0.0
 50 |     pred_pb = []
 51 |     iterator = tqdm(data_loader)
 52 |     for sample in iterator:
 53 |         imgs = Variable(sample["img"])
 54 |         if cuda:
 55 |             imgs = imgs.cuda()
 56 |         out = net(imgs, decode=True)
 57 |         gt = (sample["seq"].numpy() - 1).tolist()
 58 |         lens = sample["seq_len"].numpy().tolist()
 59 |         pos = 0
 60 |         key = ''
 61 |         for i in range(len(out)):
 62 |             gts = ''.join(abc[c] for c in gt[pos:pos+lens[i]])
 63 |             pos += lens[i]
 64 |             pred_pb.append(out[i])
 65 |             
 66 |             if gts == out[i]:
 67 |                 tp += 1.0
 68 |             else:
 69 |                 avg_ed += editdistance.eval(out[i], gts)
 70 |             count += 1.0
 71 |         if not visualize:
 72 |             iterator.set_description("acc: {0:.4f}; avg_ed: {0:.4f}".format(tp / count, avg_ed / count))
 73 | 
 74 |     acc = tp / count
 75 |     avg_ed = avg_ed / count
 76 |     return acc, avg_ed, pred_pb
 77 | 
 78 | 
 79 | def test_tta(net, data, abc, cuda, visualize, batch_size=256):
 80 |     pred_pb_tta = None
 81 |     
 82 |     for _ in range(10):
 83 |         data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=text_collate)
 84 |         iterator = tqdm(data_loader)
 85 |         
 86 |         pred_pb = []
 87 |         for sample in iterator:
 88 |             imgs = Variable(sample["img"])
 89 |             if cuda:
 90 |                 imgs = imgs.cuda()
 91 |             out = net(imgs, decode=False)
 92 |             out = out.permute(1, 0, 2).cpu().data.numpy()
 93 |             
 94 |             pred_pb.append(out)
 95 |         
 96 |         if pred_pb_tta is None:
 97 |             pred_pb_tta = np.concatenate(pred_pb)
 98 |         else:
 99 |             pred_pb_tta += np.concatenate(pred_pb)
100 |     return 0, 0, decode(pred_pb_tta)
101 | 
102 | @click.command()
103 | @click.option('--data-path', type=str, default=None, help='Path to dataset')
104 | @click.option('--abc', type=str, default=string.digits+string.ascii_uppercase, help='Alphabet')
105 | @click.option('--seq-proj', type=str, default="10x20", help='Projection of sequence')
106 | @click.option('--backend', type=str, default="resnet34", help='Backend network')
107 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights')
108 | @click.option('--input-size', type=str, default="320x32", help='Input size')
109 | @click.option('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3')
110 | @click.option('--visualize', type=bool, default=False, help='Visualize output')
111 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize):
112 |     os.environ["CUDA_VISIBLE_DEVICES"] = gpu
113 |     cuda = True if gpu is not '' else False
114 | 
115 |     input_size = [int(x) for x in input_size.split('x')]
116 |     transform = Compose([
117 |         Rotation(),
118 |         Translation(),
119 |         # Scale(),
120 |         Contrast(),
121 |         Grid_distortion(),
122 |         Resize(size=(input_size[0], input_size[1]))
123 |     ])
124 |     if data_path is not None:
125 |         data = TextDataset(data_path=data_path, mode="pb", transform=transform)
126 |     else:
127 |         data = TestDataset(transform=transform, abc=abc)
128 |     seq_proj = [int(x) for x in seq_proj.split('x')]
129 |     net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval()
130 |     acc, avg_ed, pred_pb = test_tta(net, data, data.get_abc(), cuda, visualize)
131 |     
132 |     df_submit = pd.DataFrame()
133 |     df_submit['name'] = [x.split('/')[-1] for x in glob.glob('../../input/public_test_data/*')]
134 |     df_submit['label'] = pred_pb
135 |     
136 |     df_submit.to_csv('tmp_rcnn_tta10.csv', index=None)
137 |     print("Accuracy: {}".format(acc))
138 |     print("Edit distance: {}".format(avg_ed))
139 | 
140 | if __name__ == '__main__':
141 |     main()
142 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/test2_tta.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2, glob, json
  3 | import string
  4 | from tqdm import tqdm
  5 | import click
  6 | import numpy as np
  7 | import pandas as pd
  8 | import random
  9 | 
 10 | from collections import Counter
 11 | from sklearn.externals import joblib
 12 | 
 13 | import torch
 14 | from torch.autograd import Variable
 15 | from torch.utils.data import DataLoader
 16 | 
 17 | from dataset.test_data import TestDataset
 18 | from dataset.text_data import TextDataset
 19 | from dataset.collate_fn import text_collate
 20 | from dataset.data_transform import Resize, Rotation, Translation, Scale, Contrast, Snow, Grid_distortion
 21 | from models.model_loader import load_model
 22 | from torchvision.transforms import Compose
 23 | 
 24 | import editdistance
 25 | 
 26 | def pred_to_string(pred):
 27 |     seq = []
 28 |     for i in range(pred.shape[0]):
 29 |         label = np.argmax(pred[i])
 30 |         seq.append(label - 1)
 31 |     out = []
 32 |     for i in range(len(seq)):
 33 |         if len(out) == 0:
 34 |             if seq[i] != -1:
 35 |                 out.append(seq[i])
 36 |         else:
 37 |             if seq[i] != -1 and seq[i] != seq[i - 1]:
 38 |                 out.append(seq[i])
 39 |     out = ''.join('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'[i] for i in out)
 40 |     return out
 41 | 
 42 | def decode(pred):
 43 |     seq = []
 44 |     for i in range(pred.shape[0]):
 45 |         seq.append(pred_to_string(pred[i]))
 46 |     return seq
 47 | 
 48 | def test(net, data, abc, cuda, visualize, batch_size=256):
 49 |     data_loader = DataLoader(data, batch_size=batch_size, num_workers=4, shuffle=False, collate_fn=text_collate)
 50 | 
 51 |     count = 0.0
 52 |     tp = 0.0
 53 |     avg_ed = 0.0
 54 |     pred_pb = []
 55 |     iterator = tqdm(data_loader)
 56 |     for sample in iterator:
 57 |         imgs = Variable(sample["img"])
 58 |         if cuda:
 59 |             imgs = imgs.cuda()
 60 |         out = net(imgs, decode=True)
 61 |         gt = (sample["seq"].numpy() - 1).tolist()
 62 |         lens = sample["seq_len"].numpy().tolist()
 63 |         pos = 0
 64 |         key = ''
 65 |         for i in range(len(out)):
 66 |             gts = ''.join(abc[c] for c in gt[pos:pos+lens[i]])
 67 |             pos += lens[i]
 68 |             pred_pb.append(out[i])
 69 |             
 70 |             if gts == out[i]:
 71 |                 tp += 1.0
 72 |             else:
 73 |                 avg_ed += editdistance.eval(out[i], gts)
 74 |             count += 1.0
 75 |         if not visualize:
 76 |             iterator.set_description("acc: {0:.4f}; avg_ed: {0:.4f}".format(tp / count, avg_ed / count))
 77 | 
 78 |     acc = tp / count
 79 |     avg_ed = avg_ed / count
 80 |     return acc, avg_ed, pred_pb
 81 | 
 82 | 
 83 | def test_tta(net, data, abc, cuda, visualize, batch_size=128):
 84 |     pred_pb_tta = None
 85 |     
 86 |     for _ in range(7):
 87 |         data_loader = DataLoader(data, batch_size=batch_size, num_workers=10, shuffle=False, collate_fn=text_collate)
 88 |         iterator = tqdm(data_loader)
 89 |         
 90 |         pred_pb = []
 91 |         for sample in iterator:
 92 |             imgs = Variable(sample["img"])
 93 |             if cuda:
 94 |                 imgs = imgs.cuda()
 95 |             out = net(imgs, decode=False)
 96 |             out = out.permute(1, 0, 2).cpu().data.numpy()
 97 |             
 98 |             pred_pb.append(out)
 99 |         
100 |         if pred_pb_tta is None:
101 |             pred_pb_tta = np.concatenate(pred_pb)
102 |         else:
103 |             pred_pb_tta += np.concatenate(pred_pb)
104 |     return 0, 0, decode(pred_pb_tta)
105 | 
106 | @click.command()
107 | @click.option('--data-path', type=str, default=None, help='Path to dataset')
108 | @click.option('--abc', type=str, default='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', help='Alphabet')
109 | @click.option('--seq-proj', type=str, default="20x40", help='Projection of sequence')
110 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights')
111 | @click.option('--backend', type=str, default="resnet18", help='Backend network')
112 | @click.option('--input-size', type=str, default="320x80", help='Input size')
113 | @click.option('--gpu', type=str, default='1', help='List of GPUs for parallel training, e.g. 0,1,2,3')
114 | @click.option('--visualize', type=bool, default=False, help='Visualize output')
115 | 
116 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, gpu, visualize):
117 |     os.environ["CUDA_VISIBLE_DEVICES"] = '1'
118 |     cuda = True if gpu is not '' else False
119 | 
120 |     input_size = [int(x) for x in input_size.split('x')]
121 |     seq_proj = [int(x) for x in seq_proj.split('x')]
122 |     
123 |     print(list(glob.glob('./tmp/fold*_best') + glob.glob('./tmp2/fold*_best')))
124 |     fold_pred_pb_tta = []
125 |     # for snapshot in glob.glob('./tmp/fold*_best')[:]:
126 |     
127 |     for snapshot in list(glob.glob('./tmp/fold*_best') + glob.glob('./tmp2/fold*_best'))[:]:
128 |     
129 | #     for snapshot in ['./tmp/fold12_train_crnn_resnet18_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_0.997181964573',
130 | #                     './tmp/fold13_train_crnn_resnet18_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_0.995571658615',
131 | #                     './tmp/fold3_train_crnn_resnet18_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_0.993961352657',
132 | #                     './tmp/fold5_train_crnn_resnet18_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_0.994363929147']:
133 |         if np.random.uniform(0.0, 1.0) < 1:
134 |             transform = Compose([
135 |                 # Rotation(),
136 |                 Translation(),
137 |                 # Scale(),
138 |                 Contrast(),
139 |                 # Grid_distortion(),
140 |                 Resize(size=(input_size[0], input_size[1]))
141 |             ])
142 |         else:
143 |             transform = Compose([
144 |                 # Rotation(),
145 |                 Translation(),
146 |                 # Scale(),
147 |                 Contrast(),
148 |                 # Grid_distortion(),
149 |                 Resize(size=(input_size[0], input_size[1]))
150 |             ])
151 |             
152 |         if data_path is not None:
153 |             data = TextDataset(data_path=data_path, mode="pb", transform=transform)
154 |         else:
155 |             data = TestDataset(transform=transform, abc=abc)
156 |         print(snapshot)
157 |         
158 |         net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda).eval()
159 |         acc, avg_ed, pred_pb = test_tta(net, data, data.get_abc(), cuda, visualize)
160 |         fold_pred_pb_tta.append(pred_pb)
161 |     
162 |     with open('../data/desc.json') as up:
163 |         data_json = json.load(up)
164 |     
165 |     fold_pred_pb = []
166 |     if len(fold_pred_pb_tta) > 1:
167 |         for test_idx in range(len(fold_pred_pb_tta[0])):
168 |             test_idx_folds = [fold_pred_pb_tta[i][test_idx] for i in range(len(fold_pred_pb_tta))]
169 | 
170 |             test_idx_chars = []
171 |             for char_idx in range(10):
172 |                 char_tta = [test_idx_folds[i][char_idx] for i in range(len(test_idx_folds)) 
173 |                             if len(test_idx_folds[i]) > char_idx]
174 | #                 if len(char_tta) < len(glob.glob('./tmp/fold*_best'))-2:
175 | #                     print(test_idx, glob.glob('../../input/private_test_data/*')[test_idx])
176 |                 
177 |                 if len(char_tta) > 0:
178 |                     char_tta = Counter(char_tta).most_common()[0][0]
179 |                 else:
180 |                     char_tta = '*'
181 |                     # print(test_idx, glob.glob('../../input/private_test_data/*')[test_idx])
182 | 
183 |                 test_idx_chars += char_tta
184 |             fold_pred_pb.append(''.join(test_idx_chars))
185 |     
186 |         joblib.dump(fold_pred_pb_tta, 'fold_tta.pkl')
187 |         
188 |         df_submit = pd.DataFrame()
189 |         df_submit['name'] = [x['name'] for x in data_json['pb']]
190 |         # print(fold_pred_pb_tta)
191 |         df_submit['label'] = fold_pred_pb
192 |     else:
193 |         df_submit = pd.DataFrame()
194 |         df_submit['name'] = [x['name'] for x in data_json['pb']]
195 |         # print(fold_pred_pb_tta)
196 |         df_submit['label'] = fold_pred_pb_tta[0]
197 |     
198 |     df_submit.to_csv('tmp_rcnn_tta10_pb.csv', index=None)
199 |     print("Accuracy: {}".format(acc))
200 |     print("Edit distance: {}".format(avg_ed))
201 | 
202 | # python test2_tta.py --snapshot tmp/crnn_resnet18_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_best --visualize False --data-path ../data/
203 | if __name__ == '__main__':
204 |     main()
205 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/crnn-pytorch/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import click
  3 | import string
  4 | import numpy as np
  5 | from tqdm import tqdm
  6 | from models.model_loader import load_model
  7 | from torchvision.transforms import Compose
  8 | from dataset.data_transform import Resize, Rotation, Translation, Scale, Contrast, Snow, Grid_distortion
  9 | from dataset.test_data import TestDataset
 10 | from dataset.text_data import TextDataset
 11 | from dataset.collate_fn import text_collate
 12 | from lr_policy import StepLR
 13 | 
 14 | import torch
 15 | from torch import nn
 16 | from torch import optim
 17 | from torch.autograd import Variable
 18 | from torch import Tensor
 19 | from torch.utils.data import DataLoader
 20 | from warpctc_pytorch import CTCLoss
 21 | 
 22 | from test import test
 23 | 
 24 | import logging
 25 | logging.basicConfig(level=logging.DEBUG, filename='example.log',
 26 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d]: %(message)s')  # 
 27 | 
 28 | 
 29 | @click.command()
 30 | @click.option('--data-path', type=str, default=None, help='Path to dataset')
 31 | @click.option('--abc', type=str, default='0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', help='Alphabet')
 32 | @click.option('--seq-proj', type=str, default="20x40", help='Projection of sequence')
 33 | @click.option('--backend', type=str, default="resnet18", help='Backend network')
 34 | @click.option('--snapshot', type=str, default=None, help='Pre-trained weights')
 35 | @click.option('--input-size', type=str, default="320x80", help='Input size')
 36 | @click.option('--base-lr', type=float, default=1*1e-3, help='Base learning rate')
 37 | @click.option('--step-size', type=int, default=1500, help='Step size')
 38 | @click.option('--max-iter', type=int, default=6000, help='Max iterations')
 39 | @click.option('--batch-size', type=int, default=100, help='Batch size')
 40 | @click.option('--output-dir', type=str, default=None, help='Path for snapshot')
 41 | @click.option('--test-epoch', type=int, default=1, help='Test epoch')
 42 | @click.option('--test-init', type=bool, default=False, help='Test initialization')
 43 | @click.option('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3')
 44 | 
 45 | def main(data_path, abc, seq_proj, backend, snapshot, input_size, base_lr, step_size, max_iter, batch_size, output_dir, test_epoch, test_init, gpu):
 46 |     os.environ["CUDA_VISIBLE_DEVICES"] = gpu
 47 |     cuda = True if gpu is not '' else False
 48 | 
 49 |     input_size = [int(x) for x in input_size.split('x')]
 50 |     transform = Compose([
 51 |         Rotation(),
 52 |         Translation(),
 53 |         # Scale(),
 54 |         Contrast(),
 55 |         # Grid_distortion(),
 56 |         Resize(size=(input_size[0], input_size[1]))
 57 |     ])
 58 |     seq_proj = [int(x) for x in seq_proj.split('x')]
 59 |     
 60 |     for fold_idx in range(24):
 61 |         train_mode = 'fold{0}_train'.format(fold_idx)
 62 |         val_mode = 'fold{0}_test'.format(fold_idx)
 63 |         
 64 |         if data_path is not None:
 65 |             data = TextDataset(data_path=data_path, mode=train_mode, transform=transform)
 66 |         else:
 67 |             data = TestDataset(transform=transform, abc=abc)
 68 |         
 69 |         net = load_model(data.get_abc(), seq_proj, backend, snapshot, cuda)
 70 |         optimizer = optim.Adam(net.parameters(), lr = base_lr, weight_decay=0.0001)
 71 |         lr_scheduler = StepLR(optimizer, step_size=step_size)
 72 |         # lr_scheduler = StepLR(optimizer, step_size=len(data)/batch_size*2)
 73 |         loss_function = CTCLoss()
 74 |         
 75 |         print(fold_idx)
 76 |         # continue
 77 |         
 78 |         acc_best = 0
 79 |         epoch_count = 0
 80 |         for epoch_idx in range(15):
 81 |             data_loader = DataLoader(data, batch_size=batch_size, num_workers=10, shuffle=True, collate_fn=text_collate)
 82 |             loss_mean = []
 83 |             iterator = tqdm(data_loader)
 84 |             iter_count = 0
 85 |             for sample in iterator:
 86 |                 # for multi-gpu support
 87 |                 if sample["img"].size(0) % len(gpu.split(',')) != 0:
 88 |                     continue
 89 |                 optimizer.zero_grad()
 90 |                 imgs = Variable(sample["img"])
 91 |                 labels = Variable(sample["seq"]).view(-1)
 92 |                 label_lens = Variable(sample["seq_len"].int())
 93 |                 if cuda:
 94 |                     imgs = imgs.cuda()
 95 |                 preds = net(imgs).cpu()
 96 |                 pred_lens = Variable(Tensor([preds.size(0)] * batch_size).int())
 97 |                 loss = loss_function(preds, labels, pred_lens, label_lens) / batch_size
 98 |                 loss.backward()
 99 |                 # nn.utils.clip_grad_norm(net.parameters(), 10.0)
100 |                 loss_mean.append(loss.data[0])
101 |                 status = "{}/{}; lr: {}; loss_mean: {}; loss: {}".format(epoch_count, lr_scheduler.last_iter, lr_scheduler.get_lr(), np.mean(loss_mean), loss.data[0])
102 |                 iterator.set_description(status)
103 |                 optimizer.step()
104 |                 lr_scheduler.step()
105 |                 iter_count += 1
106 |             
107 |             if True:
108 |                 logging.info("Test phase")
109 |                 
110 |                 net = net.eval()
111 |                 
112 | #                 train_acc, train_avg_ed, error_idx = test(net, data, data.get_abc(), cuda, visualize=False)
113 | #                 if acc > 0.95:
114 | #                     error_name = [data.config[data.mode][idx]["name"] for idx in error_idx]
115 | #                     logging.info('Train: '+','.join(error_name))
116 | #                 logging.info("acc: {}\tacc_best: {}; avg_ed: {}\n\n".format(train_acc, train_avg_ed))
117 | 
118 |                 data.set_mode(val_mode)
119 |                 acc, avg_ed, error_idx = test(net, data, data.get_abc(), cuda, visualize=False)
120 |                 
121 |                 if acc > 0.95:
122 |                     error_name = [data.config[data.mode][idx]["name"] for idx in error_idx]
123 |                     logging.info('Val: '+','.join(error_name))
124 |                 
125 |                 
126 |                 
127 |                 net = net.train()
128 |                 data.set_mode(train_mode)
129 |                 
130 |                 if acc > acc_best:
131 |                     if output_dir is not None:
132 |                         torch.save(net.state_dict(), os.path.join(output_dir, train_mode+"_crnn_" + backend + "_" + str(data.get_abc()) + "_best"))
133 |                     acc_best = acc
134 |                 
135 |                 if acc > 0.985:
136 |                     if output_dir is not None:
137 |                         torch.save(net.state_dict(), os.path.join(output_dir, train_mode+"_crnn_" + backend + "_" + str(data.get_abc()) + "_"+str(acc)))
138 |                 logging.info("train_acc: {}\t; avg_ed: {}\n\n".format(acc, acc_best, avg_ed))
139 |                 
140 |                 
141 |             epoch_count += 1
142 | 
143 | # python train.py --test-init True --test-epoch 10 --output-dir tmp --data-path ../data/
144 | # python test2.py --snapshot tmp/crnn_resnet18_0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_best --visualize False --data-path ../data/
145 | if __name__ == '__main__':
146 |     main()
147 | 


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/data/data.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DueapeCommon/kaggle/78009876853d2536be895097c289ac35c748beba/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/data/data.json


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/multi-digit-pytorch/1_train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse, json
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm, tqdm_notebook
  6 | # import pretrainedmodels
  7 | 
  8 | import time, datetime
  9 | import pdb, traceback
 10 | 
 11 | import cv2
 12 | # import imagehash
 13 | from PIL import Image
 14 | 
 15 | from sklearn.model_selection import train_test_split, StratifiedKFold
 16 | 
 17 | import torch
 18 | torch.manual_seed(0)
 19 | torch.backends.cudnn.deterministic = False
 20 | torch.backends.cudnn.benchmark = True
 21 | 
 22 | import torchvision.models as models
 23 | import torchvision.transforms as transforms
 24 | import torchvision.datasets as datasets
 25 | import torch.nn as nn
 26 | import torch.nn.functional as F
 27 | import torch.optim as optim
 28 | from torch.autograd import Variable
 29 | from torch.utils.data.dataset import Dataset
 30 | 
 31 | from albumentations import (
 32 |     HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90, Resize, Normalize,
 33 |     Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
 34 |     IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, RandomBrightnessContrast, IAAPiecewiseAffine,
 35 |     IAASharpen, IAAEmboss, Flip, OneOf, Compose, ElasticTransform
 36 | )
 37 | from albumentations.pytorch import ToTensor
 38 | 
 39 | import logging
 40 | logging.basicConfig(level=logging.DEBUG, filename='example.log',
 41 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d]: %(message)s')  # 
 42 | 
 43 | class QRDataset(Dataset):
 44 |     def __init__(self, img_json, transform=None):
 45 |         self.img_json = img_json
 46 |         
 47 |         if transform is not None:
 48 |             self.transform = transform
 49 |         else:
 50 |             self.transform = None
 51 |     
 52 |     def __getitem__(self, index):
 53 |         start_time = time.time()
 54 |         
 55 |         img = cv2.imread(os.path.join('../data/data/', self.img_json[index]['name']))        
 56 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 57 |         if self.transform:
 58 |             augmented = self.transform(image=img)
 59 |             img = augmented['image']
 60 |         
 61 |         img_label_idx = self.img_json[index]['text'].strip()
 62 |         label0 = np.array(self.char2idx(img_label_idx[0]))
 63 |         label1 = np.array(self.char2idx(img_label_idx[1]))
 64 |         label2 = np.array(self.char2idx(img_label_idx[2]))
 65 |         label3 = np.array(self.char2idx(img_label_idx[3]))
 66 |         label4 = np.array(self.char2idx(img_label_idx[4]))
 67 |         label5 = np.array(self.char2idx(img_label_idx[5]))
 68 |         label6 = np.array(self.char2idx(img_label_idx[6]))
 69 |         label7 = np.array(self.char2idx(img_label_idx[7]))
 70 |         label8 = np.array(self.char2idx(img_label_idx[8]))
 71 |         label9 = np.array(self.char2idx(img_label_idx[9]))
 72 |         
 73 |         return img, torch.from_numpy(label0), torch.from_numpy(label1), \
 74 |                 torch.from_numpy(label2), torch.from_numpy(label3), torch.from_numpy(label4),\
 75 |                 torch.from_numpy(label5), torch.from_numpy(label6), torch.from_numpy(label7),\
 76 |                 torch.from_numpy(label8), torch.from_numpy(label9)
 77 |     
 78 |     def __len__(self):
 79 |         return len(self.img_json)
 80 |     
 81 |     def char2idx(self, ch):
 82 |         return '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'.find(ch)
 83 |         
 84 | class RMB_Net(nn.Module):
 85 |     def __init__(self):
 86 |         super(RMB_Net, self).__init__()
 87 |         
 88 |         feat_size = 512
 89 |         self.fc0 = nn.Linear(feat_size, 36)
 90 |         self.fc1 = nn.Linear(feat_size, 36)
 91 |         self.fc2 = nn.Linear(feat_size, 36)
 92 |         self.fc3 = nn.Linear(feat_size, 36)
 93 |         self.fc4 = nn.Linear(feat_size, 36)
 94 |         self.fc5 = nn.Linear(feat_size, 36)
 95 |         self.fc6 = nn.Linear(feat_size, 36)
 96 |         self.fc7 = nn.Linear(feat_size, 36)
 97 |         self.fc8 = nn.Linear(feat_size, 36)
 98 |         self.fc9 = nn.Linear(feat_size, 36)
 99 |         
100 |         model = models.resnet18(True)
101 |         model = torch.nn.Sequential(*(list(model.children())[:-1]))
102 |         self.resnet = model
103 |         
104 | #         model_name = 'se_resnet50' # could be fbresnet152 or inceptionresnetv2
105 | #         model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
106 | #         model.avg_pool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
107 | #         model = torch.nn.Sequential(*(list(model.children())[:-1]))
108 |         
109 |         self.resnet = model
110 |         
111 |     def forward(self, img):
112 |         feat = self.resnet(img)
113 |         feat = feat.reshape(feat.size(0), -1)
114 |         
115 |         out0 = self.fc0(feat)
116 |         out1 = self.fc1(feat)
117 |         out2 = self.fc2(feat)
118 |         out3 = self.fc3(feat)
119 |         out4 = self.fc4(feat)
120 |         out5 = self.fc5(feat)
121 |         out6 = self.fc6(feat)
122 |         out7 = self.fc7(feat)
123 |         out8 = self.fc8(feat)
124 |         out9 = self.fc9(feat)
125 |         
126 |         return F.log_softmax(out0, dim=1), F.log_softmax(out1, dim=1), F.log_softmax(out2, dim=1), \
127 |                 F.log_softmax(out3, dim=1), F.log_softmax(out4, dim=1), F.log_softmax(out5, dim=1), \
128 |                 F.log_softmax(out6, dim=1), F.log_softmax(out7, dim=1), F.log_softmax(out8, dim=1), \
129 |                  F.log_softmax(out9, dim=1)
130 |                  
131 | def accuracy(outputs, targets):
132 |     with torch.no_grad():
133 |         batch_size = outputs[0].size(0)
134 |         
135 |         output_idx = []
136 |         for output in outputs:
137 |             _, pred = output.topk(1, 1, True, True)
138 |             # pred = pred
139 |             pred = pred.t().flatten()
140 |             output_idx.append(pred.data.cpu().numpy())
141 |         
142 |         output_idx = np.vstack(output_idx)
143 |         targets = [x.data.cpu().numpy() for x in targets]
144 |         targets = np.vstack(targets)
145 |         return ((targets == output_idx).mean(0) == 1).mean(), (targets == output_idx).mean(0) != 1
146 |     
147 | def train(train_loader, model, criterion, optimizer, epoch):
148 |     model.train()
149 |     
150 |     train_acc = []
151 |     train_losss = []
152 |     for input,target0,target1,target2,target3,target4,target5,target6,target7,target8,target9 in tqdm(train_loader):
153 |         optimizer.zero_grad()
154 |         
155 |         input = input.cuda(non_blocking=True)
156 |         target0 = target0.cuda(non_blocking=True)
157 |         target1 = target1.cuda(non_blocking=True)
158 |         target2 = target2.cuda(non_blocking=True)
159 |         target3 = target3.cuda(non_blocking=True)
160 |         target4 = target4.cuda(non_blocking=True)
161 |         target5 = target5.cuda(non_blocking=True)
162 |         target6 = target6.cuda(non_blocking=True)
163 |         target7 = target7.cuda(non_blocking=True)
164 |         target8 = target8.cuda(non_blocking=True)
165 |         target9 = target9.cuda(non_blocking=True)
166 | 
167 |         # compute output
168 |         output0,output1,output2,output3,output4,output5,output6,output7,output8,output9 = model(input)
169 |         loss0 = criterion(output0, target0)
170 |         loss1 = criterion(output1, target1)
171 |         loss2 = criterion(output2, target2)
172 |         loss3 = criterion(output3, target3)
173 |         loss4 = criterion(output4, target4)
174 |         loss5 = criterion(output5, target5)
175 |         loss6 = criterion(output6, target6)
176 |         loss7 = criterion(output7, target7)
177 |         loss8 = criterion(output8, target8)
178 |         loss9 = criterion(output9, target9)
179 |             
180 |         loss = (loss0+loss1+loss2+loss3+loss4+loss5+loss6+loss7+loss8+loss9)/10.0
181 | #         loss = torch.max([])
182 |         # measure accuracy and record loss
183 | #         acc = accuracy([output0,output1,output2,output3,output4,output5,output6,output7,output8,output9], 
184 | #                         [target0,target1,target2,target3,target4,target5,target6,target7,target8,target9])
185 |         
186 |         # print(acc)
187 |         # status = "loss_mean: {}; ACC: {}".format(np.mean([acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,acc8,acc9]), 
188 |         #                                          loss.item())
189 |         # iterator.set_description(status)
190 |             
191 |         
192 |         loss.backward()
193 |         optimizer.step()
194 |         train_losss.append(loss.item())
195 |         
196 |     return np.mean(train_losss)
197 | 
198 | def validate(val_loader, model, criterion):
199 |     model.eval()
200 |     
201 |     val_acc = []
202 |     val_loss = []
203 |     val_error_idx = []
204 |     val_prob = []
205 |     with torch.no_grad():
206 |         for i, (input,target0,target1,target2,target3,target4,target5,target6,target7,target8,target9) in enumerate(val_loader):
207 |             input = input.cuda(non_blocking=True)
208 |             target0 = target0.cuda(non_blocking=True)
209 |             target1 = target1.cuda(non_blocking=True)
210 |             target2 = target2.cuda(non_blocking=True)
211 |             target3 = target3.cuda(non_blocking=True)
212 |             target4 = target4.cuda(non_blocking=True)
213 |             target5 = target5.cuda(non_blocking=True)
214 |             target6 = target6.cuda(non_blocking=True)
215 |             target7 = target7.cuda(non_blocking=True)
216 |             target8 = target8.cuda(non_blocking=True)
217 |             target9 = target9.cuda(non_blocking=True)
218 | 
219 |             # compute output
220 |             output0,output1,output2,output3,output4,output5,output6,output7,output8,output9 = model(input)
221 |             loss0 = criterion(output0, target0)
222 |             loss1 = criterion(output1, target1)
223 |             loss2 = criterion(output2, target2)
224 |             loss3 = criterion(output3, target3)
225 |             loss4 = criterion(output4, target4)
226 |             loss5 = criterion(output5, target5)
227 |             loss6 = criterion(output6, target6)
228 |             loss7 = criterion(output7, target7)
229 |             loss8 = criterion(output8, target8)
230 |             loss9 = criterion(output9, target9)
231 |             
232 |             loss = (loss0+loss1+loss2+loss3+loss4+loss5+loss6+loss7+loss8+loss9)/10.0
233 |             # measure accuracy and record loss
234 |             acc, error_idx = accuracy([output0,output1,output2,output3,output4,output5,output6,output7,output8,output9], 
235 |                             [target0,target1,target2,target3,target4,target5,target6,target7,target8,target9])
236 |             
237 |             output_prob = None
238 |             for output in [output0,output1,output2,output3,output4,output5,output6,output7,output8,output9]:
239 |                 if output_prob is None:
240 |                     output_prob = np.exp(output.max(1)[0].data.cpu().numpy())
241 |                 else:
242 |                     output_prob += np.exp(output.max(1)[0].data.cpu().numpy())
243 |             output_prob /= 10    
244 |             
245 |             val_acc.append(acc)
246 |             val_loss.append(loss.item())
247 |             val_error_idx += list(error_idx)
248 |             val_prob += list(output_prob)
249 |         
250 |         print(np.where(val_error_idx)[0], np.mean(val_error_idx))
251 |         names = []
252 |         
253 |         for idx in np.where(val_error_idx)[0]:
254 |             print(val_loader.dataset.img_json[idx]['name'], val_prob[idx])
255 |             names.append(val_loader.dataset.img_json[idx]['name'])
256 |         
257 |         print(','.join(names))
258 |         return np.mean(val_acc), np.mean(val_loss)
259 |         # print('VAL', np.mean(val_acc), np.mean(val_loss))
260 |         
261 |         
262 | def main():
263 |     with open('../data/desc.json') as up:
264 |         data_json = json.load(up)
265 | 
266 |     for fold_idx in range(15):
267 |         train_mode = 'fold{0}_train'.format(fold_idx)
268 |         val_mode = 'fold{0}_test'.format(fold_idx)
269 | 
270 |         train_loader = torch.utils.data.DataLoader(
271 |             QRDataset(data_json[train_mode],
272 |                     Compose([
273 |                                 # transforms.RandomAffine(5),
274 |                                 # transforms.ColorJitter(hue=.05, saturation=.05),
275 |                                 Resize(80, 320),
276 |                                 # GridDistortion(p=.5, distort_limit=0.15,num_steps=5),
277 |                                 RandomBrightnessContrast(),
278 |                                 ElasticTransform(alpha=0.1, sigma=5, alpha_affine=2,),
279 |                                 Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
280 |                                 ToTensor(),
281 |                 ])
282 |             ), batch_size=100, shuffle=True, num_workers=20, pin_memory=True
283 |         )
284 | 
285 |         val_loader = torch.utils.data.DataLoader(
286 |             QRDataset(data_json[val_mode],
287 |                     Compose([
288 |                                 # transforms.RandomAffine(5),
289 |                                 # transforms.ColorJitter(hue=.05, saturation=.05),
290 |                                 Resize(80, 320),
291 |                                 Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
292 |                                 ToTensor(),
293 |                 ])
294 |             ), batch_size=70, shuffle=True, num_workers=20, pin_memory=True
295 |         )
296 | 
297 |         model = RMB_Net()
298 |         model = model.cuda()
299 |         # model = nn.DataParallel(model).cuda()
300 |         criterion = nn.CrossEntropyLoss().cuda()
301 |         optimizer = torch.optim.Adam(model.parameters(), 0.001)
302 |         scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.85)
303 | 
304 |         best_val_acc = 0.0
305 |         for epoch_idx in range(10):
306 |             train_loss = train(train_loader, model, criterion, optimizer, epoch_idx)
307 |             val_acc, val_loss = validate(val_loader, model, criterion)
308 |             scheduler.step()
309 | 
310 |             if val_acc > best_val_acc:
311 |                 best_val_acc = val_acc
312 |                 torch.save(model.state_dict(), os.path.join('tmp', train_mode+'_best.pt'))
313 |             
314 |             print('{0}: Train_{1}, Val_{2}/{3}, best_{4}'.format(epoch_idx, train_loss, val_loss, val_acc, best_val_acc))
315 |             logging.info('{0}: Train_{1}, Val_{2}/{3}, best_{4}'.format(epoch_idx, train_loss, val_loss, val_acc, best_val_acc))
316 |         # break
317 | main()


--------------------------------------------------------------------------------
/solutions/TinyMind人民币面值&冠字号编码识别挑战赛/task2/multi-digit-pytorch/2_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os, sys, glob, argparse, json
  3 | import pandas as pd
  4 | import numpy as np
  5 | from tqdm import tqdm, tqdm_notebook
  6 | # import pretrainedmodels
  7 | import time, datetime
  8 | import pdb, traceback
  9 | 
 10 | import cv2
 11 | # import imagehash
 12 | from PIL import Image
 13 | 
 14 | from sklearn.model_selection import train_test_split, StratifiedKFold
 15 | 
 16 | import torch
 17 | torch.manual_seed(0)
 18 | torch.backends.cudnn.deterministic = False
 19 | torch.backends.cudnn.benchmark = True
 20 | 
 21 | import torchvision.models as models
 22 | import torchvision.transforms as transforms
 23 | import torchvision.datasets as datasets
 24 | import torch.nn as nn
 25 | import torch.nn.functional as F
 26 | import torch.optim as optim
 27 | from torch.autograd import Variable
 28 | from torch.utils.data.dataset import Dataset
 29 | 
 30 | from albumentations import (
 31 |     HorizontalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90, Resize, Normalize,
 32 |     Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
 33 |     IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, RandomBrightnessContrast, IAAPiecewiseAffine,
 34 |     IAASharpen, IAAEmboss, Flip, OneOf, Compose, ElasticTransform
 35 | )
 36 | from albumentations.pytorch import ToTensor
 37 | 
 38 | import logging
 39 | # logging.basicConfig(level=logging.DEBUG, filename='example.log',
 40 | #                     format='%(asctime)s - %(filename)s[line:%(lineno)d]: %(message)s')  # 
 41 | 
 42 | class QRDataset(Dataset):
 43 |     def __init__(self, img_json, transform=None):
 44 |         self.img_json = img_json
 45 |         
 46 |         if transform is not None:
 47 |             self.transform = transform
 48 |         else:
 49 |             self.transform = None
 50 |     
 51 |     def __getitem__(self, index):
 52 |         start_time = time.time()
 53 |         
 54 |         img = cv2.imread(os.path.join('../data/data/', self.img_json[index]['name']))        
 55 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 56 |         if self.transform:
 57 |             augmented = self.transform(image=img)
 58 |             img = augmented['image']
 59 |         
 60 |         img_label_idx = self.img_json[index]['text'].strip()
 61 |         label0 = np.array(self.char2idx(img_label_idx[0]))
 62 |         label1 = np.array(self.char2idx(img_label_idx[1]))
 63 |         label2 = np.array(self.char2idx(img_label_idx[2]))
 64 |         label3 = np.array(self.char2idx(img_label_idx[3]))
 65 |         label4 = np.array(self.char2idx(img_label_idx[4]))
 66 |         label5 = np.array(self.char2idx(img_label_idx[5]))
 67 |         label6 = np.array(self.char2idx(img_label_idx[6]))
 68 |         label7 = np.array(self.char2idx(img_label_idx[7]))
 69 |         label8 = np.array(self.char2idx(img_label_idx[8]))
 70 |         label9 = np.array(self.char2idx(img_label_idx[9]))
 71 |         
 72 |         return img, torch.from_numpy(label0), torch.from_numpy(label1), \
 73 |                 torch.from_numpy(label2), torch.from_numpy(label3), torch.from_numpy(label4),\
 74 |                 torch.from_numpy(label5), torch.from_numpy(label6), torch.from_numpy(label7),\
 75 |                 torch.from_numpy(label8), torch.from_numpy(label9)
 76 |     
 77 |     def __len__(self):
 78 |         return len(self.img_json)
 79 |     
 80 |     def char2idx(self, ch):
 81 |         return '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'.find(ch)
 82 |     
 83 | class RMB_Net(nn.Module):
 84 |     def __init__(self):
 85 |         super(RMB_Net, self).__init__()
 86 |         
 87 |         feat_size = 512
 88 |         self.fc0 = nn.Linear(feat_size, 36)
 89 |         self.fc1 = nn.Linear(feat_size, 36)
 90 |         self.fc2 = nn.Linear(feat_size, 36)
 91 |         self.fc3 = nn.Linear(feat_size, 36)
 92 |         self.fc4 = nn.Linear(feat_size, 36)
 93 |         self.fc5 = nn.Linear(feat_size, 36)
 94 |         self.fc6 = nn.Linear(feat_size, 36)
 95 |         self.fc7 = nn.Linear(feat_size, 36)
 96 |         self.fc8 = nn.Linear(feat_size, 36)
 97 |         self.fc9 = nn.Linear(feat_size, 36)
 98 |         
 99 |         model = models.resnet18(False)
100 |         model = torch.nn.Sequential(*(list(model.children())[:-1]))
101 |         self.resnet = model
102 |         
103 |     def forward(self, img):
104 |         feat = self.resnet(img)
105 |         feat = feat.reshape(feat.size(0), -1)
106 |         
107 |         out0 = self.fc0(feat)
108 |         out1 = self.fc1(feat)
109 |         out2 = self.fc2(feat)
110 |         out3 = self.fc3(feat)
111 |         out4 = self.fc4(feat)
112 |         out5 = self.fc5(feat)
113 |         out6 = self.fc6(feat)
114 |         out7 = self.fc7(feat)
115 |         out8 = self.fc8(feat)
116 |         out9 = self.fc9(feat)
117 |         
118 |         return F.log_softmax(out0, dim=1), F.log_softmax(out1, dim=1), F.log_softmax(out2, dim=1), \
119 |                 F.log_softmax(out3, dim=1), F.log_softmax(out4, dim=1), F.log_softmax(out5, dim=1), \
120 |                 F.log_softmax(out6, dim=1), F.log_softmax(out7, dim=1), F.log_softmax(out8, dim=1), \
121 |                  F.log_softmax(out9, dim=1)
122 | 
123 | def predict(test_loader, model, tta=1):
124 |     model.eval()
125 |     
126 |     val_acc = []
127 |     val_loss = []
128 |     
129 |     predict_ttas = None
130 |     with torch.no_grad():
131 |         for _ in range(tta):
132 |             predict_tta = []
133 |             for i, (input,target0,target1,target2,target3,target4,target5,target6,target7,target8,target9) in enumerate(test_loader):
134 |                 input = input.cuda(non_blocking=True)
135 | 
136 |                 # compute output
137 |                 output0,output1,output2,output3,output4,output5,output6,output7,output8,output9 = model(input)
138 |                 output0 = output0.data.cpu().numpy()
139 |                 output1 = output1.data.cpu().numpy()
140 |                 output2 = output2.data.cpu().numpy()
141 |                 output3 = output3.data.cpu().numpy()
142 |                 output4 = output4.data.cpu().numpy()
143 |                 output5 = output5.data.cpu().numpy()
144 |                 output6 = output6.data.cpu().numpy()
145 |                 output7 = output7.data.cpu().numpy()
146 |                 output8 = output8.data.cpu().numpy()
147 |                 output9 = output9.data.cpu().numpy()
148 | 
149 |                 output = np.array([output0,output1,output2,output3,output4,output5,output6,
150 |                                output7,output8,output9])
151 |                 predict_tta.append(output)
152 |                 # print(output.shape, output9.shape)
153 |             predict_tta = np.concatenate(predict_tta, 1)
154 |             # return predict_tta
155 |         
156 |             if predict_ttas is None:
157 |                 predict_ttas = predict_tta
158 |             else:
159 |                 predict_ttas += predict_tta
160 |     return predict_ttas/tta
161 | 
162 | def main():
163 |     with open('../data/desc.json') as up:
164 |         data_json = json.load(up)
165 |     
166 |     test_loader = torch.utils.data.DataLoader(
167 |         QRDataset(data_json['pb'],
168 |                 Compose([
169 |                                 Resize(80, 320),
170 |                                 # GridDistortion(p=.5, distort_limit=0.15,num_steps=5),
171 |                                 RandomBrightnessContrast(),
172 |                                 ElasticTransform(alpha=0.1, sigma=5, alpha_affine=2,),
173 |                                 Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
174 |                                 ToTensor(),
175 |             ])
176 |         ), batch_size=70, shuffle=False, num_workers=20, pin_memory=True
177 |     )
178 |     
179 |     test_tta = None
180 |     for model_path in glob.glob('tmp/*_best.pt')[:]:   
181 |         print(model_path)
182 |         
183 |         model = RMB_Net()
184 |         model = model.cuda()
185 |         # model = nn.DataParallel(model).cuda()
186 |         model.load_state_dict(torch.load(model_path))
187 |         
188 |         model_pred = predict(test_loader, model, 1)
189 |         if test_tta is None:
190 |             test_tta = model_pred
191 |         else:
192 |             test_tta += model_pred
193 |      
194 |     submit_lbls = []
195 |     for idx in range(test_tta.shape[1]):
196 |         idx_chars = ['0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'[np.argmax(test_tta[idx_char, idx, :])] 
197 |                      for idx_char in range(10)]
198 |         idx_chars = ''.join(idx_chars)
199 |         submit_lbls.append(idx_chars)
200 |         
201 |     df = pd.DataFrame()
202 |     df['name'] = [x['name'] for x in data_json['pb']]
203 |     df['label'] = submit_lbls
204 |     df.to_csv('tmp_rcnn_tta10_cnn.csv', index=None)
205 |     
206 | main()
207 | 


--------------------------------------------------------------------------------
/solutions/kaggle-allstate-claims-severity/README.md:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/c/allstate-claims-severity/
2 | 


--------------------------------------------------------------------------------
/solutions/kaggle-allstate-claims-severity/XGB_encoding(LB1106.33084).py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import xgboost as xgb
  4 | 
  5 | from datetime import datetime
  6 | from sklearn.metrics import mean_absolute_error
  7 | from sklearn.model_selection import KFold
  8 | from scipy.stats import skew, boxcox
  9 | from sklearn import preprocessing
 10 | from sklearn.preprocessing import StandardScaler
 11 | import itertools
 12 | 
 13 | shift = 200
 14 | COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
 15 |                'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
 16 |                'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
 17 |                'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')
 18 | 
 19 | def encode(charcode):
 20 |     r = 0
 21 |     ln = len(str(charcode))
 22 |     for i in range(ln):
 23 |         r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
 24 |     return r
 25 | 
 26 | fair_constant = 0.7
 27 | def fair_obj(preds, dtrain):
 28 |     labels = dtrain.get_label()
 29 |     x = (preds - labels)
 30 |     den = abs(x) + fair_constant
 31 |     grad = fair_constant * x / (den)
 32 |     hess = fair_constant * fair_constant / (den * den)
 33 |     return grad, hess
 34 | 
 35 | def xg_eval_mae(yhat, dtrain):
 36 |     y = dtrain.get_label()
 37 |     return 'mae', mean_absolute_error(np.exp(y)-shift,
 38 |                                       np.exp(yhat)-shift)
 39 | def mungeskewed(train, test, numeric_feats):
 40 |     ntrain = train.shape[0]
 41 |     test['loss'] = 0
 42 |     train_test = pd.concat((train, test)).reset_index(drop=True)
 43 |     skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
 44 |     skewed_feats = skewed_feats[skewed_feats > 0.25]
 45 |     skewed_feats = skewed_feats.index
 46 | 
 47 |     for feats in skewed_feats:
 48 |         train_test[feats] = train_test[feats] + 1
 49 |         train_test[feats], lam = boxcox(train_test[feats])
 50 |     return train_test, ntrain
 51 | 
 52 | if __name__ == "__main__":
 53 | 
 54 |     print('\nStarted')
 55 |     directory = '../input/'
 56 |     train = pd.read_csv(directory + 'train.csv')
 57 |     test = pd.read_csv(directory + 'test.csv')
 58 |     
 59 |     # 20161203
 60 |     # drop_id = np.load('./drop_id.npy')
 61 |     # train = train[~train['id'].isin(drop_id)]
 62 |     # 20161201
 63 |     # train.drop(['cat62', 'cat63', 'cat64', 'cat55'], axis = 1, inplace = True)
 64 |     # test.drop(['cat62', 'cat63', 'cat64', 'cat55'], axis = 1, inplace = True)
 65 |     
 66 |     numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
 67 |     categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
 68 |     train_test, ntrain = mungeskewed(train, test, numeric_feats)
 69 |     
 70 |     # taken from Vladimir's script (https://www.kaggle.com/iglovikov/allstate-claims-severity/xgb-1114)
 71 |     for column in list(train.select_dtypes(include=['object']).columns):
 72 |         if train[column].nunique() != test[column].nunique():
 73 |             set_train = set(train[column].unique())
 74 |             set_test = set(test[column].unique())
 75 |             remove_train = set_train - set_test
 76 |             remove_test = set_test - set_train
 77 | 
 78 |             remove = remove_train.union(remove_test)
 79 | 
 80 |             def filter_cat(x):
 81 |                 if x in remove:
 82 |                     return np.nan
 83 |                 return x
 84 | 
 85 | 
 86 |             train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1)
 87 | 
 88 |     # taken from Ali's script (https://www.kaggle.com/aliajouz/allstate-claims-severity/singel-model-lb-1117)
 89 |     train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"]))
 90 |     train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
 91 |     train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
 92 |     train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
 93 |     train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
 94 |     train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
 95 |     train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))
 96 | 
 97 |     train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1)
 98 |     train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1)
 99 |     train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1)
100 |     train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1)
101 |     train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25
102 | 
103 |     print('')
104 |     for comb in itertools.combinations(COMB_FEATURE, 2):
105 |         feat = comb[0] + "_" + comb[1]
106 |         train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
107 |         train_test[feat] = train_test[feat].apply(encode)
108 |         print('Combining Columns:', feat)
109 | 
110 |     print('')
111 |     for col in categorical_feats:
112 |         print('Analyzing Column:', col)
113 |         train_test[col] = train_test[col].apply(encode)
114 | 
115 |     print(train_test[categorical_feats])
116 | 
117 |     ss = StandardScaler()
118 |     train_test[numeric_feats] = \
119 |         ss.fit_transform(train_test[numeric_feats].values)
120 | 
121 |     train = train_test.iloc[:ntrain, :].copy()
122 |     test = train_test.iloc[ntrain:, :].copy()
123 | 
124 |     print('\nMedian Loss:', train.loss.median())
125 |     print('Mean Loss:', train.loss.mean())
126 | 
127 |     ids = pd.read_csv('../input/test.csv')['id']
128 |     train_y = np.log(train['loss'] + shift)
129 |     train_x = train.drop(['loss','id'], axis=1)
130 |     test_x = test.drop(['loss','id'], axis=1)
131 | 
132 |     n_folds = 10
133 |     cv_sum = 0
134 |     early_stopping = 200
135 |     fpred = []
136 |     xgb_rounds = []
137 | 
138 |     d_train_full = xgb.DMatrix(train_x, label=train_y)
139 |     d_test = xgb.DMatrix(test_x)
140 | 
141 |     kf = KFold(n_splits = 7)
142 |     for i, (train_index, test_index) in enumerate(kf.split(train_x)):
143 |         print('\n Fold %d' % (i+1))
144 |         X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
145 |         y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]
146 | 
147 |         rand_state = 2016
148 | 
149 |         params = {
150 |             'seed': 0,
151 |             'colsample_bytree': 0.7,
152 |             'silent': 1,
153 |             'subsample': 0.7,
154 |             'learning_rate': 0.01,
155 |             'objective': 'reg:linear',
156 |             'max_depth': 20,
157 |             'min_child_weight': 100,
158 |             'booster': 'gbtree'}
159 | 
160 |         d_train = xgb.DMatrix(X_train, label=y_train)
161 |         d_valid = xgb.DMatrix(X_val, label=y_val)
162 |         watchlist = [(d_train, 'train'), (d_valid, 'eval')]
163 | 
164 |         clf = xgb.train(params,
165 |                         d_train,
166 |                         10000,
167 |                         watchlist,
168 |                         early_stopping_rounds = early_stopping,
169 |                         obj = fair_obj,
170 |                         feval=xg_eval_mae)
171 | 
172 |         xgb_rounds.append(clf.best_iteration)
173 |         scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
174 |         cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
175 |         print('eval-MAE: %.6f' % cv_score)
176 |         y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift
177 | 
178 |         if i > 0:
179 |             fpred = pred + y_pred
180 |         else:
181 |             fpred = y_pred
182 |         pred = fpred
183 |         cv_sum = cv_sum + cv_score
184 | 
185 |     mpred = pred / n_folds
186 |     score = cv_sum / n_folds
187 |     print('Average eval-MAE: %.6f' % score)
188 |     n_rounds = int(np.mean(xgb_rounds))
189 | 
190 |     print("Writing results")
191 |     result = pd.DataFrame(mpred, columns=['loss'])
192 |     result["id"] = ids
193 |     result = result.set_index("id")
194 |     print("%d-fold average prediction:" % n_folds)
195 | 
196 |     now = datetime.now()
197 |     score = str(round((cv_sum / n_folds), 6))
198 |     sub_file = './submission_5fold-average-xgb_fairobj_' + str(score) + '_' + str(
199 |         now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
200 |     print("Writing submission: %s" % sub_file)
201 |     result.to_csv(sub_file, index=True, index_label='id')
202 | 


--------------------------------------------------------------------------------
/solutions/kaggle-allstate-claims-severity/nn_bagging_1111.84364.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ''' 
  3 | Author: Danijel Kivaranovic 
  4 | Title: Neural network (Keras) with sparse data
  5 | '''
  6 | 
  7 | ## import libraries
  8 | import numpy as np
  9 | np.random.seed(123)
 10 | 
 11 | import pandas as pd
 12 | import subprocess
 13 | from scipy.sparse import csr_matrix, hstack
 14 | from sklearn.metrics import mean_absolute_error
 15 | from sklearn.preprocessing import StandardScaler
 16 | from sklearn.model_selection import KFold
 17 | from keras.models import Sequential
 18 | from keras.layers import Dense, Dropout, Activation
 19 | from keras.layers.normalization import BatchNormalization
 20 | from keras.layers.advanced_activations import PReLU
 21 | 
 22 | ## Batch generators ##################################################################################################################################
 23 | 
 24 | def batch_generator(X, y, batch_size, shuffle):
 25 |     #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
 26 |     number_of_batches = np.ceil(X.shape[0]/batch_size)
 27 |     counter = 0
 28 |     sample_index = np.arange(X.shape[0])
 29 |     if shuffle:
 30 |         np.random.shuffle(sample_index)
 31 |     while True:
 32 |         batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
 33 |         X_batch = X[batch_index,:].toarray()
 34 |         y_batch = y[batch_index]
 35 |         counter += 1
 36 |         yield X_batch, y_batch
 37 |         if (counter == number_of_batches):
 38 |             if shuffle:
 39 |                 np.random.shuffle(sample_index)
 40 |             counter = 0
 41 | 
 42 | def batch_generatorp(X, batch_size, shuffle):
 43 |     number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
 44 |     counter = 0
 45 |     sample_index = np.arange(X.shape[0])
 46 |     while True:
 47 |         batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
 48 |         X_batch = X[batch_index, :].toarray()
 49 |         counter += 1
 50 |         yield X_batch
 51 |         if (counter == number_of_batches):
 52 |             counter = 0
 53 | 
 54 | ########################################################################################################################################################
 55 | 
 56 | ## read data
 57 | train = pd.read_csv('../input/train.csv')
 58 | test = pd.read_csv('../input/test.csv')
 59 | 
 60 | index = list(train.index)
 61 | print (index[0:10])
 62 | np.random.shuffle(index)
 63 | print (index[0:10])
 64 | train = train.iloc[index]
 65 | 'train = train.iloc[np.random.permutation(len(train))]'
 66 | 
 67 | ## set test loss to NaN
 68 | test['loss'] = np.nan
 69 | 
 70 | ## response and IDs
 71 | y = np.log(train['loss'].values+200)
 72 | id_train = train['id'].values
 73 | id_test = test['id'].values
 74 | 
 75 | ## stack train test
 76 | ntrain = train.shape[0]
 77 | tr_te = pd.concat((train, test), axis = 0)
 78 | 
 79 | ## Preprocessing and transforming to sparse data
 80 | sparse_data = []
 81 | 
 82 | f_cat = [f for f in tr_te.columns if 'cat' in f]
 83 | for f in f_cat:
 84 |     dummy = pd.get_dummies(tr_te[f].astype('category'))
 85 |     tmp = csr_matrix(dummy)
 86 |     sparse_data.append(tmp)
 87 | 
 88 | f_num = [f for f in tr_te.columns if 'cont' in f]
 89 | scaler = StandardScaler()
 90 | tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
 91 | sparse_data.append(tmp)
 92 | 
 93 | del(tr_te, train, test)
 94 | 
 95 | ## sparse train and test data
 96 | xtr_te = hstack(sparse_data, format = 'csr')
 97 | xtrain = xtr_te[:ntrain, :]
 98 | xtest = xtr_te[ntrain:, :]
 99 | 
100 | print('Dim train', xtrain.shape)
101 | print('Dim test', xtest.shape)
102 | 
103 | del(xtr_te, sparse_data, tmp)
104 | 
105 | ## neural net
106 | def nn_model():
107 |     model = Sequential()
108 |     
109 |     model.add(Dense(400, input_dim = xtrain.shape[1], init = 'he_normal'))
110 |     model.add(PReLU())
111 |     model.add(BatchNormalization())
112 |     model.add(Dropout(0.4))
113 |         
114 |     model.add(Dense(200, init = 'he_normal'))
115 |     model.add(PReLU())
116 |     model.add(BatchNormalization())    
117 |     model.add(Dropout(0.2))
118 |     
119 |     model.add(Dense(50, init = 'he_normal'))
120 |     model.add(PReLU())
121 |     model.add(BatchNormalization())    
122 |     model.add(Dropout(0.2))
123 |     
124 |     model.add(Dense(1, init = 'he_normal'))
125 |     model.compile(loss = 'mae', optimizer = 'adadelta')
126 |     return(model)
127 | 
128 | ## cv-folds
129 | nfolds = 10
130 | folds = KFold(n_splits = nfolds, shuffle = True, random_state = 111)
131 | 
132 | ## train models
133 | i = 0
134 | nbags = 10
135 | nepochs = 55
136 | pred_oob = np.zeros(xtrain.shape[0])
137 | pred_test = np.zeros(xtest.shape[0])
138 | 
139 | for (inTr, inTe) in folds.split(xtrain):
140 |     xtr = xtrain[inTr]
141 |     ytr = y[inTr]
142 |     xte = xtrain[inTe]
143 |     yte = y[inTe]
144 |     pred = np.zeros(xte.shape[0])
145 |     for j in range(nbags):
146 |         model = nn_model()
147 |         fit = model.fit_generator(generator = batch_generator(xtr, ytr, 128, True),
148 |                                   nb_epoch = nepochs,
149 |                                   samples_per_epoch = xtr.shape[0],
150 |                                   verbose = 1)
151 |         pred += np.exp(model.predict_generator(generator = batch_generatorp(xte, 800, False), val_samples = xte.shape[0])[:,0])-200
152 |         pred_test += np.exp(model.predict_generator(generator = batch_generatorp(xtest, 800, False), val_samples = xtest.shape[0])[:,0])-200
153 |     pred /= nbags
154 |     pred_oob[inTe] = pred
155 |     score = mean_absolute_error(np.exp(yte)-200, pred)
156 |     i += 1
157 |     print('Fold ', i, '- MAE:', score)
158 | 
159 | print('Total - MAE:', mean_absolute_error(np.exp(y)-200, pred_oob))
160 | 
161 | ## train predictions
162 | df = pd.DataFrame({'id': id_train, 'loss': pred_oob})
163 | df.to_csv('preds_oob.csv', index = False)
164 | 
165 | ## test predictions
166 | pred_test /= (nfolds*nbags)
167 | df = pd.DataFrame({'id': id_test, 'loss': pred_test})
168 | df.to_csv('submission_keras_shift_perm.csv', index = False)


--------------------------------------------------------------------------------
/solutions/kaggle-quickdraw-doodle-recognition/1_save2df.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os, sys, codecs, glob
 5 | import numpy as np
 6 | import pandas as pd
 7 | import cv2
 8 | 
 9 | from sklearn.preprocessing import LabelEncoder
10 | from sklearn.cross_validation import train_test_split
11 | 
12 | # 读取单个csv文件
13 | def read_df(path, nrows):
14 |     print('Reading...', path)
15 |     if nrows.isdigit():
16 |         return pd.read_csv(path, nrows=int(nrows), parse_dates=['timestamp'])
17 |     else:
18 |         return pd.read_csv(path, parse_dates=['timestamp'])
19 | 
20 | # 读取多个csv文件
21 | def contcat_df(paths, nrows):
22 |     dfs = []
23 |     for path in paths:
24 |         dfs.append(read_df(path, nrows))
25 |     return pd.concat(dfs, axis=0, ignore_index=True)
26 | 
27 | def main():
28 |     if not os.path.exists('./data'):
29 |         os.mkdir('./data')
30 |     
31 |     CLASSES_CSV = glob.glob('../input/train_simplified/*.csv')
32 |     CLASSES = [x.split('/')[-1][:-4] for x in CLASSES_CSV]
33 | 
34 |     print('Reading data...')
35 |     df = contcat_df(CLASSES_CSV, number)
36 |     df = df.reindex(np.random.permutation(df.index))
37 |     
38 |     lbl = LabelEncoder().fit(df['word'])
39 |     df['word'] = lbl.transform(df['word'])
40 |     
41 |     if df.shape[0] * 0.05 < 120000:
42 |         df_train, df_val = train_test_split(df, test_size=0.05)
43 |     else:
44 |         df_train, df_val = df.iloc[:-500000], df.iloc[-500000:]
45 |     
46 |     print('Train:', df_train.shape[0], 'Val', df_val.shape[0])
47 |     print('Save data...')
48 |     df_train.to_pickle(os.path.join('./data/', 'train_' + str(number) + '.pkl'))
49 |     df_val.to_pickle(os.path.join('./data/', 'val_' + str(number) + '.pkl'))
50 | 
51 | # python 1_save2df.py 50000
52 | # python 1_save2df.py all
53 | if __name__ == "__main__":
54 |     number = str(sys.argv[1])
55 |     main()


--------------------------------------------------------------------------------
/solutions/kaggle-quickdraw-doodle-recognition/2_train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os, sys, codecs, glob
  5 | from PIL import Image, ImageDraw
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import cv2
 10 | 
 11 | import torch
 12 | torch.backends.cudnn.benchmark = False
 13 | # torch.backends.cudnn.enabled = False
 14 | 
 15 | import torchvision.models as models
 16 | import torchvision.transforms as transforms
 17 | import torchvision.datasets as datasets
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | import torch.optim as optim
 21 | from torch.autograd import Variable
 22 | from torch.utils.data.dataset import Dataset
 23 | 
 24 | import logging
 25 | logging.basicConfig(level=logging.DEBUG, filename='example.log',
 26 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d]: %(message)s')  # 
 27 | 
 28 | def draw_cv2(raw_strokes, size=256, lw=6, time_color=True):
 29 |     BASE_SIZE = 299
 30 |     img = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
 31 |     for t, stroke in enumerate(eval(raw_strokes)):
 32 |         
 33 |         str_len = len(stroke[0])
 34 |         for i in range(len(stroke[0]) - 1):
 35 |             
 36 |             # dot dropout
 37 |             if np.random.uniform() > 0.95:
 38 |                 continue
 39 |             
 40 |             color = 255 - min(t, 10) * 13 if time_color else 255
 41 |             _ = cv2.line(img, (stroke[0][i] + 22, stroke[1][i]  + 22),
 42 |                          (stroke[0][i + 1] + 22, stroke[1][i + 1] + 22), color, lw)
 43 |     
 44 |     if size != BASE_SIZE:
 45 |         return cv2.resize(img, (size, size))
 46 |     else:
 47 |         return img
 48 | 
 49 | class QRDataset(Dataset):
 50 |     def __init__(self, img_drawing, img_label, img_size, transform=None):
 51 |         self.img_drawing = img_drawing
 52 |         self.img_label = img_label
 53 |         self.img_size = img_size
 54 |         self.transform = transform
 55 | 
 56 |     def __getitem__(self, index):
 57 |         img = np.zeros((self.img_size, self.img_size, 3))
 58 |         img[:, :, 0] = draw_cv2(self.img_drawing[index], self.img_size)
 59 |         img[:, :, 1] = img[:, :, 0]
 60 |         img[:, :, 2] = img[:, :, 0]
 61 |         img = Image.fromarray(np.uint8(img))
 62 |         
 63 |         if self.transform is not None:
 64 |             img = self.transform(img)
 65 |         
 66 |         label = torch.from_numpy(np.array([self.img_label[index]]))
 67 |         return img, label
 68 | 
 69 |     def __len__(self):
 70 |         return len(self.img_drawing)
 71 | 
 72 | def accuracy(output, target, topk=(1,)):
 73 |     """Computes the accuracy over the k top predictions for the specified values of k"""
 74 |     with torch.no_grad():
 75 |         maxk = max(topk)
 76 |         batch_size = target.size(0)
 77 | 
 78 |         _, pred = output.topk(maxk, 1, True, True)
 79 |         pred = pred.t()
 80 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
 81 | 
 82 |         res = []
 83 |         for k in topk:
 84 |             correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
 85 |             res.append(correct_k.mul_(100.0 / batch_size))
 86 |         return res
 87 |     
 88 | def get_resnet18():
 89 |     model = models.resnet18(True)
 90 |     model.avgpool = nn.AdaptiveAvgPool2d(1)
 91 |     model.fc = nn.Linear(512, 340)
 92 |     return model
 93 | 
 94 | def get_resnet34():
 95 |     model = models.resnet34(True)
 96 |     model.avgpool = nn.AdaptiveAvgPool2d(1)
 97 |     model.fc = nn.Linear(512, 340)
 98 |     return model
 99 | 
100 | def get_resnet50():
101 |     model = models.resnet50(True)
102 |     model.avgpool = nn.AdaptiveAvgPool2d(1)
103 |     model.fc = nn.Linear(2048, 340)
104 |     return model
105 | 
106 | def get_resnet101():
107 |     model = models.resnet101(True)
108 |     model.avgpool = nn.AdaptiveAvgPool2d(1)
109 |     model.fc = nn.Linear(2048, 340)
110 |     return model
111 | 
112 | def main():
113 |     df_train = pd.read_pickle(os.path.join('./data', 'train_' + dataset + '.pkl'))
114 |     # df_train = df_train.reindex(np.random.permutation(df_train.index))
115 |     df_val = pd.read_pickle(os.path.join('./data', 'val_' + dataset + '.pkl'))
116 |     
117 |     train_loader = torch.utils.data.DataLoader(
118 |         QRDataset(df_train['drawing'].values, df_train['word'].values, imgsize,
119 |                          transforms.Compose([
120 |                             transforms.RandomHorizontalFlip(),
121 |                             transforms.RandomVerticalFlip(),
122 |                             # transforms.RandomAffine(5, scale=[0.95, 1.05]),
123 |                             transforms.ToTensor(),
124 |                             # transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
125 |             ])
126 |         ),
127 |         batch_size=1000, shuffle=True, num_workers=5,
128 |     )
129 | 
130 |     val_loader = torch.utils.data.DataLoader(
131 |         QRDataset(df_val['drawing'].values, df_val['word'].values, imgsize,
132 |                          transforms.Compose([
133 |                             transforms.RandomHorizontalFlip(),
134 |                             transforms.RandomVerticalFlip(),
135 |                             transforms.ToTensor(),
136 |                             # transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
137 |             ])
138 |         ),
139 |         batch_size=1000, shuffle=False, num_workers=5,
140 |     )
141 |     
142 |     if modelname == 'resnet18':
143 |         model = get_resnet18()
144 |     elif modelname == 'resnet34':
145 |         model = get_resnet34()
146 |     elif modelname == 'resnet50':
147 |         model = get_resnet50()
148 |     elif modelname == 'resnet101':
149 |         model = get_resnet101()
150 |     
151 |     # model = nn.DataParallel(model).cuda()
152 |     model.load_state_dict(torch.load('./resnet50_64_7_0.pt'))
153 |     # model.load_state_dict(torch.load('./resnet34_256_1_3280(82.7529_93.9964).pt'))
154 |     
155 |     model = model.cuda(0)
156 |     
157 |     loss_fn = nn.CrossEntropyLoss()
158 |     optimizer = optim.Adam(model.parameters(), lr=0.01)
159 |     # scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2, 3, 5, 7, 8], gamma=0.1)
160 |     scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=len(train_loader) / 10, gamma=0.95)
161 |     
162 |     print('Train:', df_train.shape[0], 'Val', df_val.shape[0])
163 |     print('Epoch/Batch\t\tTrain: loss/Top1/Top3\t\tTest: loss/Top1/Top3')
164 | 
165 |     for epoch in range(50):
166 |         train_losss, train_acc1s, train_acc5s = [], [], []
167 |         for i, data in enumerate(train_loader):
168 |             scheduler.step()
169 |             model = model.train()
170 |             train_img, train_label = data
171 |             optimizer.zero_grad()
172 |             
173 |             # TODO: data paraell
174 |             # train_img = Variable(train_img).cuda(async=True)
175 |             # train_label = Variable(train_label.view(-1)).cuda()
176 |             
177 |             train_img = Variable(train_img).cuda(0)
178 |             train_label = Variable(train_label.view(-1)).cuda(0)
179 |             
180 |             
181 |             output = model(train_img)
182 |             train_loss = loss_fn(output, train_label)
183 |             
184 |             train_loss.backward()
185 |             optimizer.step()
186 |             
187 |             train_losss.append(train_loss.item())
188 |             if i % 5 == 0:
189 |                 logging.info('{0}/{1}:\t{2}\t{3}.'.format(epoch, i, optimizer.param_groups[0]['lr'], train_losss[-1]))
190 |             
191 |             if i % int(len(train_loader) / 10) == 0:
192 |                 val_losss, val_acc1s, val_acc5s = [], [], []
193 |                 
194 |                 with torch.no_grad():
195 |                     train_acc1, train_acc3 = accuracy(output, train_label, topk=(1, 3))
196 |                     train_acc1s.append(train_acc1.item())
197 |                     train_acc5s.append(train_acc3.item())
198 |                 
199 |                     for data in val_loader:
200 |                         val_images, val_labels = data
201 |                         
202 |                         # val_images = Variable(val_images).cuda(async=True)
203 |                         # val_labels = Variable(val_labels.view(-1)).cuda()
204 | 
205 |                         val_images = Variable(val_images).cuda(0)
206 |                         val_labels = Variable(val_labels.view(-1)).cuda(0) 
207 |                        
208 |                         output = model(val_images)
209 |                         val_loss = loss_fn(output, val_labels)
210 |                         val_acc1, val_acc3 = accuracy(output, val_labels, topk=(1, 3))
211 |                         
212 |                         val_losss.append(val_loss.item())
213 |                         val_acc1s.append(val_acc1.item())
214 |                         val_acc5s.append(val_acc3.item())
215 |                         
216 |                         if i == 0:
217 |                             break
218 |                 
219 |                 logstr = '{0:2s}/{1:6s}\t\t{2:.4f}/{3:.4f}/{4:.4f}\t\t{5:.4f}/{6:.4f}/{7:.4f}'.format(
220 |                     str(epoch), str(i),
221 |                     np.mean(train_losss, 0), np.mean(train_acc1s, 0), np.mean(train_acc5s, 0),
222 |                     np.mean(val_losss, 0), np.mean(val_acc1s, 0), np.mean(val_acc5s, 0),
223 |                 )
224 |                 torch.save(model.state_dict(), '{0}_{1}_{2}_{3}.pt'.format(modelname, imgsize, epoch, i))
225 |                 print(logstr)
226 |                 
227 |     
228 | # python 2_train.py 模型 数量 图片尺寸
229 | # python 2_train.py resnet18 5000 64
230 | if __name__ == "__main__":
231 |     modelname = str(sys.argv[1])
232 |     dataset = str(sys.argv[2])
233 |     imgsize = int(sys.argv[3])
234 |     main()


--------------------------------------------------------------------------------
/solutions/kaggle-quickdraw-doodle-recognition/README.md:
--------------------------------------------------------------------------------
1 | https://quickdraw.withgoogle.com/
2 | 
3 | https://www.kaggle.com/c/quickdraw-doodle-recognition/
4 | 


--------------------------------------------------------------------------------
/solutions/kaggle-titanic/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/solutions/kaggle-two-sigma-connect-rental-listing-inquiries/README.md:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/
2 | 


--------------------------------------------------------------------------------
/solutions/kaggle-two-sigma-connect-rental-listing-inquiries/lgb.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy import sparse
  4 | import xgboost as xgb
  5 | import lightgbm as lgb
  6 | 
  7 | import random
  8 | from sklearn.preprocessing import LabelEncoder
  9 | from sklearn.metrics import log_loss
 10 | from sklearn.feature_extraction.text import CountVectorizer
 11 | 
 12 | train_df = pd.read_json("../input/train.json")
 13 | test_df = pd.read_json("../input/test.json")
 14 | 
 15 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=1800):
 16 |     params = {
 17 |         'learning_rate': 0.03,
 18 |         'min_child_samples': 4,
 19 |         'max_depth': 6, 
 20 |         'lambda_l1': 1.5,
 21 |         'boosting': 'gbdt', 
 22 |         'objective': 'multiclass', 
 23 |         'metric': 'multi_logloss',
 24 |         'num_class': 3,
 25 |         # 'feature_fraction': .85,
 26 |         # 'bagging_fraction': .7,
 27 |         'seed': 99,
 28 |         'num_threads': 10,
 29 |         'verbose': 0
 30 |     }
 31 | 
 32 |     # plst = list(param.items())
 33 |     clf = lgb.train(params, lgb.Dataset(train_X, label=train_y), 2000,)
 34 | 
 35 |     pred_test_y = clf.predict(test_X)
 36 |     return pred_test_y, clf
 37 | 
 38 | test_df["bathrooms"].loc[19671] = 1.5
 39 | test_df["bathrooms"].loc[22977] = 2.0
 40 | test_df["bathrooms"].loc[63719] = 2.0
 41 | train_df["price"] = train_df["price"].clip(upper=13000)
 42 | 
 43 | train_df["logprice"] = np.log(train_df["price"])
 44 | test_df["logprice"] = np.log(test_df["price"])
 45 | 
 46 | train_df['half_bathrooms'] = train_df["bathrooms"] - train_df["bathrooms"].apply(int)
 47 | test_df['half_bathrooms'] = test_df["bathrooms"] - test_df["bathrooms"].apply(int)
 48 | 
 49 | train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
 50 | test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 
 51 | 
 52 | train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
 53 | test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 
 54 | 
 55 | train_df['price_per_room'] = train_df['price']/train_df['room_sum']
 56 | test_df['price_per_room'] = test_df['price']/test_df['room_sum']
 57 | 
 58 | train_df["num_photos"] = train_df["photos"].apply(len)
 59 | test_df["num_photos"] = test_df["photos"].apply(len)
 60 | 
 61 | train_df["num_features"] = train_df["features"].apply(len)
 62 | test_df["num_features"] = test_df["features"].apply(len)
 63 | 
 64 | train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
 65 | test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))
 66 | 
 67 | train_df["created"] = pd.to_datetime(train_df["created"])
 68 | test_df["created"] = pd.to_datetime(test_df["created"])
 69 | train_df["created_year"] = train_df["created"].dt.year
 70 | test_df["created_year"] = test_df["created"].dt.year
 71 | train_df["created_month"] = train_df["created"].dt.month
 72 | test_df["created_month"] = test_df["created"].dt.month
 73 | train_df["created_day"] = train_df["created"].dt.day
 74 | test_df["created_day"] = test_df["created"].dt.day
 75 | train_df["created_hour"] = train_df["created"].dt.hour
 76 | test_df["created_hour"] = test_df["created"].dt.hour
 77 | 
 78 | train_df["created_weekday"] = train_df["created"].dt.weekday
 79 | test_df["created_weekday"] = test_df["created"].dt.weekday
 80 | train_df["created_week"] = train_df["created"].dt.week
 81 | test_df["created_week"] = test_df["created"].dt.week
 82 | 
 83 | train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
 84 | test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)
 85 | 
 86 | vals = train_df['pos'].value_counts()
 87 | dvals = vals.to_dict()
 88 | train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
 89 | test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
 90 | 
 91 | features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density", "half_bathrooms",
 92 | "num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour", "created_week", "created_weekday"]
 93 | 
 94 | index=list(range(train_df.shape[0]))
 95 | random.shuffle(index)
 96 | a=[np.nan]*len(train_df)
 97 | b=[np.nan]*len(train_df)
 98 | c=[np.nan]*len(train_df)
 99 | 
100 | for i in range(5):
101 |     building_level={}
102 |     for j in train_df['manager_id'].values:
103 |         building_level[j]=[0,0,0]
104 |     
105 |     test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
106 |     train_index=list(set(index).difference(test_index))
107 |     
108 |     for j in train_index:
109 |         temp=train_df.iloc[j]
110 |         if temp['interest_level']=='low':
111 |             building_level[temp['manager_id']][0]+=1
112 |         if temp['interest_level']=='medium':
113 |             building_level[temp['manager_id']][1]+=1
114 |         if temp['interest_level']=='high':
115 |             building_level[temp['manager_id']][2]+=1
116 |             
117 |     for j in test_index:
118 |         temp=train_df.iloc[j]
119 |         if sum(building_level[temp['manager_id']])!=0:
120 |             a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
121 |             b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
122 |             c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
123 |             
124 | train_df['manager_level_low']=a
125 | train_df['manager_level_medium']=b
126 | train_df['manager_level_high']=c
127 | 
128 | a=[]
129 | b=[]
130 | c=[]
131 | building_level={}
132 | for j in train_df['manager_id'].values:
133 |     building_level[j]=[0,0,0]
134 | 
135 | for j in range(train_df.shape[0]):
136 |     temp=train_df.iloc[j]
137 |     if temp['interest_level']=='low':
138 |         building_level[temp['manager_id']][0]+=1
139 |     if temp['interest_level']=='medium':
140 |         building_level[temp['manager_id']][1]+=1
141 |     if temp['interest_level']=='high':
142 |         building_level[temp['manager_id']][2]+=1
143 | 
144 | for i in test_df['manager_id'].values:
145 |     if i not in building_level.keys():
146 |         a.append(np.nan)
147 |         b.append(np.nan)
148 |         c.append(np.nan)
149 |     else:
150 |         a.append(building_level[i][0]*1.0/sum(building_level[i]))
151 |         b.append(building_level[i][1]*1.0/sum(building_level[i]))
152 |         c.append(building_level[i][2]*1.0/sum(building_level[i]))
153 | test_df['manager_level_low']=a
154 | test_df['manager_level_medium']=b
155 | test_df['manager_level_high']=c
156 | 
157 | features_to_use.append('manager_level_low') 
158 | features_to_use.append('manager_level_medium') 
159 | features_to_use.append('manager_level_high')
160 | 
161 | categorical = ["street_address", "display_address", "manager_id", "building_id"]
162 | for f in categorical:
163 |         if train_df[f].dtype=='object':
164 |             lbl = LabelEncoder()
165 |             lbl.fit(list(train_df[f].values) + list(test_df[f].values))
166 |             train_df[f] = lbl.transform(list(train_df[f].values))
167 |             test_df[f] = lbl.transform(list(test_df[f].values))
168 |             features_to_use.append(f)
169 | 
170 | train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
171 | test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
172 | 
173 | tfidf = CountVectorizer(stop_words='english', max_features=200)
174 | tr_sparse = tfidf.fit_transform(train_df["features"])
175 | te_sparse = tfidf.transform(test_df["features"])
176 | 
177 | train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
178 | test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()
179 | 
180 | target_num_map = {'high':0, 'medium':1, 'low':2}
181 | train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
182 | 
183 | preds, model = runXGB(train_X, train_y, test_X, num_rounds=1800)
184 | out_df = pd.DataFrame(preds)
185 | out_df.columns = ["high", "medium", "low"]
186 | out_df["listing_id"] = test_df.listing_id.values
187 | out_df.to_csv("cz.csv", index=False)


--------------------------------------------------------------------------------
/solutions/tianchi-第三届阿里云安全算法挑战赛/README.md:
--------------------------------------------------------------------------------
1 | https://tianchi.aliyun.com/competition/entrance/231668/information
2 | 


--------------------------------------------------------------------------------
/solutions/tianchi-第三届阿里云安全算法挑战赛/api.csv:
--------------------------------------------------------------------------------
  1 | GetSystemTimeAsFileTime
  2 | NtAllocateVirtualMemory
  3 | NtFreeVirtualMemory
  4 | SetUnhandledExceptionFilter
  5 | LdrLoadDll
  6 | LdrGetProcedureAddress
  7 | LdrUnloadDll
  8 | NtCreateMutant
  9 | NtCreateSection
 10 | NtMapViewOfSection
 11 | CoInitializeEx
 12 | RegOpenKeyExW
 13 | CoUninitialize
 14 | NtUnmapViewOfSection
 15 | NtClose
 16 | LdrGetDllHandle
 17 | NtTerminateProcess
 18 | NtOpenKey
 19 | NtQueryValueKey
 20 | __exception__
 21 | SetErrorMode
 22 | RegQueryValueExW
 23 | RegCloseKey
 24 | NtCreateFile
 25 | NtWriteFile
 26 | CreateProcessInternalW
 27 | NtProtectVirtualMemory
 28 | RegOpenKeyExA
 29 | NtQueryAttributesFile
 30 | LoadStringA
 31 | GetSystemMetrics
 32 | RegQueryValueExA
 33 | FindResourceExW
 34 | LoadResource
 35 | GetSystemWindowsDirectoryW
 36 | FindResourceA
 37 | SizeofResource
 38 | GetFileVersionInfoSizeW
 39 | GetFileVersionInfoW
 40 | DrawTextExA
 41 | WSAStartup
 42 | socket
 43 | setsockopt
 44 | closesocket
 45 | bind
 46 | NtSetInformationFile
 47 | NtDeviceIoControlFile
 48 | CreateThread
 49 | NtOpenFile
 50 | GetSystemDirectoryW
 51 | NtOpenMutant
 52 | NtOpenSection
 53 | RegEnumKeyExW
 54 | LoadStringW
 55 | GetCursorPos
 56 | EnumWindows
 57 | GetKeyState
 58 | NtQuerySystemInformation
 59 | FindFirstFileExW
 60 | NtOpenDirectoryObject
 61 | GetVolumePathNameW
 62 | CreateDirectoryW
 63 | GetFileAttributesW
 64 | DeleteFileW
 65 | CopyFileA
 66 | CreateToolhelp32Snapshot
 67 | Thread32First
 68 | Thread32Next
 69 | NtDuplicateObject
 70 | GetSystemInfo
 71 | NtOpenKeyEx
 72 | GetTempPathW
 73 | SetFilePointer
 74 | NtReadFile
 75 | GetFileType
 76 | GetTimeZoneInformation
 77 | SetWindowsHookExA
 78 | NtEnumerateKey
 79 | NtQueryInformationFile
 80 | listen
 81 | connect
 82 | gethostbyname
 83 | NtOpenProcess
 84 | WriteProcessMemory
 85 | RtlAddVectoredExceptionHandler
 86 | ReadProcessMemory
 87 | FindWindowA
 88 | SHGetFolderPathW
 89 | CreateActCtxW
 90 | FindResourceW
 91 | SetWindowsHookExW
 92 | GetForegroundWindow
 93 | RegQueryInfoKeyW
 94 | RegEnumValueW
 95 | GetFileSizeEx
 96 | DrawTextExW
 97 | Process32FirstW
 98 | Process32NextW
 99 | NtReadVirtualMemory
100 | OutputDebugStringA
101 | SearchPathW
102 | OleInitialize
103 | CryptAcquireContextW
104 | GetFileSize
105 | SetEndOfFile
106 | GlobalMemoryStatus
107 | CoGetClassObject
108 | CoCreateInstance
109 | NtQueryKey
110 | NtSetValueKey
111 | NtDelayExecution
112 | RegEnumKeyW
113 | NtQueryDirectoryFile
114 | GetFileInformationByHandleEx
115 | NtEnumerateValueKey
116 | GetUserNameExW
117 | GetComputerNameW
118 | GetUserNameW
119 | DeviceIoControl
120 | FindWindowW
121 | RegCreateKeyExW
122 | SendNotifyMessageW
123 | RegSetValueExW
124 | GetFileAttributesExW
125 | GetFileInformationByHandle
126 | SetFileTime
127 | LookupAccountSidW
128 | IsDebuggerPresent
129 | NtResumeThread
130 | GlobalMemoryStatusEx
131 | GetShortPathNameW
132 | NtCreateKey
133 | CoInitializeSecurity
134 | UuidCreate
135 | NtCreateThreadEx
136 | RtlAddVectoredContinueHandler
137 | LookupPrivilegeValueW
138 | NtOpenThread
139 | Module32FirstW
140 | Module32NextW
141 | GetKeyboardState
142 | WriteConsoleA
143 | GetVolumeNameForVolumeMountPointW
144 | NtQueryFullAttributesFile
145 | SetFilePointerEx
146 | GetVolumePathNamesForVolumeNameW
147 | system
148 | WriteConsoleW
149 | RemoveDirectoryA
150 | GetNativeSystemInfo
151 | GetSystemDirectoryA
152 | CopyFileW
153 | GetAdaptersInfo
154 | RegEnumValueA
155 | RegDeleteValueW
156 | RegCreateKeyExA
157 | GetUserNameA
158 | SetFileAttributesW
159 | RegEnumKeyExA
160 | OpenSCManagerA
161 | OpenServiceA
162 | RegSetValueExA
163 | RegDeleteValueA
164 | InternetCrackUrlA
165 | InternetSetOptionA
166 | InternetGetConnectedState
167 | InternetOpenW
168 | InternetSetStatusCallback
169 | InternetConnectW
170 | HttpOpenRequestW
171 | InternetQueryOptionA
172 | HttpSendRequestW
173 | HttpQueryInfoA
174 | InternetCloseHandle
175 | getaddrinfo
176 | GetAdaptersAddresses
177 | getsockname
178 | select
179 | CryptProtectMemory
180 | CryptUnprotectMemory
181 | GetComputerNameA
182 | GetFileVersionInfoSizeExW
183 | GetFileVersionInfoExW
184 | InternetCrackUrlW
185 | SHGetSpecialFolderLocation
186 | CryptHashData
187 | NetUserGetInfo
188 | shutdown
189 | CreateServiceA
190 | StartServiceA
191 | ShellExecuteExW
192 | SetStdHandle
193 | NtQueryMultipleValueKey
194 | CreateJobObjectW
195 | SetInformationJobObject
196 | GetSystemWindowsDirectoryA
197 | FindResourceExA
198 | RemoveDirectoryW
199 | GetDiskFreeSpaceExW
200 | MoveFileWithProgressW
201 | NetShareEnum
202 | RegDeleteKeyW
203 | GetDiskFreeSpaceW
204 | RegQueryInfoKeyA
205 | OpenSCManagerW
206 | OpenServiceW
207 | CryptAcquireContextA
208 | GetAddrInfoW
209 | NtTerminateThread
210 | CreateServiceW
211 | NtDeleteKey
212 | GetBestInterfaceEx
213 | timeGetTime
214 | InternetOpenA
215 | CryptEncrypt
216 | InternetConnectA
217 | HttpOpenRequestA
218 | HttpSendRequestA
219 | StartServiceW
220 | ControlService
221 | DeleteService
222 | CryptExportKey
223 | CryptCreateHash
224 | WSASocketW
225 | NtSuspendThread
226 | NtGetContextThread
227 | UnhookWindowsHookEx
228 | CertOpenStore
229 | CryptDecodeObjectEx
230 | CertControlStore
231 | NtDeleteValueKey
232 | GetAsyncKeyState
233 | EnumServicesStatusW
234 | DnsQuery_W
235 | FindWindowExW
236 | FindFirstFileExA
237 | RegDeleteKeyA
238 | FindWindowExA
239 | InternetOpenUrlA
240 | SendNotifyMessageA
241 | CoCreateInstanceEx
242 | IWbemServices_ExecQuery
243 | WSASocketA
244 | URLDownloadToFileW
245 | accept
246 | NtCreateDirectoryObject
247 | CertCreateCertificateContext
248 | AssignProcessToJobObject
249 | SetFileInformationByHandle
250 | NetGetJoinInformation
251 | InternetReadFile
252 | RtlRemoveVectoredExceptionHandler
253 | CryptGenKey
254 | MessageBoxTimeoutA
255 | NetUserGetLocalGroups
256 | DeleteUrlCacheEntryW
257 | send
258 | recv
259 | ioctlsocket
260 | WSARecv
261 | WSASend
262 | sendto
263 | CopyFileExW
264 | RegisterHotKey
265 | MessageBoxTimeoutW
266 | CreateRemoteThread
267 | GetUserNameExA
268 | EnumServicesStatusA
269 | NtQueueApcThread
270 | RtlCreateUserThread
271 | InternetOpenUrlW
272 | CryptProtectData
273 | WSAConnect
274 | CryptDecrypt
275 | CreateDirectoryExW
276 | IWbemServices_ExecMethod
277 | recvfrom
278 | ObtainUserAgentString
279 | DnsQuery_A
280 | ReadCabinetState
281 | NtSetContextThread
282 | WSARecvFrom
283 | WSASendTo
284 | NtLoadKey
285 | NtLoadDriver
286 | DeleteUrlCacheEntryA
287 | GetInterfaceInfo
288 | NtWriteVirtualMemory
289 | RtlCompressBuffer
290 | NtShutdownSystem
291 | TaskDialog
292 | NtDeleteFile
293 | InternetGetConnectedStateExW
294 | CryptUnprotectData
295 | InternetGetConnectedStateExA
296 | NtSaveKeyEx
297 | NtSaveKey
298 | CertOpenSystemStoreA
299 | PRF
300 | ExitWindowsEx
301 | WSAAccept
302 | CreateRemoteThreadEx
303 | CertOpenSystemStoreW
304 | NtUnloadDriver
305 | NtCreateThread
306 | NtLoadKeyEx
307 | InternetWriteFile
308 | RtlDecompressBuffer
309 | 


--------------------------------------------------------------------------------
/solutions/tianchi-第三届阿里云安全算法挑战赛/finetune.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Fine-tune with Pretrained Models\n",
  8 |     "\n",
  9 |     "Many of the exciting deep learning algorithms for computer vision require\n",
 10 |     "massive datasets for training. The most popular benchmark dataset,\n",
 11 |     "[ImageNet](http://www.image-net.org/), for example, contains one million images\n",
 12 |     "from one thousand categories. But for any practical problem, we typically have\n",
 13 |     "access to comparatively small datasets. In these cases, if we were to train a\n",
 14 |     "neural network's weights from scratch, starting from random initialized\n",
 15 |     "parameters, we would overfit the training set badly.\n",
 16 |     "\n",
 17 |     "One approach to get around this problem is to first pretrain a deep net on a\n",
 18 |     "large-scale dataset, like ImageNet. Then, given a new dataset, we can start\n",
 19 |     "with these pretrained weights when training on our new task. This process is\n",
 20 |     "commonly called _fine-tuning_. There are a number of variations of fine-tuning.\n",
 21 |     "Sometimes, the initial neural network is used only as a _feature extractor_.\n",
 22 |     "That means that we freeze every layer prior to the output layer and simply learn\n",
 23 |     "a new output layer. In [another document](https://github.com/dmlc/mxnet-notebooks/blob/master/python/how_to/predict.ipynb), we explained how to\n",
 24 |     "do this kind of feature extraction. Another approach is to update all of\n",
 25 |     "the network's weights for the new task, and that's the approach we demonstrate in\n",
 26 |     "this document.\n",
 27 |     "\n",
 28 |     "To fine-tune a network, we must first replace the last fully-connected layer\n",
 29 |     "with a new one that outputs the desired number of classes. We initialize its\n",
 30 |     "weights randomly. Then we continue training as normal. Sometimes it's common to\n",
 31 |     "use a smaller learning rate based on the intuition that we may already be close\n",
 32 |     "to a good result.\n",
 33 |     "\n",
 34 |     "In this demonstration, we'll fine-tune a model pretrained on ImageNet to the\n",
 35 |     "smaller caltech-256 dataset. Following this example, you can fine-tune to other\n",
 36 |     "datasets, even for strikingly different applications such as face\n",
 37 |     "identification.\n",
 38 |     "\n",
 39 |     "We will show that, even with simple hyper-parameters setting, we can match and\n",
 40 |     "even outperform state-of-the-art results on caltech-256.\n",
 41 |     "\n",
 42 |     "```eval_rst\n",
 43 |     ".. list-table::\n",
 44 |     "   :header-rows: 1\n",
 45 |     "\n",
 46 |     "   * - Network \n",
 47 |     "     - Accuracy \n",
 48 |     "   * - Resnet-50 \n",
 49 |     "     - 77.4% \n",
 50 |     "   * - Resnet-152 \n",
 51 |     "     - 86.4% \n",
 52 |     "```\n",
 53 |     "\n",
 54 |     "## Prepare data\n",
 55 |     "\n",
 56 |     "We follow the standard protocol to sample 60 images from each class as the\n",
 57 |     "training set, and the rest for the validation set. We resize images into 256x256\n",
 58 |     "size and pack them into the rec file. The scripts to prepare the data is as\n",
 59 |     "following.\n",
 60 |     "\n",
 61 |     "> In order to successfully run the following bash script on Windows please use https://cygwin.com/install.html .\n",
 62 |     "\n",
 63 |     "```sh\n",
 64 |     "wget http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar\n",
 65 |     "tar -xf 256_ObjectCategories.tar\n",
 66 |     "\n",
 67 |     "mkdir -p caltech_256_train_60\n",
 68 |     "for i in 256_ObjectCategories/*; do\n",
 69 |     "    c=`basename $i`\n",
 70 |     "    mkdir -p caltech_256_train_60/$c\n",
 71 |     "    for j in `ls $i/*.jpg | shuf | head -n 60`; do\n",
 72 |     "        mv $j caltech_256_train_60/$c/\n",
 73 |     "    done\n",
 74 |     "done\n",
 75 |     "\n",
 76 |     "python ~/mxnet/tools/im2rec.py --list --recursive caltech-256-60-train caltech_256_train_60/\n",
 77 |     "python ~/mxnet/tools/im2rec.py --list --recursive caltech-256-60-val 256_ObjectCategories/\n",
 78 |     "python ~/mxnet/tools/im2rec.py --resize 256 --quality 90 --num-thread 16 caltech-256-60-val 256_ObjectCategories/\n",
 79 |     "python ~/mxnet/tools/im2rec.py --resize 256 --quality 90 --num-thread 16 caltech-256-60-train caltech_256_train_60/\n",
 80 |     "```\n",
 81 |     "\n",
 82 |     "The following code downloads the pregenerated rec files. It may take a few minutes."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "import os, sys\n",
 92 |     "\n",
 93 |     "if sys.version_info[0] >= 3:\n",
 94 |     "    from urllib.request import urlretrieve\n",
 95 |     "else:\n",
 96 |     "    from urllib import urlretrieve\n",
 97 |     "\n",
 98 |     "def download(url):\n",
 99 |     "    filename = url.split(\"/\")[-1]\n",
100 |     "    if not os.path.exists(filename):\n",
101 |     "        urlretrieve(url, filename)\n",
102 |     "download('http://data.mxnet.io/data/caltech-256/caltech-256-60-train.rec')\n",
103 |     "download('http://data.mxnet.io/data/caltech-256/caltech-256-60-val.rec')"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "Next, we define the function which returns the data iterators:"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "import mxnet as mx\n",
120 |     "\n",
121 |     "def get_iterators(batch_size, data_shape=(3, 224, 224)):\n",
122 |     "    train = mx.io.ImageRecordIter(\n",
123 |     "        path_imgrec         = './caltech-256-60-train.rec',\n",
124 |     "        data_name           = 'data',\n",
125 |     "        label_name          = 'softmax_label',\n",
126 |     "        batch_size          = batch_size,\n",
127 |     "        data_shape          = data_shape,\n",
128 |     "        shuffle             = True,\n",
129 |     "        rand_crop           = True,\n",
130 |     "        rand_mirror         = True)\n",
131 |     "    val = mx.io.ImageRecordIter(\n",
132 |     "        path_imgrec         = './caltech-256-60-val.rec',\n",
133 |     "        data_name           = 'data',\n",
134 |     "        label_name          = 'softmax_label',\n",
135 |     "        batch_size          = batch_size,\n",
136 |     "        data_shape          = data_shape,\n",
137 |     "        rand_crop           = False,\n",
138 |     "        rand_mirror         = False)\n",
139 |     "    return (train, val)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "We then download a pretrained 50-layer ResNet model and load it into memory. Note\n",
147 |     "that if `load_checkpoint` reports an error, we can remove the downloaded files\n",
148 |     "and try `get_model` again."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "def get_model(prefix, epoch):\n",
158 |     "    download(prefix+'-symbol.json')\n",
159 |     "    download(prefix+'-%04d.params' % (epoch,))\n",
160 |     "\n",
161 |     "get_model('http://data.mxnet.io/models/imagenet/resnet/50-layers/resnet-50', 0)\n",
162 |     "sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-50', 0)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## Train\n",
170 |     "\n",
171 |     "We first define a function which replaces the last fully-connected layer for a given network."
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "def get_fine_tune_model(symbol, arg_params, num_classes, layer_name='flatten0'):\n",
181 |     "    \"\"\"\n",
182 |     "    symbol: the pretrained network symbol\n",
183 |     "    arg_params: the argument parameters of the pretrained model\n",
184 |     "    num_classes: the number of classes for the fine-tune datasets\n",
185 |     "    layer_name: the layer name before the last fully-connected layer\n",
186 |     "    \"\"\"\n",
187 |     "    all_layers = symbol.get_internals()\n",
188 |     "    net = all_layers[layer_name+'_output']\n",
189 |     "    net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name='fc1')\n",
190 |     "    net = mx.symbol.SoftmaxOutput(data=net, name='softmax')\n",
191 |     "    new_args = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k})\n",
192 |     "    return (net, new_args)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "Now we create a module. Note we pass the existing parameters from the loaded model via the `arg_params` argument.\n",
200 |     "The parameters of the last fully-connected layer will be randomly initialized by the `initializer`."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "import logging\n",
210 |     "head = '%(asctime)-15s %(message)s'\n",
211 |     "logging.basicConfig(level=logging.DEBUG, format=head)\n",
212 |     "\n",
213 |     "def fit(symbol, arg_params, aux_params, train, val, batch_size, num_gpus):\n",
214 |     "    devs = [mx.gpu(i) for i in range(num_gpus)]\n",
215 |     "    mod = mx.mod.Module(symbol=symbol, context=devs)\n",
216 |     "    mod.fit(train, val,\n",
217 |     "        num_epoch=8,\n",
218 |     "        arg_params=arg_params,\n",
219 |     "        aux_params=aux_params,\n",
220 |     "        allow_missing=True,\n",
221 |     "        batch_end_callback = mx.callback.Speedometer(batch_size, 10),\n",
222 |     "        kvstore='device',\n",
223 |     "        optimizer='sgd',\n",
224 |     "        optimizer_params={'learning_rate':0.01},\n",
225 |     "        initializer=mx.init.Xavier(rnd_type='gaussian', factor_type=\"in\", magnitude=2),\n",
226 |     "        eval_metric='acc')\n",
227 |     "    metric = mx.metric.Accuracy()\n",
228 |     "    return mod.score(val, metric)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "Then we can start training. We use AWS EC2 g2.8xlarge, which has 8 GPUs."
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "num_classes = 256\n",
245 |     "batch_per_gpu = 16\n",
246 |     "num_gpus = 8\n",
247 |     "\n",
248 |     "(new_sym, new_args) = get_fine_tune_model(sym, arg_params, num_classes)\n",
249 |     "\n",
250 |     "batch_size = batch_per_gpu * num_gpus\n",
251 |     "(train, val) = get_iterators(batch_size)\n",
252 |     "mod_score = fit(new_sym, new_args, aux_params, train, val, batch_size, num_gpus)\n",
253 |     "assert mod_score > 0.77, \"Low training accuracy.\""
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "You will see that, after only 8 epochs, we can get 78% validation accuracy. This\n",
261 |     "matches the state-of-the-art results training on caltech-256 alone,\n",
262 |     "e.g. [VGG](http://www.robots.ox.ac.uk/~vgg/research/deep_eval/).\n",
263 |     "\n",
264 |     "Next, we try to use another pretrained model. This model was trained on the\n",
265 |     "complete Imagenet dataset, which is 10x larger than the Imagenet 1K classes\n",
266 |     "version, and uses a 3x deeper Resnet architecture."
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "get_model('http://data.mxnet.io/models/imagenet-11k/resnet-152/resnet-152', 0)\n",
276 |     "sym, arg_params, aux_params = mx.model.load_checkpoint('resnet-152', 0)\n",
277 |     "(new_sym, new_args) = get_fine_tune_model(sym, arg_params, num_classes)\n",
278 |     "mod_score = fit(new_sym, new_args, aux_params, train, val, batch_size, num_gpus)\n",
279 |     "assert mod_score > 0.86, \"Low training accuracy.\""
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "\n",
287 |     "\n",
288 |     "As can be seen, even for a single data epoch, it reaches 83% validation\n",
289 |     "accuracy. After 8 epoches, the validation accuracy increases to 86.4%.\n",
290 |     "\n",
291 |     "<!-- INSERT SOURCE DOWNLOAD BUTTONS -->\n",
292 |     "\n"
293 |    ]
294 |   }
295 |  ],
296 |  "metadata": {
297 |   "display_name": "",
298 |   "kernelspec": {
299 |    "display_name": "Python 2",
300 |    "language": "python",
301 |    "name": "python2"
302 |   },
303 |   "language_info": {
304 |    "codemirror_mode": {
305 |     "name": "ipython",
306 |     "version": 2
307 |    },
308 |    "file_extension": ".py",
309 |    "mimetype": "text/x-python",
310 |    "name": "python",
311 |    "nbconvert_exporter": "python",
312 |    "pygments_lexer": "ipython2",
313 |    "version": "2.7.12"
314 |   },
315 |   "name": ""
316 |  },
317 |  "nbformat": 4,
318 |  "nbformat_minor": 2
319 | }
320 | 


--------------------------------------------------------------------------------
/solutions/tianchi-第三届阿里云安全算法挑战赛/gbm.py:
--------------------------------------------------------------------------------
  1 | def train_feature(id, path, part):
  2 |     df = pd.read_hdf(path + str(id) + '.hdf', part)
  3 | 
  4 |     apidict = apidict2.copy()
  5 |     apidict['file_id'] = id
  6 | 
  7 |     # api 序列合并
  8 |     apidict['api'] = ' '.join(df['api'])
  9 | 
 10 |     # api 个数统计 词袋 CountVectorizer
 11 |     for rows in df['api'].value_counts().reset_index().iterrows():
 12 |         apidict[rows[1]['index']] = rows[1]['api']
 13 | 
 14 |     # api 重复次数比例
 15 |     apidict['api_dpulicate_single'] = sum(df['api'].value_counts().reset_index()['api'] - 1) / df.shape[0]
 16 |     # api 连续重复比例
 17 |     apidict['api_dpulicate_2ngram'] = sum(df['api'].iloc[:-1].values == df['api'].iloc[1:].values) / df.shape[0]
 18 | 
 19 |     # index 重复统计
 20 |     apidict['index_dpulicate_flag'] = df.groupby(['tid', 'index'])['api'].nunique().max()
 21 |     apidict['index_dpulicate_radio'] = sum(df.groupby(['tid', 'index'])['api'].nunique() - 1) / df.shape[0]
 22 | 
 23 |     # api 整体统计
 24 |     apidict['api_count'] = df['api'].nunique()
 25 |     apidict['api_count_maxratio'] = df['api'].value_counts()[0] / df.shape[0]
 26 | 
 27 |     # tid 整体统计
 28 |     apidict['tid_count'] = df['tid'].nunique()
 29 |     apidict['tid_max_length'] = df.groupby('tid')['index'].count().max()
 30 | 
 31 |     # return_value 统计
 32 |     apidict['return_value_count'] = df['return_value'].nunique()
 33 |     apidict['return_value=0'] = sum(df['return_value'] == 0) / df.shape[0]
 34 |     apidict['return_value!=0'] = sum(df['return_value'] != 0) / df.shape[0]
 35 |     apidict['return_value==1'] = sum(df['return_value'] == 1) / df.shape[0]
 36 |     apidict['return_value=-1'] = sum(df['return_value'] == -1) / df.shape[0]
 37 | 
 38 |     apidict['tid_first_value'] = df.groupby('tid').first()['return_value'].mean()
 39 |     apidict['tid_first_value!=0'] = sum(df.groupby('tid').first()['return_value'] != 0)
 40 |     apidict['tid_last_value'] = df.groupby('tid').last()['return_value'].mean()
 41 |     apidict['tid_last_value!=0'] = sum(df.groupby('tid').last()['return_value'] != 0)
 42 | 
 43 |     # Behaviour: File, Process, Memory, Register, Network, Service, Other.
 44 | 
 45 |     # 注册表信息，注册表修改信息
 46 |     reg_cols = ['RegOpenKeyExW', 'RegQueryValueExW', 'RegCloseKey', 'RegOpenKeyExA', 'RegQueryValueExA',
 47 |                'RegEnumKeyExW', 'RegQueryInfoKeyW', 'RegEnumValueW', 'RegEnumKeyW', 'RegCreateKeyExW',
 48 |                'RegSetValueExW', 'RegEnumValueA', 'RegDeleteValueW', 'RegCreateKeyExA', 'RegEnumKeyExA',
 49 |                'RegSetValueExA', 'RegDeleteValueA', 'RegDeleteKeyW', 'RegQueryInfoKeyA', 'RegDeleteKeyA']
 50 | 
 51 |     regalter_cols = ['RegCreateKeyExW', 'RegSetValueExW', 'RegDeleteValueW', 'RegCreateKeyExA',
 52 |                'RegSetValueExA', 'RegDeleteValueA', 'RegDeleteKeyW', 'RegDeleteKeyA']
 53 | 
 54 |     apidict['reg_info'] = int(df[df['api'].isin(reg_cols)].shape[0] > 1)
 55 |     apidict['reg_info_ratio'] = df[df['api'].isin(reg_cols)].shape[0] / df.shape[0]
 56 | 
 57 |     apidict['reg_infoalter'] = int(df[df['api'].isin(regalter_cols)].shape[0] > 1)
 58 |     apidict['reg_infoalter_ratio'] = df[df['api'].isin(regalter_cols)].shape[0] / df.shape[0]
 59 |     apidict['reg_infoalter_ratio2'] = df[df['api'].isin(regalter_cols)].shape[0] / (df[df['api'].isin(reg_cols)].shape[0]+1)
 60 | 
 61 |     # 网络信息
 62 |     network_cols = ['InternetCrackUrlA', 'InternetSetOptionA', 'InternetGetConnectedState', 'InternetOpenW',
 63 |                    'InternetSetStatusCallback', 'InternetConnectW', 'InternetQueryOptionA', 'InternetCloseHandle',
 64 |                    'InternetOpenA', 'InternetConnectA', 'InternetOpenUrlA', 'InternetReadFile',
 65 |                    'InternetGetConnectedStateExW', 'InternetGetConnectedStateExA', 'InternetWriteFile']
 66 |     apidict['network_info'] = int(df[df['api'].isin(network_cols)].shape[0] > 1)
 67 |     apidict['network_ratio'] = df[df['api'].isin(network_cols)].shape[0] / df.shape[0]
 68 | 
 69 |     # 内存信息
 70 |     memory_cols = ['NtAllocateVirtualMemory', 'NtFreeVirtualMemory', 'NtProtectVirtualMemory', 'WriteProcessMemory',
 71 |                   'ReadProcessMemory', 'NtReadVirtualMemory', 'CryptProtectMemory', 'CryptUnprotectMemory', 'NtWriteVirtualMemory']
 72 |     apidict['memory_info'] = int(df[df['api'].isin(memory_cols)].shape[0] > 1)
 73 |     apidict['memory_ratio'] = df[df['api'].isin(memory_cols)].shape[0] / df.shape[0]
 74 | 
 75 |     # 文件信息
 76 |     file_cols = ['NtCreateFile', 'NtWriteFile', 'NtQueryAttributesFile', 'GetFileVersionInfoSizeW', 'GetFileVersionInfoW',
 77 |                 'NtSetInformationFile', 'NtDeviceIoControlFile', 'NtOpenFile', 'FindFirstFileExW', 'GetFileAttributesW',
 78 |                 'DeleteFileW', 'CopyFileA', 'SetFilePointer', 'NtReadFile', 'GetFileType', 'SetFileTime',
 79 |                 'CopyFileW', 'MoveFileWithProgressW', 'CopyFileExW', 'NtDeleteFile']
 80 |     filealter_cols = ['NtCreateFile', 'NtWriteFile',
 81 |                 'DeleteFileW', 'CopyFileA', 'SetFilePointer', 'SetFileTime',
 82 |                 'CopyFileW', 'MoveFileWithProgressW', 'CopyFileExW', 'NtDeleteFile']
 83 | 
 84 |     apidict['file_info'] = int(df[df['api'].isin(file_cols)].shape[0] > 1)
 85 |     apidict['file_info_ratio'] = df[df['api'].isin(file_cols)].shape[0] / df.shape[0]
 86 | 
 87 |     apidict['file_alter_info'] = int(df[df['api'].isin(filealter_cols)].shape[0] > 1)
 88 |     apidict['file_alter_info_ratio'] = df[df['api'].isin(filealter_cols)].shape[0] / df.shape[0]
 89 |     apidict['file_alter_info_ratio2'] = apidict['file_alter_info_ratio'] / (apidict['file_info_ratio'] + 0.01)
 90 | 
 91 |     # 进程信息
 92 |     thread_cols = ['CreateThread', 'Thread32First', 'Thread32Next', 'NtResumeThread', 'NtCreateThreadEx',
 93 |                    'NtOpenThread', 'NtTerminateThread', 'NtSuspendThread', 'NtGetContextThread'
 94 |                    'CreateRemoteThread', 'NtQueueApcThread', 'RtlCreateUserThread', 'NtSetContextThread',
 95 |                    'CreateRemoteThreadEx', 'NtCreateThread']
 96 |     apidict['thread_info'] = int(df[df['api'].isin(thread_cols)].shape[0] > 1)
 97 |     apidict['thread_ratio'] = df[df['api'].isin(thread_cols)].shape[0] / df.shape[0]
 98 |     apidict['thread_last10_ratio'] = df['api'].isin(thread_cols).iloc[-10:].sum() / 10
 99 |     apidict['Thread32Next_ratio'] = df[df['api'].isin(['Thread32Next'])].shape[0] / df.shape[0]
100 | 
101 |     # 服务信息
102 |     # TODO: 成功创建服务的返回值?
103 |     service_cols = ['OpenServiceA', 'CreateServiceA', 'StartServiceA', 'CreateServiceW', 'StartServiceW',
104 |                     'ControlService', 'DeleteService']
105 |     apidict['service_info'] = int(df[df['api'].isin(reg_cols)].shape[0] > 1)
106 |     apidict['service_ratio'] = df[df['api'].isin(service_cols)].shape[0] / df.shape[0]
107 | 
108 |     # DLL信息
109 |     dll_cols = ['LdrLoadDll', 'LdrUnloadDll', 'LdrGetDllHandle']
110 |     apidict['dll_info'] = int(df[df['api'].isin(dll_cols)].shape[0] > 1)
111 |     apidict['dll_ratio'] = df[df['api'].isin(dll_cols)].shape[0] / df.shape[0]
112 | 
113 |     # 加密信息
114 |     crypt_cols = ['CryptAcquireContextW', 'CryptProtectMemory', 'CryptUnprotectMemory', 'CryptHashData',
115 |                  'CryptAcquireContextA', 'CryptEncrypt', 'CryptExportKey', 'CryptCreateHash', 'CryptDecodeObjectEx',
116 |                  'CryptProtectData', 'CryptDecrypt', 'CryptUnprotectData']
117 |     apidict['crypt_info'] = int(df[df['api'].isin(crypt_cols)].shape[0] > 1)
118 | 
119 |     # 证书信息
120 |     cert_cols = ['CertCreateCertificateContext', 'CertOpenSystemStoreA', 'CertOpenSystemStoreW', 'CertOpenStore',
121 |                 'CertControlStore']
122 |     apidict['cert_info'] = int(df[df['api'].isin(cert_cols)].shape[0] > 1)
123 | 
124 |     # COM信息
125 |     com_cols = ['CoCreateInstance', 'CoCreateInstanceEx', 'CoGetClassObject', 'CoInitializeEx', 'CoInitializeSecurity',
126 |                'CoUninitialize', 'ControlService']
127 |     apidict['com_info'] = int(df[df['api'].isin(com_cols)].shape[0] > 1)
128 | 
129 |     # Find信息
130 |     find_cols = ['FindResourceExW', 'FindResourceA', 'FindFirstFileExW', 'FindWindowA', 'FindResourceW',
131 |                 'FindWindowW', 'FindResourceExA', 'FindWindowExW', 'FindFirstFileExA', 'FindWindowExA']
132 |     apidict['find_info'] = int(df[df['api'].isin(find_cols)].shape[0] > 1)
133 | 
134 |     # Console 信息
135 |     console_cols = ['WriteConsoleA', 'WriteConsoleW']
136 |     apidict['console_cols'] = int(df[df['api'].isin(console_cols)].shape[0] > 1)
137 | 
138 |     # Control 信息
139 |     control_cols = ['NtDeviceIoControlFile', 'DeviceIoControl', 'ControlService', 'CertControlStore']
140 |     apidict['control_cols'] = int(df[df['api'].isin(control_cols)].shape[0] > 1)
141 | 
142 |     # Socket 信息
143 |     socket_cols = ['socket', 'setsockopt', 'closesocket', 'getsockname', 'WSASocketW', 'WSASocketA', 'ioctlsocket']
144 |     apidict['socket_cols'] = int(df[df['api'].isin(socket_cols)].shape[0] > 1)
145 | 
146 |     # Ldr 信息
147 |     ldr_cols  = ['LdrLoadDll', 'LdrGetProcedureAddress', 'LdrUnloadDll', 'LdrGetDllHandle']
148 |     apidict['ldr_info'] = int(df[df['api'].isin(ldr_cols)].shape[0] > 1)
149 |     apidict['ldr_ratio'] = df['api'].isin(ldr_cols).iloc[-10:].sum() / 10
150 | 
151 |     # Resource 信息
152 |     res_cols = ['FindResourceExW', 'LoadResource', 'FindResourceA', 'SizeofResource', 'FindResourceExA']
153 |     apidict['resource_info'] = int(df[df['api'].isin(res_cols)].shape[0] > 1)
154 | 
155 |     # Hook 信息
156 |     hook_cols = ['SetWindowsHookExA', 'SetWindowsHookExW', 'UnhookWindowsHookEx']
157 |     apidict['hook_info'] = int(df[df['api'].isin(hook_cols)].shape[0] > 1)
158 | 
159 |     # Information 信息
160 |     information_cols = ['NtSetInformationFile', 'NtQuerySystemInformation', 'GetTimeZoneInformation',
161 |                        'NtQueryInformationFile', 'GetFileInformationByHandleEx', 'GetFileInformationByHandle',
162 |                        'SetInformationJobObject', 'SetFileInformationByHandle', 'NetGetJoinInformation']
163 |     apidict['information_info'] = int(df[df['api'].isin(information_cols)].shape[0] > 1)
164 | 
165 |     # Nt 信息
166 |     # TODO
167 | 
168 |     # Attributes 信息
169 |     attr_cols = ['NtQueryAttributesFile', 'GetFileAttributesW', 'NtQueryFullAttributesFile',
170 |                 'SetFileAttributesW']
171 |     apidict['attr_info'] = int(df[df['api'].isin(attr_cols)].shape[0] > 1)
172 | 
173 |     # Buffer 信息
174 |     buffer_cols = ['RtlCompressBuffer', 'RtlDecompressBuffer']
175 |     apidict['buffer_info'] = int(df[df['api'].isin(buffer_cols)].shape[0] > 1)
176 | 
177 |     # Module 信息
178 |     module_cols = ['Module32FirstW', 'Module32NextW']
179 |     apidict['module_info'] = int(df[df['api'].isin(module_cols)].shape[0] > 1)
180 | 
181 |     # reg_info network_info memory_info file_info thread_info service_info find_info ldr_info resource_info information_info
182 |     apidict['type_info1'] = apidict['reg_info'] * apidict['network_info'] * apidict['memory_info']
183 |     apidict['type_info2'] = apidict['network_info'] * apidict['service_info'] * apidict['reg_info']
184 |     apidict['type_info2'] = apidict['ldr_info'] * apidict['file_info'] * apidict['reg_info']
185 |     apidict['type_info3'] = apidict['ldr_info'] * apidict['thread_info'] * apidict['memory_info']
186 |     apidict['type_info4'] = apidict['crypt_info'] * apidict['find_info'] * apidict['network_info']
187 |     apidict['type_info5'] = apidict['resource_info'] * apidict['information_info'] * apidict['network_info']
188 |     apidict['type_info6'] = apidict['crypt_info'] * apidict['memory_info'] * apidict['network_info']
189 | 
190 |     return apidict
191 | 
192 | def train_feature0816(id, path, part):
193 |     df = pd.read_hdf(path + str(id) + '.hdf', part)
194 | 
195 |     apidict = apidict2.copy()
196 |     apidict['file_id'] = id
197 | 
198 |     # 返回值分析：api调用成功与否？
199 |     #   是否为零、取值空间
200 | 
201 |     # api的含义、目的，得到新的组合特征
202 | 
203 |     # 病毒序列分析
204 | 


--------------------------------------------------------------------------------
/solutions/点石-Retention Rate of Baidu Hao Kan APP Users/1_splitdf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import os, sys, time, codecs, glob
 5 | from tqdm import tqdm, tqdm_notebook
 6 | 
 7 | def read_input(debug=True):
 8 |     if debug:
 9 |         nrows = 100000
10 |     else:
11 |         nrows = None
12 | 
13 |     train = pd.read_csv('../input/train', sep='\t', nrows=nrows,
14 |                 names=['user_id', 'user_male', 'user_age', 'user_edu', 'user_district', 'label',  'user_install',
15 |                         'video_id', 'video_class', 'video_tag', 'video_creator', 'video_uptime', 'video_duration',
16 |                         'behavior_show', 'behavior_click', 'behavior_recommend', 'behavior_playback', 'behavior_timestamp',
17 |                         'behavior_comment', 'behavior_like', 'behavior_forard'],
18 |                 dtype={'user_id':object, 'video_tag':object})
19 |     test = pd.read_csv('../input/test', sep='\t', nrows=nrows,
20 |                 names=['user_id', 'user_male', 'user_age', 'user_edu', 'user_district',  'user_install',
21 |                         'video_id', 'video_class', 'video_tag', 'video_creator', 'video_uptime', 'video_duration',
22 |                         'behavior_show', 'behavior_click', 'behavior_recommend', 'behavior_playback', 'behavior_timestamp',
23 |                         'behavior_comment', 'behavior_like', 'behavior_forard'])
24 | 
25 | #     train['video_uptime'] = train['video_uptime'].apply(lambda x: timestamp_datetime(x))
26 | #     train['behavior_timestamp'] = train['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000))
27 | #     train['video_tag'] = train['video_tag'].apply(lambda x: x.split('$'))
28 | #     train.sort_values(by=['user_id', 'behavior_timestamp'], inplace=True)
29 | 
30 | 
31 | #     test['video_uptime'] = test['video_uptime'].apply(lambda x: timestamp_datetime(x))
32 | #     test['behavior_timestamp'] = test['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000))
33 | #     test['video_tag'] = test['video_tag'].apply(lambda x: x.split('$'))
34 | #     test.sort_values(by=['user_id', 'behavior_timestamp'], inplace=True)
35 | 
36 |     return train, test
37 | 
38 | train, test = read_input(debug=False)
39 | 
40 | # idx = train['user_id'].value_counts()
41 | # idx = idx[train['user_id'].unique()]
42 | # idx = idx.reset_index()
43 | # for i, rows in tqdm(enumerate(idx.iterrows())):
44 | #     if i == 0:
45 | #         start = 0
46 | #     else:
47 | #         start = idx.iloc[:i]['user_id'].sum()
48 | #     span = idx.iloc[i]['user_id']
49 | 
50 | #     tmp_df = train.iloc[start :start+span]
51 | #     tmp_df.to_csv('./train/{0}.csv'.format(str(idx.iloc[i]['index'])), index=None)
52 | 
53 | idx = test['user_id'].value_counts()
54 | idx = idx[test['user_id'].unique()]
55 | idx = idx.reset_index()
56 | for i, rows in tqdm(enumerate(idx.iterrows())):
57 |     if i == 0:
58 |         start = 0
59 |     else:
60 |         start = idx.iloc[:i]['user_id'].sum()
61 |     span = idx.iloc[i]['user_id']
62 | 
63 |     tmp_df = test.iloc[start :start+span]
64 |     tmp_df.to_csv('./test/{0}.csv'.format(str(idx.iloc[i]['index'])), index=None)
65 | 


--------------------------------------------------------------------------------
/solutions/点石-Retention Rate of Baidu Hao Kan APP Users/2_baseline_1128.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import pandas as pd
  3 | import numpy as np
  4 | import lightgbm as lgb
  5 | 
  6 | import os, sys, time, codecs, glob
  7 | from tqdm import tqdm, tqdm_notebook
  8 | 
  9 | from sklearn.metrics import log_loss, classification_report
 10 | from sklearn.externals.joblib import Parallel, delayed
 11 | 
 12 | def timestamp_datetime(value):
 13 |     value = time.localtime(value)
 14 |     dt = time.strftime('%Y-%m-%d %H:%M:%S', value)
 15 |     return dt
 16 | 
 17 | def feature_agg(i, path):
 18 |     if i % 10000 == 0:
 19 |         print(i, path)
 20 |     # print(path)
 21 |     
 22 |     df = pd.read_hdf(path)
 23 |     df['behavior_timestamp'] = pd.to_datetime(df['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000)))
 24 |     df['video_uptime'] = pd.to_datetime(df['video_uptime'].apply(lambda x: timestamp_datetime(x)))
 25 |     
 26 |     featdict = {}
 27 |     featdict['user_id'] = df['user_id'].iloc[0]
 28 |     
 29 |     # user_male 用户性别
 30 |     featdict['user_male_male'] = 0
 31 |     featdict['user_male_female'] = 0
 32 |     featdict['user_male_nan'] = 0
 33 |     if df['user_male'].value_counts().index[0] == '男':
 34 |         featdict['user_male_male'] = 1
 35 |     elif df['user_male'].value_counts().index[0] == '女':
 36 |         featdict['user_male_female'] = 1
 37 |     else:
 38 |         featdict['user_male_nan'] = 1
 39 |     featdict['user_male_NUNIQUE'] = df['user_male'].nunique()
 40 |     featdict['user_male_COUNT'] = df['user_male'].count()
 41 |     featdict['user_male_NAN'] = sum(df['user_male'] == '-') / df.shape[0]
 42 |     
 43 |     # user_age 用户年龄
 44 |     age_dict = {
 45 |         '-': -1,
 46 |         '18以下': 16,
 47 |         '18-24': 20,
 48 |         '25-34': 27,
 49 |         '35-44': 40,
 50 |         '45-54': 50,
 51 |         '55-64': 60,
 52 |         '65以上': 70,
 53 |     }
 54 |     featdict['user_age'] = age_dict[df['user_age'].value_counts().index[0]]
 55 |     if featdict['user_age'] == -1:
 56 |         featdict['user_age_nan'] = 1
 57 |     else:
 58 |         featdict['user_age_nan'] = 0
 59 |     featdict['user_age_NUNIQUE'] = df['user_age'].nunique()
 60 |     
 61 |     # user_edu 用户教育程度
 62 |     edu_dict = {
 63 |         '-': -1,
 64 |         '高中及以下': 1,
 65 |         '大专': 2,
 66 |         '本科及以上': 3,
 67 |     }
 68 |     featdict['user_edu'] = edu_dict[df['user_edu'].value_counts().index[0]]
 69 |     if featdict['user_edu'] == -1:
 70 |         featdict['user_edu_nan'] = 1
 71 |     else:
 72 |         featdict['user_edu_nan'] = 0
 73 |     featdict['user_edu_NUNIQUE'] = df['user_edu'].nunique()
 74 |     featdict['user_edu_NAN'] = sum(df['user_edu'] == '-') / df.shape[0]
 75 |     
 76 |     # user_install 用户安装渠道
 77 |     install_lbl = ['ctn_1', 'ctn_1005', 'ctn_1018', 'ctn_1029', 'ctn_1042', 'ctn_1043', 
 78 |      'ctn_112', 'ctn_13', 'ctn_14', 'ctn_144', 'ctn_149', 'ctn_15', 'ctn_150',
 79 |      'ctn_151', 'ctn_159', 'ctn_16', 'ctn_160', 'ctn_161', 'ctn_163', 'ctn_17',
 80 |      'ctn_185', 'ctn_188', 'ctn_2', 'ctn_20', 'ctn_202', 'ctn_23', 'ctn_239',
 81 |      'ctn_24', 'ctn_240', 'ctn_27', 'ctn_29', 'ctn_308', 'ctn_341', 'ctn_358',
 82 |      'ctn_368', 'ctn_371', 'ctn_430', 'ctn_484', 'ctn_487', 'ctn_5', 'ctn_55',
 83 |      'ctn_664', 'ctn_666', 'ctn_745', 'ctn_746', 'ctn_772', 'ctn_875', 'ctn_89',
 84 |      'ctn_921', 'ctn_110']
 85 |     ctns = df['user_install'].unique()
 86 |     for ctn in install_lbl:
 87 |         if ctn in ctns:
 88 |             featdict[ctn] = 1
 89 |         else:
 90 |             featdict[ctn] = 0
 91 |     featdict['user_install_NUNIQUE'] = df['user_install'].nunique()
 92 |     
 93 |     # video_id 
 94 |     featdict['video_id_NUNIQUE'] = df['video_id'].nunique()
 95 |     featdict['video_class_NUNIQUE'] = df['video_class'].nunique()
 96 |     featdict['video_duration_NUNIQUE'] = df['video_duration'].nunique()
 97 |     
 98 |     # behavior_show 展现的比例
 99 |     show_counts = df['behavior_show'].value_counts()
100 |     featdict['behavior_show_flag'] = int('1' in show_counts.index)
101 |     if featdict['behavior_show_flag']:
102 |         featdict['behavior_show_ratio'] = show_counts['1'] / show_counts.sum()
103 |     else:
104 |         featdict['behavior_show_ratio'] = 0
105 |     
106 |     # behavior_click 点击的比例
107 |     click_counts = df['behavior_click'].value_counts()
108 |     featdict['behavior_click_flag'] = int('1' in click_counts.index)
109 |     if featdict['behavior_click_flag']:
110 |         featdict['behavior_click_ratio'] = click_counts['1'] / click_counts.sum()
111 |         if '0' in click_counts.index:
112 |             featdict['behavior_show_notclick_ratio'] = click_counts['1'] / (click_counts['1'] + click_counts['0'])
113 | 
114 |         else:
115 |             featdict['behavior_show_notclick_ratio'] = 0
116 |     else:
117 |         featdict['behavior_click_ratio'] = 0
118 |         featdict['behavior_show_notclick_ratio'] = 0
119 |     
120 |     featdict['behavior_recommend_NUNIQUE'] = df['behavior_recommend'].nunique()
121 |     
122 |     df_tmp = df[df['behavior_playback'] != '-']
123 |     if df_tmp.shape[0] == 0:
124 |         featdict['behavior_playback_mean'] = 0
125 |         featdict['behavior_playback_mean2'] = 0
126 |         featdict['behavior_playback_max'] = 0
127 |         featdict['behavior_playback_sum'] = 0
128 |         featdict['behavior_playback_ratio'] = 0
129 |         
130 |         featdict['behavior_comment_ratio'] = 0
131 |         featdict['behavior_like_ratio'] = 0
132 |         featdict['behavior_forard_ratio'] = 0
133 |         
134 |         featdict['behavior_playback_video_mean'] = 0
135 |         featdict['behavior_playback_video_min'] = 0
136 |         featdict['behavior_playback_video_max'] = 0
137 |     else:
138 |         featdict['behavior_playback_mean'] = df_tmp['behavior_playback'].astype(float).mean()
139 |         featdict['behavior_playback_mean2'] = df_tmp[df_tmp['behavior_playback'] != 0]['behavior_playback'].astype(float).mean()
140 |         featdict['behavior_playback_max'] = df_tmp['behavior_playback'].astype(float).max()
141 |         featdict['behavior_playback_sum'] = df_tmp['behavior_playback'].astype(float).sum()
142 |         featdict['behavior_playback_ratio'] = df_tmp[df_tmp['behavior_playback'] != 0].shape[0] / df_tmp.shape[0]
143 |         
144 |         featdict['behavior_comment_ratio'] = df_tmp[df_tmp['behavior_comment'] == 1].shape[0] / df_tmp.shape[0]   
145 |         featdict['behavior_like_ratio'] = df_tmp[df_tmp['behavior_like'] == 1].shape[0] / df_tmp.shape[0]
146 |         featdict['behavior_forard_ratio'] = df_tmp[df_tmp['behavior_forard'] == 1].shape[0] / df_tmp.shape[0]
147 |         
148 |         df_tmp['behavior_playback_div_video_duration'] = df_tmp['behavior_playback'].astype(float) / df_tmp['video_duration']
149 |         featdict['behavior_playback_video_mean'] = df_tmp['behavior_playback_div_video_duration'].mean()
150 |         featdict['behavior_playback_video_min'] = df_tmp['behavior_playback_div_video_duration'].max()
151 |         featdict['behavior_playback_video_max'] = df_tmp['behavior_playback_div_video_duration'].min()
152 |     
153 |     featdict['behavior_timestamp_month_NUNIQUE'] = df['behavior_timestamp'].dt.month.nunique()
154 |     featdict['behavior_timestamp_day_NUNIQUE'] = df['behavior_timestamp'].dt.day.nunique()
155 |     featdict['behavior_timestamp_hour_NUNIQUE'] = df['behavior_timestamp'].dt.hour.nunique()
156 |     featdict['behavior_timestamp_minute_NUNIQUE'] = df['behavior_timestamp'].dt.minute.nunique()
157 |     
158 |     featdict['behavior_playback_mean_day'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_day_NUNIQUE']
159 |     featdict['behavior_playback_mean_hour'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_hour_NUNIQUE']
160 |     
161 |     hour_unique = df['behavior_timestamp'].dt.hour.unique() 
162 |     for hour in range(24):
163 |         if hour in hour_unique:
164 |             featdict['behavior_timestamp_hour' + str(hour)] = 1
165 |         else:
166 |             featdict['behavior_timestamp_hour' + str(hour)] = 0        
167 |     
168 |     featdict['behavior_timestamp_likeday_NUNIQUE'] = df['behavior_timestamp'].dt.minute.nunique()
169 |     
170 |     # df_tmp = df[df['behavior_click'] != '-']
171 |     # featdict['behavior_timestamp_click_month_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.month.nunique()
172 |     # featdict['behavior_timestamp_click_day_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.day.nunique()
173 |     # featdict['behavior_timestamp_click_hour_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.hour.nunique()
174 |     # featdict['behavior_timestamp_click_minute_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.minute.nunique()
175 |     
176 |     return featdict
177 | 
178 | train_id = pd.read_csv('./train_id.csv')
179 | test_id = pd.read_csv('./test_id.csv')
180 | 
181 | train_feat = Parallel(n_jobs=30)(delayed(feature_agg)(i, './train/'+id+'.hdf') for i, id in enumerate(train_id['user_id'].iloc[:]))
182 | test_feat = Parallel(n_jobs=30)(delayed(feature_agg)(i, './test/'+id+'.hdf') for i, id in enumerate(test_id['user_id'].iloc[:]))
183 | train_feat = pd.DataFrame(train_feat)
184 | test_feat = pd.DataFrame(test_feat)
185 | 
186 | train_feat = pd.merge(train_feat, train_id, on='user_id', how='left')
187 | 
188 | params = {
189 |     'learning_rate': 0.01,
190 |     'min_child_samples': 5,
191 |     'max_depth': -1,
192 |     'lambda_l1': 2,
193 |     'boosting': 'gbdt',
194 |     'objective': 'binary',
195 |     'n_estimators': 2000,
196 |     'metric': 'auc',
197 |     # 'num_class': 6,
198 |     'feature_fraction': .85,
199 |     'bagging_fraction': .85,
200 |     'seed': 99,
201 |     'num_threads': 20,
202 |     'verbose': -1
203 | }
204 | 
205 | # cv_results1 = lgb.cv(
206 | #         params,
207 | #         lgb.Dataset(train_feat.drop(['user_id', 'label'], axis=1).values, label=train_feat['label'].values),
208 | #         num_boost_round=200,
209 | #         nfold=7, verbose_eval=False,
210 | #         early_stopping_rounds=200,
211 | # )
212 | # print('CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1])
213 | 
214 | # clf = lgb.train(
215 | #         params,
216 | #         lgb.Dataset(train_feat.drop(['user_id', 'label'], axis=1).values, label=train_feat['label'].values),
217 | #         num_boost_round=1000)
218 | 
219 | from sklearn.model_selection import StratifiedKFold
220 | from sklearn.metrics import roc_auc_score
221 | 
222 | n_fold = 20
223 | skf = StratifiedKFold(n_splits = n_fold, shuffle = True)
224 | eval_fun = roc_auc_score
225 | 
226 | def run_oof(clf, X_train, y_train, X_test, kf):
227 |     print(clf)
228 |     preds_train = np.zeros((len(X_train)), dtype = np.float)
229 |     preds_test = np.zeros((len(X_test)), dtype = np.float)
230 |     train_loss = []; test_loss = []
231 | 
232 |     i = 1
233 |     for train_index, test_index in kf.split(X_train, y_train):
234 |         x_tr = X_train[train_index]; x_te = X_train[test_index]
235 |         y_tr = y_train[train_index]; y_te = y_train[test_index]
236 |         clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], early_stopping_rounds = 500, verbose = False)
237 |         
238 |         train_loss.append(eval_fun(y_tr, clf.predict_proba(x_tr)[:, 1]))
239 |         test_loss.append(eval_fun(y_te, clf.predict_proba(x_te)[:, 1]))
240 | 
241 |         preds_train[test_index] = clf.predict_proba(x_te)[:, 1]
242 |         preds_test += clf.predict_proba(X_test)[:, 1]
243 | 
244 |         print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss)))
245 |         print('-' * 50)
246 |         i += 1
247 |     print('Train: ', train_loss)
248 |     print('Val: ', test_loss)
249 |     print('-' * 50)
250 |     print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss)))
251 |     preds_test /= n_fold
252 |     return preds_train, preds_test
253 | 
254 | params = {
255 |     'learning_rate': 0.01,
256 |     'min_child_samples': 5,
257 |     'max_depth': -1,
258 |     'lambda_l1': 5,
259 |     'boosting': 'gbdt',
260 |     'objective': 'binary',
261 |     'n_estimators': 5000,
262 |     'metric': 'auc',
263 |     # 'num_class': 6,
264 |     'feature_fraction': .75,
265 |     'bagging_fraction': .85,
266 |     'seed': 99,
267 |     'num_threads': 20,
268 |     'verbose': -1
269 | }
270 | 
271 | train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 
272 |                                 train_feat.drop(['user_id', 'label'], axis=1).values, 
273 |                                 train_feat['label'].values, 
274 |                                 test_feat.drop(['user_id'], axis=1).values, 
275 |                                 skf)
276 | 
277 | test_feat['label'] = test_pred
278 | test_feat[['user_id', 'label']].to_csv('baseline.csv', index=None, header=None)


--------------------------------------------------------------------------------
/solutions/点石-Retention Rate of Baidu Hao Kan APP Users/2_baseline_1202.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | import pandas as pd
  3 | import numpy as np
  4 | import lightgbm as lgb
  5 | 
  6 | import os, sys, time, codecs, glob
  7 | from tqdm import tqdm, tqdm_notebook
  8 | 
  9 | from sklearn.metrics import log_loss, classification_report
 10 | from sklearn.externals.joblib import Parallel, delayed
 11 | 
 12 | def timestamp_datetime(value):
 13 |     value = time.localtime(value)
 14 |     dt = time.strftime('%Y-%m-%d %H:%M:%S', value)
 15 |     return dt
 16 | 
 17 | def feature_agg(i, path):
 18 |     if i % 10000 == 0:
 19 |         print(i, path)
 20 |     # print(path)
 21 |     
 22 |     df = pd.read_hdf(path)
 23 |     df.reset_index(drop=True, inplace=True)
 24 |     df.sort_values(by='behavior_timestamp', inplace=True)
 25 |     df['behavior_timestamp'] = pd.to_datetime(df['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000)))
 26 |     df['video_uptime'] = pd.to_datetime(df['video_uptime'].apply(lambda x: timestamp_datetime(x)))
 27 |     
 28 |     featdict = {}
 29 |     featdict['user_id'] = df['user_id'].iloc[0]
 30 |     
 31 |     # user_male 用户性别
 32 |     featdict['user_male_male'] = 0
 33 |     featdict['user_male_female'] = 0
 34 |     featdict['user_male_nan'] = 0
 35 |     if df['user_male'].value_counts().index[0] == '男':
 36 |         featdict['user_male_male'] = 1
 37 |     elif df['user_male'].value_counts().index[0] == '女':
 38 |         featdict['user_male_female'] = 1
 39 |     else:
 40 |         featdict['user_male_nan'] = 1
 41 |     featdict['user_male_NUNIQUE'] = df['user_male'].nunique()
 42 |     featdict['user_male_COUNT'] = df['user_male'].count()
 43 |     featdict['user_male_NAN'] = sum(df['user_male'] == '-') / df.shape[0]
 44 |     
 45 |     # user_age 用户年龄
 46 |     age_dict = {
 47 |         '-': -1,
 48 |         '18以下': 16,
 49 |         '18-24': 20,
 50 |         '25-34': 27,
 51 |         '35-44': 40,
 52 |         '45-54': 50,
 53 |         '55-64': 60,
 54 |         '65以上': 70,
 55 |     }
 56 |     featdict['user_age'] = age_dict[df['user_age'].value_counts().index[0]]
 57 |     if featdict['user_age'] == -1:
 58 |         featdict['user_age_nan'] = 1
 59 |     else:
 60 |         featdict['user_age_nan'] = 0
 61 |     featdict['user_age_NUNIQUE'] = df['user_age'].nunique()
 62 |     
 63 |     # user_edu 用户教育程度
 64 |     edu_dict = {
 65 |         '-': -1,
 66 |         '高中及以下': 1,
 67 |         '大专': 2,
 68 |         '本科及以上': 3,
 69 |     }
 70 |     featdict['user_edu'] = edu_dict[df['user_edu'].value_counts().index[0]]
 71 |     if featdict['user_edu'] == -1:
 72 |         featdict['user_edu_nan'] = 1
 73 |     else:
 74 |         featdict['user_edu_nan'] = 0
 75 |     featdict['user_edu_NUNIQUE'] = df['user_edu'].nunique()
 76 |     featdict['user_edu_NAN'] = sum(df['user_edu'] == '-') / df.shape[0]
 77 |     
 78 |     # user_install 用户安装渠道
 79 |     install_lbl = ['ctn_1', 'ctn_1005', 'ctn_1018', 'ctn_1029', 'ctn_1042', 'ctn_1043', 
 80 |      'ctn_112', 'ctn_13', 'ctn_14', 'ctn_144', 'ctn_149', 'ctn_15', 'ctn_150',
 81 |      'ctn_151', 'ctn_159', 'ctn_16', 'ctn_160', 'ctn_161', 'ctn_163', 'ctn_17',
 82 |      'ctn_185', 'ctn_188', 'ctn_2', 'ctn_20', 'ctn_202', 'ctn_23', 'ctn_239',
 83 |      'ctn_24', 'ctn_240', 'ctn_27', 'ctn_29', 'ctn_308', 'ctn_341', 'ctn_358',
 84 |      'ctn_368', 'ctn_371', 'ctn_430', 'ctn_484', 'ctn_487', 'ctn_5', 'ctn_55',
 85 |      'ctn_664', 'ctn_666', 'ctn_745', 'ctn_746', 'ctn_772', 'ctn_875', 'ctn_89',
 86 |      'ctn_921', 'ctn_110']
 87 |     ctns = df['user_install'].unique()
 88 |     for ctn in install_lbl:
 89 |         if ctn in ctns:
 90 |             featdict[ctn] = 1
 91 |         else:
 92 |             featdict[ctn] = 0
 93 |     featdict['user_install_NUNIQUE'] = df['user_install'].nunique()
 94 |     
 95 |     # video_id 
 96 |     featdict['video_id_NUNIQUE'] = df['video_id'].nunique()
 97 |     featdict['video_class_NUNIQUE'] = df['video_class'].nunique()
 98 |     featdict['video_duration_NUNIQUE'] = df['video_duration'].nunique()
 99 |     
100 |     # behavior_show 展现的比例
101 |     # behavior_show 连续出现的比例
102 |     show_counts = df['behavior_show'].value_counts()
103 |     featdict['behavior_show_flag'] = int('1' in show_counts.index)
104 |     featdict['behavior_show_keep'] = sum(pd.Series(df[df['behavior_show'] == '1'].index).diff(1) == 1)
105 |     if featdict['behavior_show_flag']:
106 |         featdict['behavior_show_ratio'] = show_counts['1'] / show_counts.sum()
107 |         featdict['behavior_show_keep_ratio'] = featdict['behavior_show_keep'] / show_counts['1']
108 |     else:
109 |         featdict['behavior_show_ratio'] = 0
110 |         featdict['behavior_show_keep_ratio'] = 0
111 |     
112 |     # 前10/20/50 后10/20/50 behavior_show 展现的比例
113 |     if df.shape[0] < 10:
114 |         featdict['behavior_show_first10_ratio'] = 0
115 |         featdict['behavior_show_last10_ratio'] = 0
116 |     else:
117 |         featdict['behavior_show_first10_ratio'] = int('1' in df.iloc[:10]['behavior_show'].values)
118 |         featdict['behavior_show_last10_ratio'] = int('1' in df.iloc[-10:]['behavior_show'].values)
119 | 
120 |     if df.shape[0] < 20:
121 |         featdict['behavior_show_first20_ratio'] = 0
122 |         featdict['behavior_show_last20_ratio'] = 0
123 |     else:
124 |         featdict['behavior_show_first20_ratio'] = int('1' in df.iloc[:20]['behavior_show'].values)
125 |         featdict['behavior_show_last20_ratio'] = int('1' in df.iloc[-20:]['behavior_show'].values)
126 | 
127 |     if df.shape[0] < 50:
128 |         featdict['behavior_show_first50_ratio'] = 0
129 |         featdict['behavior_show_last50_ratio'] = 0
130 |     else:
131 |         featdict['behavior_show_first50_ratio'] = int('1' in df.iloc[:50]['behavior_show'].values)
132 |         featdict['behavior_show_last50_ratio'] = int('1' in df.iloc[-50:]['behavior_show'].values)    
133 |     
134 |     # behavior_click 点击的比例
135 |     # behavior_click 连续的比例
136 |     click_counts = df['behavior_click'].value_counts()
137 |     featdict['behavior_click_flag'] = int('1' in click_counts.index)
138 |     featdict['behavior_click_keep'] = sum(pd.Series(df[df['behavior_click'] == '1'].index).diff(1) == 1)
139 |     if featdict['behavior_click_flag']:
140 |         featdict['behavior_click_ratio'] = click_counts['1'] / click_counts.sum()
141 |         if '0' in click_counts.index:
142 |             featdict['behavior_show_notclick_ratio'] = click_counts['1'] / (click_counts['1'] + click_counts['0'])
143 |             featdict['behavior_click_keep_ratio'] = featdict['behavior_click_keep']/ click_counts['1']
144 |         else:
145 |             featdict['behavior_show_notclick_ratio'] = 0
146 |             featdict['behavior_click_keep_ratio'] = 0
147 |     else:
148 |         featdict['behavior_click_ratio'] = 0
149 |         featdict['behavior_show_notclick_ratio'] = 0
150 |     
151 |     featdict['behavior_recommend_NUNIQUE'] = df['behavior_recommend'].nunique()
152 |     
153 |     df_tmp = df[df['behavior_playback'] != '-']
154 |     if df_tmp.shape[0] == 0:
155 |         featdict['behavior_playback_mean'] = 0
156 |         featdict['behavior_playback_mean2'] = 0
157 |         featdict['behavior_playback_max'] = 0
158 |         featdict['behavior_playback_sum'] = 0
159 |         featdict['behavior_playback_ratio'] = 0
160 |         
161 |         featdict['behavior_comment_ratio'] = 0
162 |         featdict['behavior_like_ratio'] = 0
163 |         featdict['behavior_forard_ratio'] = 0
164 |         
165 |         featdict['behavior_playback_video_mean'] = 0
166 |         featdict['behavior_playback_video_min'] = 0
167 |         featdict['behavior_playback_video_max'] = 0
168 |     else:
169 |         featdict['behavior_playback_mean'] = df_tmp['behavior_playback'].astype(float).mean()
170 |         featdict['behavior_playback_mean2'] = df_tmp[df_tmp['behavior_playback'] != 0]['behavior_playback'].astype(float).mean()
171 |         featdict['behavior_playback_max'] = df_tmp['behavior_playback'].astype(float).max()
172 |         featdict['behavior_playback_sum'] = df_tmp['behavior_playback'].astype(float).sum()
173 |         featdict['behavior_playback_ratio'] = df_tmp[df_tmp['behavior_playback'] != 0].shape[0] / df_tmp.shape[0]
174 |         
175 |         featdict['behavior_comment_ratio'] = df_tmp[df_tmp['behavior_comment'] == 1].shape[0] / df_tmp.shape[0]   
176 |         featdict['behavior_like_ratio'] = df_tmp[df_tmp['behavior_like'] == 1].shape[0] / df_tmp.shape[0]
177 |         featdict['behavior_forard_ratio'] = df_tmp[df_tmp['behavior_forard'] == 1].shape[0] / df_tmp.shape[0]
178 |         
179 |         df_tmp['behavior_playback_div_video_duration'] = df_tmp['behavior_playback'].astype(float) / df_tmp['video_duration']
180 |         featdict['behavior_playback_video_mean'] = df_tmp['behavior_playback_div_video_duration'].mean()
181 |         featdict['behavior_playback_video_min'] = df_tmp['behavior_playback_div_video_duration'].max()
182 |         featdict['behavior_playback_video_max'] = df_tmp['behavior_playback_div_video_duration'].min()
183 |     
184 |     featdict['behavior_playback_sum_minute'] = featdict['behavior_playback_sum'] % 60
185 |     featdict['behavior_playback_sum_hour'] = featdict['behavior_playback_sum'] % 3600
186 |     
187 |     featdict['behavior_timestamp_month_NUNIQUE'] = df['behavior_timestamp'].dt.month.nunique()
188 |     featdict['behavior_timestamp_day_NUNIQUE'] = df['behavior_timestamp'].dt.day.nunique()
189 |     featdict['behavior_timestamp_hour_NUNIQUE'] = df['behavior_timestamp'].dt.hour.nunique()
190 |     featdict['behavior_timestamp_minute_NUNIQUE'] = df['behavior_timestamp'].dt.minute.nunique()
191 |     
192 |     featdict['behavior_playback_mean_day'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_day_NUNIQUE']
193 |     featdict['behavior_playback_mean_hour'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_hour_NUNIQUE']
194 |     
195 |     hour_unique = df['behavior_timestamp'].dt.hour.unique() 
196 |     for hour in range(24):
197 |         if hour in hour_unique:
198 |             featdict['behavior_timestamp_hour' + str(hour)] = 1
199 |         else:
200 |             featdict['behavior_timestamp_hour' + str(hour)] = 0        
201 |     
202 |     featdict['behavior_timestamp_likeday_NUNIQUE'] = df['behavior_timestamp'].dt.minute.nunique()
203 |     
204 |     # 使用时间
205 |     behavior_timestamp = df['behavior_timestamp'].iloc[-1] - df['behavior_timestamp'].iloc[0]
206 |     featdict['behavior_timestamp_second'] = behavior_timestamp.seconds
207 |     featdict['behavior_timestamp_minute'] = behavior_timestamp.seconds % 60
208 |     featdict['behavior_timestamp_hour'] = behavior_timestamp.seconds % 3600
209 |     featdict['behavior_timestamp_day'] = behavior_timestamp.seconds % 86400
210 |     if featdict['behavior_timestamp_second'] == 0:
211 |         featdict['behavior_playback_time_ratio'] = 0
212 |     else:
213 |         featdict['behavior_playback_time_ratio'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_second']
214 |     
215 |     # df_tmp = df[df['behavior_click'] != '-']
216 |     # featdict['behavior_timestamp_click_month_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.month.nunique()
217 |     # featdict['behavior_timestamp_click_day_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.day.nunique()
218 |     # featdict['behavior_timestamp_click_hour_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.hour.nunique()
219 |     # featdict['behavior_timestamp_click_minute_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.minute.nunique()
220 |     
221 |     return featdict
222 | 
223 | train_id = pd.read_csv('./train_id.csv')
224 | test_id = pd.read_csv('./test_id.csv')
225 | 
226 | train_id = pd.read_csv('./train_id.csv')
227 | test_id = pd.read_csv('./test_id.csv')
228 | 
229 | train_feat = Parallel(n_jobs=30)(delayed(feature_agg)(i, './train/'+id+'.hdf') for i, id in enumerate(train_id['user_id'].iloc[:]))
230 | test_feat = Parallel(n_jobs=30)(delayed(feature_agg)(i, './test/'+id+'.hdf') for i, id in enumerate(test_id['user_id'].iloc[:]))
231 | train_feat = pd.DataFrame(train_feat)
232 | test_feat = pd.DataFrame(test_feat)
233 | 
234 | train_feat = pd.merge(train_feat, train_id, on='user_id', how='left')
235 | 
236 | params = {
237 |     'learning_rate': 0.01,
238 |     'min_child_samples': 5,
239 |     'max_depth': -1,
240 |     'lambda_l1': 2,
241 |     'boosting': 'gbdt',
242 |     'objective': 'binary',
243 |     'n_estimators': 2000,
244 |     'metric': 'auc',
245 |     # 'num_class': 6,
246 |     'feature_fraction': .85,
247 |     'bagging_fraction': .85,
248 |     'seed': 99,
249 |     'num_threads': 20,
250 |     'verbose': -1
251 | }
252 | 
253 | # cv_results1 = lgb.cv(
254 | #         params,
255 | #         lgb.Dataset(train_feat.drop(['user_id', 'label'], axis=1).values, label=train_feat['label'].values),
256 | #         num_boost_round=200,
257 | #         nfold=7, verbose_eval=False,
258 | #         early_stopping_rounds=200,
259 | # )
260 | # print('CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1])
261 | 
262 | # clf = lgb.train(
263 | #         params,
264 | #         lgb.Dataset(train_feat.drop(['user_id', 'label'], axis=1).values, label=train_feat['label'].values),
265 | #         num_boost_round=1000)
266 | 
267 | from sklearn.model_selection import StratifiedKFold
268 | from sklearn.metrics import roc_auc_score
269 | 
270 | n_fold = 10
271 | skf = StratifiedKFold(n_splits = n_fold, shuffle = True)
272 | eval_fun = roc_auc_score
273 | 
274 | def run_oof(clf, X_train, y_train, X_test, kf):
275 |     print(clf)
276 |     preds_train = np.zeros((len(X_train)), dtype = np.float)
277 |     preds_test = np.zeros((len(X_test)), dtype = np.float)
278 |     train_loss = []; test_loss = []
279 | 
280 |     i = 1
281 |     for train_index, test_index in kf.split(X_train, y_train):
282 |         x_tr = X_train[train_index]; x_te = X_train[test_index]
283 |         y_tr = y_train[train_index]; y_te = y_train[test_index]
284 |         clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], early_stopping_rounds = 500, verbose = False)
285 |         
286 |         train_loss.append(eval_fun(y_tr, clf.predict_proba(x_tr)[:, 1]))
287 |         test_loss.append(eval_fun(y_te, clf.predict_proba(x_te)[:, 1]))
288 | 
289 |         preds_train[test_index] = clf.predict_proba(x_te)[:, 1]
290 |         preds_test += clf.predict_proba(X_test)[:, 1]
291 | 
292 |         print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss)))
293 |         print('-' * 50)
294 |         i += 1
295 |     print('Train: ', train_loss)
296 |     print('Val: ', test_loss)
297 |     print('-' * 50)
298 |     print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss)))
299 |     preds_test /= n_fold
300 |     return preds_train, preds_test
301 | 
302 | params = {
303 |     'learning_rate': 0.01,
304 |     'min_child_samples': 5,
305 |     'max_depth': -1,
306 |     'lambda_l1': 5,
307 |     'boosting': 'gbdt',
308 |     'objective': 'binary',
309 |     'n_estimators': 5000,
310 |     'metric': 'auc',
311 |     # 'num_class': 6,
312 |     'feature_fraction': .75,
313 |     'bagging_fraction': .85,
314 |     'seed': 99,
315 |     'num_threads': 20,
316 |     'verbose': -1
317 | }
318 | 
319 | train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 
320 |                                 train_feat.drop(['user_id', 'label'], axis=1).values, 
321 |                                 train_feat['label'].values, 
322 |                                 test_feat.drop(['user_id'], axis=1).values, 
323 |                                 skf)
324 | 
325 | test_feat['label'] = test_pred
326 | test_feat[['user_id', 'label']].to_csv('baseline.csv', index=None, header=None)


--------------------------------------------------------------------------------
/solutions/点石-Retention Rate of Baidu Hao Kan APP Users/2_baseline_1203_Train0.75989_Test0.75627.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | import lightgbm as lgb
  9 | 
 10 | import os, sys, time, codecs, glob
 11 | from tqdm import tqdm, tqdm_notebook
 12 | 
 13 | from sklearn.metrics import log_loss, classification_report
 14 | from sklearn.externals.joblib import Parallel, delayed
 15 | from collections import Counter
 16 | 
 17 | def timestamp_datetime(value):
 18 |     value = time.localtime(value)
 19 |     dt = time.strftime('%Y-%m-%d %H:%M:%S', value)
 20 |     return dt
 21 | 
 22 | installs = pd.read_csv('./install_counts.csv')
 23 | install_count = np.zeros(installs.shape[0])
 24 | 
 25 | install_count[0:5] = 6
 26 | install_count[5:10] = 5
 27 | install_count[10:30] = 4
 28 | install_count[30:80] = 3
 29 | install_count[80:250] = 2
 30 | install_count[250:500] = 1
 31 | install_count[500:] = 0
 32 | 
 33 | install_mean = pd.read_csv('./install_mean.csv')
 34 | 
 35 | def feature_agg(i, path):
 36 |     if i % 10000 == 0:
 37 |         print(i, path)
 38 | 
 39 |     df = pd.read_csv(path,  dtype={'user_id':object, 'user_male':object, 'user_age':object,
 40 |                             'user_edu':object, 'user_district':object, 'user_install':object,
 41 |                             'video_id':object, 'video_class':object, 'video_tag':object,
 42 |                             'video_creator':object, 'video_uptime':int, 'video_duration':int,
 43 |                             'behavior_show':object, 'behavior_click':object, 'behavior_recommend':object,
 44 |                             'behavior_playback':object, 'behavior_timestamp':int, 'behavior_comment':object,
 45 |                             'behavior_like':object, 'behavior_forard':object})
 46 | 
 47 |     df.reset_index(drop=True, inplace=True)
 48 |     df.sort_values(by='behavior_timestamp', inplace=True)
 49 | 
 50 |     df['behavior_timestamp'] = pd.to_datetime(df['behavior_timestamp'].apply(lambda x: timestamp_datetime(x / 1000)))
 51 |     df['video_uptime'] = pd.to_datetime(df['video_uptime'].apply(lambda x: timestamp_datetime(x)))
 52 | 
 53 |     featdict = {}
 54 |     featdict['user_id'] = df['user_id'].iloc[0]
 55 | 
 56 |     ############################################################################
 57 |     # 用户基本信息
 58 |     ############################################################################
 59 |     # user 用户记录的条数
 60 |     featdict['user_count'] = df['user_male'].count()
 61 | 
 62 |     # user_male 用户性别编码，用出现次数最多的编码
 63 |     featdict['user_male_male'] = 0
 64 |     featdict['user_male_female'] = 0
 65 |     featdict['user_male_nan'] = 0
 66 |     if df['user_male'].value_counts().index[0] == '男':
 67 |         featdict['user_male_male'] = 1
 68 |     elif df['user_male'].value_counts().index[0] == '女':
 69 |         featdict['user_male_female'] = 1
 70 |     else:
 71 |         featdict['user_male_nan'] = 1
 72 | 
 73 |     # user_male 用户性别种类个数
 74 |     featdict['user_male_NUNIQUE'] = df['user_male'].nunique()
 75 |     # user_male 用户性别缺失比例
 76 |     featdict['user_male_NAN'] = sum(df['user_male'] == '-') / df.shape[0]
 77 | 
 78 |     # user_age 用户年龄编码
 79 |     age_dict = {
 80 |         '-': -1,
 81 |         '18以下': 16,
 82 |         '18-24': 20,
 83 |         '25-34': 27,
 84 |         '35-44': 40,
 85 |         '45-54': 50,
 86 |         '55-64': 60,
 87 |         '65以上': 70,
 88 |     }
 89 |     # 使用出现次数最多的最为年龄编码
 90 |     featdict['user_age'] = age_dict[df['user_age'].value_counts().index[0]]
 91 |     # 年龄缺失编码
 92 |     if featdict['user_age'] == -1:
 93 |         featdict['user_age_nan'] = 1
 94 |     else:
 95 |         featdict['user_age_nan'] = 0
 96 |     
 97 |     featdict['user_age_NAN'] = sum(df['user_age'] == '-') / df.shape[0]
 98 |     featdict['user_age_NUNIQUE'] = df['user_age'].nunique()
 99 |     
100 |     df['user_age'] = df['user_age'].apply(lambda x: age_dict[x])
101 |     if featdict['user_age_NAN'] == 1:
102 |         featdict['user_age_MIN'] = 0
103 |         featdict['user_age_MAX'] = 0
104 |         featdict['user_age_MEAN'] = 0
105 |         featdict['user_age_STD'] = 0
106 |         # featdict['user_age_PTP'] = 0
107 |     else:
108 |         featdict['user_age_MIN'] = df[df['user_age'] != -1]['user_age'].mean()
109 |         featdict['user_age_MAX'] = df[df['user_age'] != -1]['user_age'].max()
110 |         featdict['user_age_MEAN'] = df[df['user_age'] != -1]['user_age'].mean()
111 |         featdict['user_age_STD'] = df[df['user_age'] != -1]['user_age'].std()
112 |         # featdict['user_age_PTP'] = df[df['user_age'] != -1]['user_age'].ptp()
113 |     
114 |     # user_age 用户年龄编码 留存率分级
115 |     
116 |     # user_edu 用户教育程度
117 |     edu_dict = {
118 |         '-': -1,
119 |         '高中及以下': 1,
120 |         '大专': 2,
121 |         '本科及以上': 3,
122 |     }
123 |     featdict['user_edu'] = edu_dict[df['user_edu'].value_counts().index[0]]
124 |     if featdict['user_edu'] == -1:
125 |         featdict['user_edu_nan'] = 1
126 |     else:
127 |         featdict['user_edu_nan'] = 0
128 |     
129 |     # user_edu 用户教育程度 留存率分级
130 |     
131 |     featdict['user_age*edu'] = featdict['user_edu'] * featdict['user_age']
132 |     
133 |     featdict['user_edu_NAN'] = sum(df['user_edu'] == '-') / df.shape[0]
134 |     df['user_edu'] = df['user_edu'].apply(lambda x: edu_dict[x])
135 |     featdict['user_edu_NUNIQUE'] = df['user_edu'].nunique()
136 |     featdict['user_edu_MAX'] = df['user_edu'].max()
137 |     
138 |     # user_install 用户安装渠道
139 |     install_lbl = ['ctn_1', 'ctn_1005', 'ctn_1018', 'ctn_1029', 'ctn_1042', 'ctn_1043',
140 |      'ctn_112', 'ctn_13', 'ctn_14', 'ctn_144', 'ctn_149', 'ctn_15', 'ctn_150',
141 |      'ctn_151', 'ctn_159', 'ctn_16', 'ctn_160', 'ctn_161', 'ctn_163', 'ctn_17',
142 |      'ctn_185', 'ctn_188', 'ctn_2', 'ctn_20', 'ctn_202', 'ctn_23', 'ctn_239',
143 |      'ctn_24', 'ctn_240', 'ctn_27', 'ctn_29', 'ctn_308', 'ctn_341', 'ctn_358',
144 |      'ctn_368', 'ctn_371', 'ctn_430', 'ctn_484', 'ctn_487', 'ctn_5', 'ctn_55',
145 |      'ctn_664', 'ctn_666', 'ctn_745', 'ctn_746', 'ctn_772', 'ctn_875', 'ctn_89',
146 |      'ctn_921', 'ctn_110']
147 |     ctns = df['user_install'].unique()
148 |     for ctn in install_lbl:
149 |         if ctn in ctns:
150 |             featdict['user_install_' + ctn] = 1
151 |         else:
152 |             featdict['user_install_' + ctn] = 0
153 |     featdict['user_install_NUNIQUE'] = df['user_install'].nunique()
154 | 
155 |     # user_install 用户安装渠道 留存率分级
156 |     
157 |     if df['user_install'].iloc[0] in install_mean['user_install'].values:
158 |         featdict['install_mean'] = np.where(df['user_install'].iloc[0] == install_mean['user_install'])[0][0]
159 |     else:
160 |         featdict['install_mean'] = 1000
161 |     
162 |     # user_install 用户安装渠道 COUNT分级
163 |     if df['user_install'].iloc[0] in installs['user_install'].values:
164 |         featdict['install_count'] = install_count[np.where(df['user_install'].iloc[0] == installs['user_install'])[0][0]]
165 |     else:
166 |         featdict['install_count'] = 0
167 |         
168 |     ############################################################################
169 |     # 用户（基本信息）与视频的交叉特征
170 |     ############################################################################
171 |     # 用户视频个数
172 |     # featdict['video_NUNIQUE'] = df['video_id'].nunique()
173 |     # 用户视频类别格式
174 |     # featdict['video_class_NUNIQUE'] = df['video_class'].nunique()
175 |     
176 |     featdict['video_show_NUNIQUE'] = df[df['behavior_show'] == '1']['video_id'].nunique()
177 |     featdict['video_show_class_NUNIQUE'] = df[df['behavior_show'] == '1']['video_class'].nunique()
178 | 
179 |     featdict['video_click_NUNIQUE'] = df[df['behavior_click'] == '1']['video_id'].nunique()
180 |     featdict['video_click_class_NUNIQUE'] = df[df['behavior_click'] == '1']['video_class'].nunique()
181 |     
182 |     # featdict['video_duration_NUNIQUE'] = df['video_duration'].nunique()
183 | 
184 |     video_class_dict = ['category_149', 'category_152', 'category_169', 'category_178',
185 |        'category_197', 'category_103', 'category_136', 'category_75',
186 |        'category_109', 'category_158']
187 | 
188 |     for c in video_class_dict:
189 |         if c in df[df['behavior_show'] == '1']['video_class'].values:
190 |             featdict[c + '_show'] = 1
191 |         else:
192 |             featdict[c + '_show'] = 0
193 |     
194 |     # 用户是否观看视频多次
195 |     # featdict['user_videio_>2'] = int(df['video_id'].value_counts().max() >= 2)
196 |     # featdict['user_videio_show_>2'] = int(df[df['behavior_show'] == '1']['video_id'].value_counts().max() >= 2)
197 |     # featdict['user_videio_>2'] = int(df[df['behavior_click'] == '1']['video_id'].value_counts().max() >= 2)
198 |     
199 |     # 用户是否观看同一个作者多次
200 |     # featdict['user_videio_creator_>2'] = int(df['video_creator'].value_counts().max() >= 2)
201 |     # featdict['user_videio_creator_show_>2'] = int(df[df['behavior_show'] == '1']['video_creator'].value_counts().max() >= 2)
202 |     # featdict['user_videio_creator_>2'] = int(df[df['behavior_click'] == '1']['video_creator'].value_counts().max() >= 2)
203 |     
204 |     # 是否观看同种tag视频多次
205 |     tags = '$'.join(df[df['behavior_show'] == '1']['video_tag']).split('$')
206 |     if len(tags) > 2:
207 |         tags = Counter([x for x in tags if x != ''])
208 |         featdict['user_videio_tags'] = int(tags.most_common(1)[0][1] > 1)
209 |     else:
210 |         featdict['user_videio_tags'] = 0
211 |     
212 |     
213 |     featdict['user_video_same_week'] = sum(df['video_uptime'].dt.week == df['behavior_timestamp'].dt.week) / df.shape[0]
214 |     featdict['user_video_same_month'] = sum(df['video_uptime'].dt.month == df['behavior_timestamp'].dt.month) / df.shape[0]
215 |     
216 |     df_tmp = df[df['behavior_show'] == '1']
217 |     featdict['user_video_show_same_week'] = sum(df_tmp['video_uptime'].dt.week == df_tmp['behavior_timestamp'].dt.week) / df.shape[0]
218 |     
219 |     df_tmp = df[df['behavior_click'] == '1']
220 |     featdict['user_video_click_same_week'] = sum(df_tmp['video_uptime'].dt.week == df_tmp['behavior_timestamp'].dt.week) / df.shape[0]
221 |     
222 |     ############################################################################
223 |     # 用户（基本信息）与行为特征
224 |     ############################################################################
225 | 
226 |     # behavior_show 展现的比例
227 |     # behavior_show 连续出现的比例
228 |     show_counts = df['behavior_show'].value_counts()
229 |     featdict['behavior_show_flag'] = int('1' in show_counts.index)
230 |     featdict['behavior_show_keep'] = sum(pd.Series(df[df['behavior_show'] == '1'].index).diff(1) == 1)
231 |     if featdict['behavior_show_flag']:
232 |         # featdict['behavior_show_ratio'] = show_counts['1'] / show_counts.sum()
233 |         featdict['behavior_show_keep_ratio'] = featdict['behavior_show_keep'] / show_counts['1']
234 |     else:
235 |         # featdict['behavior_show_ratio'] = 0
236 |         featdict['behavior_show_keep_ratio'] = 0
237 | 
238 |     # 前10/20/50 后10/20/50 behavior_show 展现的比例
239 |     if df.shape[0] < 10:
240 |         featdict['behavior_show_first10_ratio'] = 0
241 |         featdict['behavior_show_last10_ratio'] = 0
242 |     else:
243 |         featdict['behavior_show_first10_ratio'] = int('1' in df.iloc[:10]['behavior_show'].values)
244 |         featdict['behavior_show_last10_ratio'] = int('1' in df.iloc[-10:]['behavior_show'].values)
245 | 
246 |     if df.shape[0] < 20:
247 |         featdict['behavior_show_first20_ratio'] = 0
248 |         featdict['behavior_show_last20_ratio'] = 0
249 |     else:
250 |         featdict['behavior_show_first20_ratio'] = int('1' in df.iloc[:20]['behavior_show'].values)
251 |         featdict['behavior_show_last20_ratio'] = int('1' in df.iloc[-20:]['behavior_show'].values)
252 | 
253 |     if df.shape[0] < 50:
254 |         featdict['behavior_show_first50_ratio'] = 0
255 |         featdict['behavior_show_last50_ratio'] = 0
256 |     else:
257 |         featdict['behavior_show_first50_ratio'] = int('1' in df.iloc[:50]['behavior_show'].values)
258 |         featdict['behavior_show_last50_ratio'] = int('1' in df.iloc[-50:]['behavior_show'].values)
259 | 
260 |     # behavior_click 点击的比例
261 |     # behavior_click 连续的比例
262 |     click_counts = df['behavior_click'].value_counts()
263 |     featdict['behavior_click_flag'] = int('1' in click_counts.index)
264 |     featdict['behavior_click_keep'] = sum(pd.Series(df[df['behavior_click'] == '1'].index).diff(1) == 1)
265 |     if featdict['behavior_click_flag']:
266 |         # featdict['behavior_click_ratio'] = click_counts['1'] / click_counts.sum()
267 |         if '0' in click_counts.index:
268 |             # featdict['behavior_show_notclick_ratio'] = click_counts['1'] / (click_counts['1'] + click_counts['0'])
269 |             featdict['behavior_click_keep_ratio'] = featdict['behavior_click_keep']/ click_counts['1']
270 |         else:
271 |             # featdict['behavior_show_notclick_ratio'] = 0
272 |             featdict['behavior_click_keep_ratio'] = 0
273 |     else:
274 |         featdict['behavior_click_keep_ratio'] = 0
275 |         # featdict['behavior_click_ratio'] = 0
276 |         # featdict['behavior_show_notclick_ratio'] = 0
277 |     
278 |     # 不同 behavior_recommend 情况下的统计
279 |     featdict['behavior_recommend_NUNIQUE'] = df['behavior_recommend'].nunique()
280 |     
281 |     df_tmp = df[df['behavior_playback'] != '-']
282 |     if df_tmp.shape[0] == 0:
283 |         featdict['behavior_playback_mean'] = 0
284 |         featdict['behavior_playback_mean2'] = 0
285 |         featdict['behavior_playback_max'] = 0
286 |         featdict['behavior_playback_sum'] = 0
287 |         # featdict['behavior_playback_ratio'] = 0
288 | 
289 |         featdict['behavior_comment_ratio'] = 0
290 |         featdict['behavior_like_ratio'] = 0
291 |         featdict['behavior_forard_ratio'] = 0
292 | 
293 |         featdict['behavior_playback_video_mean'] = 0
294 |         # featdict['behavior_playback_video_max'] = 0
295 |         featdict['behavior_playback_video_min'] = 0
296 |         featdict['behavior_playback_video_>1'] = 0
297 |     else:
298 |         featdict['behavior_playback_mean'] = df_tmp['behavior_playback'].astype(float).mean()
299 |         featdict['behavior_playback_mean2'] = df_tmp[df_tmp['behavior_playback'] != 0]['behavior_playback'].astype(float).mean()
300 |         featdict['behavior_playback_max'] = df_tmp['behavior_playback'].astype(float).max()
301 |         featdict['behavior_playback_sum'] = df_tmp['behavior_playback'].astype(float).sum()
302 |         # featdict['behavior_playback_ratio'] = df_tmp[df_tmp['behavior_playback'] != 0].shape[0] / df_tmp.shape[0]
303 | 
304 |         featdict['behavior_comment_ratio'] = df_tmp[df_tmp['behavior_comment'] == 1].shape[0] / df_tmp.shape[0]
305 |         featdict['behavior_like_ratio'] = df_tmp[df_tmp['behavior_like'] == 1].shape[0] / df_tmp.shape[0]
306 |         featdict['behavior_forard_ratio'] = df_tmp[df_tmp['behavior_forard'] == 1].shape[0] / df_tmp.shape[0]
307 | 
308 |         df_tmp['behavior_playback_div_video_duration'] = df_tmp['behavior_playback'].astype(float) / (df_tmp['video_duration'] + 1.0)
309 |         featdict['behavior_playback_video_mean'] = df_tmp['behavior_playback_div_video_duration'].mean()
310 |         featdict['behavior_playback_video_max'] = df_tmp['behavior_playback_div_video_duration'].max()
311 |         featdict['behavior_playback_video_min'] = df_tmp['behavior_playback_div_video_duration'].min()
312 |         featdict['behavior_playback_video_>1'] = int(featdict['behavior_playback_video_max'])
313 |     
314 |     # 用户 behavior_click 对应的时间差
315 |     df_tmp = df[df['behavior_click'] == '1']
316 |     if df_tmp.shape[0] < 2:
317 |         featdict['behavior_click_diff_mean'] = 0
318 |         # featdict['behavior_click_diff_max'] = 0
319 |         featdict['behavior_click_diff_min'] = 0
320 |     else:
321 |         featdict['behavior_click_diff_mean'] = df_tmp['behavior_timestamp'].diff(1).mean().total_seconds()
322 |         # featdict['behavior_click_diff_max'] = df_tmp['behavior_timestamp'].diff(1).max().total_seconds()
323 |         featdict['behavior_click_diff_min'] = df_tmp['behavior_timestamp'].diff(1).min().total_seconds()
324 |     
325 |     featdict['behavior_playback_sum_minute'] = featdict['behavior_playback_sum'] % 60
326 |     featdict['behavior_playback_sum_hour'] = featdict['behavior_playback_sum'] % 3600
327 | 
328 |     featdict['behavior_timestamp_month_NUNIQUE'] = df['behavior_timestamp'].dt.month.nunique()
329 |     featdict['behavior_timestamp_day_NUNIQUE'] = df['behavior_timestamp'].dt.day.nunique()
330 |     featdict['behavior_timestamp_hour_NUNIQUE'] = df['behavior_timestamp'].dt.hour.nunique()
331 |     featdict['behavior_timestamp_minute_NUNIQUE'] = df['behavior_timestamp'].dt.minute.nunique()
332 |     
333 |     # 周中 信息编码
334 |     for day in range(7):
335 |         featdict['behavior_timestamp_weekday' + str(day)] = 0
336 |     for day in df['behavior_timestamp'].dt.weekday.unique():
337 |         featdict['behavior_timestamp_weekday' + str(day)] = 1
338 |     
339 |     featdict['behavior_timestamp_weekday_NUNIQUE'] = df['behavior_timestamp'].dt.weekday.nunique()
340 |     
341 |     featdict['behavior_playback_mean_day'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_day_NUNIQUE']
342 |     featdict['behavior_playback_mean_hour'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_hour_NUNIQUE']
343 | 
344 |     hour_unique = df['behavior_timestamp'].dt.hour.unique()
345 |     for hour in range(24):
346 |         if hour in hour_unique:
347 |             featdict['behavior_timestamp_hour' + str(hour)] = 1
348 |         else:
349 |             featdict['behavior_timestamp_hour' + str(hour)] = 0
350 |     
351 |     featdict['behavior_timestamp_likeday_NUNIQUE'] = df['behavior_timestamp'].dt.minute.nunique()
352 | 
353 |     # 使用时间
354 |     behavior_timestamp = df['behavior_timestamp'].iloc[-1] - df['behavior_timestamp'].iloc[0]
355 |     featdict['behavior_timestamp_second'] = behavior_timestamp.seconds
356 |     featdict['behavior_timestamp_minute'] = behavior_timestamp.seconds % 60
357 |     # featdict['behavior_timestamp_hour'] = behavior_timestamp.seconds % 3600
358 |     featdict['behavior_timestamp_day'] = behavior_timestamp.seconds % 86400
359 |     if featdict['behavior_timestamp_second'] == 0:
360 |         featdict['behavior_playback_time_ratio'] = 0
361 |     else:
362 |         featdict['behavior_playback_time_ratio'] = featdict['behavior_playback_sum'] / featdict['behavior_timestamp_second']
363 | 
364 |     # df_tmp = df[df['behavior_click'] != '-']
365 |     # featdict['behavior_timestamp_click_month_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.month.nunique()
366 |     # featdict['behavior_timestamp_click_day_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.day.nunique()
367 |     # featdict['behavior_timestamp_click_hour_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.hour.nunique()
368 |     # featdict['behavior_timestamp_click_minute_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.minute.nunique()
369 | 
370 |     return featdict
371 | 
372 | train_id = pd.read_csv('./train_id.csv')
373 | test_id = pd.read_csv('./test_id.csv')
374 | 
375 | train_feat = Parallel(n_jobs=50)(delayed(feature_agg)(i, './train/'+id+'.csv') for i, id in enumerate(train_id['user_id'].iloc[:10000]))
376 | test_feat = Parallel(n_jobs=50)(delayed(feature_agg)(i, './test/'+id+'.csv') for i, id in enumerate(test_id['user_id'].iloc[:10000]))
377 | 
378 | train_feat = pd.DataFrame(train_feat)
379 | test_feat = pd.DataFrame(test_feat)
380 | 
381 | train_feat = pd.merge(train_feat, train_id, on='user_id', how='left')
382 | 
383 | params = {
384 |     'learning_rate': 0.01,
385 |     'min_child_samples': 5,
386 |     'max_depth': 4,
387 |     'lambda_l1': 2,
388 |     'boosting': 'gbdt',
389 |     'objective': 'binary',
390 |     'n_estimators': 4000,
391 |     'metric': 'auc',
392 |     # 'num_class': 6,
393 |     'feature_fraction': .85,
394 |     'bagging_fraction': .85,
395 |     'seed': 99,
396 |     'num_threads': 20,
397 |     'verbose': -1
398 | }
399 | 
400 | # cv_results1 = lgb.cv(
401 | #         params,
402 | #         lgb.Dataset(train_feat.drop(['user_id', 'label'], axis=1).values, label=train_feat['label'].values),
403 | #         num_boost_round=200,
404 | #         nfold=7, verbose_eval=False,
405 | #         early_stopping_rounds=200,
406 | # )
407 | # print('CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1])
408 | 
409 | # clf = lgb.train(
410 | #         params,
411 | #         lgb.Dataset(train_feat.drop(['user_id', 'label'], axis=1).values, label=train_feat['label'].values),
412 | #         num_boost_round=1000)
413 | 
414 | from sklearn.model_selection import StratifiedKFold
415 | from sklearn.metrics import roc_auc_score
416 | 
417 | n_fold = 10
418 | skf = StratifiedKFold(n_splits = n_fold, shuffle = True)
419 | eval_fun = roc_auc_score
420 | 
421 | def run_oof(clf, X_train, y_train, X_test, kf):
422 |     print(clf)
423 |     preds_train = np.zeros((len(X_train)), dtype = np.float)
424 |     preds_test = np.zeros((len(X_test)), dtype = np.float)
425 |     train_loss = []; test_loss = []
426 | 
427 |     i = 1
428 |     for train_index, test_index in kf.split(X_train, y_train):
429 |         x_tr = X_train[train_index]; x_te = X_train[test_index]
430 |         y_tr = y_train[train_index]; y_te = y_train[test_index]
431 |         clf.fit(x_tr, y_tr, eval_set = [(x_te, y_te)], early_stopping_rounds = 500, verbose = False)
432 |         
433 |         train_loss.append(eval_fun(y_tr, clf.predict_proba(x_tr)[:, 1]))
434 |         test_loss.append(eval_fun(y_te, clf.predict_proba(x_te)[:, 1]))
435 | 
436 |         preds_train[test_index] = clf.predict_proba(x_te)[:, 1]
437 |         preds_test += clf.predict_proba(X_test)[:, 1]
438 | 
439 |         print('{0}: Train {1:0.7f} Val {2:0.7f}/{3:0.7f}'.format(i, train_loss[-1], test_loss[-1], np.mean(test_loss)))
440 |         print('-' * 50)
441 |         i += 1
442 |     print('Train: ', train_loss)
443 |     print('Val: ', test_loss)
444 |     print('-' * 50)
445 |     print('Train{0:0.5f}_Test{1:0.5f}\n\n'.format(np.mean(train_loss), np.mean(test_loss)))
446 |     preds_test /= n_fold
447 |     return preds_train, preds_test
448 | 
449 | params = {
450 |     'learning_rate': 0.01,
451 |     'min_child_samples': 5,
452 |     'max_depth': 4,
453 |     'lambda_l1': 5,
454 |     'boosting': 'gbdt',
455 |     'objective': 'binary',
456 |     'n_estimators': 5000,
457 |     'metric': 'auc',
458 |     # 'num_class': 6,
459 |     'feature_fraction': .75,
460 |     'bagging_fraction': .85,
461 |     'seed': 99,
462 |     'num_threads': 20,
463 |     'verbose': -1
464 | }
465 | 
466 | train_pred, test_pred = run_oof(lgb.LGBMClassifier(**params), 
467 |                                 train_feat.drop(['user_id', 'label'], axis=1).values, 
468 |                                 train_feat['label'].values, 
469 |                                 test_feat.drop(['user_id'], axis=1).values, 
470 |                                 skf)
471 | 
472 | test_feat['label'] = test_pred
473 | test_feat[['user_id', 'label']].to_csv('baseline.csv', index=None, header=None)


--------------------------------------------------------------------------------
/solutions/点石-Retention Rate of Baidu Hao Kan APP Users/README.md:
--------------------------------------------------------------------------------
1 | https://dianshi.baidu.com/competition/24/rule
2 | 
3 | 比赛数据下载：链接: https://pan.baidu.com/s/1Nw64v5jPAoom3PUxRZqxNw 提取码: w54b
4 | 
5 | 第五名代码
6 | 


--------------------------------------------------------------------------------
/solutions/点石-Retention Rate of Baidu Hao Kan APP Users/featselect.py:
--------------------------------------------------------------------------------
 1 | import os, sys, codecs
 2 | import lightgbm as lgb
 3 | 
 4 | def modelWarpper(clf, data_train, data_label, basescore):
 5 |     params = {
 6 |         'learning_rate': 0.01,
 7 |         'min_child_samples': 5,
 8 |         'max_depth': 4,
 9 |         'lambda_l1': 2,
10 |         'boosting': 'gbdt',
11 |         'objective': 'binary',
12 |         'n_estimators': 2000,
13 |         'metric': 'auc',
14 |         # 'num_class': 6,
15 |         'feature_fraction': .85,
16 |         'bagging_fraction': .85,
17 |         'seed': 99,
18 |         'num_threads': -1,
19 |         'verbose': -1
20 |     }
21 |     for col in data_train.columns:
22 |         cv_results1 = lgb.cv(
23 |                 params,
24 |                 lgb.Dataset(data_train.drop([col], axis=1).values, label=data_label.values),
25 |                 num_boost_round=2000,
26 |                 nfold=7, verbose_eval=False,
27 |                 early_stopping_rounds=200,
28 |         )
29 |         
30 |         if cv_results1['auc-mean'][-1] > basescore:
31 |             print('+', col, 'CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1])
32 |         else:
33 |             print('-', col, 'CV AUC: ', len(cv_results1['auc-mean']), cv_results1['auc-mean'][-1])
34 |             
35 |     XX


--------------------------------------------------------------------------------