├── img ├── wns1.png ├── wns2.png ├── wns3.png └── wns4.png ├── README.md ├── Solution - Part 3.ipynb ├── LICENSE └── Solution - Part 1.ipynb /img/wns1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajat5ranjan/AV-WNS-Analytics-Wizard-2019/HEAD/img/wns1.png -------------------------------------------------------------------------------- /img/wns2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajat5ranjan/AV-WNS-Analytics-Wizard-2019/HEAD/img/wns2.png -------------------------------------------------------------------------------- /img/wns3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajat5ranjan/AV-WNS-Analytics-Wizard-2019/HEAD/img/wns3.png -------------------------------------------------------------------------------- /img/wns4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajat5ranjan/AV-WNS-Analytics-Wizard-2019/HEAD/img/wns4.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AV WNS Analytics Wizard 2019 2 | 3 | ![title](img/wns1.png) 4 | ![title](img/wns2.png) 5 | ![title](img/wns3.png) 6 | ![title](img/wns4.png) 7 | 8 | 9 | # Collaboration With 10 | * [Chetan Ambi](https://github.com/chetanambi) 11 | 12 | 13 | # Note 14 | * **Solution - Part 1.ipynb** 15 | This solution produces 3 csv files 16 | *AV_WNS_forkkv2_lgb_folds.csv* 17 | *AV_WNS_forkkv2_cb_folds.csv* 18 | *AV_WNS_forkkv2_CBstack_folds.csv* (final - produces by stacking above 2 csv files) 19 | 20 | 21 | * **Solution - Part 2.ipynb** 22 | This solution produces 1 csv file 23 | *sub2_10fold_lgbm.csv* 24 | 25 | 26 | * **Solution - Part 3.ipynb** 27 | This is the final solution stacks *AV_WNS_forkkv2_CBstack_folds.csv & sub2_10fold_lgbm.csv* 28 | and produces final submission csv file 29 | 30 | 31 | * **Final submission** *ensemble_chetan_rajat_final.csv* 32 | 33 | 34 | # Leaderboard (Nodus Tollens) 35 | 36 | * **[Public LB](https://datahack.analyticsvidhya.com/contest/wns-analytics-wizard-2019/lb)** : **16th/836 Rank** 37 | * **[Private LB](https://datahack.analyticsvidhya.com/contest/wns-analytics-wizard-2019/pvt_lb)** : **16th/836 Rank** 38 | 39 | (6.5K Participants) 40 | -------------------------------------------------------------------------------- /Solution - Part 3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 8, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "s1 = pd.read_csv('AV_WNS_forkkv2_CBstack_folds.csv')\n", 19 | "s2 = pd.read_csv('sub2_10fold_lgbm.csv')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 9, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
\n", 31 | "\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | "
impression_idis_click
0a9e7126a585a69a32bc7414e9d0c0ada0.015520
1caac14a5bf2ba283db7708bb348557600.024357
\n", 65 | "
" 66 | ], 67 | "text/plain": [ 68 | " impression_id is_click\n", 69 | "0 a9e7126a585a69a32bc7414e9d0c0ada 0.015520\n", 70 | "1 caac14a5bf2ba283db7708bb34855760 0.024357" 71 | ] 72 | }, 73 | "execution_count": 9, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "s1.head(2)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 10, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/html": [ 90 | "
\n", 91 | "\n", 104 | "\n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | "
impression_idis_click
0a9e7126a585a69a32bc7414e9d0c0ada0.012703
1caac14a5bf2ba283db7708bb348557600.002386
\n", 125 | "
" 126 | ], 127 | "text/plain": [ 128 | " impression_id is_click\n", 129 | "0 a9e7126a585a69a32bc7414e9d0c0ada 0.012703\n", 130 | "1 caac14a5bf2ba283db7708bb34855760 0.002386" 131 | ] 132 | }, 133 | "execution_count": 10, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "s2.head(2)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 11, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "s1_array = np.array(s1['is_click'])\n", 149 | "s2_array = np.array(s2['is_click'])" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 13, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/html": [ 160 | "
\n", 161 | "\n", 174 | "\n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | "
impression_idis_click
0a9e7126a585a69a32bc7414e9d0c0ada0.014675
1caac14a5bf2ba283db7708bb348557600.017766
213f10ba306a19ce7bec2f3cae507b6980.067086
339c4b4dc0e9701b55a0a4f072008fb3f0.012717
4bf5a572cca75f5fc67f4b14e58b11d700.145120
\n", 210 | "
" 211 | ], 212 | "text/plain": [ 213 | " impression_id is_click\n", 214 | "0 a9e7126a585a69a32bc7414e9d0c0ada 0.014675\n", 215 | "1 caac14a5bf2ba283db7708bb34855760 0.017766\n", 216 | "2 13f10ba306a19ce7bec2f3cae507b698 0.067086\n", 217 | "3 39c4b4dc0e9701b55a0a4f072008fb3f 0.012717\n", 218 | "4 bf5a572cca75f5fc67f4b14e58b11d70 0.145120" 219 | ] 220 | }, 221 | "execution_count": 13, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "s2['is_click'] = s1_array*0.7 + s2_array*0.3\n", 228 | "s2.head()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 30, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "s2.to_csv('sub_7_ensemble_chetan_rajat_4.csv', index=False)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "Python 3", 258 | "language": "python", 259 | "name": "python3" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.7.1" 272 | }, 273 | "toc": { 274 | "base_numbering": 1, 275 | "nav_menu": {}, 276 | "number_sections": true, 277 | "sideBar": true, 278 | "skip_h1_title": false, 279 | "title_cell": "Table of Contents", 280 | "title_sidebar": "Contents", 281 | "toc_cell": false, 282 | "toc_position": {}, 283 | "toc_section_display": true, 284 | "toc_window_display": false 285 | } 286 | }, 287 | "nbformat": 4, 288 | "nbformat_minor": 2 289 | } 290 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Solution - Part 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", 8 | "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" 9 | }, 10 | "outputs": [ 11 | { 12 | "name": "stdout", 13 | "output_type": "stream", 14 | "text": [ 15 | "/kaggle/input/av-wns-2019/data_wns/test.csv\n", 16 | "/kaggle/input/av-wns-2019/data_wns/sample_submission.csv\n", 17 | "/kaggle/input/av-wns-2019/data_wns/train_NA17Sgz/train.csv\n", 18 | "/kaggle/input/av-wns-2019/data_wns/train_NA17Sgz/view_log.csv\n", 19 | "/kaggle/input/av-wns-2019/data_wns/train_NA17Sgz/item_data.csv\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "# This Python 3 environment comes with many helpful analytics libraries installed\n", 25 | "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", 26 | "# For example, here's several helpful packages to load in \n", 27 | "\n", 28 | "import numpy as np # linear algebra\n", 29 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 30 | "\n", 31 | "# Input data files are available in the \"../input/\" directory.\n", 32 | "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", 33 | "\n", 34 | "import os\n", 35 | "for dirname, _, filenames in os.walk('/kaggle/input'):\n", 36 | " for filename in filenames:\n", 37 | " print(os.path.join(dirname, filename)) \n", 38 | "\n", 39 | "# Any results you write to the current directory are saved as output." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": { 46 | "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", 47 | "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "train=pd.read_csv('/kaggle/input/av-wns-2019/data_wns/train_NA17Sgz/train.csv')\n", 52 | "test=pd.read_csv('/kaggle/input/av-wns-2019/data_wns/test.csv')\n", 53 | "view=pd.read_csv('/kaggle/input/av-wns-2019/data_wns/train_NA17Sgz/view_log.csv')\n", 54 | "item=pd.read_csv('/kaggle/input/av-wns-2019/data_wns/train_NA17Sgz/item_data.csv')\n", 55 | "s=pd.read_csv('/kaggle/input/av-wns-2019/data_wns/sample_submission.csv')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "(90675, 6)" 67 | ] 68 | }, 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "test.shape" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 4, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "
impression_idimpression_timeuser_idapp_codeos_versionis_4Gis_click
0c4ca4238a0b923820dcc509a6f75849b2018-11-15 00:00:0087862422old00
145c48cce2e2d7fbdea1afc51c7c6ad262018-11-15 00:01:0063410467latest11
270efdf2ec9b086079795c442636b55fb2018-11-15 00:02:0071748259intermediate10
38e296a067a37563370ded05f5a3bf3ec2018-11-15 00:02:0069209244latest10
4182be0c5cdcd5072bb1864cdee4d3d6e2018-11-15 00:02:0062873473latest00
\n", 166 | "
" 167 | ], 168 | "text/plain": [ 169 | " impression_id impression_time user_id app_code \\\n", 170 | "0 c4ca4238a0b923820dcc509a6f75849b 2018-11-15 00:00:00 87862 422 \n", 171 | "1 45c48cce2e2d7fbdea1afc51c7c6ad26 2018-11-15 00:01:00 63410 467 \n", 172 | "2 70efdf2ec9b086079795c442636b55fb 2018-11-15 00:02:00 71748 259 \n", 173 | "3 8e296a067a37563370ded05f5a3bf3ec 2018-11-15 00:02:00 69209 244 \n", 174 | "4 182be0c5cdcd5072bb1864cdee4d3d6e 2018-11-15 00:02:00 62873 473 \n", 175 | "\n", 176 | " os_version is_4G is_click \n", 177 | "0 old 0 0 \n", 178 | "1 latest 1 1 \n", 179 | "2 intermediate 1 0 \n", 180 | "3 latest 1 0 \n", 181 | "4 latest 0 0 " 182 | ] 183 | }, 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "train.head()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 5, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "impression_id 0\n", 202 | "impression_time 0\n", 203 | "user_id 0\n", 204 | "app_code 0\n", 205 | "os_version 0\n", 206 | "is_4G 0\n", 207 | "is_click 0\n", 208 | "dtype: int64" 209 | ] 210 | }, 211 | "execution_count": 5, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "train.isnull().sum()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 6, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "(74723, 34079)" 229 | ] 230 | }, 231 | "execution_count": 6, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "train.user_id.nunique(),test.user_id.nunique()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 7, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "34079\n", 250 | "19645\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "len(np.intersect1d(train.user_id,view.user_id))\n", 256 | "print(len(np.intersect1d(test.user_id,view.user_id)))\n", 257 | "print(len(np.intersect1d(test.user_id,train.user_id)))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 8, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "(3118622, 5)\n" 270 | ] 271 | }, 272 | { 273 | "data": { 274 | "text/html": [ 275 | "
\n", 276 | "\n", 289 | "\n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | "
server_timedevice_typesession_iduser_iditem_id
02018-10-15 08:58:00android112333455732970
12018-10-15 08:58:00android503590747887640
22018-10-15 08:58:00android57396023628128855
32018-10-15 08:58:00android121691243012774
42018-10-15 08:58:00android2185641922728296
\n", 343 | "
" 344 | ], 345 | "text/plain": [ 346 | " server_time device_type session_id user_id item_id\n", 347 | "0 2018-10-15 08:58:00 android 112333 4557 32970\n", 348 | "1 2018-10-15 08:58:00 android 503590 74788 7640\n", 349 | "2 2018-10-15 08:58:00 android 573960 23628 128855\n", 350 | "3 2018-10-15 08:58:00 android 121691 2430 12774\n", 351 | "4 2018-10-15 08:58:00 android 218564 19227 28296" 352 | ] 353 | }, 354 | "execution_count": 8, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "print(view.shape)\n", 361 | "view.head()" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 9, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/plain": [ 372 | "server_time 0\n", 373 | "device_type 0\n", 374 | "session_id 0\n", 375 | "user_id 0\n", 376 | "item_id 0\n", 377 | "dtype: int64" 378 | ] 379 | }, 380 | "execution_count": 9, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "view.isnull().sum()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 10, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "item_id 0\n", 398 | "item_price 0\n", 399 | "category_1 0\n", 400 | "category_2 0\n", 401 | "category_3 0\n", 402 | "product_type 0\n", 403 | "dtype: int64" 404 | ] 405 | }, 406 | "execution_count": 10, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "item.isnull().sum()" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 11, 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "name": "stdout", 422 | "output_type": "stream", 423 | "text": [ 424 | "item_id 132761\n", 425 | "item_price 13531\n", 426 | "category_1 17\n", 427 | "category_2 79\n", 428 | "category_3 335\n", 429 | "product_type 7959\n" 430 | ] 431 | }, 432 | { 433 | "data": { 434 | "text/html": [ 435 | "
\n", 436 | "\n", 449 | "\n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | "
item_iditem_pricecategory_1category_2category_3product_type
02688046021135203040
15493935131257856822
2403838251782791619
38777235513581895264
41137051267173915110239
\n", 509 | "
" 510 | ], 511 | "text/plain": [ 512 | " item_id item_price category_1 category_2 category_3 product_type\n", 513 | "0 26880 4602 11 35 20 3040\n", 514 | "1 54939 3513 12 57 85 6822\n", 515 | "2 40383 825 17 8 279 1619\n", 516 | "3 8777 2355 13 58 189 5264\n", 517 | "4 113705 1267 17 39 151 10239" 518 | ] 519 | }, 520 | "execution_count": 11, 521 | "metadata": {}, 522 | "output_type": "execute_result" 523 | } 524 | ], 525 | "source": [ 526 | "for k in item.columns:\n", 527 | " print(k,item[k].nunique())\n", 528 | " \n", 529 | "item.head()" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 12, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "item=pd.get_dummies(item,columns=['category_1','category_2'],drop_first=True)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 13, 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/html": [ 549 | "
\n", 550 | "\n", 563 | "\n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | "
server_timedevice_typesession_iduser_iditem_iditem_pricecategory_3product_typecategory_1_1category_1_2...category_2_70category_2_71category_2_72category_2_73category_2_74category_2_75category_2_76category_2_77category_2_78category_2_79
02018-10-15 08:58:00android11233345573297054685.0253.03184.00.00.0...0.00.00.00.00.00.00.00.00.00.0
12018-10-15 08:58:00android5035907478876401376.0228.0545.00.00.0...0.01.00.00.00.00.00.00.00.00.0
22018-10-15 08:58:00android573960236281288554544.062.05609.00.00.0...0.00.00.00.00.00.00.00.00.00.0
32018-10-15 08:58:00android121691243012774904.0252.02740.00.00.0...0.00.00.00.00.00.00.00.00.00.0
42018-10-15 08:58:00android21856419227282962304.054.07422.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", 713 | "

5 rows × 102 columns

\n", 714 | "
" 715 | ], 716 | "text/plain": [ 717 | " server_time device_type session_id user_id item_id item_price \\\n", 718 | "0 2018-10-15 08:58:00 android 112333 4557 32970 54685.0 \n", 719 | "1 2018-10-15 08:58:00 android 503590 74788 7640 1376.0 \n", 720 | "2 2018-10-15 08:58:00 android 573960 23628 128855 4544.0 \n", 721 | "3 2018-10-15 08:58:00 android 121691 2430 12774 904.0 \n", 722 | "4 2018-10-15 08:58:00 android 218564 19227 28296 2304.0 \n", 723 | "\n", 724 | " category_3 product_type category_1_1 category_1_2 ... category_2_70 \\\n", 725 | "0 253.0 3184.0 0.0 0.0 ... 0.0 \n", 726 | "1 228.0 545.0 0.0 0.0 ... 0.0 \n", 727 | "2 62.0 5609.0 0.0 0.0 ... 0.0 \n", 728 | "3 252.0 2740.0 0.0 0.0 ... 0.0 \n", 729 | "4 54.0 7422.0 0.0 0.0 ... 0.0 \n", 730 | "\n", 731 | " category_2_71 category_2_72 category_2_73 category_2_74 category_2_75 \\\n", 732 | "0 0.0 0.0 0.0 0.0 0.0 \n", 733 | "1 1.0 0.0 0.0 0.0 0.0 \n", 734 | "2 0.0 0.0 0.0 0.0 0.0 \n", 735 | "3 0.0 0.0 0.0 0.0 0.0 \n", 736 | "4 0.0 0.0 0.0 0.0 0.0 \n", 737 | "\n", 738 | " category_2_76 category_2_77 category_2_78 category_2_79 \n", 739 | "0 0.0 0.0 0.0 0.0 \n", 740 | "1 0.0 0.0 0.0 0.0 \n", 741 | "2 0.0 0.0 0.0 0.0 \n", 742 | "3 0.0 0.0 0.0 0.0 \n", 743 | "4 0.0 0.0 0.0 0.0 \n", 744 | "\n", 745 | "[5 rows x 102 columns]" 746 | ] 747 | }, 748 | "execution_count": 13, 749 | "metadata": {}, 750 | "output_type": "execute_result" 751 | } 752 | ], 753 | "source": [ 754 | "view_item=view.merge(item,on='item_id',how='left')\n", 755 | "view_item.head()" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 15, 761 | "metadata": {}, 762 | "outputs": [ 763 | { 764 | "data": { 765 | "text/plain": [ 766 | "Index(['server_time', 'device_type', 'session_id', 'user_id', 'item_id',\n", 767 | " 'item_price', 'category_3', 'product_type', 'category_1_1',\n", 768 | " 'category_1_2',\n", 769 | " ...\n", 770 | " 'category_2_70', 'category_2_71', 'category_2_72', 'category_2_73',\n", 771 | " 'category_2_74', 'category_2_75', 'category_2_76', 'category_2_77',\n", 772 | " 'category_2_78', 'category_2_79'],\n", 773 | " dtype='object', length=102)" 774 | ] 775 | }, 776 | "execution_count": 15, 777 | "metadata": {}, 778 | "output_type": "execute_result" 779 | } 780 | ], 781 | "source": [ 782 | "view_item.columns" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 16, 788 | "metadata": {}, 789 | "outputs": [], 790 | "source": [ 791 | "view_item['server_time']=pd.to_datetime(view_item['server_time'],format='%Y-%m-%d %H:%M:%S')\n", 792 | "\n", 793 | "view_item.sort_values(['user_id',\"server_time\"],ascending=True,inplace=True)\n", 794 | "view_item['cumcount_1']=view_item.groupby(\"user_id\")[\"session_id\"].cumcount() + 1\n", 795 | "\n", 796 | "view_item.sort_values(['user_id','item_id',\"server_time\"],ascending=True,inplace=True)\n", 797 | "view_item['cumcount_2']=view_item.groupby([\"user_id\",'item_id'])[\"session_id\"].cumcount() + 1\n", 798 | "\n", 799 | "view_item.sort_values(['user_id','session_id','item_id',\"server_time\"],ascending=True,inplace=True)\n", 800 | "view_item['cumcount_3']=view_item.groupby([\"user_id\",'session_id','item_id'])[\"session_id\"].cumcount() + 1\n", 801 | "\n", 802 | "view_item['device_type']=view_item['device_type'].astype('category')\n", 803 | "view_item['session_id']=view_item['session_id'].astype('category')\n", 804 | "view_item['item_id']=view_item['item_id'].astype('category')\n", 805 | "view_item['device_type']=view_item['device_type'].astype('category')\n", 806 | "\n", 807 | "view_item.drop(['server_time'],axis=1,inplace=True)" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 17, 813 | "metadata": {}, 814 | "outputs": [ 815 | { 816 | "data": { 817 | "text/plain": [ 818 | "Index(['device_type', 'session_id', 'user_id', 'item_id', 'item_price',\n", 819 | " 'category_3', 'product_type', 'category_1_1', 'category_1_2',\n", 820 | " 'category_1_4',\n", 821 | " ...\n", 822 | " 'category_2_73', 'category_2_74', 'category_2_75', 'category_2_76',\n", 823 | " 'category_2_77', 'category_2_78', 'category_2_79', 'cumcount_1',\n", 824 | " 'cumcount_2', 'cumcount_3'],\n", 825 | " dtype='object', length=104)" 826 | ] 827 | }, 828 | "execution_count": 17, 829 | "metadata": {}, 830 | "output_type": "execute_result" 831 | } 832 | ], 833 | "source": [ 834 | "view_item.columns" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 18, 840 | "metadata": {}, 841 | "outputs": [ 842 | { 843 | "data": { 844 | "text/html": [ 845 | "
\n", 846 | "\n", 859 | "\n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | "
device_typesession_iduser_iditem_iditem_pricecategory_3product_typecategory_1_1category_1_2category_1_4...category_2_73category_2_74category_2_75category_2_76category_2_77category_2_78category_2_79cumcount_1cumcount_2cumcount_3
2627295android8637301160734181.0151.0124.00.00.00.0...0.00.00.00.00.00.00.03341
1355387android143955027151601.0159.07093.00.00.00.0...0.00.00.00.00.00.00.01811
1355488android143955027151601.0159.07093.00.00.00.0...0.00.00.00.00.00.00.01922
1355056android1439550103791238.0272.060.00.00.00.0...0.00.00.00.00.00.00.01211
1355084android1439550103791238.0272.060.00.00.00.0...0.00.00.00.00.00.00.01322
\n", 1009 | "

5 rows × 104 columns

\n", 1010 | "
" 1011 | ], 1012 | "text/plain": [ 1013 | " device_type session_id user_id item_id item_price category_3 \\\n", 1014 | "2627295 android 86373 0 116073 4181.0 151.0 \n", 1015 | "1355387 android 143955 0 2715 1601.0 159.0 \n", 1016 | "1355488 android 143955 0 2715 1601.0 159.0 \n", 1017 | "1355056 android 143955 0 10379 1238.0 272.0 \n", 1018 | "1355084 android 143955 0 10379 1238.0 272.0 \n", 1019 | "\n", 1020 | " product_type category_1_1 category_1_2 category_1_4 ... \\\n", 1021 | "2627295 124.0 0.0 0.0 0.0 ... \n", 1022 | "1355387 7093.0 0.0 0.0 0.0 ... \n", 1023 | "1355488 7093.0 0.0 0.0 0.0 ... \n", 1024 | "1355056 60.0 0.0 0.0 0.0 ... \n", 1025 | "1355084 60.0 0.0 0.0 0.0 ... \n", 1026 | "\n", 1027 | " category_2_73 category_2_74 category_2_75 category_2_76 \\\n", 1028 | "2627295 0.0 0.0 0.0 0.0 \n", 1029 | "1355387 0.0 0.0 0.0 0.0 \n", 1030 | "1355488 0.0 0.0 0.0 0.0 \n", 1031 | "1355056 0.0 0.0 0.0 0.0 \n", 1032 | "1355084 0.0 0.0 0.0 0.0 \n", 1033 | "\n", 1034 | " category_2_77 category_2_78 category_2_79 cumcount_1 cumcount_2 \\\n", 1035 | "2627295 0.0 0.0 0.0 33 4 \n", 1036 | "1355387 0.0 0.0 0.0 18 1 \n", 1037 | "1355488 0.0 0.0 0.0 19 2 \n", 1038 | "1355056 0.0 0.0 0.0 12 1 \n", 1039 | "1355084 0.0 0.0 0.0 13 2 \n", 1040 | "\n", 1041 | " cumcount_3 \n", 1042 | "2627295 1 \n", 1043 | "1355387 1 \n", 1044 | "1355488 2 \n", 1045 | "1355056 1 \n", 1046 | "1355084 2 \n", 1047 | "\n", 1048 | "[5 rows x 104 columns]" 1049 | ] 1050 | }, 1051 | "execution_count": 18, 1052 | "metadata": {}, 1053 | "output_type": "execute_result" 1054 | } 1055 | ], 1056 | "source": [ 1057 | "view_item.head()" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "code", 1062 | "execution_count": 19, 1063 | "metadata": {}, 1064 | "outputs": [ 1065 | { 1066 | "data": { 1067 | "text/html": [ 1068 | "
\n", 1069 | "\n", 1082 | "\n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | "
device_typesession_iduser_iditem_iditem_pricecategory_3product_typecategory_1_1category_1_2category_1_4...category_2_73category_2_74category_2_75category_2_76category_2_77category_2_78category_2_79cumcount_1cumcount_2cumcount_3
1591211android67746236281403441.0170.0734.00.00.00.0...0.00.00.00.00.00.00.03321
1591213android67746236281403441.0170.0734.00.00.00.0...0.00.00.00.00.00.00.03432
19692android7953323628861272067.062.05609.00.00.01.0...0.00.00.00.00.00.00.0311
19814android7953323628861272067.062.05609.00.00.01.0...0.00.00.00.00.00.00.0422
20124android7953323628861272067.062.05609.00.00.01.0...0.00.00.00.00.00.00.0533
\n", 1232 | "

5 rows × 104 columns

\n", 1233 | "
" 1234 | ], 1235 | "text/plain": [ 1236 | " device_type session_id user_id item_id item_price category_3 \\\n", 1237 | "1591211 android 67746 23628 1403 441.0 170.0 \n", 1238 | "1591213 android 67746 23628 1403 441.0 170.0 \n", 1239 | "19692 android 79533 23628 86127 2067.0 62.0 \n", 1240 | "19814 android 79533 23628 86127 2067.0 62.0 \n", 1241 | "20124 android 79533 23628 86127 2067.0 62.0 \n", 1242 | "\n", 1243 | " product_type category_1_1 category_1_2 category_1_4 ... \\\n", 1244 | "1591211 734.0 0.0 0.0 0.0 ... \n", 1245 | "1591213 734.0 0.0 0.0 0.0 ... \n", 1246 | "19692 5609.0 0.0 0.0 1.0 ... \n", 1247 | "19814 5609.0 0.0 0.0 1.0 ... \n", 1248 | "20124 5609.0 0.0 0.0 1.0 ... \n", 1249 | "\n", 1250 | " category_2_73 category_2_74 category_2_75 category_2_76 \\\n", 1251 | "1591211 0.0 0.0 0.0 0.0 \n", 1252 | "1591213 0.0 0.0 0.0 0.0 \n", 1253 | "19692 0.0 0.0 0.0 0.0 \n", 1254 | "19814 0.0 0.0 0.0 0.0 \n", 1255 | "20124 0.0 0.0 0.0 0.0 \n", 1256 | "\n", 1257 | " category_2_77 category_2_78 category_2_79 cumcount_1 cumcount_2 \\\n", 1258 | "1591211 0.0 0.0 0.0 33 2 \n", 1259 | "1591213 0.0 0.0 0.0 34 3 \n", 1260 | "19692 0.0 0.0 0.0 3 1 \n", 1261 | "19814 0.0 0.0 0.0 4 2 \n", 1262 | "20124 0.0 0.0 0.0 5 3 \n", 1263 | "\n", 1264 | " cumcount_3 \n", 1265 | "1591211 1 \n", 1266 | "1591213 2 \n", 1267 | "19692 1 \n", 1268 | "19814 2 \n", 1269 | "20124 3 \n", 1270 | "\n", 1271 | "[5 rows x 104 columns]" 1272 | ] 1273 | }, 1274 | "execution_count": 19, 1275 | "metadata": {}, 1276 | "output_type": "execute_result" 1277 | } 1278 | ], 1279 | "source": [ 1280 | "view_item[view_item.user_id==23628].head()" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "code", 1285 | "execution_count": 20, 1286 | "metadata": {}, 1287 | "outputs": [ 1288 | { 1289 | "data": { 1290 | "text/plain": [ 1291 | "Index(['device_type', 'session_id', 'user_id', 'item_id', 'item_price',\n", 1292 | " 'category_3', 'product_type', 'category_1_1', 'category_1_2',\n", 1293 | " 'category_1_4',\n", 1294 | " ...\n", 1295 | " 'category_2_73', 'category_2_74', 'category_2_75', 'category_2_76',\n", 1296 | " 'category_2_77', 'category_2_78', 'category_2_79', 'cumcount_1',\n", 1297 | " 'cumcount_2', 'cumcount_3'],\n", 1298 | " dtype='object', length=104)" 1299 | ] 1300 | }, 1301 | "execution_count": 20, 1302 | "metadata": {}, 1303 | "output_type": "execute_result" 1304 | } 1305 | ], 1306 | "source": [ 1307 | "view_item.columns" 1308 | ] 1309 | }, 1310 | { 1311 | "cell_type": "code", 1312 | "execution_count": 21, 1313 | "metadata": {}, 1314 | "outputs": [ 1315 | { 1316 | "data": { 1317 | "text/plain": [ 1318 | "{'device_type': ['count', 'nunique'],\n", 1319 | " 'session_id': ['count', 'nunique'],\n", 1320 | " 'item_id': ['count', 'nunique'],\n", 1321 | " 'item_price': ['min', 'mean', 'max', 'sum'],\n", 1322 | " 'category_3': ['count', 'nunique', 'mean'],\n", 1323 | " 'product_type': ['count', 'nunique', 'mean'],\n", 1324 | " 'category_1_1': ['sum', 'mean'],\n", 1325 | " 'category_1_2': ['sum', 'mean'],\n", 1326 | " 'category_1_4': ['sum', 'mean'],\n", 1327 | " 'category_1_5': ['sum', 'mean'],\n", 1328 | " 'category_1_6': ['sum', 'mean'],\n", 1329 | " 'category_1_7': ['sum', 'mean'],\n", 1330 | " 'category_1_8': ['sum', 'mean'],\n", 1331 | " 'category_1_9': ['sum', 'mean'],\n", 1332 | " 'category_1_10': ['sum', 'mean'],\n", 1333 | " 'category_1_11': ['sum', 'mean'],\n", 1334 | " 'category_1_12': ['sum', 'mean'],\n", 1335 | " 'category_1_13': ['sum', 'mean'],\n", 1336 | " 'category_1_14': ['sum', 'mean'],\n", 1337 | " 'category_1_15': ['sum', 'mean'],\n", 1338 | " 'category_1_16': ['sum', 'mean'],\n", 1339 | " 'category_1_17': ['sum', 'mean'],\n", 1340 | " 'category_2_1': ['sum', 'mean'],\n", 1341 | " 'category_2_2': ['sum', 'mean'],\n", 1342 | " 'category_2_3': ['sum', 'mean'],\n", 1343 | " 'category_2_4': ['sum', 'mean'],\n", 1344 | " 'category_2_5': ['sum', 'mean'],\n", 1345 | " 'category_2_6': ['sum', 'mean'],\n", 1346 | " 'category_2_7': ['sum', 'mean'],\n", 1347 | " 'category_2_8': ['sum', 'mean'],\n", 1348 | " 'category_2_9': ['sum', 'mean'],\n", 1349 | " 'category_2_10': ['sum', 'mean'],\n", 1350 | " 'category_2_11': ['sum', 'mean'],\n", 1351 | " 'category_2_12': ['sum', 'mean'],\n", 1352 | " 'category_2_13': ['sum', 'mean'],\n", 1353 | " 'category_2_14': ['sum', 'mean'],\n", 1354 | " 'category_2_15': ['sum', 'mean'],\n", 1355 | " 'category_2_16': ['sum', 'mean'],\n", 1356 | " 'category_2_17': ['sum', 'mean'],\n", 1357 | " 'category_2_18': ['sum', 'mean'],\n", 1358 | " 'category_2_19': ['sum', 'mean'],\n", 1359 | " 'category_2_20': ['sum', 'mean'],\n", 1360 | " 'category_2_21': ['sum', 'mean'],\n", 1361 | " 'category_2_22': ['sum', 'mean'],\n", 1362 | " 'category_2_23': ['sum', 'mean'],\n", 1363 | " 'category_2_24': ['sum', 'mean'],\n", 1364 | " 'category_2_25': ['sum', 'mean'],\n", 1365 | " 'category_2_26': ['sum', 'mean'],\n", 1366 | " 'category_2_27': ['sum', 'mean'],\n", 1367 | " 'category_2_28': ['sum', 'mean'],\n", 1368 | " 'category_2_29': ['sum', 'mean'],\n", 1369 | " 'category_2_30': ['sum', 'mean'],\n", 1370 | " 'category_2_31': ['sum', 'mean'],\n", 1371 | " 'category_2_33': ['sum', 'mean'],\n", 1372 | " 'category_2_34': ['sum', 'mean'],\n", 1373 | " 'category_2_35': ['sum', 'mean'],\n", 1374 | " 'category_2_36': ['sum', 'mean'],\n", 1375 | " 'category_2_37': ['sum', 'mean'],\n", 1376 | " 'category_2_38': ['sum', 'mean'],\n", 1377 | " 'category_2_39': ['sum', 'mean'],\n", 1378 | " 'category_2_40': ['sum', 'mean'],\n", 1379 | " 'category_2_41': ['sum', 'mean'],\n", 1380 | " 'category_2_42': ['sum', 'mean'],\n", 1381 | " 'category_2_43': ['sum', 'mean'],\n", 1382 | " 'category_2_44': ['sum', 'mean'],\n", 1383 | " 'category_2_45': ['sum', 'mean'],\n", 1384 | " 'category_2_46': ['sum', 'mean'],\n", 1385 | " 'category_2_47': ['sum', 'mean'],\n", 1386 | " 'category_2_48': ['sum', 'mean'],\n", 1387 | " 'category_2_49': ['sum', 'mean'],\n", 1388 | " 'category_2_50': ['sum', 'mean'],\n", 1389 | " 'category_2_51': ['sum', 'mean'],\n", 1390 | " 'category_2_52': ['sum', 'mean'],\n", 1391 | " 'category_2_53': ['sum', 'mean'],\n", 1392 | " 'category_2_54': ['sum', 'mean'],\n", 1393 | " 'category_2_55': ['sum', 'mean'],\n", 1394 | " 'category_2_56': ['sum', 'mean'],\n", 1395 | " 'category_2_57': ['sum', 'mean'],\n", 1396 | " 'category_2_58': ['sum', 'mean'],\n", 1397 | " 'category_2_59': ['sum', 'mean'],\n", 1398 | " 'category_2_60': ['sum', 'mean'],\n", 1399 | " 'category_2_61': ['sum', 'mean'],\n", 1400 | " 'category_2_62': ['sum', 'mean'],\n", 1401 | " 'category_2_63': ['sum', 'mean'],\n", 1402 | " 'category_2_64': ['sum', 'mean'],\n", 1403 | " 'category_2_65': ['sum', 'mean'],\n", 1404 | " 'category_2_66': ['sum', 'mean'],\n", 1405 | " 'category_2_67': ['sum', 'mean'],\n", 1406 | " 'category_2_68': ['sum', 'mean'],\n", 1407 | " 'category_2_69': ['sum', 'mean'],\n", 1408 | " 'category_2_70': ['sum', 'mean'],\n", 1409 | " 'category_2_71': ['sum', 'mean'],\n", 1410 | " 'category_2_72': ['sum', 'mean'],\n", 1411 | " 'category_2_73': ['sum', 'mean'],\n", 1412 | " 'category_2_74': ['sum', 'mean'],\n", 1413 | " 'category_2_75': ['sum', 'mean'],\n", 1414 | " 'category_2_76': ['sum', 'mean'],\n", 1415 | " 'category_2_77': ['sum', 'mean'],\n", 1416 | " 'category_2_78': ['sum', 'mean'],\n", 1417 | " 'category_2_79': ['sum', 'mean'],\n", 1418 | " 'cumcount_1': ['min', 'mean', 'max', 'sum'],\n", 1419 | " 'cumcount_2': ['min', 'mean', 'max', 'sum'],\n", 1420 | " 'cumcount_3': ['min', 'mean', 'max', 'sum']}" 1421 | ] 1422 | }, 1423 | "execution_count": 21, 1424 | "metadata": {}, 1425 | "output_type": "execute_result" 1426 | } 1427 | ], 1428 | "source": [ 1429 | "cat_agg=['count','nunique']\n", 1430 | "num_agg=['min','mean','max','sum']\n", 1431 | "agg_col={\n", 1432 | " 'device_type':cat_agg, 'session_id':cat_agg, 'item_id':cat_agg,'item_price':num_agg,\n", 1433 | " 'category_3':['count','nunique','mean'], 'product_type':['count','nunique','mean']\n", 1434 | "}\n", 1435 | "\n", 1436 | "for k in view_item.columns:\n", 1437 | " if k.startswith('category_1') or k.startswith('category_2'):\n", 1438 | " agg_col[k]=['sum','mean']\n", 1439 | " elif k.startswith('server'):\n", 1440 | " agg_col[k]=cat_agg\n", 1441 | " elif k.startswith('cumcount'):\n", 1442 | " agg_col[k]=num_agg\n", 1443 | "agg_col" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "execution_count": 22, 1449 | "metadata": {}, 1450 | "outputs": [], 1451 | "source": [ 1452 | "view_item1=view_item.groupby('user_id').agg(agg_col)" 1453 | ] 1454 | }, 1455 | { 1456 | "cell_type": "code", 1457 | "execution_count": 23, 1458 | "metadata": {}, 1459 | "outputs": [ 1460 | { 1461 | "data": { 1462 | "text/html": [ 1463 | "
\n", 1464 | "\n", 1481 | "\n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | "
device_typesession_iditem_iditem_price...cumcount_1cumcount_2cumcount_3
countnuniquecountnuniquecountnuniqueminmeanmaxsum...maxsumminmeanmaxsumminmeanmaxsum
user_id
042142114218332.09395.66666792160.0394618.0...4290312.19047669211.690476671
1818388383.03946.75000012595.031574.0...83611.0000001811.00000018
2165116537165130128.014809.509091281536.02443569.0...1651369511.333333622011.0727274177
3818183537.07257.37500016640.058059.0...83611.87500031511.875000315
42121221977.030114.50000058252.060229.0...2311.0000001211.00000012
\n", 1666 | "

5 rows × 216 columns

\n", 1667 | "
" 1668 | ], 1669 | "text/plain": [ 1670 | " device_type session_id item_id item_price \\\n", 1671 | " count nunique count nunique count nunique min \n", 1672 | "user_id \n", 1673 | "0 42 1 42 11 42 18 332.0 \n", 1674 | "1 8 1 8 3 8 8 383.0 \n", 1675 | "2 165 1 165 37 165 130 128.0 \n", 1676 | "3 8 1 8 1 8 3 537.0 \n", 1677 | "4 2 1 2 1 2 2 1977.0 \n", 1678 | "\n", 1679 | " ... cumcount_1 cumcount_2 \\\n", 1680 | " mean max sum ... max sum min \n", 1681 | "user_id ... \n", 1682 | "0 9395.666667 92160.0 394618.0 ... 42 903 1 \n", 1683 | "1 3946.750000 12595.0 31574.0 ... 8 36 1 \n", 1684 | "2 14809.509091 281536.0 2443569.0 ... 165 13695 1 \n", 1685 | "3 7257.375000 16640.0 58059.0 ... 8 36 1 \n", 1686 | "4 30114.500000 58252.0 60229.0 ... 2 3 1 \n", 1687 | "\n", 1688 | " cumcount_3 \n", 1689 | " mean max sum min mean max sum \n", 1690 | "user_id \n", 1691 | "0 2.190476 6 92 1 1.690476 6 71 \n", 1692 | "1 1.000000 1 8 1 1.000000 1 8 \n", 1693 | "2 1.333333 6 220 1 1.072727 4 177 \n", 1694 | "3 1.875000 3 15 1 1.875000 3 15 \n", 1695 | "4 1.000000 1 2 1 1.000000 1 2 \n", 1696 | "\n", 1697 | "[5 rows x 216 columns]" 1698 | ] 1699 | }, 1700 | "execution_count": 23, 1701 | "metadata": {}, 1702 | "output_type": "execute_result" 1703 | } 1704 | ], 1705 | "source": [ 1706 | "view_item1.head()" 1707 | ] 1708 | }, 1709 | { 1710 | "cell_type": "code", 1711 | "execution_count": 24, 1712 | "metadata": {}, 1713 | "outputs": [ 1714 | { 1715 | "data": { 1716 | "text/html": [ 1717 | "
\n", 1718 | "\n", 1731 | "\n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | "
user_idJ_device_type_countJ_device_type_nuniqueJ_session_id_countJ_session_id_nuniqueJ_item_id_countJ_item_id_nuniqueJ_item_price_minJ_item_price_meanJ_item_price_max...J_cumcount_1_maxJ_cumcount_1_sumJ_cumcount_2_minJ_cumcount_2_meanJ_cumcount_2_maxJ_cumcount_2_sumJ_cumcount_3_minJ_cumcount_3_meanJ_cumcount_3_maxJ_cumcount_3_sum
0042142114218332.09395.66666792160.0...4290312.19047669211.690476671
11818388383.03946.75000012595.0...83611.0000001811.00000018
22165116537165130128.014809.509091281536.0...1651369511.333333622011.0727274177
33818183537.07257.37500016640.0...83611.87500031511.875000315
442121221977.030114.50000058252.0...2311.0000001211.00000012
\n", 1881 | "

5 rows × 217 columns

\n", 1882 | "
" 1883 | ], 1884 | "text/plain": [ 1885 | " user_id J_device_type_count J_device_type_nunique J_session_id_count \\\n", 1886 | "0 0 42 1 42 \n", 1887 | "1 1 8 1 8 \n", 1888 | "2 2 165 1 165 \n", 1889 | "3 3 8 1 8 \n", 1890 | "4 4 2 1 2 \n", 1891 | "\n", 1892 | " J_session_id_nunique J_item_id_count J_item_id_nunique J_item_price_min \\\n", 1893 | "0 11 42 18 332.0 \n", 1894 | "1 3 8 8 383.0 \n", 1895 | "2 37 165 130 128.0 \n", 1896 | "3 1 8 3 537.0 \n", 1897 | "4 1 2 2 1977.0 \n", 1898 | "\n", 1899 | " J_item_price_mean J_item_price_max ... J_cumcount_1_max \\\n", 1900 | "0 9395.666667 92160.0 ... 42 \n", 1901 | "1 3946.750000 12595.0 ... 8 \n", 1902 | "2 14809.509091 281536.0 ... 165 \n", 1903 | "3 7257.375000 16640.0 ... 8 \n", 1904 | "4 30114.500000 58252.0 ... 2 \n", 1905 | "\n", 1906 | " J_cumcount_1_sum J_cumcount_2_min J_cumcount_2_mean J_cumcount_2_max \\\n", 1907 | "0 903 1 2.190476 6 \n", 1908 | "1 36 1 1.000000 1 \n", 1909 | "2 13695 1 1.333333 6 \n", 1910 | "3 36 1 1.875000 3 \n", 1911 | "4 3 1 1.000000 1 \n", 1912 | "\n", 1913 | " J_cumcount_2_sum J_cumcount_3_min J_cumcount_3_mean J_cumcount_3_max \\\n", 1914 | "0 92 1 1.690476 6 \n", 1915 | "1 8 1 1.000000 1 \n", 1916 | "2 220 1 1.072727 4 \n", 1917 | "3 15 1 1.875000 3 \n", 1918 | "4 2 1 1.000000 1 \n", 1919 | "\n", 1920 | " J_cumcount_3_sum \n", 1921 | "0 71 \n", 1922 | "1 8 \n", 1923 | "2 177 \n", 1924 | "3 15 \n", 1925 | "4 2 \n", 1926 | "\n", 1927 | "[5 rows x 217 columns]" 1928 | ] 1929 | }, 1930 | "execution_count": 24, 1931 | "metadata": {}, 1932 | "output_type": "execute_result" 1933 | } 1934 | ], 1935 | "source": [ 1936 | "view_item1.columns=['J_' + '_'.join(col).strip() for col in view_item1.columns.values]\n", 1937 | "view_item1.reset_index(inplace=True)\n", 1938 | "view_item1.head()" 1939 | ] 1940 | }, 1941 | { 1942 | "cell_type": "code", 1943 | "execution_count": 25, 1944 | "metadata": {}, 1945 | "outputs": [ 1946 | { 1947 | "data": { 1948 | "text/plain": [ 1949 | "(132761, (89157, 217))" 1950 | ] 1951 | }, 1952 | "execution_count": 25, 1953 | "metadata": {}, 1954 | "output_type": "execute_result" 1955 | } 1956 | ], 1957 | "source": [ 1958 | "item.item_id.nunique(),view_item1.shape" 1959 | ] 1960 | }, 1961 | { 1962 | "cell_type": "code", 1963 | "execution_count": 26, 1964 | "metadata": {}, 1965 | "outputs": [ 1966 | { 1967 | "data": { 1968 | "text/plain": [ 1969 | "67" 1970 | ] 1971 | }, 1972 | "execution_count": 26, 1973 | "metadata": {}, 1974 | "output_type": "execute_result" 1975 | } 1976 | ], 1977 | "source": [ 1978 | "del view_item\n", 1979 | "import gc \n", 1980 | "gc.collect()" 1981 | ] 1982 | }, 1983 | { 1984 | "cell_type": "code", 1985 | "execution_count": 27, 1986 | "metadata": {}, 1987 | "outputs": [ 1988 | { 1989 | "data": { 1990 | "text/html": [ 1991 | "
\n", 1992 | "\n", 2005 | "\n", 2006 | " \n", 2007 | " \n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | "
impression_idis_click
0a9e7126a585a69a32bc7414e9d0c0ada0
1caac14a5bf2ba283db7708bb348557600
213f10ba306a19ce7bec2f3cae507b6980
339c4b4dc0e9701b55a0a4f072008fb3f0
4bf5a572cca75f5fc67f4b14e58b11d700
\n", 2041 | "
" 2042 | ], 2043 | "text/plain": [ 2044 | " impression_id is_click\n", 2045 | "0 a9e7126a585a69a32bc7414e9d0c0ada 0\n", 2046 | "1 caac14a5bf2ba283db7708bb34855760 0\n", 2047 | "2 13f10ba306a19ce7bec2f3cae507b698 0\n", 2048 | "3 39c4b4dc0e9701b55a0a4f072008fb3f 0\n", 2049 | "4 bf5a572cca75f5fc67f4b14e58b11d70 0" 2050 | ] 2051 | }, 2052 | "execution_count": 27, 2053 | "metadata": {}, 2054 | "output_type": "execute_result" 2055 | } 2056 | ], 2057 | "source": [ 2058 | "s.head()" 2059 | ] 2060 | }, 2061 | { 2062 | "cell_type": "code", 2063 | "execution_count": 28, 2064 | "metadata": {}, 2065 | "outputs": [ 2066 | { 2067 | "name": "stderr", 2068 | "output_type": "stream", 2069 | "text": [ 2070 | "/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py:7116: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", 2071 | "of pandas will change to not sort by default.\n", 2072 | "\n", 2073 | "To accept the future behavior, pass 'sort=False'.\n", 2074 | "\n", 2075 | "To retain the current behavior and silence the warning, pass 'sort=True'.\n", 2076 | "\n", 2077 | " sort=sort,\n" 2078 | ] 2079 | }, 2080 | { 2081 | "data": { 2082 | "text/html": [ 2083 | "
\n", 2084 | "\n", 2097 | "\n", 2098 | " \n", 2099 | " \n", 2100 | " \n", 2101 | " \n", 2102 | " \n", 2103 | " \n", 2104 | " \n", 2105 | " \n", 2106 | " \n", 2107 | " \n", 2108 | " \n", 2109 | " \n", 2110 | " \n", 2111 | " \n", 2112 | " \n", 2113 | " \n", 2114 | " \n", 2115 | " \n", 2116 | " \n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | "
app_codeimpression_idimpression_timeis_4Gis_clickos_versionuser_id
0422c4ca4238a0b923820dcc509a6f75849b2018-11-15 00:00:0000.0old87862
146745c48cce2e2d7fbdea1afc51c7c6ad262018-11-15 00:01:0011.0latest63410
225970efdf2ec9b086079795c442636b55fb2018-11-15 00:02:0010.0intermediate71748
32448e296a067a37563370ded05f5a3bf3ec2018-11-15 00:02:0010.0latest69209
4473182be0c5cdcd5072bb1864cdee4d3d6e2018-11-15 00:02:0000.0latest62873
\n", 2163 | "
" 2164 | ], 2165 | "text/plain": [ 2166 | " app_code impression_id impression_time is_4G \\\n", 2167 | "0 422 c4ca4238a0b923820dcc509a6f75849b 2018-11-15 00:00:00 0 \n", 2168 | "1 467 45c48cce2e2d7fbdea1afc51c7c6ad26 2018-11-15 00:01:00 1 \n", 2169 | "2 259 70efdf2ec9b086079795c442636b55fb 2018-11-15 00:02:00 1 \n", 2170 | "3 244 8e296a067a37563370ded05f5a3bf3ec 2018-11-15 00:02:00 1 \n", 2171 | "4 473 182be0c5cdcd5072bb1864cdee4d3d6e 2018-11-15 00:02:00 0 \n", 2172 | "\n", 2173 | " is_click os_version user_id \n", 2174 | "0 0.0 old 87862 \n", 2175 | "1 1.0 latest 63410 \n", 2176 | "2 0.0 intermediate 71748 \n", 2177 | "3 0.0 latest 69209 \n", 2178 | "4 0.0 latest 62873 " 2179 | ] 2180 | }, 2181 | "execution_count": 28, 2182 | "metadata": {}, 2183 | "output_type": "execute_result" 2184 | } 2185 | ], 2186 | "source": [ 2187 | "df=train.append(test,ignore_index=True)\n", 2188 | "df.shape\n", 2189 | "df.head()" 2190 | ] 2191 | }, 2192 | { 2193 | "cell_type": "code", 2194 | "execution_count": 29, 2195 | "metadata": {}, 2196 | "outputs": [ 2197 | { 2198 | "data": { 2199 | "text/html": [ 2200 | "
\n", 2201 | "\n", 2214 | "\n", 2215 | " \n", 2216 | " \n", 2217 | " \n", 2218 | " \n", 2219 | " \n", 2220 | " \n", 2221 | " \n", 2222 | " \n", 2223 | " \n", 2224 | " \n", 2225 | " \n", 2226 | " \n", 2227 | " \n", 2228 | " \n", 2229 | " \n", 2230 | " \n", 2231 | " \n", 2232 | " \n", 2233 | " \n", 2234 | " \n", 2235 | " \n", 2236 | " \n", 2237 | " \n", 2238 | " \n", 2239 | " \n", 2240 | " \n", 2241 | " \n", 2242 | " \n", 2243 | " \n", 2244 | " \n", 2245 | " \n", 2246 | " \n", 2247 | " \n", 2248 | " \n", 2249 | " \n", 2250 | " \n", 2251 | " \n", 2252 | " \n", 2253 | " \n", 2254 | " \n", 2255 | " \n", 2256 | " \n", 2257 | " \n", 2258 | " \n", 2259 | " \n", 2260 | " \n", 2261 | " \n", 2262 | " \n", 2263 | " \n", 2264 | " \n", 2265 | " \n", 2266 | " \n", 2267 | " \n", 2268 | " \n", 2269 | " \n", 2270 | " \n", 2271 | " \n", 2272 | " \n", 2273 | " \n", 2274 | " \n", 2275 | " \n", 2276 | " \n", 2277 | " \n", 2278 | " \n", 2279 | "
app_codeimpression_idimpression_timeis_4Gis_clickos_versionuser_id
0422c4ca4238a0b923820dcc509a6f75849b2018-11-15 00:00:0000.0old87862
146745c48cce2e2d7fbdea1afc51c7c6ad262018-11-15 00:01:0011.0latest63410
225970efdf2ec9b086079795c442636b55fb2018-11-15 00:02:0010.0intermediate71748
32448e296a067a37563370ded05f5a3bf3ec2018-11-15 00:02:0010.0latest69209
4473182be0c5cdcd5072bb1864cdee4d3d6e2018-11-15 00:02:0000.0latest62873
\n", 2280 | "
" 2281 | ], 2282 | "text/plain": [ 2283 | " app_code impression_id impression_time is_4G \\\n", 2284 | "0 422 c4ca4238a0b923820dcc509a6f75849b 2018-11-15 00:00:00 0 \n", 2285 | "1 467 45c48cce2e2d7fbdea1afc51c7c6ad26 2018-11-15 00:01:00 1 \n", 2286 | "2 259 70efdf2ec9b086079795c442636b55fb 2018-11-15 00:02:00 1 \n", 2287 | "3 244 8e296a067a37563370ded05f5a3bf3ec 2018-11-15 00:02:00 1 \n", 2288 | "4 473 182be0c5cdcd5072bb1864cdee4d3d6e 2018-11-15 00:02:00 0 \n", 2289 | "\n", 2290 | " is_click os_version user_id \n", 2291 | "0 0.0 old 87862 \n", 2292 | "1 1.0 latest 63410 \n", 2293 | "2 0.0 intermediate 71748 \n", 2294 | "3 0.0 latest 69209 \n", 2295 | "4 0.0 latest 62873 " 2296 | ] 2297 | }, 2298 | "execution_count": 29, 2299 | "metadata": {}, 2300 | "output_type": "execute_result" 2301 | } 2302 | ], 2303 | "source": [ 2304 | "df.head()" 2305 | ] 2306 | }, 2307 | { 2308 | "cell_type": "code", 2309 | "execution_count": 30, 2310 | "metadata": {}, 2311 | "outputs": [], 2312 | "source": [ 2313 | "df=pd.get_dummies(df,columns=['os_version'],drop_first=True)" 2314 | ] 2315 | }, 2316 | { 2317 | "cell_type": "code", 2318 | "execution_count": 31, 2319 | "metadata": {}, 2320 | "outputs": [ 2321 | { 2322 | "data": { 2323 | "text/html": [ 2324 | "
\n", 2325 | "\n", 2338 | "\n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | "
app_codeimpression_idis_4Gis_clickuser_idos_version_latestos_version_old
0422c4ca4238a0b923820dcc509a6f75849b00.08786201
146745c48cce2e2d7fbdea1afc51c7c6ad2611.06341010
225970efdf2ec9b086079795c442636b55fb10.07174800
32448e296a067a37563370ded05f5a3bf3ec10.06920910
4473182be0c5cdcd5072bb1864cdee4d3d6e00.06287310
\n", 2404 | "
" 2405 | ], 2406 | "text/plain": [ 2407 | " app_code impression_id is_4G is_click user_id \\\n", 2408 | "0 422 c4ca4238a0b923820dcc509a6f75849b 0 0.0 87862 \n", 2409 | "1 467 45c48cce2e2d7fbdea1afc51c7c6ad26 1 1.0 63410 \n", 2410 | "2 259 70efdf2ec9b086079795c442636b55fb 1 0.0 71748 \n", 2411 | "3 244 8e296a067a37563370ded05f5a3bf3ec 1 0.0 69209 \n", 2412 | "4 473 182be0c5cdcd5072bb1864cdee4d3d6e 0 0.0 62873 \n", 2413 | "\n", 2414 | " os_version_latest os_version_old \n", 2415 | "0 0 1 \n", 2416 | "1 1 0 \n", 2417 | "2 0 0 \n", 2418 | "3 1 0 \n", 2419 | "4 1 0 " 2420 | ] 2421 | }, 2422 | "execution_count": 31, 2423 | "metadata": {}, 2424 | "output_type": "execute_result" 2425 | } 2426 | ], 2427 | "source": [ 2428 | "#preprocessing for df\n", 2429 | "df['impression_time']=pd.to_datetime(df['impression_time'],format='%Y-%m-%d %H:%M:%S')\n", 2430 | "df['is_4G']=df['is_4G'].astype('category')\n", 2431 | "df['app_code']=df['app_code'].astype('category')\n", 2432 | "\n", 2433 | "df.drop(['impression_time'],axis=1,inplace=True)\n", 2434 | "\n", 2435 | "df.head()" 2436 | ] 2437 | }, 2438 | { 2439 | "cell_type": "code", 2440 | "execution_count": 32, 2441 | "metadata": {}, 2442 | "outputs": [ 2443 | { 2444 | "data": { 2445 | "text/html": [ 2446 | "
\n", 2447 | "\n", 2460 | "\n", 2461 | " \n", 2462 | " \n", 2463 | " \n", 2464 | " \n", 2465 | " \n", 2466 | " \n", 2467 | " \n", 2468 | " \n", 2469 | " \n", 2470 | " \n", 2471 | " \n", 2472 | " \n", 2473 | " \n", 2474 | " \n", 2475 | " \n", 2476 | " \n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2488 | " \n", 2489 | " \n", 2490 | " \n", 2491 | " \n", 2492 | " \n", 2493 | " \n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2502 | " \n", 2503 | " \n", 2504 | " \n", 2505 | " \n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | " \n", 2512 | " \n", 2513 | " \n", 2514 | " \n", 2515 | " \n", 2516 | " \n", 2517 | " \n", 2518 | " \n", 2519 | " \n", 2520 | " \n", 2521 | " \n", 2522 | " \n", 2523 | " \n", 2524 | " \n", 2525 | " \n", 2526 | " \n", 2527 | " \n", 2528 | " \n", 2529 | " \n", 2530 | " \n", 2531 | " \n", 2532 | " \n", 2533 | " \n", 2534 | " \n", 2535 | " \n", 2536 | " \n", 2537 | " \n", 2538 | " \n", 2539 | " \n", 2540 | " \n", 2541 | " \n", 2542 | " \n", 2543 | " \n", 2544 | " \n", 2545 | " \n", 2546 | " \n", 2547 | " \n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | " \n", 2559 | " \n", 2560 | " \n", 2561 | " \n", 2562 | " \n", 2563 | " \n", 2564 | " \n", 2565 | " \n", 2566 | " \n", 2567 | " \n", 2568 | " \n", 2569 | " \n", 2570 | " \n", 2571 | " \n", 2572 | " \n", 2573 | " \n", 2574 | " \n", 2575 | " \n", 2576 | " \n", 2577 | " \n", 2578 | " \n", 2579 | " \n", 2580 | " \n", 2581 | " \n", 2582 | " \n", 2583 | " \n", 2584 | " \n", 2585 | " \n", 2586 | " \n", 2587 | " \n", 2588 | " \n", 2589 | " \n", 2590 | " \n", 2591 | " \n", 2592 | " \n", 2593 | " \n", 2594 | " \n", 2595 | " \n", 2596 | " \n", 2597 | " \n", 2598 | " \n", 2599 | " \n", 2600 | " \n", 2601 | " \n", 2602 | " \n", 2603 | " \n", 2604 | " \n", 2605 | " \n", 2606 | " \n", 2607 | " \n", 2608 | " \n", 2609 | "
app_codeimpression_idis_4Gis_clickuser_idos_version_latestos_version_oldJ_device_type_countJ_device_type_nuniqueJ_session_id_count...J_cumcount_1_maxJ_cumcount_1_sumJ_cumcount_2_minJ_cumcount_2_meanJ_cumcount_2_maxJ_cumcount_2_sumJ_cumcount_3_minJ_cumcount_3_meanJ_cumcount_3_maxJ_cumcount_3_sum
0422c4ca4238a0b923820dcc509a6f75849b00.08786201111...1111.0000001111.00000011
146745c48cce2e2d7fbdea1afc51c7c6ad2611.0634101012112...127811.91666752311.166667214
225970efdf2ec9b086079795c442636b55fb10.07174800212...2311.0000001211.00000012
32448e296a067a37563370ded05f5a3bf3ec10.0692091018118...1817111.38888932511.277778323
4473182be0c5cdcd5072bb1864cdee4d3d6e00.0628731046146...46108111.45652256711.130435252
\n", 2610 | "

5 rows × 223 columns

\n", 2611 | "
" 2612 | ], 2613 | "text/plain": [ 2614 | " app_code impression_id is_4G is_click user_id \\\n", 2615 | "0 422 c4ca4238a0b923820dcc509a6f75849b 0 0.0 87862 \n", 2616 | "1 467 45c48cce2e2d7fbdea1afc51c7c6ad26 1 1.0 63410 \n", 2617 | "2 259 70efdf2ec9b086079795c442636b55fb 1 0.0 71748 \n", 2618 | "3 244 8e296a067a37563370ded05f5a3bf3ec 1 0.0 69209 \n", 2619 | "4 473 182be0c5cdcd5072bb1864cdee4d3d6e 0 0.0 62873 \n", 2620 | "\n", 2621 | " os_version_latest os_version_old J_device_type_count \\\n", 2622 | "0 0 1 1 \n", 2623 | "1 1 0 12 \n", 2624 | "2 0 0 2 \n", 2625 | "3 1 0 18 \n", 2626 | "4 1 0 46 \n", 2627 | "\n", 2628 | " J_device_type_nunique J_session_id_count ... J_cumcount_1_max \\\n", 2629 | "0 1 1 ... 1 \n", 2630 | "1 1 12 ... 12 \n", 2631 | "2 1 2 ... 2 \n", 2632 | "3 1 18 ... 18 \n", 2633 | "4 1 46 ... 46 \n", 2634 | "\n", 2635 | " J_cumcount_1_sum J_cumcount_2_min J_cumcount_2_mean J_cumcount_2_max \\\n", 2636 | "0 1 1 1.000000 1 \n", 2637 | "1 78 1 1.916667 5 \n", 2638 | "2 3 1 1.000000 1 \n", 2639 | "3 171 1 1.388889 3 \n", 2640 | "4 1081 1 1.456522 5 \n", 2641 | "\n", 2642 | " J_cumcount_2_sum J_cumcount_3_min J_cumcount_3_mean J_cumcount_3_max \\\n", 2643 | "0 1 1 1.000000 1 \n", 2644 | "1 23 1 1.166667 2 \n", 2645 | "2 2 1 1.000000 1 \n", 2646 | "3 25 1 1.277778 3 \n", 2647 | "4 67 1 1.130435 2 \n", 2648 | "\n", 2649 | " J_cumcount_3_sum \n", 2650 | "0 1 \n", 2651 | "1 14 \n", 2652 | "2 2 \n", 2653 | "3 23 \n", 2654 | "4 52 \n", 2655 | "\n", 2656 | "[5 rows x 223 columns]" 2657 | ] 2658 | }, 2659 | "execution_count": 32, 2660 | "metadata": {}, 2661 | "output_type": "execute_result" 2662 | } 2663 | ], 2664 | "source": [ 2665 | "df=df.merge(view_item1,on='user_id',how='left')\n", 2666 | "\n", 2667 | "df.shape\n", 2668 | "df.head()" 2669 | ] 2670 | }, 2671 | { 2672 | "cell_type": "code", 2673 | "execution_count": 33, 2674 | "metadata": {}, 2675 | "outputs": [], 2676 | "source": [ 2677 | "df['user_id']=df['user_id'].astype('category')\n", 2678 | "df['is_4G']=df['is_4G'].astype('category')" 2679 | ] 2680 | }, 2681 | { 2682 | "cell_type": "code", 2683 | "execution_count": 34, 2684 | "metadata": {}, 2685 | "outputs": [ 2686 | { 2687 | "data": { 2688 | "text/plain": [ 2689 | "app_code 0\n", 2690 | "impression_id 0\n", 2691 | "is_4G 0\n", 2692 | "is_click 90675\n", 2693 | "user_id 0\n", 2694 | " ... \n", 2695 | "J_cumcount_2_sum 0\n", 2696 | "J_cumcount_3_min 0\n", 2697 | "J_cumcount_3_mean 0\n", 2698 | "J_cumcount_3_max 0\n", 2699 | "J_cumcount_3_sum 0\n", 2700 | "Length: 223, dtype: int64" 2701 | ] 2702 | }, 2703 | "execution_count": 34, 2704 | "metadata": {}, 2705 | "output_type": "execute_result" 2706 | } 2707 | ], 2708 | "source": [ 2709 | "df.isnull().sum()" 2710 | ] 2711 | }, 2712 | { 2713 | "cell_type": "code", 2714 | "execution_count": 35, 2715 | "metadata": {}, 2716 | "outputs": [ 2717 | { 2718 | "name": "stdout", 2719 | "output_type": "stream", 2720 | "text": [ 2721 | "(237609, 223) (90675, 223)\n" 2722 | ] 2723 | } 2724 | ], 2725 | "source": [ 2726 | "df_train=df[df['is_click'].isnull()==False].copy()\n", 2727 | "df_test=df[df['is_click'].isnull()==True].copy()\n", 2728 | "\n", 2729 | "print(df_train.shape,df_test.shape)" 2730 | ] 2731 | }, 2732 | { 2733 | "cell_type": "code", 2734 | "execution_count": 37, 2735 | "metadata": {}, 2736 | "outputs": [ 2737 | { 2738 | "data": { 2739 | "text/html": [ 2740 | "
\n", 2741 | "\n", 2754 | "\n", 2755 | " \n", 2756 | " \n", 2757 | " \n", 2758 | " \n", 2759 | " \n", 2760 | " \n", 2761 | " \n", 2762 | " \n", 2763 | " \n", 2764 | " \n", 2765 | " \n", 2766 | " \n", 2767 | " \n", 2768 | " \n", 2769 | " \n", 2770 | " \n", 2771 | " \n", 2772 | " \n", 2773 | " \n", 2774 | " \n", 2775 | " \n", 2776 | " \n", 2777 | " \n", 2778 | " \n", 2779 | " \n", 2780 | " \n", 2781 | " \n", 2782 | " \n", 2783 | " \n", 2784 | " \n", 2785 | " \n", 2786 | " \n", 2787 | " \n", 2788 | " \n", 2789 | " \n", 2790 | " \n", 2791 | " \n", 2792 | " \n", 2793 | " \n", 2794 | " \n", 2795 | " \n", 2796 | " \n", 2797 | " \n", 2798 | " \n", 2799 | " \n", 2800 | " \n", 2801 | " \n", 2802 | " \n", 2803 | " \n", 2804 | " \n", 2805 | " \n", 2806 | " \n", 2807 | " \n", 2808 | " \n", 2809 | " \n", 2810 | " \n", 2811 | " \n", 2812 | " \n", 2813 | " \n", 2814 | " \n", 2815 | " \n", 2816 | " \n", 2817 | " \n", 2818 | " \n", 2819 | " \n", 2820 | " \n", 2821 | " \n", 2822 | " \n", 2823 | " \n", 2824 | " \n", 2825 | " \n", 2826 | " \n", 2827 | " \n", 2828 | " \n", 2829 | " \n", 2830 | " \n", 2831 | " \n", 2832 | " \n", 2833 | " \n", 2834 | " \n", 2835 | " \n", 2836 | " \n", 2837 | " \n", 2838 | " \n", 2839 | " \n", 2840 | " \n", 2841 | " \n", 2842 | " \n", 2843 | " \n", 2844 | " \n", 2845 | " \n", 2846 | " \n", 2847 | " \n", 2848 | " \n", 2849 | " \n", 2850 | " \n", 2851 | " \n", 2852 | " \n", 2853 | " \n", 2854 | " \n", 2855 | " \n", 2856 | " \n", 2857 | " \n", 2858 | " \n", 2859 | " \n", 2860 | " \n", 2861 | " \n", 2862 | " \n", 2863 | " \n", 2864 | " \n", 2865 | " \n", 2866 | " \n", 2867 | " \n", 2868 | " \n", 2869 | " \n", 2870 | " \n", 2871 | " \n", 2872 | " \n", 2873 | " \n", 2874 | " \n", 2875 | " \n", 2876 | " \n", 2877 | " \n", 2878 | " \n", 2879 | " \n", 2880 | " \n", 2881 | " \n", 2882 | " \n", 2883 | " \n", 2884 | " \n", 2885 | " \n", 2886 | " \n", 2887 | " \n", 2888 | " \n", 2889 | " \n", 2890 | " \n", 2891 | " \n", 2892 | " \n", 2893 | " \n", 2894 | " \n", 2895 | " \n", 2896 | " \n", 2897 | " \n", 2898 | " \n", 2899 | " \n", 2900 | " \n", 2901 | " \n", 2902 | " \n", 2903 | "
app_codeimpression_idis_4Gis_clickuser_idos_version_latestos_version_oldJ_device_type_countJ_device_type_nuniqueJ_session_id_count...J_cumcount_1_maxJ_cumcount_1_sumJ_cumcount_2_minJ_cumcount_2_meanJ_cumcount_2_maxJ_cumcount_2_sumJ_cumcount_3_minJ_cumcount_3_meanJ_cumcount_3_maxJ_cumcount_3_sum
0422c4ca4238a0b923820dcc509a6f75849b00.08786201111...1111.0000001111.00000011
146745c48cce2e2d7fbdea1afc51c7c6ad2611.0634101012112...127811.91666752311.166667214
225970efdf2ec9b086079795c442636b55fb10.07174800212...2311.0000001211.00000012
32448e296a067a37563370ded05f5a3bf3ec10.0692091018118...1817111.38888932511.277778323
4473182be0c5cdcd5072bb1864cdee4d3d6e00.0628731046146...46108111.45652256711.130435252
\n", 2904 | "

5 rows × 223 columns

\n", 2905 | "
" 2906 | ], 2907 | "text/plain": [ 2908 | " app_code impression_id is_4G is_click user_id \\\n", 2909 | "0 422 c4ca4238a0b923820dcc509a6f75849b 0 0.0 87862 \n", 2910 | "1 467 45c48cce2e2d7fbdea1afc51c7c6ad26 1 1.0 63410 \n", 2911 | "2 259 70efdf2ec9b086079795c442636b55fb 1 0.0 71748 \n", 2912 | "3 244 8e296a067a37563370ded05f5a3bf3ec 1 0.0 69209 \n", 2913 | "4 473 182be0c5cdcd5072bb1864cdee4d3d6e 0 0.0 62873 \n", 2914 | "\n", 2915 | " os_version_latest os_version_old J_device_type_count \\\n", 2916 | "0 0 1 1 \n", 2917 | "1 1 0 12 \n", 2918 | "2 0 0 2 \n", 2919 | "3 1 0 18 \n", 2920 | "4 1 0 46 \n", 2921 | "\n", 2922 | " J_device_type_nunique J_session_id_count ... J_cumcount_1_max \\\n", 2923 | "0 1 1 ... 1 \n", 2924 | "1 1 12 ... 12 \n", 2925 | "2 1 2 ... 2 \n", 2926 | "3 1 18 ... 18 \n", 2927 | "4 1 46 ... 46 \n", 2928 | "\n", 2929 | " J_cumcount_1_sum J_cumcount_2_min J_cumcount_2_mean J_cumcount_2_max \\\n", 2930 | "0 1 1 1.000000 1 \n", 2931 | "1 78 1 1.916667 5 \n", 2932 | "2 3 1 1.000000 1 \n", 2933 | "3 171 1 1.388889 3 \n", 2934 | "4 1081 1 1.456522 5 \n", 2935 | "\n", 2936 | " J_cumcount_2_sum J_cumcount_3_min J_cumcount_3_mean J_cumcount_3_max \\\n", 2937 | "0 1 1 1.000000 1 \n", 2938 | "1 23 1 1.166667 2 \n", 2939 | "2 2 1 1.000000 1 \n", 2940 | "3 25 1 1.277778 3 \n", 2941 | "4 67 1 1.130435 2 \n", 2942 | "\n", 2943 | " J_cumcount_3_sum \n", 2944 | "0 1 \n", 2945 | "1 14 \n", 2946 | "2 2 \n", 2947 | "3 23 \n", 2948 | "4 52 \n", 2949 | "\n", 2950 | "[5 rows x 223 columns]" 2951 | ] 2952 | }, 2953 | "execution_count": 37, 2954 | "metadata": {}, 2955 | "output_type": "execute_result" 2956 | } 2957 | ], 2958 | "source": [ 2959 | "# del df\n", 2960 | "gc.collect()\n", 2961 | "\n", 2962 | "df_train.head()" 2963 | ] 2964 | }, 2965 | { 2966 | "cell_type": "code", 2967 | "execution_count": 40, 2968 | "metadata": {}, 2969 | "outputs": [], 2970 | "source": [ 2971 | "df_train=df_train.merge(df_train.groupby(['user_id']).count()[['impression_id']].reset_index().rename(columns={'impression_id':'user_count'}),on='user_id',how='left')\n", 2972 | "df_test=df_test.merge(df_test.groupby(['user_id']).count()[['impression_id']].reset_index().rename(columns={'impression_id':'user_count'}),on='user_id',how='left')" 2973 | ] 2974 | }, 2975 | { 2976 | "cell_type": "code", 2977 | "execution_count": 41, 2978 | "metadata": {}, 2979 | "outputs": [], 2980 | "source": [ 2981 | "from catboost import CatBoostClassifier,Pool, cv\n", 2982 | "from lightgbm import LGBMClassifier\n", 2983 | "from sklearn.model_selection import StratifiedKFold,train_test_split\n", 2984 | "from sklearn.linear_model import LogisticRegression\n", 2985 | "from sklearn.ensemble import RandomForestClassifier\n", 2986 | "from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score" 2987 | ] 2988 | }, 2989 | { 2990 | "cell_type": "code", 2991 | "execution_count": 42, 2992 | "metadata": {}, 2993 | "outputs": [ 2994 | { 2995 | "name": "stdout", 2996 | "output_type": "stream", 2997 | "text": [ 2998 | "(237609, 222) (90675, 222)\n" 2999 | ] 3000 | } 3001 | ], 3002 | "source": [ 3003 | "X,y=df_train.drop(['impression_id','is_click'],axis=1),df_train['is_click']\n", 3004 | "Xtest=df_test.drop(['impression_id','is_click'],axis=1)\n", 3005 | "\n", 3006 | "print(X.shape,Xtest.shape)\n", 3007 | "X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.25,random_state = 1994,stratify=y)" 3008 | ] 3009 | }, 3010 | { 3011 | "cell_type": "code", 3012 | "execution_count": 43, 3013 | "metadata": {}, 3014 | "outputs": [ 3015 | { 3016 | "data": { 3017 | "text/plain": [ 3018 | "0 0.954286\n", 3019 | "1 0.045714\n", 3020 | "Name: is_click, dtype: float64" 3021 | ] 3022 | }, 3023 | "execution_count": 43, 3024 | "metadata": {}, 3025 | "output_type": "execute_result" 3026 | } 3027 | ], 3028 | "source": [ 3029 | "train['is_click'].value_counts()/train.shape[0]" 3030 | ] 3031 | }, 3032 | { 3033 | "cell_type": "code", 3034 | "execution_count": 46, 3035 | "metadata": {}, 3036 | "outputs": [ 3037 | { 3038 | "data": { 3039 | "text/plain": [ 3040 | "app_code 0\n", 3041 | "is_4G 0\n", 3042 | "user_id 0\n", 3043 | "os_version_latest 0\n", 3044 | "os_version_old 0\n", 3045 | " ..\n", 3046 | "J_cumcount_3_min 0\n", 3047 | "J_cumcount_3_mean 0\n", 3048 | "J_cumcount_3_max 0\n", 3049 | "J_cumcount_3_sum 0\n", 3050 | "user_count 0\n", 3051 | "Length: 222, dtype: int64" 3052 | ] 3053 | }, 3054 | "execution_count": 46, 3055 | "metadata": {}, 3056 | "output_type": "execute_result" 3057 | } 3058 | ], 3059 | "source": [ 3060 | "# Xtest.head()\n", 3061 | "X.isnull().sum()" 3062 | ] 3063 | }, 3064 | { 3065 | "cell_type": "code", 3066 | "execution_count": 49, 3067 | "metadata": {}, 3068 | "outputs": [ 3069 | { 3070 | "name": "stdout", 3071 | "output_type": "stream", 3072 | "text": [ 3073 | "Training until validation scores don't improve for 200 rounds.\n", 3074 | "[200]\tvalid_0's binary_logloss: 0.168698\tvalid_0's auc: 0.746438\n", 3075 | "[400]\tvalid_0's binary_logloss: 0.167574\tvalid_0's auc: 0.749888\n", 3076 | "[600]\tvalid_0's binary_logloss: 0.167415\tvalid_0's auc: 0.750645\n", 3077 | "Early stopping, best iteration is:\n", 3078 | "[525]\tvalid_0's binary_logloss: 0.167256\tvalid_0's auc: 0.751311\n", 3079 | "err: 0.7513112264925058\n", 3080 | "Training until validation scores don't improve for 200 rounds.\n", 3081 | "[200]\tvalid_0's binary_logloss: 0.170478\tvalid_0's auc: 0.735446\n", 3082 | "[400]\tvalid_0's binary_logloss: 0.169739\tvalid_0's auc: 0.738919\n", 3083 | "[600]\tvalid_0's binary_logloss: 0.169722\tvalid_0's auc: 0.740304\n", 3084 | "Early stopping, best iteration is:\n", 3085 | "[528]\tvalid_0's binary_logloss: 0.169609\tvalid_0's auc: 0.740465\n", 3086 | "err: 0.7404650733485545\n", 3087 | "Training until validation scores don't improve for 200 rounds.\n", 3088 | "[200]\tvalid_0's binary_logloss: 0.168564\tvalid_0's auc: 0.745053\n", 3089 | "[400]\tvalid_0's binary_logloss: 0.167726\tvalid_0's auc: 0.747611\n", 3090 | "[600]\tvalid_0's binary_logloss: 0.167639\tvalid_0's auc: 0.749005\n", 3091 | "Early stopping, best iteration is:\n", 3092 | "[502]\tvalid_0's binary_logloss: 0.167526\tvalid_0's auc: 0.749219\n", 3093 | "err: 0.7492190472709699\n", 3094 | "Training until validation scores don't improve for 200 rounds.\n", 3095 | "[200]\tvalid_0's binary_logloss: 0.168293\tvalid_0's auc: 0.746348\n", 3096 | "[400]\tvalid_0's binary_logloss: 0.167559\tvalid_0's auc: 0.746954\n", 3097 | "[600]\tvalid_0's binary_logloss: 0.167745\tvalid_0's auc: 0.746265\n", 3098 | "Early stopping, best iteration is:\n", 3099 | "[508]\tvalid_0's binary_logloss: 0.167476\tvalid_0's auc: 0.747519\n", 3100 | "err: 0.7475194365087583\n", 3101 | "Training until validation scores don't improve for 200 rounds.\n", 3102 | "[200]\tvalid_0's binary_logloss: 0.168911\tvalid_0's auc: 0.747868\n", 3103 | "[400]\tvalid_0's binary_logloss: 0.167786\tvalid_0's auc: 0.75152\n", 3104 | "[600]\tvalid_0's binary_logloss: 0.167698\tvalid_0's auc: 0.751676\n", 3105 | "Early stopping, best iteration is:\n", 3106 | "[484]\tvalid_0's binary_logloss: 0.167638\tvalid_0's auc: 0.751853\n", 3107 | "err: 0.7518532754248215\n", 3108 | "Training until validation scores don't improve for 200 rounds.\n", 3109 | "[200]\tvalid_0's binary_logloss: 0.16677\tvalid_0's auc: 0.755928\n", 3110 | "[400]\tvalid_0's binary_logloss: 0.165211\tvalid_0's auc: 0.76145\n", 3111 | "[600]\tvalid_0's binary_logloss: 0.164819\tvalid_0's auc: 0.763055\n", 3112 | "[800]\tvalid_0's binary_logloss: 0.164882\tvalid_0's auc: 0.763161\n", 3113 | "Early stopping, best iteration is:\n", 3114 | "[613]\tvalid_0's binary_logloss: 0.164762\tvalid_0's auc: 0.763393\n", 3115 | "err: 0.7633934144296153\n", 3116 | "Training until validation scores don't improve for 200 rounds.\n", 3117 | "[200]\tvalid_0's binary_logloss: 0.169565\tvalid_0's auc: 0.738628\n", 3118 | "[400]\tvalid_0's binary_logloss: 0.168572\tvalid_0's auc: 0.743103\n", 3119 | "[600]\tvalid_0's binary_logloss: 0.16851\tvalid_0's auc: 0.74435\n", 3120 | "Early stopping, best iteration is:\n", 3121 | "[484]\tvalid_0's binary_logloss: 0.16848\tvalid_0's auc: 0.743688\n", 3122 | "err: 0.7436882158614906\n", 3123 | "Training until validation scores don't improve for 200 rounds.\n", 3124 | "[200]\tvalid_0's binary_logloss: 0.168709\tvalid_0's auc: 0.748216\n", 3125 | "[400]\tvalid_0's binary_logloss: 0.167448\tvalid_0's auc: 0.752274\n", 3126 | "[600]\tvalid_0's binary_logloss: 0.167042\tvalid_0's auc: 0.754599\n", 3127 | "Early stopping, best iteration is:\n", 3128 | "[543]\tvalid_0's binary_logloss: 0.167006\tvalid_0's auc: 0.754556\n", 3129 | "err: 0.7545564150434918\n", 3130 | "Training until validation scores don't improve for 200 rounds.\n", 3131 | "[200]\tvalid_0's binary_logloss: 0.169281\tvalid_0's auc: 0.740119\n", 3132 | "[400]\tvalid_0's binary_logloss: 0.168729\tvalid_0's auc: 0.740322\n", 3133 | "Early stopping, best iteration is:\n", 3134 | "[293]\tvalid_0's binary_logloss: 0.168745\tvalid_0's auc: 0.741119\n", 3135 | "err: 0.7411194842552564\n", 3136 | "Training until validation scores don't improve for 200 rounds.\n", 3137 | "[200]\tvalid_0's binary_logloss: 0.169126\tvalid_0's auc: 0.74353\n", 3138 | "[400]\tvalid_0's binary_logloss: 0.168223\tvalid_0's auc: 0.747395\n", 3139 | "[600]\tvalid_0's binary_logloss: 0.16821\tvalid_0's auc: 0.748223\n", 3140 | "Early stopping, best iteration is:\n", 3141 | "[516]\tvalid_0's binary_logloss: 0.167996\tvalid_0's auc: 0.749049\n", 3142 | "err: 0.7490488330798404\n" 3143 | ] 3144 | } 3145 | ], 3146 | "source": [ 3147 | "err=[]\n", 3148 | "y_pred_tot=[]\n", 3149 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 3150 | "fold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1994)\n", 3151 | "i=1\n", 3152 | "for train_index, test_index in fold.split(X,y):\n", 3153 | " X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n", 3154 | " y_train, y_test = y[train_index], y[test_index]\n", 3155 | " m=LGBMClassifier(n_estimators=3000,random_state=1994,learning_rate=0.03,colsample_bytree=0.2,objective='binary',scale_pos_weight=1)\n", 3156 | " m.fit(X_train,y_train,eval_set=[(X_test, y_test)],eval_metric='auc', early_stopping_rounds=200,verbose=200)\n", 3157 | " preds=m.predict_proba(X_test)[:,-1]\n", 3158 | " print(\"err: \",roc_auc_score(y_test,preds))\n", 3159 | " err.append(roc_auc_score(y_test,preds))\n", 3160 | " p = m.predict_proba(Xtest)[:,-1]\n", 3161 | " i=i+1\n", 3162 | " y_pred_tot.append(p)" 3163 | ] 3164 | }, 3165 | { 3166 | "cell_type": "code", 3167 | "execution_count": 51, 3168 | "metadata": {}, 3169 | "outputs": [ 3170 | { 3171 | "data": { 3172 | "text/plain": [ 3173 | "0.7492174421715305" 3174 | ] 3175 | }, 3176 | "execution_count": 51, 3177 | "metadata": {}, 3178 | "output_type": "execute_result" 3179 | } 3180 | ], 3181 | "source": [ 3182 | "np.mean(err,0)" 3183 | ] 3184 | }, 3185 | { 3186 | "cell_type": "code", 3187 | "execution_count": 52, 3188 | "metadata": {}, 3189 | "outputs": [ 3190 | { 3191 | "data": { 3192 | "text/html": [ 3193 | "
\n", 3194 | "\n", 3207 | "\n", 3208 | " \n", 3209 | " \n", 3210 | " \n", 3211 | " \n", 3212 | " \n", 3213 | " \n", 3214 | " \n", 3215 | " \n", 3216 | " \n", 3217 | " \n", 3218 | " \n", 3219 | " \n", 3220 | " \n", 3221 | " \n", 3222 | " \n", 3223 | " \n", 3224 | " \n", 3225 | " \n", 3226 | " \n", 3227 | " \n", 3228 | " \n", 3229 | " \n", 3230 | " \n", 3231 | " \n", 3232 | " \n", 3233 | " \n", 3234 | " \n", 3235 | " \n", 3236 | " \n", 3237 | " \n", 3238 | " \n", 3239 | " \n", 3240 | " \n", 3241 | " \n", 3242 | "
impression_idis_click
0a9e7126a585a69a32bc7414e9d0c0ada0.013531
1caac14a5bf2ba283db7708bb348557600.010222
213f10ba306a19ce7bec2f3cae507b6980.062582
339c4b4dc0e9701b55a0a4f072008fb3f0.014803
4bf5a572cca75f5fc67f4b14e58b11d700.190164
\n", 3243 | "
" 3244 | ], 3245 | "text/plain": [ 3246 | " impression_id is_click\n", 3247 | "0 a9e7126a585a69a32bc7414e9d0c0ada 0.013531\n", 3248 | "1 caac14a5bf2ba283db7708bb34855760 0.010222\n", 3249 | "2 13f10ba306a19ce7bec2f3cae507b698 0.062582\n", 3250 | "3 39c4b4dc0e9701b55a0a4f072008fb3f 0.014803\n", 3251 | "4 bf5a572cca75f5fc67f4b14e58b11d70 0.190164" 3252 | ] 3253 | }, 3254 | "execution_count": 52, 3255 | "metadata": {}, 3256 | "output_type": "execute_result" 3257 | } 3258 | ], 3259 | "source": [ 3260 | "s['is_click']=np.mean(y_pred_tot,0)\n", 3261 | "s.head()" 3262 | ] 3263 | }, 3264 | { 3265 | "cell_type": "code", 3266 | "execution_count": 53, 3267 | "metadata": {}, 3268 | "outputs": [ 3269 | { 3270 | "name": "stderr", 3271 | "output_type": "stream", 3272 | "text": [ 3273 | "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3274 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", 3275 | "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3276 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", 3277 | "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3278 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", 3279 | "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3280 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", 3281 | "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3282 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", 3283 | "/opt/conda/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3284 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n", 3285 | "/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3286 | " _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n", 3287 | "/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3288 | " _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n", 3289 | "/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3290 | " _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n", 3291 | "/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3292 | " _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n", 3293 | "/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3294 | " _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n", 3295 | "/opt/conda/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n", 3296 | " np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n" 3297 | ] 3298 | }, 3299 | { 3300 | "data": { 3301 | "text/plain": [ 3302 | "(90675, 2)" 3303 | ] 3304 | }, 3305 | "execution_count": 53, 3306 | "metadata": {}, 3307 | "output_type": "execute_result" 3308 | } 3309 | ], 3310 | "source": [ 3311 | "s.to_csv('AV_WNS_forkkv2_lgb_folds.csv',index=False)\n", 3312 | "s.shape" 3313 | ] 3314 | }, 3315 | { 3316 | "cell_type": "code", 3317 | "execution_count": 54, 3318 | "metadata": {}, 3319 | "outputs": [ 3320 | { 3321 | "name": "stdout", 3322 | "output_type": "stream", 3323 | "text": [ 3324 | "(237609, 222) (90675, 222)\n" 3325 | ] 3326 | }, 3327 | { 3328 | "data": { 3329 | "text/plain": [ 3330 | "array([0, 1, 2])" 3331 | ] 3332 | }, 3333 | "execution_count": 54, 3334 | "metadata": {}, 3335 | "output_type": "execute_result" 3336 | } 3337 | ], 3338 | "source": [ 3339 | "print(X.shape,Xtest.shape)\n", 3340 | "X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.25,random_state = 1994,stratify=y)\n", 3341 | "categorical_features_indices = np.where(X_train.dtypes =='category')[0]\n", 3342 | "categorical_features_indices" 3343 | ] 3344 | }, 3345 | { 3346 | "cell_type": "code", 3347 | "execution_count": 55, 3348 | "metadata": {}, 3349 | "outputs": [ 3350 | { 3351 | "name": "stdout", 3352 | "output_type": "stream", 3353 | "text": [ 3354 | "0:\ttest: 0.5418666\tbest: 0.5418666 (0)\ttotal: 370ms\tremaining: 15m 23s\n", 3355 | "200:\ttest: 0.7364161\tbest: 0.7364161 (200)\ttotal: 1m\tremaining: 11m 29s\n", 3356 | "400:\ttest: 0.7390786\tbest: 0.7390786 (400)\ttotal: 2m\tremaining: 10m 30s\n", 3357 | "600:\ttest: 0.7398497\tbest: 0.7399070 (589)\ttotal: 2m 58s\tremaining: 9m 25s\n", 3358 | "800:\ttest: 0.7405001\tbest: 0.7405083 (799)\ttotal: 3m 57s\tremaining: 8m 24s\n", 3359 | "1000:\ttest: 0.7414331\tbest: 0.7414480 (995)\ttotal: 4m 57s\tremaining: 7m 25s\n", 3360 | "1200:\ttest: 0.7416168\tbest: 0.7416217 (1194)\ttotal: 5m 56s\tremaining: 6m 25s\n", 3361 | "1400:\ttest: 0.7415737\tbest: 0.7416946 (1277)\ttotal: 6m 55s\tremaining: 5m 25s\n", 3362 | "1600:\ttest: 0.7416015\tbest: 0.7417861 (1448)\ttotal: 7m 55s\tremaining: 4m 26s\n", 3363 | "Stopped by overfitting detector (200 iterations wait)\n", 3364 | "\n", 3365 | "bestTest = 0.7417861135\n", 3366 | "bestIteration = 1448\n", 3367 | "\n", 3368 | "Shrink model to first 1449 iterations.\n", 3369 | "0.7417861135403558\n" 3370 | ] 3371 | } 3372 | ], 3373 | "source": [ 3374 | "m=CatBoostClassifier(n_estimators=2500,random_state=1994,learning_rate=0.03,eval_metric='AUC')\n", 3375 | "m.fit(X_train,y_train,eval_set=[(X_val, y_val.values)], early_stopping_rounds=200,verbose=200,cat_features=categorical_features_indices)\n", 3376 | "p=m.predict_proba(X_val)[:,-1]\n", 3377 | "print(roc_auc_score(y_val,p))" 3378 | ] 3379 | }, 3380 | { 3381 | "cell_type": "code", 3382 | "execution_count": 58, 3383 | "metadata": {}, 3384 | "outputs": [ 3385 | { 3386 | "name": "stdout", 3387 | "output_type": "stream", 3388 | "text": [ 3389 | "0:\ttest: 0.5144795\tbest: 0.5144795 (0)\ttotal: 332ms\tremaining: 13m 49s\n", 3390 | "200:\ttest: 0.7436090\tbest: 0.7436090 (200)\ttotal: 1m 7s\tremaining: 12m 47s\n", 3391 | "400:\ttest: 0.7464037\tbest: 0.7464189 (397)\ttotal: 2m 14s\tremaining: 11m 42s\n", 3392 | "600:\ttest: 0.7471002\tbest: 0.7471002 (600)\ttotal: 3m 19s\tremaining: 10m 29s\n", 3393 | "800:\ttest: 0.7484396\tbest: 0.7484861 (797)\ttotal: 4m 24s\tremaining: 9m 21s\n", 3394 | "1000:\ttest: 0.7489092\tbest: 0.7490326 (885)\ttotal: 5m 30s\tremaining: 8m 14s\n", 3395 | "Stopped by overfitting detector (200 iterations wait)\n", 3396 | "\n", 3397 | "bestTest = 0.7490325983\n", 3398 | "bestIteration = 885\n", 3399 | "\n", 3400 | "Shrink model to first 886 iterations.\n", 3401 | "err_cb: 0.7490325983432549\n", 3402 | "0:\ttest: 0.5255802\tbest: 0.5255802 (0)\ttotal: 353ms\tremaining: 14m 41s\n", 3403 | "200:\ttest: 0.7349605\tbest: 0.7349727 (196)\ttotal: 1m 11s\tremaining: 13m 41s\n", 3404 | "400:\ttest: 0.7367486\tbest: 0.7367954 (398)\ttotal: 2m 22s\tremaining: 12m 27s\n", 3405 | "600:\ttest: 0.7380834\tbest: 0.7381583 (588)\ttotal: 3m 31s\tremaining: 11m 9s\n", 3406 | "800:\ttest: 0.7385614\tbest: 0.7385799 (798)\ttotal: 4m 41s\tremaining: 9m 56s\n", 3407 | "1000:\ttest: 0.7390868\tbest: 0.7390968 (998)\ttotal: 5m 51s\tremaining: 8m 46s\n", 3408 | "1200:\ttest: 0.7395140\tbest: 0.7395239 (1192)\ttotal: 7m 2s\tremaining: 7m 36s\n", 3409 | "1400:\ttest: 0.7399729\tbest: 0.7400785 (1373)\ttotal: 8m 11s\tremaining: 6m 25s\n", 3410 | "1600:\ttest: 0.7401355\tbest: 0.7401405 (1580)\ttotal: 9m 22s\tremaining: 5m 15s\n", 3411 | "1800:\ttest: 0.7406562\tbest: 0.7406607 (1799)\ttotal: 10m 31s\tremaining: 4m 5s\n", 3412 | "2000:\ttest: 0.7406412\tbest: 0.7407321 (1977)\ttotal: 11m 42s\tremaining: 2m 55s\n", 3413 | "2200:\ttest: 0.7407028\tbest: 0.7408802 (2079)\ttotal: 12m 53s\tremaining: 1m 45s\n", 3414 | "Stopped by overfitting detector (200 iterations wait)\n", 3415 | "\n", 3416 | "bestTest = 0.7408802435\n", 3417 | "bestIteration = 2079\n", 3418 | "\n", 3419 | "Shrink model to first 2080 iterations.\n", 3420 | "err_cb: 0.740880243511318\n", 3421 | "0:\ttest: 0.5699077\tbest: 0.5699077 (0)\ttotal: 406ms\tremaining: 16m 53s\n", 3422 | "200:\ttest: 0.7484476\tbest: 0.7485515 (195)\ttotal: 1m 9s\tremaining: 13m 14s\n", 3423 | "400:\ttest: 0.7499270\tbest: 0.7499463 (398)\ttotal: 2m 18s\tremaining: 12m 3s\n", 3424 | "600:\ttest: 0.7511684\tbest: 0.7511878 (579)\ttotal: 3m 26s\tremaining: 10m 52s\n", 3425 | "800:\ttest: 0.7522715\tbest: 0.7523015 (797)\ttotal: 4m 34s\tremaining: 9m 41s\n", 3426 | "1000:\ttest: 0.7530256\tbest: 0.7531171 (955)\ttotal: 5m 42s\tremaining: 8m 32s\n", 3427 | "1200:\ttest: 0.7532913\tbest: 0.7534078 (1133)\ttotal: 6m 50s\tremaining: 7m 23s\n", 3428 | "1400:\ttest: 0.7535164\tbest: 0.7535470 (1321)\ttotal: 7m 59s\tremaining: 6m 15s\n", 3429 | "1600:\ttest: 0.7538200\tbest: 0.7538463 (1594)\ttotal: 9m 7s\tremaining: 5m 7s\n", 3430 | "1800:\ttest: 0.7540321\tbest: 0.7540744 (1795)\ttotal: 10m 15s\tremaining: 3m 58s\n", 3431 | "2000:\ttest: 0.7545968\tbest: 0.7546311 (1984)\ttotal: 11m 23s\tremaining: 2m 50s\n", 3432 | "2200:\ttest: 0.7549419\tbest: 0.7549912 (2189)\ttotal: 12m 30s\tremaining: 1m 41s\n", 3433 | "2400:\ttest: 0.7545905\tbest: 0.7550024 (2223)\ttotal: 13m 39s\tremaining: 33.8s\n", 3434 | "Stopped by overfitting detector (200 iterations wait)\n", 3435 | "\n", 3436 | "bestTest = 0.7550023655\n", 3437 | "bestIteration = 2223\n", 3438 | "\n", 3439 | "Shrink model to first 2224 iterations.\n", 3440 | "err_cb: 0.7550023654774305\n", 3441 | "0:\ttest: 0.5525951\tbest: 0.5525951 (0)\ttotal: 351ms\tremaining: 14m 38s\n", 3442 | "200:\ttest: 0.7439602\tbest: 0.7439894 (199)\ttotal: 1m 7s\tremaining: 12m 53s\n", 3443 | "400:\ttest: 0.7458887\tbest: 0.7458887 (400)\ttotal: 2m 13s\tremaining: 11m 38s\n", 3444 | "600:\ttest: 0.7467823\tbest: 0.7468110 (599)\ttotal: 3m 18s\tremaining: 10m 25s\n", 3445 | "800:\ttest: 0.7475678\tbest: 0.7476593 (786)\ttotal: 4m 23s\tremaining: 9m 19s\n", 3446 | "1000:\ttest: 0.7481178\tbest: 0.7481585 (998)\ttotal: 5m 29s\tremaining: 8m 13s\n", 3447 | "1200:\ttest: 0.7483658\tbest: 0.7483658 (1200)\ttotal: 6m 34s\tremaining: 7m 7s\n", 3448 | "1400:\ttest: 0.7480768\tbest: 0.7483658 (1200)\ttotal: 7m 40s\tremaining: 6m\n", 3449 | "Stopped by overfitting detector (200 iterations wait)\n", 3450 | "\n", 3451 | "bestTest = 0.7483658104\n", 3452 | "bestIteration = 1200\n", 3453 | "\n", 3454 | "Shrink model to first 1201 iterations.\n", 3455 | "err_cb: 0.7483658104247505\n", 3456 | "0:\ttest: 0.5295316\tbest: 0.5295316 (0)\ttotal: 341ms\tremaining: 14m 12s\n", 3457 | "200:\ttest: 0.7419186\tbest: 0.7419224 (198)\ttotal: 1m 7s\tremaining: 12m 51s\n", 3458 | "400:\ttest: 0.7447588\tbest: 0.7447588 (400)\ttotal: 2m 13s\tremaining: 11m 40s\n", 3459 | "600:\ttest: 0.7464010\tbest: 0.7464042 (599)\ttotal: 3m 19s\tremaining: 10m 29s\n", 3460 | "800:\ttest: 0.7470388\tbest: 0.7470816 (786)\ttotal: 4m 23s\tremaining: 9m 19s\n", 3461 | "1000:\ttest: 0.7479356\tbest: 0.7479492 (982)\ttotal: 5m 28s\tremaining: 8m 11s\n", 3462 | "1200:\ttest: 0.7482608\tbest: 0.7483751 (1151)\ttotal: 6m 33s\tremaining: 7m 5s\n", 3463 | "1400:\ttest: 0.7484390\tbest: 0.7486519 (1269)\ttotal: 7m 38s\tremaining: 5m 59s\n", 3464 | "1600:\ttest: 0.7486441\tbest: 0.7486764 (1572)\ttotal: 8m 43s\tremaining: 4m 54s\n", 3465 | "1800:\ttest: 0.7489762\tbest: 0.7490161 (1799)\ttotal: 9m 49s\tremaining: 3m 48s\n", 3466 | "2000:\ttest: 0.7493574\tbest: 0.7494478 (1927)\ttotal: 10m 55s\tremaining: 2m 43s\n", 3467 | "2200:\ttest: 0.7493936\tbest: 0.7494826 (2110)\ttotal: 11m 59s\tremaining: 1m 37s\n", 3468 | "2400:\ttest: 0.7498119\tbest: 0.7498972 (2358)\ttotal: 13m 5s\tremaining: 32.4s\n", 3469 | "2499:\ttest: 0.7498882\tbest: 0.7499641 (2478)\ttotal: 13m 37s\tremaining: 0us\n", 3470 | "\n", 3471 | "bestTest = 0.7499641016\n", 3472 | "bestIteration = 2478\n", 3473 | "\n", 3474 | "Shrink model to first 2479 iterations.\n", 3475 | "err_cb: 0.7499641015957328\n", 3476 | "0:\ttest: 0.5570438\tbest: 0.5570438 (0)\ttotal: 403ms\tremaining: 16m 46s\n", 3477 | "200:\ttest: 0.7486009\tbest: 0.7486074 (199)\ttotal: 1m 7s\tremaining: 12m 46s\n", 3478 | "400:\ttest: 0.7506760\tbest: 0.7506760 (400)\ttotal: 2m 15s\tremaining: 11m 48s\n", 3479 | "600:\ttest: 0.7515125\tbest: 0.7516139 (568)\ttotal: 3m 20s\tremaining: 10m 33s\n", 3480 | "800:\ttest: 0.7532311\tbest: 0.7532333 (798)\ttotal: 4m 25s\tremaining: 9m 23s\n", 3481 | "1000:\ttest: 0.7537186\tbest: 0.7537824 (947)\ttotal: 5m 30s\tremaining: 8m 15s\n", 3482 | "1200:\ttest: 0.7541483\tbest: 0.7541535 (1199)\ttotal: 6m 37s\tremaining: 7m 9s\n", 3483 | "1400:\ttest: 0.7544343\tbest: 0.7546010 (1256)\ttotal: 7m 43s\tremaining: 6m 3s\n", 3484 | "Stopped by overfitting detector (200 iterations wait)\n", 3485 | "\n", 3486 | "bestTest = 0.7546009653\n", 3487 | "bestIteration = 1256\n", 3488 | "\n", 3489 | "Shrink model to first 1257 iterations.\n", 3490 | "err_cb: 0.7546009652772279\n", 3491 | "0:\ttest: 0.5590386\tbest: 0.5590386 (0)\ttotal: 399ms\tremaining: 16m 37s\n", 3492 | "200:\ttest: 0.7360658\tbest: 0.7360749 (199)\ttotal: 1m 6s\tremaining: 12m 45s\n", 3493 | "400:\ttest: 0.7385525\tbest: 0.7385525 (400)\ttotal: 2m 11s\tremaining: 11m 30s\n", 3494 | "600:\ttest: 0.7402099\tbest: 0.7402099 (600)\ttotal: 3m 16s\tremaining: 10m 19s\n", 3495 | "800:\ttest: 0.7408576\tbest: 0.7409951 (774)\ttotal: 4m 19s\tremaining: 9m 11s\n", 3496 | "1000:\ttest: 0.7412622\tbest: 0.7412838 (996)\ttotal: 5m 23s\tremaining: 8m 5s\n", 3497 | "1200:\ttest: 0.7419062\tbest: 0.7419062 (1200)\ttotal: 6m 28s\tremaining: 7m\n", 3498 | "1400:\ttest: 0.7420229\tbest: 0.7420326 (1392)\ttotal: 7m 33s\tremaining: 5m 55s\n", 3499 | "1600:\ttest: 0.7426657\tbest: 0.7427121 (1593)\ttotal: 8m 39s\tremaining: 4m 51s\n", 3500 | "1800:\ttest: 0.7429967\tbest: 0.7430180 (1799)\ttotal: 9m 43s\tremaining: 3m 46s\n", 3501 | "2000:\ttest: 0.7434697\tbest: 0.7434927 (1992)\ttotal: 10m 47s\tremaining: 2m 41s\n", 3502 | "2200:\ttest: 0.7436493\tbest: 0.7437120 (2135)\ttotal: 11m 52s\tremaining: 1m 36s\n", 3503 | "2400:\ttest: 0.7438011\tbest: 0.7438741 (2282)\ttotal: 12m 58s\tremaining: 32.1s\n", 3504 | "2499:\ttest: 0.7437049\tbest: 0.7438786 (2432)\ttotal: 13m 30s\tremaining: 0us\n", 3505 | "\n", 3506 | "bestTest = 0.7438786114\n", 3507 | "bestIteration = 2432\n", 3508 | "\n", 3509 | "Shrink model to first 2433 iterations.\n", 3510 | "err_cb: 0.7438786114139869\n", 3511 | "0:\ttest: 0.5587621\tbest: 0.5587621 (0)\ttotal: 365ms\tremaining: 15m 12s\n", 3512 | "200:\ttest: 0.7479608\tbest: 0.7479945 (192)\ttotal: 1m 6s\tremaining: 12m 37s\n", 3513 | "400:\ttest: 0.7507254\tbest: 0.7507254 (400)\ttotal: 2m 12s\tremaining: 11m 31s\n", 3514 | "600:\ttest: 0.7519246\tbest: 0.7520294 (593)\ttotal: 3m 16s\tremaining: 10m 20s\n", 3515 | "800:\ttest: 0.7521744\tbest: 0.7522218 (739)\ttotal: 4m 21s\tremaining: 9m 14s\n", 3516 | "1000:\ttest: 0.7532134\tbest: 0.7533053 (975)\ttotal: 5m 27s\tremaining: 8m 10s\n", 3517 | "Stopped by overfitting detector (200 iterations wait)\n", 3518 | "\n", 3519 | "bestTest = 0.7533052964\n", 3520 | "bestIteration = 975\n", 3521 | "\n", 3522 | "Shrink model to first 976 iterations.\n", 3523 | "err_cb: 0.7533052964177498\n", 3524 | "0:\ttest: 0.5604210\tbest: 0.5604210 (0)\ttotal: 404ms\tremaining: 16m 49s\n", 3525 | "200:\ttest: 0.7361066\tbest: 0.7361066 (200)\ttotal: 1m 9s\tremaining: 13m 11s\n", 3526 | "400:\ttest: 0.7385754\tbest: 0.7385830 (399)\ttotal: 2m 18s\tremaining: 12m 6s\n", 3527 | "600:\ttest: 0.7403040\tbest: 0.7403500 (590)\ttotal: 3m 27s\tremaining: 10m 55s\n", 3528 | "800:\ttest: 0.7417078\tbest: 0.7417528 (792)\ttotal: 4m 36s\tremaining: 9m 46s\n", 3529 | "1000:\ttest: 0.7421358\tbest: 0.7421992 (975)\ttotal: 5m 45s\tremaining: 8m 37s\n", 3530 | "1200:\ttest: 0.7423636\tbest: 0.7425127 (1158)\ttotal: 6m 53s\tremaining: 7m 27s\n", 3531 | "1400:\ttest: 0.7427913\tbest: 0.7429467 (1315)\ttotal: 8m 4s\tremaining: 6m 20s\n", 3532 | "1600:\ttest: 0.7431347\tbest: 0.7431689 (1578)\ttotal: 9m 14s\tremaining: 5m 11s\n", 3533 | "1800:\ttest: 0.7431388\tbest: 0.7432008 (1626)\ttotal: 10m 23s\tremaining: 4m 2s\n", 3534 | "Stopped by overfitting detector (200 iterations wait)\n", 3535 | "\n", 3536 | "bestTest = 0.7432008104\n", 3537 | "bestIteration = 1626\n", 3538 | "\n", 3539 | "Shrink model to first 1627 iterations.\n", 3540 | "err_cb: 0.743200810397546\n", 3541 | "0:\ttest: 0.5633069\tbest: 0.5633069 (0)\ttotal: 369ms\tremaining: 15m 22s\n", 3542 | "200:\ttest: 0.7405152\tbest: 0.7405219 (199)\ttotal: 1m 8s\tremaining: 12m 58s\n", 3543 | "400:\ttest: 0.7428690\tbest: 0.7428690 (400)\ttotal: 2m 15s\tremaining: 11m 48s\n", 3544 | "600:\ttest: 0.7445805\tbest: 0.7446532 (583)\ttotal: 3m 19s\tremaining: 10m 31s\n", 3545 | "800:\ttest: 0.7454955\tbest: 0.7455142 (796)\ttotal: 4m 24s\tremaining: 9m 21s\n", 3546 | "1000:\ttest: 0.7462670\tbest: 0.7462670 (1000)\ttotal: 5m 29s\tremaining: 8m 13s\n", 3547 | "1200:\ttest: 0.7461326\tbest: 0.7462814 (1009)\ttotal: 6m 35s\tremaining: 7m 7s\n", 3548 | "Stopped by overfitting detector (200 iterations wait)\n", 3549 | "\n", 3550 | "bestTest = 0.7462813867\n", 3551 | "bestIteration = 1009\n", 3552 | "\n", 3553 | "Shrink model to first 1010 iterations.\n", 3554 | "err_cb: 0.7462813867011826\n" 3555 | ] 3556 | } 3557 | ], 3558 | "source": [ 3559 | "errCB=[]\n", 3560 | "y_pred_tot_cb=[]\n", 3561 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 3562 | "fold=StratifiedKFold(n_splits=10,shuffle=True,random_state=1994)\n", 3563 | "i=1\n", 3564 | "for train_index, test_index in fold.split(X,y):\n", 3565 | " X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n", 3566 | " y_train, y_test = y[train_index], y[test_index]\n", 3567 | " m=CatBoostClassifier(n_estimators=2500,random_state=1994,eval_metric='AUC',learning_rate=0.03)\n", 3568 | " m.fit(X_train,y_train,eval_set=[(X_test, y_test)], early_stopping_rounds=200,verbose=200,cat_features=categorical_features_indices)\n", 3569 | " preds=m.predict_proba(X_test)[:,-1]\n", 3570 | " print(\"err_cb: \",roc_auc_score(y_test,preds))\n", 3571 | " errCB.append(roc_auc_score(y_test,preds))\n", 3572 | " p = m.predict_proba(Xtest)[:,-1]\n", 3573 | " i=i+1\n", 3574 | " y_pred_tot_cb.append(p)" 3575 | ] 3576 | }, 3577 | { 3578 | "cell_type": "code", 3579 | "execution_count": 59, 3580 | "metadata": {}, 3581 | "outputs": [ 3582 | { 3583 | "data": { 3584 | "text/plain": [ 3585 | "0.7484512189560181" 3586 | ] 3587 | }, 3588 | "execution_count": 59, 3589 | "metadata": {}, 3590 | "output_type": "execute_result" 3591 | } 3592 | ], 3593 | "source": [ 3594 | "np.mean(errCB,0)" 3595 | ] 3596 | }, 3597 | { 3598 | "cell_type": "code", 3599 | "execution_count": 60, 3600 | "metadata": {}, 3601 | "outputs": [ 3602 | { 3603 | "data": { 3604 | "text/html": [ 3605 | "
\n", 3606 | "\n", 3619 | "\n", 3620 | " \n", 3621 | " \n", 3622 | " \n", 3623 | " \n", 3624 | " \n", 3625 | " \n", 3626 | " \n", 3627 | " \n", 3628 | " \n", 3629 | " \n", 3630 | " \n", 3631 | " \n", 3632 | " \n", 3633 | " \n", 3634 | " \n", 3635 | " \n", 3636 | " \n", 3637 | " \n", 3638 | " \n", 3639 | " \n", 3640 | " \n", 3641 | " \n", 3642 | " \n", 3643 | " \n", 3644 | " \n", 3645 | " \n", 3646 | " \n", 3647 | " \n", 3648 | " \n", 3649 | " \n", 3650 | " \n", 3651 | " \n", 3652 | " \n", 3653 | " \n", 3654 | "
impression_idis_click
0a9e7126a585a69a32bc7414e9d0c0ada0.016591
1caac14a5bf2ba283db7708bb348557600.031968
213f10ba306a19ce7bec2f3cae507b6980.077346
339c4b4dc0e9701b55a0a4f072008fb3f0.012104
4bf5a572cca75f5fc67f4b14e58b11d700.141255
\n", 3655 | "
" 3656 | ], 3657 | "text/plain": [ 3658 | " impression_id is_click\n", 3659 | "0 a9e7126a585a69a32bc7414e9d0c0ada 0.016591\n", 3660 | "1 caac14a5bf2ba283db7708bb34855760 0.031968\n", 3661 | "2 13f10ba306a19ce7bec2f3cae507b698 0.077346\n", 3662 | "3 39c4b4dc0e9701b55a0a4f072008fb3f 0.012104\n", 3663 | "4 bf5a572cca75f5fc67f4b14e58b11d70 0.141255" 3664 | ] 3665 | }, 3666 | "execution_count": 60, 3667 | "metadata": {}, 3668 | "output_type": "execute_result" 3669 | } 3670 | ], 3671 | "source": [ 3672 | "s['is_click']=np.mean(y_pred_tot_cb,0)\n", 3673 | "s.head()" 3674 | ] 3675 | }, 3676 | { 3677 | "cell_type": "code", 3678 | "execution_count": 61, 3679 | "metadata": {}, 3680 | "outputs": [ 3681 | { 3682 | "data": { 3683 | "text/plain": [ 3684 | "(90675, 2)" 3685 | ] 3686 | }, 3687 | "execution_count": 61, 3688 | "metadata": {}, 3689 | "output_type": "execute_result" 3690 | } 3691 | ], 3692 | "source": [ 3693 | "s.to_csv('AV_WNS_forkkv2_cb_folds.csv',index=False)\n", 3694 | "s.shape" 3695 | ] 3696 | }, 3697 | { 3698 | "cell_type": "code", 3699 | "execution_count": 64, 3700 | "metadata": {}, 3701 | "outputs": [ 3702 | { 3703 | "data": { 3704 | "text/html": [ 3705 | "
\n", 3706 | "\n", 3719 | "\n", 3720 | " \n", 3721 | " \n", 3722 | " \n", 3723 | " \n", 3724 | " \n", 3725 | " \n", 3726 | " \n", 3727 | " \n", 3728 | " \n", 3729 | " \n", 3730 | " \n", 3731 | " \n", 3732 | " \n", 3733 | " \n", 3734 | " \n", 3735 | " \n", 3736 | " \n", 3737 | " \n", 3738 | " \n", 3739 | " \n", 3740 | " \n", 3741 | " \n", 3742 | " \n", 3743 | " \n", 3744 | " \n", 3745 | " \n", 3746 | " \n", 3747 | " \n", 3748 | " \n", 3749 | " \n", 3750 | " \n", 3751 | " \n", 3752 | " \n", 3753 | " \n", 3754 | "
impression_idis_click
0a9e7126a585a69a32bc7414e9d0c0ada0.015520
1caac14a5bf2ba283db7708bb348557600.024357
213f10ba306a19ce7bec2f3cae507b6980.072179
339c4b4dc0e9701b55a0a4f072008fb3f0.013049
4bf5a572cca75f5fc67f4b14e58b11d700.158373
\n", 3755 | "
" 3756 | ], 3757 | "text/plain": [ 3758 | " impression_id is_click\n", 3759 | "0 a9e7126a585a69a32bc7414e9d0c0ada 0.015520\n", 3760 | "1 caac14a5bf2ba283db7708bb34855760 0.024357\n", 3761 | "2 13f10ba306a19ce7bec2f3cae507b698 0.072179\n", 3762 | "3 39c4b4dc0e9701b55a0a4f072008fb3f 0.013049\n", 3763 | "4 bf5a572cca75f5fc67f4b14e58b11d70 0.158373" 3764 | ] 3765 | }, 3766 | "execution_count": 64, 3767 | "metadata": {}, 3768 | "output_type": "execute_result" 3769 | } 3770 | ], 3771 | "source": [ 3772 | "s['is_click']=np.mean(y_pred_tot_cb,0)*0.65 + np.mean(y_pred_tot,0)*0.35\n", 3773 | "s.head()" 3774 | ] 3775 | }, 3776 | { 3777 | "cell_type": "code", 3778 | "execution_count": 65, 3779 | "metadata": {}, 3780 | "outputs": [ 3781 | { 3782 | "data": { 3783 | "text/plain": [ 3784 | "(90675, 2)" 3785 | ] 3786 | }, 3787 | "execution_count": 65, 3788 | "metadata": {}, 3789 | "output_type": "execute_result" 3790 | } 3791 | ], 3792 | "source": [ 3793 | "s.to_csv('AV_WNS_forkkv2_CBstack_folds.csv',index=False)\n", 3794 | "s.shape" 3795 | ] 3796 | } 3797 | ], 3798 | "metadata": { 3799 | "kernelspec": { 3800 | "display_name": "Python 3", 3801 | "language": "python", 3802 | "name": "python3" 3803 | }, 3804 | "language_info": { 3805 | "codemirror_mode": { 3806 | "name": "ipython", 3807 | "version": 3 3808 | }, 3809 | "file_extension": ".py", 3810 | "mimetype": "text/x-python", 3811 | "name": "python", 3812 | "nbconvert_exporter": "python", 3813 | "pygments_lexer": "ipython3", 3814 | "version": "3.7.1" 3815 | }, 3816 | "toc": { 3817 | "base_numbering": 1, 3818 | "nav_menu": {}, 3819 | "number_sections": true, 3820 | "sideBar": true, 3821 | "skip_h1_title": false, 3822 | "title_cell": "Table of Contents", 3823 | "title_sidebar": "Contents", 3824 | "toc_cell": false, 3825 | "toc_position": {}, 3826 | "toc_section_display": true, 3827 | "toc_window_display": false 3828 | } 3829 | }, 3830 | "nbformat": 4, 3831 | "nbformat_minor": 1 3832 | } 3833 | --------------------------------------------------------------------------------