├── .gitignore
├── LICENSE
├── Makefile
├── Pipfile
├── Pipfile.lock
├── README.md
├── models
    └── .gitkeep
├── notebooks
    ├── 01_cnn_vectors_for_sample.ipynb
    ├── 02_ train_products_distribution.ipynb
    ├── 03_vgg16_finetuning_models.ipynb
    └── 04_resnet50_finetuning_models.ipynb
├── src
    ├── __init__.py
    ├── data
    │   ├── big_sample.py
    │   ├── category_idx.py
    │   ├── product_info.py
    │   ├── top_categories_sample.py
    │   └── train_split.py
    ├── heng_cherkeng
    │   ├── excited_inception_v3.py
    │   ├── inception_v3.py
    │   ├── resnet101.py
    │   └── xception.py
    └── model
    │   ├── __init__.py
    │   ├── bcolz_iterator.py
    │   ├── bcolz_to_memmap.py
    │   ├── bson_iterator.py
    │   ├── combine_iterator.py
    │   ├── ensemble_fixed_weights.py
    │   ├── form_submission.py
    │   ├── form_submission_mul.py
    │   ├── form_submission_sum.py
    │   ├── heng_models.py
    │   ├── memmap_iterator.py
    │   ├── multi_memmap_iterator.py
    │   ├── predict_ensemble_nn.py
    │   ├── pseudo_label_prod_info.py
    │   ├── resnet50_vecs.py
    │   ├── sngl_preds_to_avg.py
    │   ├── train_ensemble_nn.py
    │   ├── tune_avg_resnet50_vecs.py
    │   ├── tune_avg_vgg16_vecs.py
    │   ├── tune_pl_avg_resnet50_vecs.py
    │   ├── tune_resnet50_memmap_vecs.py
    │   ├── tune_resnet50_vecs.py
    │   ├── tune_vgg16_memmap_vecs.py
    │   ├── tune_vgg16_vecs.py
    │   └── vgg16_vecs.py
├── test_environment.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | /data
3 | /models
4 | .env
5 | .ipynb_checkpoints
6 | nohup.out
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | The MIT License (MIT)
 3 | Copyright (c) 2017, NighTurs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | 
 3 | url = "https://pypi.python.org/simple"
 4 | verify_ssl = true
 5 | name = "pypi"
 6 | 
 7 | 
 8 | [packages]
 9 | 
10 | keras = "*"
11 | pandas = "*"
12 | numpy = "*"
13 | jupyter = "*"
14 | theano = "*"
15 | scikit-learn = "*"
16 | jupyter-contrib-nbextensions = "*"
17 | pillow = "*"
18 | pymongo = "*"
19 | matplotlib = "*"
20 | seaborn = "*"
21 | "h5py" = "*"
22 | bcolz = "*"
23 | cookiecutter = "*"
24 | tqdm = "*"
25 | opencv-python = "*"
26 | "bb297d5" = {file = "http://download.pytorch.org/whl/cu80/torch-0.2.0.post3-cp36-cp36m-manylinux1_x86_64.whl"}
27 | 
28 | 
29 | [dev-packages]
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | kaggle-cdiscount-image-classification
2 | ==============================
3 | 
4 | Solution to kaggle competition "Cdiscount’s Image Classification Challenge"


--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NighTurs/kaggle-cdiscount-image-classification/3646ee4dc7a0e35dfe6fb4cdaadcf2fb7b30d3a5/models/.gitkeep


--------------------------------------------------------------------------------
/notebooks/02_ train_products_distribution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import scipy.stats as ss"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "prod_info = pd.read_csv('../data/interim/train_product_info.csv')"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 3,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/html": [
 31 |        "<div>\n",
 32 |        "<style>\n",
 33 |        "    .dataframe thead tr:only-child th {\n",
 34 |        "        text-align: right;\n",
 35 |        "    }\n",
 36 |        "\n",
 37 |        "    .dataframe thead th {\n",
 38 |        "        text-align: left;\n",
 39 |        "    }\n",
 40 |        "\n",
 41 |        "    .dataframe tbody tr th {\n",
 42 |        "        vertical-align: top;\n",
 43 |        "    }\n",
 44 |        "</style>\n",
 45 |        "<table border=\"1\" class=\"dataframe\">\n",
 46 |        "  <thead>\n",
 47 |        "    <tr style=\"text-align: right;\">\n",
 48 |        "      <th></th>\n",
 49 |        "      <th>product_id</th>\n",
 50 |        "      <th>num_imgs</th>\n",
 51 |        "      <th>offset</th>\n",
 52 |        "      <th>length</th>\n",
 53 |        "      <th>category_id</th>\n",
 54 |        "    </tr>\n",
 55 |        "  </thead>\n",
 56 |        "  <tbody>\n",
 57 |        "    <tr>\n",
 58 |        "      <th>0</th>\n",
 59 |        "      <td>0</td>\n",
 60 |        "      <td>1</td>\n",
 61 |        "      <td>0</td>\n",
 62 |        "      <td>6979</td>\n",
 63 |        "      <td>1000010653</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>1</th>\n",
 67 |        "      <td>1</td>\n",
 68 |        "      <td>1</td>\n",
 69 |        "      <td>6979</td>\n",
 70 |        "      <td>7318</td>\n",
 71 |        "      <td>1000010653</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>2</th>\n",
 75 |        "      <td>2</td>\n",
 76 |        "      <td>1</td>\n",
 77 |        "      <td>14297</td>\n",
 78 |        "      <td>5455</td>\n",
 79 |        "      <td>1000004079</td>\n",
 80 |        "    </tr>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>3</th>\n",
 83 |        "      <td>3</td>\n",
 84 |        "      <td>1</td>\n",
 85 |        "      <td>19752</td>\n",
 86 |        "      <td>4580</td>\n",
 87 |        "      <td>1000004141</td>\n",
 88 |        "    </tr>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>4</th>\n",
 91 |        "      <td>4</td>\n",
 92 |        "      <td>1</td>\n",
 93 |        "      <td>24332</td>\n",
 94 |        "      <td>6346</td>\n",
 95 |        "      <td>1000015539</td>\n",
 96 |        "    </tr>\n",
 97 |        "  </tbody>\n",
 98 |        "</table>\n",
 99 |        "</div>"
100 |       ],
101 |       "text/plain": [
102 |        "   product_id  num_imgs  offset  length  category_id\n",
103 |        "0           0         1       0    6979   1000010653\n",
104 |        "1           1         1    6979    7318   1000010653\n",
105 |        "2           2         1   14297    5455   1000004079\n",
106 |        "3           3         1   19752    4580   1000004141\n",
107 |        "4           4         1   24332    6346   1000015539"
108 |       ]
109 |      },
110 |      "execution_count": 3,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "prod_info.head()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 4,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "categories = prod_info.category_id.sample(10, random_state=123).as_matrix()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 5,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "max_index = np.max(prod_info.index.values)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 6,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "prod_info = prod_info[prod_info.category_id.isin(categories)]"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "(212367, 5)"
155 |       ]
156 |      },
157 |      "execution_count": 7,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "prod_info.shape"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 8,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "def dist_check(x):\n",
173 |     "    return ss.kstest(x.index.values, 'uniform', args=(0,max_index))"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 9,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "category_id\n",
185 |        "1000003796    (0.0585434648794, 6.62093945672e-133)\n",
186 |        "1000005890     (0.0282060871844, 0.000225859584689)\n",
187 |        "1000009368        (0.0556134773972, 0.690667941194)\n",
188 |        "1000010722                      (0.2076794447, 0.0)\n",
189 |        "1000011345     (0.0518859397384, 4.02373456585e-05)\n",
190 |        "1000012993                    (0.212172712988, 0.0)\n",
191 |        "1000013645         (0.122406384056, 0.116693063939)\n",
192 |        "1000014217      (0.0255374263699, 2.0049038596e-11)\n",
193 |        "1000018294      (0.0219118239728, 1.6525548444e-24)\n",
194 |        "Name: offset, dtype: object"
195 |       ]
196 |      },
197 |      "execution_count": 9,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "prod_info.groupby('category_id').offset.agg(dist_check)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 85,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "prod_info_shuffled = prod_info.reindex(np.random.permutation(prod_info.index))"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 88,
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "data": {
222 |       "text/html": [
223 |        "<div>\n",
224 |        "<style>\n",
225 |        "    .dataframe thead tr:only-child th {\n",
226 |        "        text-align: right;\n",
227 |        "    }\n",
228 |        "\n",
229 |        "    .dataframe thead th {\n",
230 |        "        text-align: left;\n",
231 |        "    }\n",
232 |        "\n",
233 |        "    .dataframe tbody tr th {\n",
234 |        "        vertical-align: top;\n",
235 |        "    }\n",
236 |        "</style>\n",
237 |        "<table border=\"1\" class=\"dataframe\">\n",
238 |        "  <thead>\n",
239 |        "    <tr style=\"text-align: right;\">\n",
240 |        "      <th></th>\n",
241 |        "      <th>product_id</th>\n",
242 |        "      <th>num_imgs</th>\n",
243 |        "      <th>offset</th>\n",
244 |        "      <th>length</th>\n",
245 |        "      <th>category_id</th>\n",
246 |        "    </tr>\n",
247 |        "  </thead>\n",
248 |        "  <tbody>\n",
249 |        "    <tr>\n",
250 |        "      <th>0</th>\n",
251 |        "      <td>18534597</td>\n",
252 |        "      <td>2</td>\n",
253 |        "      <td>50143773053</td>\n",
254 |        "      <td>8313</td>\n",
255 |        "      <td>1000014217</td>\n",
256 |        "    </tr>\n",
257 |        "    <tr>\n",
258 |        "      <th>1</th>\n",
259 |        "      <td>2234391</td>\n",
260 |        "      <td>1</td>\n",
261 |        "      <td>7226758382</td>\n",
262 |        "      <td>5642</td>\n",
263 |        "      <td>1000010722</td>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>2</th>\n",
267 |        "      <td>17857665</td>\n",
268 |        "      <td>1</td>\n",
269 |        "      <td>48480651683</td>\n",
270 |        "      <td>5456</td>\n",
271 |        "      <td>1000018294</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <th>3</th>\n",
275 |        "      <td>19248064</td>\n",
276 |        "      <td>1</td>\n",
277 |        "      <td>51904004751</td>\n",
278 |        "      <td>7465</td>\n",
279 |        "      <td>1000014217</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>4</th>\n",
283 |        "      <td>5783542</td>\n",
284 |        "      <td>1</td>\n",
285 |        "      <td>15554491425</td>\n",
286 |        "      <td>6956</td>\n",
287 |        "      <td>1000018294</td>\n",
288 |        "    </tr>\n",
289 |        "  </tbody>\n",
290 |        "</table>\n",
291 |        "</div>"
292 |       ],
293 |       "text/plain": [
294 |        "   product_id  num_imgs       offset  length  category_id\n",
295 |        "0    18534597         2  50143773053    8313   1000014217\n",
296 |        "1     2234391         1   7226758382    5642   1000010722\n",
297 |        "2    17857665         1  48480651683    5456   1000018294\n",
298 |        "3    19248064         1  51904004751    7465   1000014217\n",
299 |        "4     5783542         1  15554491425    6956   1000018294"
300 |       ]
301 |      },
302 |      "execution_count": 88,
303 |      "metadata": {},
304 |      "output_type": "execute_result"
305 |     }
306 |    ],
307 |    "source": [
308 |     "prod_info_shuffled.reset_index(inplace=True, drop=True)\n",
309 |     "prod_info_shuffled.head()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 91,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "max_index = np.max(prod_info_shuffled.index.values)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 92,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "def dist_check(x):\n",
328 |     "    return ss.kstest(x.index.values, 'uniform', args=(0,max_index))"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 93,
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "data": {
338 |       "text/plain": [
339 |        "category_id\n",
340 |        "1000003796     (0.0029641280778, 0.829121438545)\n",
341 |        "1000005890     (0.0107517857166, 0.523791313126)\n",
342 |        "1000009368     (0.0644712714092, 0.482267109646)\n",
343 |        "1000010722     (0.00406073851587, 0.53500273456)\n",
344 |        "1000011345     (0.0201039563847, 0.393696555104)\n",
345 |        "1000012993    (0.00389849966882, 0.524969145571)\n",
346 |        "1000013645      (0.070996176003, 0.755455088588)\n",
347 |        "1000014217    (0.0101867510116, 0.0355555653105)\n",
348 |        "1000018294     (0.00226298289825, 0.92889410781)\n",
349 |        "Name: offset, dtype: object"
350 |       ]
351 |      },
352 |      "execution_count": 93,
353 |      "metadata": {},
354 |      "output_type": "execute_result"
355 |     }
356 |    ],
357 |    "source": [
358 |     "prod_info_shuffled.groupby('category_id').offset.agg(dist_check)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 61,
364 |    "metadata": {},
365 |    "outputs": [
366 |     {
367 |      "data": {
368 |       "text/plain": [
369 |        "KstestResult(statistic=0.50000544788818035, pvalue=0.0)"
370 |       ]
371 |      },
372 |      "execution_count": 61,
373 |      "metadata": {},
374 |      "output_type": "execute_result"
375 |     }
376 |    ],
377 |    "source": [
378 |     "ss.kstest(np.random.uniform(0, 0.5, 10000), 'uniform', args=(0,1))"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 28,
384 |    "metadata": {},
385 |    "outputs": [
386 |     {
387 |      "data": {
388 |       "text/plain": [
389 |        "77.10547314768715"
390 |       ]
391 |      },
392 |      "execution_count": 28,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "np.random.uniform(100)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 60,
404 |    "metadata": {},
405 |    "outputs": [
406 |     {
407 |      "data": {
408 |       "text/plain": [
409 |        "KstestResult(statistic=0.92043693500882817, pvalue=0.0)"
410 |       ]
411 |      },
412 |      "execution_count": 60,
413 |      "metadata": {},
414 |      "output_type": "execute_result"
415 |     }
416 |    ],
417 |    "source": [
418 |     "ss.kstest(np.random.normal(0.5, 0.1, 10000), 'uniform', args=(0,10))"
419 |    ]
420 |   }
421 |  ],
422 |  "metadata": {
423 |   "kernelspec": {
424 |    "display_name": "Python 3",
425 |    "language": "python",
426 |    "name": "python3"
427 |   },
428 |   "language_info": {
429 |    "codemirror_mode": {
430 |     "name": "ipython",
431 |     "version": 3
432 |    },
433 |    "file_extension": ".py",
434 |    "mimetype": "text/x-python",
435 |    "name": "python",
436 |    "nbconvert_exporter": "python",
437 |    "pygments_lexer": "ipython3",
438 |    "version": "3.6.2"
439 |   }
440 |  },
441 |  "nbformat": 4,
442 |  "nbformat_minor": 2
443 | }
444 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NighTurs/kaggle-cdiscount-image-classification/3646ee4dc7a0e35dfe6fb4cdaadcf2fb7b30d3a5/src/__init__.py


--------------------------------------------------------------------------------
/src/data/big_sample.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import argparse
 4 | 
 5 | 
 6 | def create_big_sample(prod_info_csv):
 7 |     prod_info = pd.read_csv(prod_info_csv)
 8 |     category_stats = prod_info.groupby(by='category_id').size()
 9 |     category_stats.sort_values(ascending=False, inplace=True)
10 |     categories = category_stats[:2000].index.values
11 |     categories_set = set(categories)
12 | 
13 |     np.random.seed(123)
14 |     chunks = []
15 |     for category, prods in prod_info.groupby(by='category_id'):
16 |         if category not in categories_set:
17 |             continue
18 |         chunks.append(prods.sample(100 if prods.shape[0] >= 100 else prods.shape[0]))
19 |     sample = pd.concat(chunks)
20 |     sample = sample.reset_index(drop=True)
21 |     return sample
22 | 
23 | 
24 | def save_big_sample(big_sample, output_file):
25 |     big_sample.to_csv(output_file, index=False)
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument('--prod_info_csv', required=True, help='Path to prod info csv')
31 |     parser.add_argument('--output_file', required=True, help='File to save sample into')
32 | 
33 |     args = parser.parse_args()
34 |     big_sample = create_big_sample(args.prod_info_csv)
35 |     save_big_sample(big_sample, args.output_file)
36 | 


--------------------------------------------------------------------------------
/src/data/category_idx.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | 
 6 | def create_category_idx(prod_info):
 7 |     category_stats = prod_info.groupby(by='category_id').size()
 8 |     category_stats.sort_values(ascending=False, inplace=True)
 9 |     category_idx = pd.DataFrame([(i, v) for i, v in enumerate(category_stats.index.values)],
10 |                                 columns=['category_idx', 'category_id'])
11 |     return category_idx
12 | 
13 | 
14 | def category_to_index_dict(category_idx):
15 |     return {row.category_id: row.category_idx for row in category_idx.itertuples()}
16 | 
17 | 
18 | def index_to_category_dict(category_idx):
19 |     return {row.category_idx: row.category_id for row in category_idx.itertuples()}
20 | 
21 | 
22 | def map_categories(category_idx, categories):
23 |     cat2idx = category_to_index_dict(category_idx)
24 |     return np.array([cat2idx[x] for x in categories])
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument('--prod_info_csv', required=True, help='Path to prod info csv')
30 |     parser.add_argument('--output_file', required=True, help='File to save indexes into')
31 |     args = parser.parse_args()
32 |     prod_info = pd.read_csv(args.prod_info_csv)
33 |     category_idx = create_category_idx(prod_info)
34 |     category_idx.to_csv(args.output_file, index=False)
35 | 


--------------------------------------------------------------------------------
/src/data/product_info.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import bson
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | import argparse
 6 | 
 7 | 
 8 | def product_info(bson_path, with_categories):
 9 |     rows = {}
10 |     with open(bson_path, "rb") as f, tqdm() as pbar:
11 |         offset = 0
12 |         while True:
13 |             item_length_bytes = f.read(4)
14 |             if len(item_length_bytes) == 0:
15 |                 break
16 | 
17 |             length = struct.unpack("<i", item_length_bytes)[0]
18 | 
19 |             f.seek(offset)
20 |             item_data = f.read(length)
21 |             assert len(item_data) == length
22 | 
23 |             item = bson.BSON.decode(item_data)
24 |             product_id = item["_id"]
25 |             num_imgs = len(item["imgs"])
26 | 
27 |             row = [num_imgs, offset, length]
28 |             if with_categories:
29 |                 row += [item["category_id"]]
30 |             rows[product_id] = row
31 | 
32 |             offset += length
33 |             f.seek(offset)
34 |             pbar.update()
35 | 
36 |     columns = ["num_imgs", "offset", "length"]
37 |     if with_categories:
38 |         columns += ["category_id"]
39 | 
40 |     df = pd.DataFrame.from_dict(rows, orient="index")
41 |     df.index.name = "product_id"
42 |     df.columns = columns
43 |     df.sort_index(inplace=True)
44 |     return df
45 | 
46 | 
47 | def save_product_info(product_info, file):
48 |     product_info.to_csv(file)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument('--bson', required=True, help='Path to bson with products')
54 |     parser.add_argument('--without_categories', dest='with_categories', action='store_false',
55 |                         help="Products don't have category_id field?")
56 |     parser.set_defaults(with_categories=True)
57 |     parser.add_argument('--output_file', required=True, help='File to save products info into')
58 | 
59 |     args = parser.parse_args()
60 |     product_info = product_info(args.bson, args.with_categories)
61 |     save_product_info(product_info, args.output_file)
62 | 


--------------------------------------------------------------------------------
/src/data/top_categories_sample.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | 
 4 | 
 5 | def top_categories_sample(prod_info, num_categories):
 6 |     category_stats = prod_info.groupby(by='category_id').size()
 7 |     category_stats.sort_values(ascending=False, inplace=True)
 8 |     categories = category_stats[:num_categories].index.values
 9 |     categories_set = set(categories)
10 |     chunks = []
11 |     for category, prods in prod_info.groupby(by='category_id'):
12 |         if category not in categories_set:
13 |             continue
14 |         chunks.append(prods)
15 |     sample = pd.concat(chunks)
16 |     sample = sample.reset_index(drop=True)
17 |     return sample
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('--prod_info_csv', required=True, help='Path to prod info csv')
23 |     parser.add_argument('--output_file', required=True, help='File to save sample into')
24 |     parser.add_argument('--num_categories', type=int, required=True, help='Number of categories to leave')
25 | 
26 |     args = parser.parse_args()
27 |     prod_info = pd.read_csv(args.prod_info_csv)
28 |     sample = top_categories_sample(prod_info, args.num_categories)
29 |     sample.to_csv(args.output_file, index=False)
30 | 


--------------------------------------------------------------------------------
/src/data/train_split.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import argparse
 4 | 
 5 | 
 6 | def train_slit(prod_info, split_size=100000):
 7 |     n = prod_info.shape[0]
 8 |     idx = np.arange(n)
 9 |     np.random.seed(321)
10 |     np.random.shuffle(idx)
11 |     split = np.zeros(n, dtype=np.bool)
12 |     split[idx[:-split_size]] = True
13 |     return prod_info.assign(train=split)[['product_id', 'train']]
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--prod_info_csv', required=True,
19 |                         help='Path to training prod info csv')
20 |     parser.add_argument('--output_file', required=True,
21 |                         help='Path to save split into')
22 |     args = parser.parse_args()
23 |     prod_info = pd.read_csv(args.prod_info_csv)
24 |     split = train_slit(prod_info)
25 |     split.to_csv(args.output_file, index=False)
26 | 


--------------------------------------------------------------------------------
/src/heng_cherkeng/excited_inception_v3.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from torch.autograd import Variable
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.utils.model_zoo as model_zoo
 10 | 
 11 | #se inception 3
 12 | # https://github.com/moskomule/senet.pytorch/blob/master/se_inception.py
 13 | 
 14 | 
 15 | class SELayer(nn.Module):
 16 |     def __init__(self, channel, reduction=16):
 17 |         super(SELayer, self).__init__()
 18 |         self.avg_pool = nn.AdaptiveAvgPool2d(1)
 19 |         self.fc = nn.Sequential(
 20 |                 nn.Linear(channel, reduction), ##0
 21 |                 nn.ReLU(inplace=True),
 22 |                 nn.Linear(reduction, channel), ##2
 23 |                 nn.Sigmoid()
 24 |         )
 25 |         self.init_parameters(channel, reduction)
 26 | 
 27 |     def forward(self, x):
 28 |         b, c, _, _ = x.size()
 29 |         y = self.avg_pool(x).view(b, c)
 30 |         y = self.fc(y).view(b, c, 1, 1)
 31 |         return x * y
 32 | 
 33 |     # https://discuss.pytorch.org/t/weight-initilzation/157/41
 34 |     # https://discuss.pytorch.org/t/initalize-the-weights-of-nn-convtranspose2d/946
 35 |     def init_parameters(self, channel, reduction):
 36 |         self.fc._modules['0'].weight.data.normal_(0, 1)
 37 |         self.fc._modules['0'].bias.data.zero_()
 38 | 
 39 |         self.fc._modules['2'].weight.data.normal_(0, 1)
 40 |         self.fc._modules['2'].bias.data.zero_()
 41 | 
 42 | 
 43 | 
 44 | 
 45 | class SEInception3(nn.Module):
 46 | 
 47 |     def load_pretrain_pytorch_file(self,pytorch_file, skip=[]):
 48 |         pytorch_state_dict = torch.load(pytorch_file)
 49 |         state_dict = self.state_dict()
 50 | 
 51 |         keys = list(state_dict.keys())
 52 |         for key in keys:
 53 |             if key in skip:
 54 |                 continue
 55 |             if 'SELayer' in key:
 56 |                 continue
 57 |             state_dict[key] = pytorch_state_dict[key]
 58 | 
 59 |         self.load_state_dict(state_dict)
 60 |         pass
 61 | 
 62 |     #-----------------------------------------------------------------------
 63 |     def __init__(self, in_shape=(3,128,128), num_classes=1000 ):
 64 |         super(SEInception3, self).__init__()
 65 |         in_channels, height, width = in_shape
 66 |         self.num_classes=num_classes
 67 |         assert(in_channels==3)
 68 | 
 69 |         self.Conv2d_1a_3x3 = BasicConv2d(in_channels, 32, kernel_size=3, stride=2)
 70 |         self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
 71 |         self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
 72 |         self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
 73 |         self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
 74 |         self.Mixed_5b = InceptionA(192, pool_features=32)
 75 |         self.Mixed_5c = InceptionA(256, pool_features=64)
 76 |         self.Mixed_5d = InceptionA(288, pool_features=64)
 77 |         self.Mixed_6a = InceptionB(288)
 78 |         self.Mixed_6b = InceptionC(768, channels_7x7=128)
 79 |         self.Mixed_6c = InceptionC(768, channels_7x7=160)
 80 |         self.Mixed_6d = InceptionC(768, channels_7x7=160)
 81 |         self.Mixed_6e = InceptionC(768, channels_7x7=192)
 82 |         self.Mixed_7a = InceptionD(768)
 83 |         self.Mixed_7b = InceptionE(1280)
 84 |         self.Mixed_7c = InceptionE(2048)
 85 | 
 86 |         ##------------------------------------------------------------
 87 |         ## add se
 88 |         self.Mixed_5b.add_module("SELayer", SELayer(192))
 89 |         self.Mixed_5c.add_module("SELayer", SELayer(256))
 90 |         self.Mixed_5d.add_module("SELayer", SELayer(288))
 91 |         self.Mixed_6a.add_module("SELayer", SELayer(288))
 92 |         self.Mixed_6b.add_module("SELayer", SELayer(768))
 93 |         self.Mixed_6c.add_module("SELayer", SELayer(768))
 94 |         self.Mixed_6d.add_module("SELayer", SELayer(768))
 95 |         self.Mixed_6e.add_module("SELayer", SELayer(768))
 96 |         self.Mixed_7a.add_module("SELayer", SELayer(768))
 97 |         self.Mixed_7b.add_module("SELayer", SELayer(1280))
 98 |         self.Mixed_7c.add_module("SELayer", SELayer(2048))
 99 | 
100 |         ##------------------------------------------------------------
101 |         self.fc = nn.Linear(2048, num_classes)
102 | 
103 |         for m in self.modules():
104 |             if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
105 |                 import scipy.stats as stats
106 |                 stddev = m.stddev if hasattr(m, 'stddev') else 0.1
107 |                 X = stats.truncnorm(-2, 2, scale=stddev)
108 |                 values = torch.Tensor(X.rvs(m.weight.data.numel()))
109 |                 values = values.view(m.weight.data.size())
110 |                 m.weight.data.copy_(values)
111 |             elif isinstance(m, nn.BatchNorm2d):
112 |                 m.weight.data.fill_(1)
113 |                 m.bias.data.zero_()
114 | 
115 |     def forward(self, x):
116 | 
117 |         # if self.transform_input:
118 |         #     x = x.clone()
119 |         #     x[:, 0] = x[:, 0] * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
120 |         #     x[:, 1] = x[:, 1] * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
121 |         #     x[:, 2] = x[:, 2] * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
122 | 
123 | 
124 |         # 299 x 299 x 3
125 |         x = self.Conv2d_1a_3x3(x)
126 |         # 149 x 149 x 32
127 |         x = self.Conv2d_2a_3x3(x)
128 |         # 147 x 147 x 32
129 |         x = self.Conv2d_2b_3x3(x)
130 |         # 147 x 147 x 64
131 |         x = F.max_pool2d(x, kernel_size=3, stride=2)
132 |         # 73 x 73 x 64
133 |         x = self.Conv2d_3b_1x1(x)
134 |         # 73 x 73 x 80
135 |         x = self.Conv2d_4a_3x3(x)
136 |         # 71 x 71 x 192
137 |         x = F.max_pool2d(x, kernel_size=3, stride=2)
138 |         # 35 x 35 x 192
139 |         x = self.Mixed_5b(x)
140 |         # 35 x 35 x 256
141 |         x = self.Mixed_5c(x)
142 |         # 35 x 35 x 288
143 |         x = self.Mixed_5d(x)
144 |         # 35 x 35 x 288
145 |         x = self.Mixed_6a(x)
146 |         # 17 x 17 x 768
147 |         x = self.Mixed_6b(x)
148 |         # 17 x 17 x 768
149 |         x = self.Mixed_6c(x)
150 |         # 17 x 17 x 768
151 |         x = self.Mixed_6d(x)
152 |         # 17 x 17 x 768
153 |         x = self.Mixed_6e(x)
154 |         # 17 x 17 x 768
155 | 
156 |         # if self.training and self.aux_logits:
157 |         #     aux = self.AuxLogits(x)
158 | 
159 |         # 17 x 17 x 768
160 |         x = self.Mixed_7a(x)
161 |         # 8 x 8 x 1280
162 |         x = self.Mixed_7b(x)
163 |         # 8 x 8 x 2048
164 |         x = self.Mixed_7c(x)
165 |         # 8 x 8 x 2048
166 | 
167 |         #x = F.avg_pool2d(x, kernel_size=8)
168 |         x = F.adaptive_avg_pool2d(x, output_size=1)
169 | 
170 |         # 1 x 1 x 2048
171 |         x = F.dropout(x, training=self.training)
172 |         # 1 x 1 x 2048
173 |         x = x.view(x.size(0), -1)
174 |         # 2048
175 |         x = self.fc(x)
176 |         # 1000 (num_classes)
177 | 
178 |         # if self.training and self.aux_logits:
179 |         #     return x, aux
180 | 
181 |         return x
182 | 
183 | 
184 | class InceptionA(nn.Module):
185 | 
186 |     def __init__(self, in_channels, pool_features):
187 |         super(InceptionA, self).__init__()
188 |         self.branch1x1 = BasicConv2d(in_channels, 64, kernel_size=1)
189 | 
190 |         self.branch5x5_1 = BasicConv2d(in_channels, 48, kernel_size=1)
191 |         self.branch5x5_2 = BasicConv2d(48, 64, kernel_size=5, padding=2)
192 | 
193 |         self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
194 |         self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
195 |         self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, padding=1)
196 | 
197 |         self.branch_pool = BasicConv2d(in_channels, pool_features, kernel_size=1)
198 | 
199 |     def forward(self, x):
200 |         branch1x1 = self.branch1x1(x)
201 | 
202 |         branch5x5 = self.branch5x5_1(x)
203 |         branch5x5 = self.branch5x5_2(branch5x5)
204 | 
205 |         branch3x3dbl = self.branch3x3dbl_1(x)
206 |         branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
207 |         branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
208 | 
209 |         branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
210 |         branch_pool = self.branch_pool(branch_pool)
211 | 
212 |         outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
213 |         return torch.cat(outputs, 1)
214 | 
215 | 
216 | class InceptionB(nn.Module):
217 | 
218 |     def __init__(self, in_channels):
219 |         super(InceptionB, self).__init__()
220 |         self.branch3x3 = BasicConv2d(in_channels, 384, kernel_size=3, stride=2)
221 | 
222 |         self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
223 |         self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
224 |         self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, stride=2)
225 | 
226 |     def forward(self, x):
227 |         branch3x3 = self.branch3x3(x)
228 | 
229 |         branch3x3dbl = self.branch3x3dbl_1(x)
230 |         branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
231 |         branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
232 | 
233 |         branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
234 | 
235 |         outputs = [branch3x3, branch3x3dbl, branch_pool]
236 |         return torch.cat(outputs, 1)
237 | 
238 | 
239 | class InceptionC(nn.Module):
240 | 
241 |     def __init__(self, in_channels, channels_7x7):
242 |         super(InceptionC, self).__init__()
243 |         self.branch1x1 = BasicConv2d(in_channels, 192, kernel_size=1)
244 | 
245 |         c7 = channels_7x7
246 |         self.branch7x7_1 = BasicConv2d(in_channels, c7, kernel_size=1)
247 |         self.branch7x7_2 = BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3))
248 |         self.branch7x7_3 = BasicConv2d(c7, 192, kernel_size=(7, 1), padding=(3, 0))
249 | 
250 |         self.branch7x7dbl_1 = BasicConv2d(in_channels, c7, kernel_size=1)
251 |         self.branch7x7dbl_2 = BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0))
252 |         self.branch7x7dbl_3 = BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3))
253 |         self.branch7x7dbl_4 = BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0))
254 |         self.branch7x7dbl_5 = BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3))
255 | 
256 |         self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
257 | 
258 |     def forward(self, x):
259 |         branch1x1 = self.branch1x1(x)
260 | 
261 |         branch7x7 = self.branch7x7_1(x)
262 |         branch7x7 = self.branch7x7_2(branch7x7)
263 |         branch7x7 = self.branch7x7_3(branch7x7)
264 | 
265 |         branch7x7dbl = self.branch7x7dbl_1(x)
266 |         branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
267 |         branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
268 |         branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
269 |         branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
270 | 
271 |         branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
272 |         branch_pool = self.branch_pool(branch_pool)
273 | 
274 |         outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
275 |         return torch.cat(outputs, 1)
276 | 
277 | 
278 | class InceptionD(nn.Module):
279 | 
280 |     def __init__(self, in_channels):
281 |         super(InceptionD, self).__init__()
282 |         self.branch3x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
283 |         self.branch3x3_2 = BasicConv2d(192, 320, kernel_size=3, stride=2)
284 | 
285 |         self.branch7x7x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
286 |         self.branch7x7x3_2 = BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3))
287 |         self.branch7x7x3_3 = BasicConv2d(192, 192, kernel_size=(7, 1), padding=(3, 0))
288 |         self.branch7x7x3_4 = BasicConv2d(192, 192, kernel_size=3, stride=2)
289 | 
290 |     def forward(self, x):
291 |         branch3x3 = self.branch3x3_1(x)
292 |         branch3x3 = self.branch3x3_2(branch3x3)
293 | 
294 |         branch7x7x3 = self.branch7x7x3_1(x)
295 |         branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
296 |         branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
297 |         branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
298 | 
299 |         branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
300 |         outputs = [branch3x3, branch7x7x3, branch_pool]
301 |         return torch.cat(outputs, 1)
302 | 
303 | 
304 | class InceptionE(nn.Module):
305 | 
306 |     def __init__(self, in_channels):
307 |         super(InceptionE, self).__init__()
308 |         self.branch1x1 = BasicConv2d(in_channels, 320, kernel_size=1)
309 | 
310 |         self.branch3x3_1 = BasicConv2d(in_channels, 384, kernel_size=1)
311 |         self.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
312 |         self.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
313 | 
314 |         self.branch3x3dbl_1 = BasicConv2d(in_channels, 448, kernel_size=1)
315 |         self.branch3x3dbl_2 = BasicConv2d(448, 384, kernel_size=3, padding=1)
316 |         self.branch3x3dbl_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
317 |         self.branch3x3dbl_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
318 | 
319 |         self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
320 | 
321 |     def forward(self, x):
322 |         branch1x1 = self.branch1x1(x)
323 | 
324 |         branch3x3 = self.branch3x3_1(x)
325 |         branch3x3 = [
326 |             self.branch3x3_2a(branch3x3),
327 |             self.branch3x3_2b(branch3x3),
328 |         ]
329 |         branch3x3 = torch.cat(branch3x3, 1)
330 | 
331 |         branch3x3dbl = self.branch3x3dbl_1(x)
332 |         branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
333 |         branch3x3dbl = [
334 |             self.branch3x3dbl_3a(branch3x3dbl),
335 |             self.branch3x3dbl_3b(branch3x3dbl),
336 |         ]
337 |         branch3x3dbl = torch.cat(branch3x3dbl, 1)
338 | 
339 |         branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
340 |         branch_pool = self.branch_pool(branch_pool)
341 | 
342 |         outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
343 |         return torch.cat(outputs, 1)
344 | 
345 | 
346 | class InceptionAux(nn.Module):
347 | 
348 |     def __init__(self, in_channels, num_classes):
349 |         super(InceptionAux, self).__init__()
350 |         self.conv0 = BasicConv2d(in_channels, 128, kernel_size=1)
351 |         self.conv1 = BasicConv2d(128, 768, kernel_size=5)
352 |         self.conv1.stddev = 0.01
353 |         self.fc = nn.Linear(768, num_classes)
354 |         self.fc.stddev = 0.001
355 | 
356 |     def forward(self, x):
357 |         # 17 x 17 x 768
358 |         x = F.avg_pool2d(x, kernel_size=5, stride=3)
359 |         # 5 x 5 x 768
360 |         x = self.conv0(x)
361 |         # 5 x 5 x 128
362 |         x = self.conv1(x)
363 |         # 1 x 1 x 768
364 |         x = x.view(x.size(0), -1)
365 |         # 768
366 |         x = self.fc(x)
367 |         # 1000
368 |         return x
369 | 
370 | 
371 | class BasicConv2d(nn.Module):
372 | 
373 |     def __init__(self, in_channels, out_channels, **kwargs):
374 |         super(BasicConv2d, self).__init__()
375 |         self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
376 |         self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
377 | 
378 |     def forward(self, x):
379 |         x = self.conv(x)
380 |         x = self.bn(x)
381 |         return F.relu(x, inplace=True)
382 | 
383 | 
384 | #####################################################################################################3
385 | 
386 | 
387 | def run_check_net():
388 | 
389 |     # https://discuss.pytorch.org/t/print-autograd-graph/692/8
390 |     batch_size  = 1
391 |     num_classes = 5000
392 |     C,H,W = 3,128,128
393 | 
394 |     inputs = torch.randn(batch_size,C,H,W)
395 |     labels = torch.randn(batch_size,num_classes)
396 |     in_shape = inputs.size()[1:]
397 | 
398 |     net = SEInception3(in_shape=in_shape, num_classes=num_classes)
399 |     net.load_pretrain_pytorch_file(
400 |             '/root/share/data/models/pytorch/imagenet/inception/inception_v3_google-1a9a5a14.pth',
401 |             skip=['fc.weight'	,'fc.bias']
402 |         )
403 |     net.cuda()
404 |     net.train()
405 | 
406 |     x = Variable(inputs).cuda()
407 |     y = Variable(labels).cuda()
408 |     logits = net.forward(x)
409 |     probs  = F.softmax(logits)
410 | 
411 |     loss = F.binary_cross_entropy_with_logits(logits, y)
412 |     loss.backward()
413 | 
414 |     print(type(net))
415 |     print(net)
416 | 
417 |     print('probs')
418 |     print(probs)
419 | 
420 |     #merging
421 |     # net.eval()
422 |     # net.merge_bn()
423 | 
424 | 
425 | ########################################################################################
426 | if __name__ == '__main__':
427 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
428 | 
429 |     run_check_net()


--------------------------------------------------------------------------------
/src/heng_cherkeng/inception_v3.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from torch.autograd import Variable
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.utils.model_zoo as model_zoo
 10 | 
 11 | 
 12 | # __all__ = ['Inception3', 'inception_v3']
 13 | #
 14 | #
 15 | # model_urls = {
 16 | #     # Inception v3 ported from TensorFlow
 17 | #     'inception_v3_google': 'https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth',
 18 | # }
 19 | 
 20 | 
 21 | # def inception_v3(pretrained=False, **kwargs):
 22 | #     r"""Inception v3 model architecture from
 23 | #     `"Rethinking the Inception Architecture for Computer Vision" <http://arxiv.org/abs/1512.00567>`_.
 24 | #     Args:
 25 | #         pretrained (bool): If True, returns a model pre-trained on ImageNet
 26 | #     """
 27 | #     if pretrained:
 28 | #         if 'transform_input' not in kwargs:
 29 | #             kwargs['transform_input'] = True
 30 | #         model = Inception3(**kwargs)
 31 | #         model.load_state_dict(model_zoo.load_url(model_urls['inception_v3_google']))
 32 | #         return model
 33 | #
 34 | #     return Inception3(**kwargs)
 35 | 
 36 | 
 37 | class Inception3(nn.Module):
 38 | 
 39 |     def load_pretrain_pytorch_file(self,pytorch_file, skip=[]):
 40 |         pytorch_state_dict = torch.load(pytorch_file)
 41 |         state_dict = self.state_dict()
 42 | 
 43 |         keys = list(state_dict.keys())
 44 |         for key in keys:
 45 |             if key in skip: continue
 46 |             state_dict[key] = pytorch_state_dict[key]
 47 | 
 48 |         self.load_state_dict(state_dict)
 49 |         pass
 50 | 
 51 |     #-----------------------------------------------------------------------
 52 | 
 53 |     def __init__(self, in_shape=(3,128,128), num_classes=1000 ):
 54 |         super(Inception3, self).__init__()
 55 |         in_channels, height, width = in_shape
 56 |         self.num_classes=num_classes
 57 |         assert(in_channels==3)
 58 | 
 59 |         # self.aux_logits = aux_logits
 60 |         # self.transform_input = transform_input
 61 |         self.Conv2d_1a_3x3 = BasicConv2d(in_channels, 32, kernel_size=3, stride=2)
 62 |         self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
 63 |         self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
 64 |         self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
 65 |         self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
 66 |         self.Mixed_5b = InceptionA(192, pool_features=32)
 67 |         self.Mixed_5c = InceptionA(256, pool_features=64)
 68 |         self.Mixed_5d = InceptionA(288, pool_features=64)
 69 |         self.Mixed_6a = InceptionB(288)
 70 |         self.Mixed_6b = InceptionC(768, channels_7x7=128)
 71 |         self.Mixed_6c = InceptionC(768, channels_7x7=160)
 72 |         self.Mixed_6d = InceptionC(768, channels_7x7=160)
 73 |         self.Mixed_6e = InceptionC(768, channels_7x7=192)
 74 |         # if aux_logits:
 75 |         #     self.AuxLogits = InceptionAux(768, num_classes)
 76 |         self.Mixed_7a = InceptionD(768)
 77 |         self.Mixed_7b = InceptionE(1280)
 78 |         self.Mixed_7c = InceptionE(2048)
 79 |         self.fc = nn.Linear(2048, num_classes)
 80 | 
 81 |         for m in self.modules():
 82 |             if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
 83 |                 import scipy.stats as stats
 84 |                 stddev = m.stddev if hasattr(m, 'stddev') else 0.1
 85 |                 X = stats.truncnorm(-2, 2, scale=stddev)
 86 |                 values = torch.Tensor(X.rvs(m.weight.data.numel()))
 87 |                 values = values.view(m.weight.data.size())
 88 |                 m.weight.data.copy_(values)
 89 |             elif isinstance(m, nn.BatchNorm2d):
 90 |                 m.weight.data.fill_(1)
 91 |                 m.bias.data.zero_()
 92 | 
 93 |     def forward(self, x):
 94 | 
 95 |         # if self.transform_input:
 96 |         #     x = x.clone()
 97 |         #     x[:, 0] = x[:, 0] * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
 98 |         #     x[:, 1] = x[:, 1] * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
 99 |         #     x[:, 2] = x[:, 2] * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
100 | 
101 | 
102 |         # 299 x 299 x 3
103 |         x = self.Conv2d_1a_3x3(x)
104 |         # 149 x 149 x 32
105 |         x = self.Conv2d_2a_3x3(x)
106 |         # 147 x 147 x 32
107 |         x = self.Conv2d_2b_3x3(x)
108 |         # 147 x 147 x 64
109 |         x = F.max_pool2d(x, kernel_size=3, stride=2)
110 |         # 73 x 73 x 64
111 |         x = self.Conv2d_3b_1x1(x)
112 |         # 73 x 73 x 80
113 |         x = self.Conv2d_4a_3x3(x)
114 |         # 71 x 71 x 192
115 |         x = F.max_pool2d(x, kernel_size=3, stride=2)
116 |         # 35 x 35 x 192
117 |         x = self.Mixed_5b(x)
118 |         # 35 x 35 x 256
119 |         x = self.Mixed_5c(x)
120 |         # 35 x 35 x 288
121 |         x = self.Mixed_5d(x)
122 |         # 35 x 35 x 288
123 |         x = self.Mixed_6a(x)
124 |         # 17 x 17 x 768
125 |         x = self.Mixed_6b(x)
126 |         # 17 x 17 x 768
127 |         x = self.Mixed_6c(x)
128 |         # 17 x 17 x 768
129 |         x = self.Mixed_6d(x)
130 |         # 17 x 17 x 768
131 |         x = self.Mixed_6e(x)
132 |         # 17 x 17 x 768
133 | 
134 |         # if self.training and self.aux_logits:
135 |         #     aux = self.AuxLogits(x)
136 | 
137 |         # 17 x 17 x 768
138 |         x = self.Mixed_7a(x)
139 |         # 8 x 8 x 1280
140 |         x = self.Mixed_7b(x)
141 |         # 8 x 8 x 2048
142 |         x = self.Mixed_7c(x)
143 |         # 8 x 8 x 2048
144 | 
145 |         #x = F.avg_pool2d(x, kernel_size=8)
146 |         x = F.adaptive_avg_pool2d(x, output_size=1)
147 | 
148 |         # 1 x 1 x 2048
149 |         #x = F.dropout(x, training=self.training)
150 |         # 1 x 1 x 2048
151 |         x = x.view(x.size(0), -1)
152 |         # 2048
153 |         x = self.fc(x)
154 |         # 1000 (num_classes)
155 | 
156 |         # if self.training and self.aux_logits:
157 |         #     return x, aux
158 | 
159 |         return x
160 | 
161 | 
162 | class InceptionA(nn.Module):
163 | 
164 |     def __init__(self, in_channels, pool_features):
165 |         super(InceptionA, self).__init__()
166 |         self.branch1x1 = BasicConv2d(in_channels, 64, kernel_size=1)
167 | 
168 |         self.branch5x5_1 = BasicConv2d(in_channels, 48, kernel_size=1)
169 |         self.branch5x5_2 = BasicConv2d(48, 64, kernel_size=5, padding=2)
170 | 
171 |         self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
172 |         self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
173 |         self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, padding=1)
174 | 
175 |         self.branch_pool = BasicConv2d(in_channels, pool_features, kernel_size=1)
176 | 
177 |     def forward(self, x):
178 |         branch1x1 = self.branch1x1(x)
179 | 
180 |         branch5x5 = self.branch5x5_1(x)
181 |         branch5x5 = self.branch5x5_2(branch5x5)
182 | 
183 |         branch3x3dbl = self.branch3x3dbl_1(x)
184 |         branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
185 |         branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
186 | 
187 |         branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
188 |         branch_pool = self.branch_pool(branch_pool)
189 | 
190 |         outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
191 |         return torch.cat(outputs, 1)
192 | 
193 | 
194 | class InceptionB(nn.Module):
195 | 
196 |     def __init__(self, in_channels):
197 |         super(InceptionB, self).__init__()
198 |         self.branch3x3 = BasicConv2d(in_channels, 384, kernel_size=3, stride=2)
199 | 
200 |         self.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
201 |         self.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
202 |         self.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, stride=2)
203 | 
204 |     def forward(self, x):
205 |         branch3x3 = self.branch3x3(x)
206 | 
207 |         branch3x3dbl = self.branch3x3dbl_1(x)
208 |         branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
209 |         branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
210 | 
211 |         branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
212 | 
213 |         outputs = [branch3x3, branch3x3dbl, branch_pool]
214 |         return torch.cat(outputs, 1)
215 | 
216 | 
217 | class InceptionC(nn.Module):
218 | 
219 |     def __init__(self, in_channels, channels_7x7):
220 |         super(InceptionC, self).__init__()
221 |         self.branch1x1 = BasicConv2d(in_channels, 192, kernel_size=1)
222 | 
223 |         c7 = channels_7x7
224 |         self.branch7x7_1 = BasicConv2d(in_channels, c7, kernel_size=1)
225 |         self.branch7x7_2 = BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3))
226 |         self.branch7x7_3 = BasicConv2d(c7, 192, kernel_size=(7, 1), padding=(3, 0))
227 | 
228 |         self.branch7x7dbl_1 = BasicConv2d(in_channels, c7, kernel_size=1)
229 |         self.branch7x7dbl_2 = BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0))
230 |         self.branch7x7dbl_3 = BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3))
231 |         self.branch7x7dbl_4 = BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0))
232 |         self.branch7x7dbl_5 = BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3))
233 | 
234 |         self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
235 | 
236 |     def forward(self, x):
237 |         branch1x1 = self.branch1x1(x)
238 | 
239 |         branch7x7 = self.branch7x7_1(x)
240 |         branch7x7 = self.branch7x7_2(branch7x7)
241 |         branch7x7 = self.branch7x7_3(branch7x7)
242 | 
243 |         branch7x7dbl = self.branch7x7dbl_1(x)
244 |         branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
245 |         branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
246 |         branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
247 |         branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
248 | 
249 |         branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
250 |         branch_pool = self.branch_pool(branch_pool)
251 | 
252 |         outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
253 |         return torch.cat(outputs, 1)
254 | 
255 | 
256 | class InceptionD(nn.Module):
257 | 
258 |     def __init__(self, in_channels):
259 |         super(InceptionD, self).__init__()
260 |         self.branch3x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
261 |         self.branch3x3_2 = BasicConv2d(192, 320, kernel_size=3, stride=2)
262 | 
263 |         self.branch7x7x3_1 = BasicConv2d(in_channels, 192, kernel_size=1)
264 |         self.branch7x7x3_2 = BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3))
265 |         self.branch7x7x3_3 = BasicConv2d(192, 192, kernel_size=(7, 1), padding=(3, 0))
266 |         self.branch7x7x3_4 = BasicConv2d(192, 192, kernel_size=3, stride=2)
267 | 
268 |     def forward(self, x):
269 |         branch3x3 = self.branch3x3_1(x)
270 |         branch3x3 = self.branch3x3_2(branch3x3)
271 | 
272 |         branch7x7x3 = self.branch7x7x3_1(x)
273 |         branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
274 |         branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
275 |         branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
276 | 
277 |         branch_pool = F.max_pool2d(x, kernel_size=3, stride=2)
278 |         outputs = [branch3x3, branch7x7x3, branch_pool]
279 |         return torch.cat(outputs, 1)
280 | 
281 | 
282 | class InceptionE(nn.Module):
283 | 
284 |     def __init__(self, in_channels):
285 |         super(InceptionE, self).__init__()
286 |         self.branch1x1 = BasicConv2d(in_channels, 320, kernel_size=1)
287 | 
288 |         self.branch3x3_1 = BasicConv2d(in_channels, 384, kernel_size=1)
289 |         self.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
290 |         self.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
291 | 
292 |         self.branch3x3dbl_1 = BasicConv2d(in_channels, 448, kernel_size=1)
293 |         self.branch3x3dbl_2 = BasicConv2d(448, 384, kernel_size=3, padding=1)
294 |         self.branch3x3dbl_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
295 |         self.branch3x3dbl_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
296 | 
297 |         self.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
298 | 
299 |     def forward(self, x):
300 |         branch1x1 = self.branch1x1(x)
301 | 
302 |         branch3x3 = self.branch3x3_1(x)
303 |         branch3x3 = [
304 |             self.branch3x3_2a(branch3x3),
305 |             self.branch3x3_2b(branch3x3),
306 |         ]
307 |         branch3x3 = torch.cat(branch3x3, 1)
308 | 
309 |         branch3x3dbl = self.branch3x3dbl_1(x)
310 |         branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
311 |         branch3x3dbl = [
312 |             self.branch3x3dbl_3a(branch3x3dbl),
313 |             self.branch3x3dbl_3b(branch3x3dbl),
314 |         ]
315 |         branch3x3dbl = torch.cat(branch3x3dbl, 1)
316 | 
317 |         branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
318 |         branch_pool = self.branch_pool(branch_pool)
319 | 
320 |         outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
321 |         return torch.cat(outputs, 1)
322 | 
323 | 
324 | class InceptionAux(nn.Module):
325 | 
326 |     def __init__(self, in_channels, num_classes):
327 |         super(InceptionAux, self).__init__()
328 |         self.conv0 = BasicConv2d(in_channels, 128, kernel_size=1)
329 |         self.conv1 = BasicConv2d(128, 768, kernel_size=5)
330 |         self.conv1.stddev = 0.01
331 |         self.fc = nn.Linear(768, num_classes)
332 |         self.fc.stddev = 0.001
333 | 
334 |     def forward(self, x):
335 |         # 17 x 17 x 768
336 |         x = F.avg_pool2d(x, kernel_size=5, stride=3)
337 |         # 5 x 5 x 768
338 |         x = self.conv0(x)
339 |         # 5 x 5 x 128
340 |         x = self.conv1(x)
341 |         # 1 x 1 x 768
342 |         x = x.view(x.size(0), -1)
343 |         # 768
344 |         x = self.fc(x)
345 |         # 1000
346 |         return x
347 | 
348 | 
349 | class BasicConv2d(nn.Module):
350 | 
351 |     def __init__(self, in_channels, out_channels, **kwargs):
352 |         super(BasicConv2d, self).__init__()
353 |         self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
354 |         self.bn = nn.BatchNorm2d(out_channels, eps=0.001)
355 | 
356 |     def forward(self, x):
357 |         x = self.conv(x)
358 |         x = self.bn(x)
359 |         return F.relu(x, inplace=True)
360 | 
361 | 
362 | #####################################################################################################3
363 | 
364 | 
365 | def run_check_net():
366 | 
367 |     # https://discuss.pytorch.org/t/print-autograd-graph/692/8
368 |     batch_size  = 1
369 |     num_classes = 5000
370 |     C,H,W = 3,128,128
371 | 
372 |     inputs = torch.randn(batch_size,C,H,W)
373 |     labels = torch.randn(batch_size,num_classes)
374 |     in_shape = inputs.size()[1:]
375 | 
376 |     net = Inception3(in_shape=in_shape, num_classes=num_classes)
377 |     net.load_pretrain_pytorch_file(
378 |             '/root/share/data/models/pytorch/imagenet/inception/inception_v3_google-1a9a5a14.pth',
379 |             skip=['fc.weight'	,'fc.bias']
380 |         )
381 |     net.cuda()
382 |     net.train()
383 | 
384 |     x = Variable(inputs).cuda()
385 |     y = Variable(labels).cuda()
386 |     logits = net.forward(x)
387 |     probs  = F.softmax(logits)
388 | 
389 |     loss = F.binary_cross_entropy_with_logits(logits, y)
390 |     loss.backward()
391 | 
392 |     print(type(net))
393 |     print(net)
394 | 
395 |     print('probs')
396 |     print(probs)
397 | 
398 |     #merging
399 |     # net.eval()
400 |     # net.merge_bn()
401 | 
402 | 
403 | ########################################################################################
404 | if __name__ == '__main__':
405 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
406 | 
407 |     run_check_net()
408 | 
409 | 
410 | 
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 


--------------------------------------------------------------------------------
/src/heng_cherkeng/resnet101.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from torch.autograd import Variable
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import cv2
  7 | import numpy as np
  8 | 
  9 | #https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
 10 | 
 11 | #----- helper functions ------------------------------
 12 | BN_EPS = 1e-5  #1e-4  #1e-5
 13 | 
 14 | 
 15 | class ConvBn2d(nn.Module):
 16 | 
 17 |     def merge_bn(self):
 18 |         #raise NotImplementedError
 19 |         assert(self.conv.bias==None)
 20 |         conv_weight     = self.conv.weight.data
 21 |         bn_weight       = self.bn.weight.data
 22 |         bn_bias         = self.bn.bias.data
 23 |         bn_running_mean = self.bn.running_mean
 24 |         bn_running_var  = self.bn.running_var
 25 |         bn_eps          = self.bn.eps
 26 | 
 27 |         #https://github.com/sanghoon/pva-faster-rcnn/issues/5
 28 |         #https://github.com/sanghoon/pva-faster-rcnn/commit/39570aab8c6513f0e76e5ab5dba8dfbf63e9c68c
 29 | 
 30 |         N,C,KH,KW = conv_weight.size()
 31 |         std = 1/(torch.sqrt(bn_running_var+bn_eps))
 32 |         std_bn_weight =(std*bn_weight).repeat(C*KH*KW,1).t().contiguous().view(N,C,KH,KW )
 33 |         conv_weight_hat = std_bn_weight*conv_weight
 34 |         conv_bias_hat   = (bn_bias - bn_weight*std*bn_running_mean)
 35 | 
 36 |         self.is_bn = False
 37 |         self.bn = None
 38 |         self.conv = nn.Conv2d(in_channels=self.conv.in_channels, out_channels=self.conv.out_channels, kernel_size=self.conv.kernel_size,
 39 |                               padding=self.conv.padding, stride=self.conv.stride, dilation=self.conv.dilation, groups=self.conv.groups,
 40 |                               bias=True)
 41 |         self.conv.weight.data = conv_weight_hat #fill in
 42 |         self.conv.bias.data   = conv_bias_hat
 43 | 
 44 | 
 45 |     def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, dilation=1, stride=1, groups=1):
 46 |         super(ConvBn2d, self).__init__()
 47 |         self.is_bn = True
 48 | 
 49 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding, stride=stride, dilation=dilation, groups=groups, bias=False)
 50 |         self.bn = nn.BatchNorm2d(out_channels, eps=BN_EPS)
 51 | 
 52 |     def forward(self,x):
 53 |         x = self.conv(x)
 54 |         if self.is_bn :
 55 |             x = self.bn(x)
 56 | 
 57 |         return x
 58 | 
 59 | 
 60 | 
 61 | class Bottleneck(nn.Module):
 62 |     def __init__(self, in_planes, planes, out_planes, is_downsample=False, stride=1):
 63 |         super(Bottleneck, self).__init__()
 64 |         self.is_downsample = is_downsample
 65 | 
 66 |         self.conv_bn1 = ConvBn2d(in_planes,     planes, kernel_size=1, padding=0, stride=1)
 67 |         self.conv_bn2 = ConvBn2d(   planes,     planes, kernel_size=3, padding=1, stride=stride)
 68 |         self.conv_bn3 = ConvBn2d(   planes, out_planes, kernel_size=1, padding=0, stride=1)
 69 | 
 70 |         if self.is_downsample:
 71 |             self.downsample = ConvBn2d(in_planes, out_planes, kernel_size=1, padding=0, stride=stride)
 72 | 
 73 | 
 74 |     def forward(self, x):
 75 | 
 76 |         z = self.conv_bn1(x)
 77 |         z = F.relu(z,inplace=True)
 78 |         z = self.conv_bn2(z)
 79 |         z = F.relu(z,inplace=True)
 80 |         z = self.conv_bn3(z)
 81 | 
 82 |         if self.is_downsample:
 83 |             z += self.downsample(x)
 84 |         else:
 85 |             z += x
 86 | 
 87 |         z = F.relu(z,inplace=True)
 88 |         return z
 89 | 
 90 | 
 91 | #resnet
 92 | def make_layer(in_planes, planes, out_planes, num_blocks, stride):
 93 |     layers = []
 94 |     layers.append(Bottleneck(in_planes, planes, out_planes, is_downsample=True, stride=stride))
 95 |     for i in range(1, num_blocks):
 96 |         layers.append(Bottleneck(out_planes, planes, out_planes))
 97 | 
 98 |     return nn.Sequential(*layers)
 99 | 
100 | def make_layer0(in_channels, out_planes):
101 |     layers = [
102 |         ConvBn2d(in_channels, out_planes, kernel_size=7, stride=2, padding=3),
103 |         nn.ReLU(inplace=True),
104 |         nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
105 |     ]
106 |     return nn.Sequential(*layers)
107 | 
108 | 
109 | 
110 | ## resenet   ##
111 | class ResNet101(nn.Module):
112 | 
113 |     def load_pretrain_file(self,pretrain_file, skip=[]):
114 | 
115 |         pretrain_state_dict = torch.load(pretrain_file)
116 |         state_dict = self.state_dict()
117 | 
118 |         keys = list(state_dict.keys())
119 |         for key in keys:
120 |             if any(s in key for s in skip):
121 |                 continue
122 | 
123 |             pretrain_key = key
124 |             if 'layer0.0.conv.'   in key: pretrain_key=key.replace('layer0.0.conv.',  'conv1.' )
125 |             if 'layer0.0.bn.'     in key: pretrain_key=key.replace('layer0.0.bn.',    'bn1.'   )
126 |             if '.conv_bn1.conv.'  in key: pretrain_key=key.replace('.conv_bn1.conv.', '.conv1.')
127 |             if '.conv_bn1.bn.'    in key: pretrain_key=key.replace('.conv_bn1.bn.',   '.bn1.'  )
128 |             if '.conv_bn2.conv.'  in key: pretrain_key=key.replace('.conv_bn2.conv.', '.conv2.')
129 |             if '.conv_bn2.bn.'    in key: pretrain_key=key.replace('.conv_bn2.bn.',   '.bn2.'  )
130 |             if '.conv_bn3.conv.'  in key: pretrain_key=key.replace('.conv_bn3.conv.', '.conv3.')
131 |             if '.conv_bn3.bn.'    in key: pretrain_key=key.replace('.conv_bn3.bn.',   '.bn3.'  )
132 |             if '.downsample.conv.'in key: pretrain_key=key.replace('.downsample.conv.',  '.downsample.0.')
133 |             if '.downsample.bn.'  in key: pretrain_key=key.replace('.downsample.bn.',    '.downsample.1.')
134 | 
135 |             #print('%36s'%key, ' ','%-36s'%pretrain_key)
136 |             state_dict[key] = pretrain_state_dict[pretrain_key]
137 | 
138 |         self.load_state_dict(state_dict)
139 |         #torch.save(state_dict,save_model_file)
140 | 
141 | 
142 |     def merge_bn(self):
143 |         print ('merging bn ....')
144 | 
145 |         for name, m in self.named_modules():
146 |             if isinstance(m, (ConvBn2d,)):
147 |                 print('\t%s'%name)
148 |                 m.merge_bn()
149 |         print('')
150 | 
151 |     #-----------------------------------------------------------------------
152 |     def __init__(self, in_shape=(3,180,180), num_classes=5270 ):
153 | 
154 |         super(ResNet101, self).__init__()
155 |         in_channels, height, width = in_shape
156 |         self.num_classes=num_classes
157 | 
158 |         self.layer0 = make_layer0(in_channels, 64)
159 |         self.layer1 = make_layer(   64,  64,  256, num_blocks= 3, stride=1)  #out =  64*4 =  256
160 |         self.layer2 = make_layer(  256, 128,  512, num_blocks= 4, stride=2)  #out = 128*4 =  512
161 |         self.layer3 = make_layer(  512, 256, 1024, num_blocks=23, stride=2)  #out = 256*4 = 1024
162 |         self.layer4 = make_layer( 1024, 512, 2048, num_blocks= 3, stride=2)  #out = 512*4 = 2048
163 |         self.fc  = nn.Linear(2048, num_classes)
164 | 
165 | 
166 |         # for m in self.modules():
167 |         #     if isinstance(m, nn.Conv2d):
168 |         #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
169 |         #         m.weight.data.normal_(0, math.sqrt(2. / n))
170 |         #     elif isinstance(m, nn.BatchNorm2d):
171 |         #         m.weight.data.fill_(1)
172 |         #         m.bias.data.zero_()
173 | 
174 |     def forward(self, x):
175 |         #x                   #; print('input ' ,x.size())
176 |         x = self.layer0(x)  #; print('layer0 ',x.size())
177 |         x = self.layer1(x)  #; print('layer1 ',x.size())
178 |         x = self.layer2(x)  #; print('layer2 ',x.size())
179 |         x = self.layer3(x)  #; print('layer3 ',x.size())
180 |         x = self.layer4(x)  #; print('layer4 ',x.size())
181 | 
182 |         x = F.adaptive_avg_pool2d(x, output_size=1)
183 |         x = x.view(x.size(0), -1)
184 |         x = self.fc (x)
185 |         return x #logits
186 | 
187 | 
188 | 
189 | ########################################################################################################
190 | 
191 | # test some images
192 | #   https://github.com/soeaver/caffe-model/blob/master/cls/synset.txt
193 | #   https://github.com/ruotianluo/pytorch-resnet/blob/master/synset.py ()
194 | #
195 | #    (441)  810 n02823750 beer glass
196 | #    (  1)  449 n01443537 goldfish, Carassius auratus
197 | #    (  9)  384 n01518878 ostrich, Struthio camelus
198 | #    ( 22)  397 n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
199 | #    (281)  173 n02123045 tabby, tabby cat
200 | 
201 | 
202 | def run_check_net_imagenet():
203 |     num_classes = 1000
204 |     C,H,W = 3,224,224
205 |     net = ResNet101(in_shape=(C,H,W), num_classes=num_classes)
206 |     net.load_pretrain_file(
207 |             '/root/share/data/models/reference/imagenet/resnet/resnet101-5d3b4d8f.pth',
208 |             skip=[]
209 |         )
210 |     #net.cuda()
211 |     net.eval()
212 | 
213 | 
214 |     image = cv2.imread('/root/share/data/imagenet/dummy/256x256/beer_glass.jpg')
215 |     #image = cv2.imread('/root/share/data/imagenet/dummy/256x256/goldfish.jpg')
216 |     #image = cv2.imread('/root/share/data/imagenet/dummy/256x256/blad_eagle.jpg')
217 |     #image = cv2.imread('/root/share/data/imagenet/dummy/256x256/ostrich.jpg')
218 |     #image = cv2.imread('/root/share/data/imagenet/dummy/256x256/tabby_cat.jpg')
219 |     #image = cv2.imread('/root/share/data/imagenet/dummy/256x256/bullet_train.jpg')
220 | 
221 | 
222 |     #pre process ----
223 |     image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
224 |     image = cv2.resize(image,(224,224)).astype(np.float32)
225 |     image = image.transpose((2,0,1))
226 |     image = image/255
227 | 
228 |     mean = [0.485, 0.456, 0.406 ]
229 |     std  = [0.229, 0.224, 0.225 ]
230 |     image[0] = (image[0] - mean[0]) / std[0]
231 |     image[1] = (image[1] - mean[1]) / std[1]
232 |     image[2] = (image[2] - mean[2]) / std[2]
233 |     #pre process ----
234 | 
235 | 
236 |     #run net
237 |     logits = net( Variable(torch.from_numpy(image).unsqueeze(0).float() ) )
238 |     probs  = F.softmax(logits,dim=1).data.numpy().reshape(-1)
239 |     #print('probs\n',probs)
240 | 
241 |     #check
242 |     print('results ', np.argmax(probs), ' ', probs[np.argmax(probs)])
243 | 
244 | 
245 | 
246 | def run_check_net():
247 | 
248 |     # https://discuss.pytorch.org/t/print-autograd-graph/692/8
249 |     batch_size  = 1
250 |     num_classes = 5270
251 |     C,H,W = 3,180,180
252 | 
253 |     inputs = torch.randn(batch_size,C,H,W)
254 |     labels = torch.randn(batch_size,num_classes)
255 |     in_shape = inputs.size()[1:]
256 | 
257 |     net = ResNet101(in_shape=in_shape, num_classes=num_classes)
258 |     net.load_pretrain_file(
259 |             '/root/share/data/models/reference/imagenet/resnet/resnet101-5d3b4d8f.pth',
260 |             skip=['fc.']
261 |         )
262 |     net.cuda()
263 |     net.train()
264 | 
265 |     x = Variable(inputs).cuda()
266 |     y = Variable(labels).cuda()
267 |     logits = net.forward(x)
268 |     probs  = F.softmax(logits,dim=1)
269 | 
270 |     loss = F.binary_cross_entropy_with_logits(logits, y)
271 |     loss.backward()
272 | 
273 |     print(type(net))
274 |     #print(net)
275 | 
276 |     print('probs')
277 |     print(probs)
278 | 
279 |     #merging ----
280 |     # net.eval()
281 |     # net.merge_bn()
282 | 
283 | 
284 | 
285 | ########################################################################################
286 | if __name__ == '__main__':
287 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
288 | 
289 | 
290 |     #run_check_net()
291 |     run_check_net_imagenet()
292 | 
293 | 


--------------------------------------------------------------------------------
/src/heng_cherkeng/xception.py:
--------------------------------------------------------------------------------
  1 | # https://arxiv.org/pdf/1610.02357.pdf
  2 | 
  3 | 
  4 | # "Xception: Deep Learning with Depthwise Separable Convolutions" - Francois Chollet (Google, Inc), CVPR 2017
  5 | 
  6 | # separable conv pytorch
  7 | #  https://github.com/szagoruyko/pyinn
  8 | #  https://github.com/pytorch/pytorch/issues/1708
  9 | #  https://discuss.pytorch.org/t/separable-convolutions-in-pytorch/3407/2
 10 | #  https://discuss.pytorch.org/t/depthwise-and-separable-convolutions-in-pytorch/7315/3
 11 | 
 12 | import os
 13 | from torch.autograd import Variable
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.nn.functional as F
 17 | import pyinn as P
 18 | from pyinn.modules import Conv2dDepthwise
 19 | 
 20 | #----- helper functions ------------------------------
 21 | BN_EPS = 1e-4  #1e-4  #1e-5
 22 | 
 23 | class ConvBn2d(nn.Module):
 24 | 
 25 |     def merge_bn(self):
 26 |         #raise NotImplementedError
 27 |         assert(self.conv.bias==None)
 28 |         conv_weight     = self.conv.weight.data
 29 |         bn_weight       = self.bn.weight.data
 30 |         bn_bias         = self.bn.bias.data
 31 |         bn_running_mean = self.bn.running_mean
 32 |         bn_running_var  = self.bn.running_var
 33 |         bn_eps          = self.bn.eps
 34 | 
 35 |         #https://github.com/sanghoon/pva-faster-rcnn/issues/5
 36 |         #https://github.com/sanghoon/pva-faster-rcnn/commit/39570aab8c6513f0e76e5ab5dba8dfbf63e9c68c
 37 | 
 38 |         N,C,KH,KW = conv_weight.size()
 39 |         std = 1/(torch.sqrt(bn_running_var+bn_eps))
 40 |         std_bn_weight =(std*bn_weight).repeat(C*KH*KW,1).t().contiguous().view(N,C,KH,KW )
 41 |         conv_weight_hat = std_bn_weight*conv_weight
 42 |         conv_bias_hat   = (bn_bias - bn_weight*std*bn_running_mean)
 43 | 
 44 |         self.bn   = None
 45 |         self.conv = nn.Conv2d(in_channels=self.conv.in_channels, out_channels=self.conv.out_channels, kernel_size=self.conv.kernel_size,
 46 |                               padding=self.conv.padding, stride=self.conv.stride, dilation=self.conv.dilation, groups=self.conv.groups,
 47 |                               bias=True)
 48 |         self.conv.weight.data = conv_weight_hat #fill in
 49 |         self.conv.bias.data   = conv_bias_hat
 50 | 
 51 | 
 52 |     def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, dilation=1, stride=1, groups=1, is_bn=True):
 53 |         super(ConvBn2d, self).__init__()
 54 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding, stride=stride, dilation=dilation, groups=groups, bias=False)
 55 |         self.bn   = nn.BatchNorm2d(out_channels, eps=BN_EPS)
 56 | 
 57 |         if is_bn is False:
 58 |             self.bn =None
 59 | 
 60 |     def forward(self,x):
 61 |         x = self.conv(x)
 62 |         if self.bn is not None:
 63 |             x = self.bn(x)
 64 |         return x
 65 | 
 66 | 
 67 | # ----
 68 | class SeparableConvBn2d(nn.Module):
 69 | 
 70 |     def __init__(self, in_channels, out_channels, kernel_size=3, padding=1, stride=1, is_bn=True):
 71 |         super(SeparableConvBn2d, self).__init__()
 72 | 
 73 |         #self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, padding=padding, stride=stride, groups=in_channels, bias=False)  #depth_wise
 74 |         #self.conv2 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, stride=1, bias=False) #point_wise
 75 | 
 76 |         self.conv1 = Conv2dDepthwise(in_channels,  kernel_size=kernel_size, padding=padding, stride=stride, bias=False)
 77 |         self.conv2 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, stride=1, bias=False)
 78 |         self.bn    = nn.BatchNorm2d(out_channels, eps=BN_EPS)
 79 | 
 80 | 
 81 |     def forward(self,x):
 82 |         x = self.conv1(x)
 83 |         x = self.conv2(x)
 84 |         x = self.bn(x)
 85 |         return x
 86 | 
 87 | #
 88 | class SBlock(nn.Module):
 89 | 
 90 |     def __init__(self, in_channels, channels, out_channels, is_first_relu=True):
 91 |         super(SBlock, self).__init__()
 92 |         self.is_first_relu=is_first_relu
 93 | 
 94 |         self.downsample = ConvBn2d(in_channels, out_channels, kernel_size=1, padding=0, stride=2)
 95 |         self.conv1 = SeparableConvBn2d(in_channels,     channels, kernel_size=3, padding=1, stride=1)
 96 |         self.conv2 = SeparableConvBn2d(   channels, out_channels, kernel_size=3, padding=1, stride=1)
 97 | 
 98 |     def forward(self,x):
 99 |         residual = self.downsample(x)
100 |         if self.is_first_relu:
101 |             x = F.relu(x,inplace=False)
102 |         x = self.conv1(x)
103 |         x = F.relu(x,inplace=True)
104 |         x = self.conv2(x)
105 |         x = F.max_pool2d(x, kernel_size=3, padding=1, stride=2)
106 |         x = x + residual
107 | 
108 |         return x
109 | 
110 | 
111 | 
112 | class XBlock(nn.Module):
113 | 
114 |     def __init__(self, in_channels):
115 |         super(XBlock, self).__init__()
116 | 
117 |         self.conv1 = SeparableConvBn2d(in_channels, in_channels, kernel_size=3, padding=1, stride=1)
118 |         self.conv2 = SeparableConvBn2d(in_channels, in_channels, kernel_size=3, padding=1, stride=1)
119 |         self.conv3 = SeparableConvBn2d(in_channels, in_channels, kernel_size=3, padding=1, stride=1)
120 | 
121 |     def forward(self,x):
122 | 
123 |         residual = x
124 |         x = F.relu(x,inplace=True)
125 |         x = self.conv1(x)
126 |         x = F.relu(x,inplace=True)
127 |         x = self.conv2(x)
128 |         x = F.relu(x,inplace=True)
129 |         x = self.conv3(x)
130 |         x = x + residual
131 | 
132 |         return x
133 | 
134 | 
135 | 
136 | class EBlock(nn.Module):
137 | 
138 |     def __init__(self, in_channels, channels, out_channels):
139 |         super(EBlock, self).__init__()
140 | 
141 |         self.conv1 = SeparableConvBn2d(in_channels, channels, kernel_size=3, padding=1, stride=1)
142 |         self.conv2 = SeparableConvBn2d(channels,out_channels, kernel_size=3, padding=1, stride=1)
143 | 
144 | 
145 |     def forward(self,x):
146 | 
147 |         x = self.conv1(x)
148 |         x = F.relu(x,inplace=True)
149 |         x = self.conv2(x)
150 |         x = F.relu(x,inplace=True)
151 | 
152 |         return x
153 | 
154 | 
155 | class Xception(nn.Module):
156 | 
157 |     def load_pretrain_pytorch_file(self,pytorch_file, skip=[]):
158 |         pytorch_state_dict = torch.load(pytorch_file,map_location=lambda storage, loc: storage)
159 |         state_dict = self.state_dict()
160 |         keys = list(state_dict.keys())
161 |         for key in keys:
162 |             if any(s in key for s in skip):
163 |                 continue
164 |             #print(key)
165 |             state_dict[key] = pytorch_state_dict[key]
166 |         self.load_state_dict(state_dict)
167 | 
168 |     #-----------------------------------------------------------------------
169 | 
170 |     def __init__(self, in_shape=(3,128,128), num_classes=5000 ):
171 |         super(Xception, self).__init__()
172 |         in_channels, height, width = in_shape
173 |         self.num_classes = num_classes
174 | 
175 |         self.entry0  = nn.Sequential(
176 |             ConvBn2d(in_channels, 32, kernel_size=3, stride=2, padding=1),
177 |             nn.ReLU(inplace=True),
178 |             ConvBn2d(32, 64, kernel_size=3, stride=1, padding=0),
179 |             nn.ReLU(inplace=True),
180 |         )
181 |         self.entry1  = SBlock( 64,128,128,is_first_relu=False)
182 |         self.entry2  = SBlock(128,256,256)
183 |         self.entry3  = SBlock(256,728,728)
184 | 
185 |         self.middle1 = XBlock(728)
186 |         self.middle2 = XBlock(728)
187 |         self.middle3 = XBlock(728)
188 |         self.middle4 = XBlock(728)
189 |         self.middle5 = XBlock(728)
190 |         self.middle6 = XBlock(728)
191 |         self.middle7 = XBlock(728)
192 |         self.middle8 = XBlock(728)
193 | 
194 |         self.exit1 = SBlock( 728, 728,1024)
195 |         self.exit2 = EBlock(1024,1536,2048)
196 |         self.fc = nn.Linear(2048, num_classes)
197 | 
198 | 
199 |     def forward(self,x):
200 | 
201 |         x = self.entry0(x)    #; print('entry0 ', x.size())
202 |         x = self.entry1(x)    #; print('entry1 ', x.size())
203 |         x = self.entry2(x)    #; print('entry2 ', x.size())
204 |         x = self.entry3(x)    #; print('entry3 ', x.size())
205 |         x = self.middle1(x)   #; print('middle1 ',x.size())
206 |         x = self.middle2(x)   #; print('middle2 ',x.size())
207 |         x = self.middle3(x)   #; print('middle3 ',x.size())
208 |         x = self.middle4(x)   #; print('middle4 ',x.size())
209 |         x = self.middle5(x)   #; print('middle5 ',x.size())
210 |         x = self.middle6(x)   #; print('middle6 ',x.size())
211 |         x = self.middle7(x)   #; print('middle7 ',x.size())
212 |         x = self.middle8(x)   #; print('middle8 ',x.size())
213 |         x = self.exit1(x)     #; print('exit1 ',x.size())
214 |         x = self.exit2(x)     #; print('exit2 ',x.size())
215 | 
216 |         x = F.adaptive_avg_pool2d(x, output_size=1)
217 |         x = x.view(x.size(0), -1)
218 |         x = F.dropout(x, training=self.training, p=0.2)     #
219 |         x = self.fc (x)
220 |         return x #logits
221 | 
222 | 
223 | ########################################################################################################
224 | 
225 | 
226 | def run_check_net():
227 | 
228 |     # https://discuss.pytorch.org/t/print-autograd-graph/692/8
229 |     batch_size  = 1
230 |     num_classes = 5000
231 |     C,H,W = 3,180,180
232 | 
233 |     inputs = torch.randn(batch_size,C,H,W)
234 |     labels = torch.randn(batch_size,num_classes)
235 |     in_shape = inputs.size()[1:]
236 | 
237 | 
238 |     net = Xception(in_shape=in_shape, num_classes=num_classes)
239 |     net.load_pretrain_pytorch_file(
240 |             '/root/share/data/models/reference/imagenet/xception/caffe-model/inception/xception/xception.keras.convert.pth',
241 |             skip=['fc.weight'	,'fc.bias']
242 |         )
243 |     net.cuda().train()
244 | 
245 |     x = Variable(inputs).cuda()
246 |     y = Variable(labels).cuda()
247 |     logits = net.forward(x)
248 |     probs  = F.softmax(logits)
249 | 
250 |     loss = F.binary_cross_entropy_with_logits(logits, y)
251 |     loss.backward()
252 | 
253 |     print(type(net))
254 |     print(net)
255 | 
256 |     print('probs')
257 |     print(probs)
258 | 
259 |     #merging
260 |     # net.eval()
261 |     # net.merge_bn()
262 | 
263 | 
264 | ########################################################################################
265 | if __name__ == '__main__':
266 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
267 | 
268 |     run_check_net()
269 | 
270 | 


--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NighTurs/kaggle-cdiscount-image-classification/3646ee4dc7a0e35dfe6fb4cdaadcf2fb7b30d3a5/src/model/__init__.py


--------------------------------------------------------------------------------
/src/model/bcolz_iterator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import bcolz
 3 | import threading
 4 | from keras.preprocessing.image import Iterator
 5 | 
 6 | CHUNK_SIZE = 100000
 7 | 
 8 | 
 9 | class BcolzIterator():
10 |     def __init__(self, bcolz_root, x_idxs, side_input=None, y=None, num_classes=None, batch_size=32, shuffle=True, seed=None):
11 |         self.x = bcolz.open(bcolz_root)
12 |         self.side_x = side_input
13 |         self.x_idxs = x_idxs
14 |         self.y = y
15 |         self.num_classes = num_classes
16 |         self.samples = len(self.x_idxs)
17 |         self.batch_size = batch_size
18 |         assert CHUNK_SIZE % batch_size == 0
19 |         self.shuffle = shuffle
20 |         if seed:
21 |             np.random.seed(seed)
22 |         self.chunk_idx = -1
23 |         self.next_idx = -1
24 |         self.thread = threading.Thread(target=self.preload, args=(0,))
25 |         self.thread.start()
26 |         self.get_chunk()
27 | 
28 |     def preload(self, idx):
29 |         idxs = self.x_idxs[(CHUNK_SIZE * idx):(CHUNK_SIZE * idx + CHUNK_SIZE)]
30 |         self.preload_x = self.x[idxs]
31 |         if self.side_x is not None:
32 |             self.preload_side_x = self.side_x[idxs]
33 | 
34 |     def get_chunk(self):
35 |         self.chunk_idx += 1
36 |         if CHUNK_SIZE * self.chunk_idx >= len(self.x_idxs):
37 |             self.chunk_idx = 0
38 |         self.next_idx = self.chunk_idx + 1
39 |         if CHUNK_SIZE * self.next_idx >= len(self.x_idxs):
40 |             self.next_idx = 0
41 |         idxs = self.x_idxs[(CHUNK_SIZE * self.chunk_idx):(CHUNK_SIZE * self.chunk_idx + CHUNK_SIZE)]
42 | 
43 |         self.thread.join()
44 |         self.chunk_x = self.preload_x
45 |         if self.side_x is not None:
46 |             self.chunk_side_x = self.preload_side_x
47 |         self.thread = threading.Thread(target=self.preload, args=(self.next_idx,))
48 |         self.thread.start()
49 | 
50 |         if self.y is not None:
51 |             self.chunk_y = self.y[(CHUNK_SIZE * self.chunk_idx):(CHUNK_SIZE * self.chunk_idx + CHUNK_SIZE)]
52 |         self.chunk_seen = 0
53 |         self.it = Iterator(len(idxs), self.batch_size, self.shuffle, None)
54 | 
55 |     def next(self):
56 |         if self.chunk_x.shape[0] <= self.chunk_seen:
57 |             self.get_chunk()
58 |         index_array = next(self.it.index_generator)
59 |         if self.side_x is not None:
60 |             out_x = [self.chunk_x[index_array[0]], self.chunk_side_x[index_array[0]]]
61 |         else:
62 |             out_x = self.chunk_x[index_array[0]]
63 |         if self.y is not None:
64 |             out = out_x, self.chunk_y[index_array[0]]
65 |         else:
66 |             out = out_x
67 |         self.chunk_seen += len(index_array[0])
68 |         return out
69 | 
70 |     def __iter__(self):
71 |         return self
72 | 
73 |     def __next__(self, *args, **kwargs):
74 |         return self.next(*args, **kwargs)
75 | 


--------------------------------------------------------------------------------
/src/model/bcolz_to_memmap.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import bcolz
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--bcolz_path', required=True, help='Path to bcolz with vectors')
 9 |     parser.add_argument('--memmap_path', required=True, help="Write memmap to path")
10 | 
11 |     args = parser.parse_args()
12 | 
13 |     bcolz_path = args.bcolz_path
14 |     memmap_path = args.memmap_path
15 | 
16 |     a = bcolz.open(bcolz_path)
17 |     b = np.memmap(memmap_path, dtype='float32', mode='w+', shape=a.shape)
18 | 
19 |     with tqdm(total=a.shape[0]) as pbar:
20 |         batch_size = 100000
21 |         for i in range(0, a.shape[0], batch_size):
22 |             chunk = a[i:(i + batch_size)]
23 |             b[i:(i + batch_size)] = chunk
24 |             b.flush()
25 |             pbar.update(chunk.shape[0])


--------------------------------------------------------------------------------
/src/model/bson_iterator.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import numpy as np
 3 | import bson
 4 | from keras.preprocessing.image import Iterator
 5 | from keras.preprocessing.image import load_img, img_to_array
 6 | from keras import backend as K
 7 | 
 8 | 
 9 | class BSONIterator(Iterator):
10 |     def __init__(self, bson_file, images_df, num_class,
11 |                  image_data_generator, lock, target_size=(180, 180),
12 |                  with_labels=True, batch_size=32, shuffle=False, seed=None):
13 | 
14 |         self.file = bson_file
15 |         self.images_df = images_df
16 |         self.with_labels = with_labels
17 |         self.samples = len(images_df)
18 |         self.num_class = num_class
19 |         self.image_data_generator = image_data_generator
20 |         self.target_size = tuple(target_size)
21 |         self.image_shape = (3,) + self.target_size
22 | 
23 |         super(BSONIterator, self).__init__(self.samples, batch_size, shuffle, seed)
24 |         self.lock = lock
25 | 
26 |     def _get_batches_of_transformed_samples(self, index_array):
27 |         batch_x = np.zeros((len(index_array),) + self.image_shape, dtype=K.floatx())
28 |         if self.with_labels:
29 |             batch_y = np.zeros((len(batch_x), self.num_class), dtype=K.floatx())
30 | 
31 |         for i, j in enumerate(index_array):
32 |             # Protect file and dataframe access with a lock.
33 |             with self.lock:
34 |                 image_row = self.images_df.iloc[j]
35 | 
36 |                 # Read this product's data from the BSON file.
37 |                 self.file.seek(image_row["offset"])
38 |                 item_data = self.file.read(image_row["length"])
39 | 
40 |             # Grab the image from the product.
41 |             item = bson.BSON.decode(item_data)
42 |             img_idx = image_row["img_idx"]
43 |             bson_img = item["imgs"][img_idx]["picture"]
44 | 
45 |             # Load the image.
46 |             img = load_img(io.BytesIO(bson_img), target_size=self.target_size)
47 | 
48 |             # Preprocess the image.
49 |             x = img_to_array(img)
50 |             x = self.image_data_generator.random_transform(x)
51 |             x = self.image_data_generator.standardize(x)
52 | 
53 |             # Add the image and the label to the batch (one-hot encoded).
54 |             batch_x[i] = x
55 |             if self.with_labels:
56 |                 batch_y[i, image_row["category_idx"]] = 1
57 | 
58 |         if self.with_labels:
59 |             return batch_x, batch_y
60 |         else:
61 |             return batch_x
62 | 
63 |     def next(self):
64 |         with self.lock:
65 |             index_array = next(self.index_generator)
66 |         return self._get_batches_of_transformed_samples(index_array[0])
67 | 


--------------------------------------------------------------------------------
/src/model/combine_iterator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class CombineIterator():
 5 |     def __init__(self, first_iterator, second_iterator):
 6 |         self.first_iterator = first_iterator
 7 |         self.second_iterator = second_iterator
 8 |         self.batch_size = first_iterator.batch_size + second_iterator.batch_size
 9 |         self.samples = first_iterator.samples + second_iterator.samples
10 | 
11 |     def next(self):
12 |         first_out = self.first_iterator.next()
13 |         second_out = self.second_iterator.next()
14 |         if type(first_out[0]) is list:
15 |             x = [np.concatenate((x1, x2)) for x1, x2 in zip(first_out[0], second_out[0])]
16 |         else:
17 |             x = np.concatenate((first_out[0], second_out[0]))
18 |         y = np.concatenate((first_out[1], second_out[1]))
19 |         return x, y
20 | 
21 |     def __iter__(self):
22 |         return self
23 | 
24 |     def __next__(self, *args, **kwargs):
25 |         return self.next(*args, **kwargs)
26 | 


--------------------------------------------------------------------------------
/src/model/ensemble_fixed_weights.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import os
 4 | import argparse
 5 | import itertools
 6 | from tqdm import tqdm
 7 | from collections import namedtuple
 8 | 
 9 | TOP_PREDS = 10
10 | PREDICTIONS_FILE = 'predictions.csv'
11 | 
12 | if __name__ == '__main__':
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--preds_csvs', nargs='+', required=True, help='Files with predictions of test dataset')
15 |     parser.add_argument('--weights', nargs='+', type=float, required=True, help='Weight of each model')
16 |     parser.add_argument('--model_dir', required=True, help='Model directory')
17 |     args = parser.parse_args()
18 | 
19 |     if len(args.preds_csvs) != len(args.weights):
20 |         raise ValueError('Count of weights should much count of csvs')
21 | 
22 |     preds_csvs = args.preds_csvs
23 |     weights = args.weights
24 |     model_dir = args.model_dir
25 | 
26 |     all_preds = []
27 |     for i, csv in enumerate(preds_csvs):
28 |         preds = pd.read_csv(csv, dtype={'product_id': np.int32,
29 |                                         'img_idx': np.int8,
30 |                                         'category_idx': np.int16,
31 |                                         'prob': np.float32})
32 |         preds['prob'] = preds['prob'] * weights[i]
33 |         preds.sort_values(['product_id', 'img_idx'], inplace=True)
34 |         all_preds.append(preds)
35 | 
36 |     prev_img = (0, 0)
37 |     prev_cat = 0
38 |     sum_prob = 0
39 |     sum_all_probs = 0
40 |     cat_prob = []
41 | 
42 |     if not os.path.isdir(model_dir):
43 |         os.mkdir(model_dir)
44 | 
45 |     # Can't concatenate and sort all preds simultaneously because of memory problems
46 |     def preds_gen(all_preds):
47 |         iters = [pred.itertuples() for pred in all_preds]
48 |         while True:
49 |             rows = []
50 |             for iter in iters:
51 |                 for i in range(TOP_PREDS):
52 |                     rows.append(next(iter))
53 |             rows.sort(key=lambda x: x.category_idx)
54 |             for row in rows:
55 |                 yield row
56 | 
57 |     with tqdm(total=sum([preds.shape[0] for preds in all_preds])) as pbar, \
58 |             open(os.path.join(model_dir, PREDICTIONS_FILE), 'w') as out:
59 |         out.write('product_id,img_idx,category_idx,prob\n')
60 |         for row in itertools.chain(preds_gen(all_preds),
61 |                                    [namedtuple('Pandas', ['product_id', 'img_idx', 'category_idx', 'prob'])(0, 0, 0,
62 |                                                                                                             0)]):
63 |             product_id = row.product_id
64 |             img_idx = row.img_idx
65 |             category_idx = row.category_idx
66 |             prob = row.prob
67 |             if prev_img == (product_id, img_idx):
68 |                 if prev_cat == category_idx:
69 |                     sum_prob += prob
70 |                 else:
71 |                     cat_prob.append((prev_cat, sum_prob))
72 |                     sum_all_probs += sum_prob
73 |                     prev_cat = category_idx
74 |                     sum_prob = prob
75 |             else:
76 |                 cat_prob.append((prev_cat, sum_prob))
77 |                 sum_all_probs += sum_prob
78 |                 cat_prob = sorted(cat_prob, key=lambda x: x[1], reverse=True)[:TOP_PREDS]
79 |                 if prev_img != (0, 0):
80 |                     for cat in cat_prob:
81 |                         out.write('{},{},{},{}\n'.format(prev_img[0], prev_img[1], cat[0], cat[1] / sum_all_probs))
82 |                 prev_img = (product_id, img_idx)
83 |                 prev_cat = category_idx
84 |                 sum_prob = prob
85 |                 sum_all_probs = 0
86 |                 cat_prob = []
87 |             pbar.update()


--------------------------------------------------------------------------------
/src/model/form_submission.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import numpy as np
 4 | from ..data.category_idx import index_to_category_dict
 5 | 
 6 | 
 7 | def max_prob_category(df):
 8 |     i = df.prob.argmax()
 9 |     return df.category_idx.loc[i]
10 | 
11 | 
12 | def form_submission(preds, category_idx):
13 |     preds.sort_values('prob', ascending=False, inplace=True)
14 |     taken_prods = set()
15 |     d = index_to_category_dict(category_idx)
16 |     products = []
17 |     category_id = []
18 |     for row in preds.itertuples():
19 |         if row.product_id in taken_prods:
20 |             continue
21 |         taken_prods.add(row.product_id)
22 |         products.append(row.product_id)
23 |         category_id.append(d[row.category_idx])
24 |     submission = pd.Series(category_id, index=products)
25 |     submission.rename('category_id', inplace=True)
26 |     submission.index.rename(name='_id', inplace=True)
27 |     return submission
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument('--preds_csv', required=True, help='File with predictions')
33 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
34 |     parser.add_argument('--output_file', required=True, help='File to save submission into')
35 | 
36 |     args = parser.parse_args()
37 |     preds = pd.read_csv(args.preds_csv, dtype={'category_idx': np.int16, 'prob': np.float32, 'product_id': np.int32,
38 |                                                'img_idx': np.int8})
39 |     category_idx = pd.read_csv(args.category_idx_csv)
40 |     submission = form_submission(preds, category_idx)
41 |     submission.to_csv(args.output_file, header=True)
42 | 


--------------------------------------------------------------------------------
/src/model/form_submission_mul.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import numpy as np
 4 | import itertools
 5 | from tqdm import tqdm
 6 | from collections import namedtuple
 7 | from ..data.category_idx import index_to_category_dict
 8 | 
 9 | DEFAULT_PROB = 0.001
10 | 
11 | def max_prob_category(df):
12 |     i = df.prob.argmax()
13 |     return df.category_idx.loc[i]
14 | 
15 | 
16 | def form_submission(preds, category_idx):
17 |     preds.sort_values(['product_id', 'category_idx'], ascending=False, inplace=True)
18 |     d = index_to_category_dict(category_idx)
19 |     cur = (0, 0)
20 |     acc = 1
21 |     imgs = 0
22 |     max_acc = 0
23 |     max_imgs = 1
24 |     max_cat = 0
25 |     products = []
26 |     category_id = []
27 |     with tqdm(total=preds.shape[0]) as pbar:
28 |         for row in itertools.chain(preds.itertuples(),
29 |                                    [namedtuple('Pandas', ['product_id', 'img_idx', 'category_idx', 'prob'])(0, 0, 0, 0)]):
30 |             if cur == (row.product_id, row.category_idx):
31 |                 acc *= row.prob
32 |                 imgs += 1
33 |             else:
34 |                 while imgs > max_imgs:
35 |                     max_imgs += 1
36 |                     max_acc *= DEFAULT_PROB
37 |                 while max_imgs > imgs:
38 |                     imgs += 1
39 |                     acc *= DEFAULT_PROB
40 |                 max_imgs = imgs
41 |                 if max_acc < acc:
42 |                     max_acc = acc
43 |                     max_cat = cur[1]
44 |                 if row.product_id != cur[0]:
45 |                     if cur != (0, 0):
46 |                         products.append(cur[0])
47 |                         category_id.append(d[max_cat])
48 |                     max_acc = 0
49 |                     max_imgs = 1
50 |                 cur = (row.product_id, row.category_idx)
51 |                 acc = row.prob
52 |                 imgs = 1
53 |             pbar.update(1)
54 |     submission = pd.Series(category_id, index=products)
55 |     submission.rename('category_id', inplace=True)
56 |     submission.index.rename(name='_id', inplace=True)
57 |     return submission
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('--preds_csv', required=True, help='File with predictions')
63 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
64 |     parser.add_argument('--output_file', required=True, help='File to save submission into')
65 | 
66 |     args = parser.parse_args()
67 |     preds = pd.read_csv(args.preds_csv, dtype={'category_idx': np.int16, 'prob': np.float32, 'product_id': np.int32,
68 |                                                'img_idx': np.int8})
69 |     category_idx = pd.read_csv(args.category_idx_csv)
70 |     submission = form_submission(preds, category_idx)
71 |     submission.to_csv(args.output_file, header=True)
72 | 


--------------------------------------------------------------------------------
/src/model/form_submission_sum.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import numpy as np
 4 | import itertools
 5 | from tqdm import tqdm
 6 | from collections import namedtuple
 7 | from ..data.category_idx import index_to_category_dict
 8 | 
 9 | DEFAULT_PROB = 0.001
10 | 
11 | 
12 | def max_prob_category(df):
13 |     i = df.prob.argmax()
14 |     return df.category_idx.loc[i]
15 | 
16 | 
17 | def form_submission(preds, category_idx):
18 |     preds.sort_values(['product_id', 'category_idx'], ascending=False, inplace=True)
19 |     d = index_to_category_dict(category_idx)
20 |     cur = (0, 0)
21 |     acc = 0
22 |     max_acc = 0
23 |     max_cat = 0
24 |     products = []
25 |     category_id = []
26 |     with tqdm(total=preds.shape[0]) as pbar:
27 |         for row in itertools.chain(preds.itertuples(),
28 |                                    [namedtuple('Pandas', ['product_id', 'img_idx', 'category_idx', 'prob'])(0, 0, 0,
29 |                                                                                                             0)]):
30 |             if cur == (row.product_id, row.category_idx):
31 |                 acc += row.prob
32 |             else:
33 |                 if max_acc < acc:
34 |                     max_acc = acc
35 |                     max_cat = cur[1]
36 |                 if row.product_id != cur[0]:
37 |                     if cur != (0, 0):
38 |                         products.append(cur[0])
39 |                         category_id.append(d[max_cat])
40 |                     max_acc = 0
41 |                 cur = (row.product_id, row.category_idx)
42 |                 acc = row.prob
43 |             pbar.update(1)
44 |     submission = pd.Series(category_id, index=products)
45 |     submission.rename('category_id', inplace=True)
46 |     submission.index.rename(name='_id', inplace=True)
47 |     return submission
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('--preds_csv', required=True, help='File with predictions')
53 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
54 |     parser.add_argument('--output_file', required=True, help='File to save submission into')
55 | 
56 |     args = parser.parse_args()
57 |     preds = pd.read_csv(args.preds_csv, dtype={'category_idx': np.int16, 'prob': np.float32, 'product_id': np.int32,
58 |                                                'img_idx': np.int8})
59 |     category_idx = pd.read_csv(args.category_idx_csv)
60 |     submission = form_submission(preds, category_idx)
61 |     submission.to_csv(args.output_file, header=True)
62 | 


--------------------------------------------------------------------------------
/src/model/heng_models.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import pandas as pd
  4 | import torch
  5 | import cv2
  6 | import os
  7 | import bson
  8 | import itertools
  9 | from tqdm import tqdm
 10 | from torch.autograd import Variable
 11 | import torch.nn.functional as F
 12 | from src.heng_cherkeng.inception_v3 import Inception3
 13 | from src.heng_cherkeng.excited_inception_v3 import SEInception3
 14 | from src.heng_cherkeng.xception import Xception
 15 | from src.heng_cherkeng.resnet101 import ResNet101
 16 | from src.data.category_idx import category_to_index_dict
 17 | 
 18 | CDISCOUNT_NUM_CLASSES = 5270
 19 | CDISCOUNT_HEIGHT = 180
 20 | CDISCOUNT_WIDTH = 180
 21 | 
 22 | 
 23 | def read_label_to_category_id(file):
 24 |     with open(file, 'r') as file:
 25 |         d = eval(file.read())
 26 |     return d
 27 | 
 28 | 
 29 | def read_train_ids(file):
 30 |     with open(file, 'r') as file:
 31 |         lines = file.readlines()
 32 |     return {int(line) for line in lines}
 33 | 
 34 | 
 35 | def pytorch_image_to_tensor_transform(image):
 36 |     mean = [0.485, 0.456, 0.406]
 37 |     std = [0.229, 0.224, 0.225]
 38 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 39 |     image = image.transpose((2, 0, 1))
 40 |     tensor = torch.from_numpy(image).float().div(255)
 41 |     tensor[0] = (tensor[0] - mean[0]) / std[0]
 42 |     tensor[1] = (tensor[1] - mean[1]) / std[1]
 43 |     tensor[2] = (tensor[2] - mean[2]) / std[2]
 44 |     return tensor
 45 | 
 46 | 
 47 | def image_to_tensor_transform(image):
 48 |     tensor = pytorch_image_to_tensor_transform(image)
 49 |     tensor[0] = tensor[0] * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
 50 |     tensor[1] = tensor[1] * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
 51 |     tensor[2] = tensor[2] * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
 52 |     return tensor
 53 | 
 54 | 
 55 | def doit(net, vecs, ids, dfs, label_to_category_id, category_dict, top_k=10, single_prediction=False):
 56 |     x = Variable(vecs, volatile=True).cuda()
 57 |     logits = net(x)
 58 |     preds = F.softmax(logits)
 59 |     preds = preds.cpu().data.numpy()
 60 | 
 61 |     if single_prediction:
 62 |         product_start = 0
 63 |         prev_product_id = 0
 64 |         chunk = []
 65 |         for i, tuple in enumerate(itertools.chain(ids, [(1, 0)])):
 66 |             if prev_product_id != 0 and prev_product_id != tuple[0]:
 67 |                 prods = preds[product_start:i].prod(axis=-2)
 68 |                 prods = prods / prods.sum()
 69 |                 top_k_preds = np.argpartition(prods, -top_k)[-top_k:]
 70 |                 for pred_idx in range(top_k):
 71 |                     chunk.append((prev_product_id, 0, category_dict[label_to_category_id[top_k_preds[pred_idx]]],
 72 |                                   prods[top_k_preds[pred_idx]]))
 73 |                 product_start = i
 74 |             prev_product_id = tuple[0]
 75 |     else:
 76 |         top_k_preds = np.argpartition(preds, -top_k)[:, -top_k:]
 77 |         chunk = []
 78 |         for i in range(len(ids)):
 79 |             product_id = ids[i][0]
 80 |             img_idx = ids[i][1]
 81 |             for pred_idx in range(top_k):
 82 |                 chunk.append(
 83 |                     (product_id, img_idx, category_dict[label_to_category_id[top_k_preds[i, pred_idx]]],
 84 |                      preds[i, top_k_preds[i, pred_idx]]))
 85 |     chunk_df = pd.DataFrame(chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
 86 |     dfs.append(chunk_df)
 87 | 
 88 | 
 89 | def model_predict(bson_file, model_name, model_dir, label_to_category_id_file, batch_size, category_idx, is_pred_valid,
 90 |                   train_ids_file, single_prediction=False, test_time_augmentation=False, tta_seed=123, crop_range=20,
 91 |                   rotation_max=0):
 92 |     category_dict = category_to_index_dict(category_idx)
 93 | 
 94 |     if model_name == 'inception':
 95 |         net = Inception3(in_shape=(3, CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH), num_classes=CDISCOUNT_NUM_CLASSES)
 96 |     elif model_name == 'seinception':
 97 |         net = SEInception3(in_shape=(3, CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH), num_classes=CDISCOUNT_NUM_CLASSES)
 98 |     elif model_name == 'xception':
 99 |         net = Xception(in_shape=(3, CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH), num_classes=CDISCOUNT_NUM_CLASSES)
100 |     elif model_name == 'resnet101':
101 |         net = ResNet101(in_shape=(3, CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH), num_classes=CDISCOUNT_NUM_CLASSES)
102 |     else:
103 |         raise ValueError('Unknown model name ' + model_name)
104 | 
105 |     net.load_state_dict(torch.load(os.path.join(model_dir, 'model.pth')))
106 |     net.cuda().eval()
107 | 
108 |     label_to_category_id = read_label_to_category_id(label_to_category_id_file)
109 |     if is_pred_valid:
110 |         train_ids = read_train_ids(train_ids_file)
111 | 
112 |     bson_iter = bson.decode_file_iter(open(bson_file, 'rb'))
113 |     batch_size = batch_size
114 | 
115 |     rnd = np.random.RandomState(tta_seed)
116 | 
117 |     dfs = []
118 |     with tqdm() as pbar:
119 |         v = torch.from_numpy(np.zeros((batch_size + 3, 3, CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH), dtype=np.float32))
120 |         ids = []
121 |         for d in bson_iter:
122 |             product_id = d['_id']
123 |             # noinspection PyUnboundLocalVariable
124 |             if is_pred_valid and product_id in train_ids:
125 |                 continue
126 |             for e, pic in enumerate(d['imgs']):
127 |                 image = cv2.imdecode(np.fromstring(pic['picture'], np.uint8), 1)
128 |                 if test_time_augmentation:
129 |                     if rotation_max != 0:
130 |                         angle = rnd.rand(1)[0] * rotation_max * 2 - rotation_max
131 |                         M = cv2.getRotationMatrix2D((CDISCOUNT_HEIGHT / 2, CDISCOUNT_WIDTH / 2), angle, 1)
132 |                         image = cv2.warpAffine(image, M, (CDISCOUNT_HEIGHT, CDISCOUNT_WIDTH),
133 |                                                borderMode=cv2.BORDER_REPLICATE)
134 |                     if crop_range != 0:
135 |                         image = cv2.resize(image, (CDISCOUNT_HEIGHT + crop_range, CDISCOUNT_WIDTH + crop_range))
136 |                         crop = rnd.randint(0, crop_range, 2)
137 |                         image = image[crop[0]:(crop[0] + CDISCOUNT_HEIGHT), crop[1]:(crop[1] + CDISCOUNT_WIDTH)]
138 |                 x = image_to_tensor_transform(image)
139 |                 v[len(ids)] = x
140 |                 ids.append((product_id, e))
141 |             if len(ids) >= batch_size:
142 |                 doit(net, v, ids, dfs, label_to_category_id, category_dict, single_prediction=single_prediction)
143 |                 pbar.update(len(ids))
144 |                 ids = []
145 |         if len(ids) > 0:
146 |             doit(net, v, ids, dfs, label_to_category_id, category_dict, single_prediction=single_prediction)
147 |             pbar.update(len(ids))
148 | 
149 |     return pd.concat(dfs)
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     parser = argparse.ArgumentParser()
154 |     parser.add_argument('--bson', required=True, help='Path to bson with products')
155 |     parser.add_argument('--model_name', required=True, help='Model name: inception or seinception')
156 |     parser.add_argument('--model_dir', required=True, help='Output directory for vectors')
157 |     parser.add_argument('--label_to_category_id_file', required=True, help='Hengs label to category mappings file')
158 |     parser.add_argument('--batch_size', type=int, required=False, default=256, help='Batch size')
159 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
160 |     parser.add_argument('--predict_valid', action='store_true', required=False, dest='is_predict_valid')
161 |     parser.set_defaults(is_predict_valid=False)
162 |     parser.add_argument('--train_ids_file', required=False, help='Path to Hengs with train ids')
163 |     parser.add_argument('--single_prediction', action='store_true', required=False, dest='single_prediction')
164 |     parser.set_defaults(single_prediction=False)
165 |     parser.add_argument('--test_time_augmentation', action='store_true', required=False, dest='test_time_augmentation')
166 |     parser.set_defaults(test_time_augmentation=False)
167 |     parser.add_argument('--tta_seed', type=int, required=False, default=123)
168 |     parser.add_argument('--csv_suffix', required=False, default='')
169 |     parser.add_argument('--crop_range', type=int, required=False, default=20)
170 |     parser.add_argument('--rotation_max', type=int, required=False, default=0)
171 | 
172 |     args = parser.parse_args()
173 | 
174 |     category_idx = pd.read_csv(args.category_idx_csv)
175 | 
176 |     preds = model_predict(args.bson, args.model_name, args.model_dir, args.label_to_category_id_file, args.batch_size,
177 |                           category_idx, args.is_predict_valid, args.train_ids_file, args.single_prediction,
178 |                           args.test_time_augmentation,
179 |                           args.tta_seed, args.crop_range, args.rotation_max)
180 |     if args.is_predict_valid:
181 |         if args.single_prediction:
182 |             csv_name = 'valid_single_predictions{}.csv'
183 |         else:
184 |             csv_name = 'valid_predictions{}.csv'
185 |     else:
186 |         if args.single_prediction:
187 |             csv_name = 'single_predictions{}.csv'
188 |         else:
189 |             csv_name = 'predictions{}.csv'
190 |     csv_name = csv_name.format(args.csv_suffix)
191 |     preds.to_csv(os.path.join(args.model_dir, csv_name), index=False)
192 | 


--------------------------------------------------------------------------------
/src/model/memmap_iterator.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from keras.preprocessing.image import Iterator
 3 | from queue import Queue
 4 | from threading import Thread
 5 | import time
 6 | 
 7 | 
 8 | class MemmapIterator():
 9 |     def __init__(self, memmap_path, memmap_shape, images_df, num_classes=None, batch_size=32, shuffle=True, seed=None,
10 |                  pool_wrokers=4, use_side_input=False):
11 |         if seed:
12 |             np.random.seed(seed)
13 |         self.x = np.memmap(memmap_path, dtype=np.float32, mode='r', shape=memmap_shape)
14 |         self.images_df = images_df
15 |         self.images_df_index = np.copy(self.images_df.index.values)
16 |         self.images_df_num_imgs = np.copy(self.images_df.num_imgs.as_matrix())
17 |         self.images_df_img_idx = np.copy(self.images_df.img_idx.as_matrix())
18 |         self.has_y = 'category_idx' in images_df.columns
19 |         if self.has_y:
20 |             self.images_df_category_idx = np.copy(self.images_df.category_idx.as_matrix())
21 |         del self.images_df
22 |         self.num_classes = num_classes
23 |         self.batch_size = batch_size
24 |         self.shuffle = shuffle
25 |         self.use_side_input = use_side_input
26 |         self.samples = len(self.images_df_index)
27 |         self.it = Iterator(self.samples, self.batch_size, self.shuffle, seed)
28 |         self.queue = Queue(maxsize=40)
29 |         self.stop_flag = False
30 |         self.threads = []
31 |         for i in range(pool_wrokers):
32 |             thread = Thread(target=self.read_batches)
33 |             thread.start()
34 |             self.threads.append(thread)
35 | 
36 |     def read_batches(self):
37 |         while True:
38 |             if self.stop_flag == True:
39 |                 return
40 |             with self.it.lock:
41 |                 index_array = next(self.it.index_generator)[0]
42 |             m1 = np.zeros((len(index_array), *self.x.shape[1:]), dtype=np.float32)
43 |             if self.use_side_input:
44 |                 m2 = np.zeros((len(index_array), 8), dtype=np.float32)
45 | 
46 |             if self.has_y:
47 |                 p = np.zeros(len(index_array), dtype=np.float32)
48 | 
49 |             for bi, i in enumerate(index_array):
50 |                 m1[bi] = self.x[self.images_df_index[i]]
51 |                 if self.use_side_input:
52 |                     m2[bi, self.images_df_num_imgs[i] - 1] = 1
53 |                     m2[bi, 4 + self.images_df_img_idx[i]] = 1
54 |                 if self.has_y:
55 |                     # noinspection PyUnboundLocalVariable
56 |                     p[bi] = self.images_df_category_idx[i]
57 |             if self.use_side_input:
58 |                 inputs = [m1, m2]
59 |             else:
60 |                 inputs = m1
61 | 
62 |             if self.has_y:
63 |                 self.queue.put((inputs, p))
64 |             else:
65 |                 self.queue.put(inputs)
66 | 
67 |     def next(self):
68 |         return self.queue.get()
69 | 
70 |     def terminate(self):
71 |         self.stop_flag = True
72 |         while True:
73 |             try:
74 |                 while True:
75 |                     self.queue.get(block=False)
76 |             except:
77 |                 pass
78 |             live_threads = 0
79 |             for thread in self.threads:
80 |                 live_threads += 1 if thread.is_alive() else 0
81 |             if live_threads == 0:
82 |                 return
83 |             print('Threads running ', live_threads)
84 |             for thread in self.threads:
85 |                 thread.join(timeout=5)
86 | 
87 |     def __iter__(self):
88 |         return self
89 | 
90 |     def __next__(self, *args, **kwargs):
91 |         return self.next(*args, **kwargs)
92 | 


--------------------------------------------------------------------------------
/src/model/multi_memmap_iterator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import itertools
  3 | from collections import namedtuple
  4 | from keras.preprocessing.image import Iterator
  5 | from queue import Queue
  6 | from threading import Thread
  7 | 
  8 | class MultiMemmapIterator():
  9 |     def __init__(self, memmap_path, memmap_shape, images_df, num_classes=None, batch_size=32, shuffle=True, seed=None,
 10 |                  pool_wrokers=4, only_single=False, include_singles=True, max_images=2, use_side_input=True):
 11 |         if seed:
 12 |             np.random.seed(seed)
 13 |         self.x = np.memmap(memmap_path, dtype=np.float32, mode='r', shape=memmap_shape)
 14 |         self.images_df = images_df.sort_values('product_id')
 15 |         self.images_df_index = np.copy(self.images_df.index.values)
 16 |         self.images_df_num_imgs = np.copy(self.images_df.num_imgs.as_matrix())
 17 |         self.images_df_img_idx = np.copy(self.images_df.img_idx.as_matrix())
 18 |         self.has_y = 'category_idx' in images_df.columns
 19 |         if self.has_y:
 20 |             self.images_df_category_idx = np.copy(self.images_df.category_idx.as_matrix())
 21 |         self.num_classes = num_classes
 22 |         self.batch_size = batch_size
 23 |         self.shuffle = shuffle
 24 |         self.max_images = max_images
 25 |         self.use_side_input = use_side_input
 26 | 
 27 |         self.smpls = []
 28 |         cur_index = []
 29 |         prev_product_id = -1
 30 |         for i, row in enumerate(
 31 |                 itertools.chain(self.images_df.itertuples(), [namedtuple('Pandas', ['Index', 'product_id'])(0, 0)])):
 32 |             if prev_product_id != -1 and row.product_id != prev_product_id:
 33 |                 if include_singles or len(cur_index) == 1:
 34 |                     self.smpls.extend([[idx] for idx in cur_index])
 35 |                 if len(cur_index) > 1 and not only_single:
 36 |                     self.smpls.append(cur_index)
 37 |                 cur_index = []
 38 |             prev_product_id = row.product_id
 39 |             cur_index.append(i)
 40 |         del self.images_df
 41 | 
 42 |         self.samples = len(self.smpls)
 43 |         self.rnd = np.random.RandomState(seed)
 44 |         self.it = Iterator(self.samples, self.batch_size, self.shuffle, seed)
 45 |         self.queue = Queue(maxsize=40)
 46 |         self.stop_flag = False
 47 |         self.threads = []
 48 |         for i in range(pool_wrokers):
 49 |             thread = Thread(target=self.read_batches)
 50 |             thread.start()
 51 |             self.threads.append(thread)
 52 | 
 53 |     def read_batches(self):
 54 |         while True:
 55 |             if self.stop_flag == True:
 56 |                 return
 57 |             with self.it.lock:
 58 |                 index_array = next(self.it.index_generator)[0]
 59 |             m1 = np.zeros((len(index_array), self.max_images, *self.x.shape[1:]), dtype=np.float32)
 60 |             if self.use_side_input:
 61 |                 m2 = np.zeros((len(index_array), self.max_images, 8), dtype=np.float32)
 62 | 
 63 |             if self.has_y:
 64 |                 p = np.zeros(len(index_array), dtype=np.float32)
 65 | 
 66 |             bi = 0
 67 |             for smpl_idx in index_array:
 68 |                 smpl = self.smpls[smpl_idx]
 69 | 
 70 |                 for i in smpl:
 71 |                     cur_idx = 3 - self.images_df_img_idx[i]
 72 |                     m1[bi, cur_idx] = self.x[self.images_df_index[i]]
 73 |                     if self.use_side_input:
 74 |                         m2[bi, cur_idx, self.images_df_num_imgs[i] - 1] = 1
 75 |                         m2[bi, cur_idx, 4 + self.images_df_img_idx[i]] = 1
 76 | 
 77 |                 if self.has_y:
 78 |                     # noinspection PyUnboundLocalVariable
 79 |                     p[bi] = self.images_df_category_idx[smpl[0]]
 80 |                 bi += 1
 81 |             if self.use_side_input:
 82 |                 inputs = [m1, m2]
 83 |             else:
 84 |                 inputs = m1
 85 | 
 86 |             if self.has_y:
 87 |                 self.queue.put((inputs, p))
 88 |             else:
 89 |                 self.queue.put(inputs)
 90 | 
 91 |     def next(self):
 92 |         return self.queue.get()
 93 | 
 94 |     def terminate(self):
 95 |         self.stop_flag = True
 96 |         while True:
 97 |             try:
 98 |                 while True:
 99 |                     self.queue.get(block=False)
100 |             except:
101 |                 pass
102 |             live_threads = 0
103 |             for thread in self.threads:
104 |                 live_threads += 1 if thread.is_alive() else 0
105 |             if live_threads == 0:
106 |                 return
107 |             print('Threads running ', live_threads)
108 |             for thread in self.threads:
109 |                 thread.join(timeout=5)
110 | 
111 |     def __iter__(self):
112 |         return self
113 | 
114 |     def __next__(self, *args, **kwargs):
115 |         return self.next(*args, **kwargs)
116 | 


--------------------------------------------------------------------------------
/src/model/predict_ensemble_nn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | from keras.models import load_model
 6 | 
 7 | MODEL_FILE = 'model.h5'
 8 | TOP_PREDS = 10
 9 | PRODS_BATCH = 100000
10 | CATEGORIES_SPLIT = 2000
11 | PREDICTIONS_FILE = 'predictions.csv'
12 | 
13 | if __name__ == '__main__':
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('--preds_csvs', nargs='+', required=True, help='Files with predictions of test dataset')
16 |     parser.add_argument('--model_dir', required=True, help='Model directory')
17 |     parser.add_argument('--total_records', type=int, default=30950800,
18 |                         help='Total number of records in prediction files')
19 |     args = parser.parse_args()
20 |     model = load_model(os.path.join(args.model_dir, MODEL_FILE))
21 |     weights_left = model.get_layer('embedding_1').get_weights()
22 |     weights_right = model.get_layer('embedding_2').get_weights()
23 | 
24 |     whole = []
25 |     skiprows = 1
26 |     with tqdm(total=args.total_records) as pbar:
27 |         while skiprows < args.total_records + 1:
28 |             all_preds = []
29 |             for i, csv in enumerate(args.preds_csvs):
30 |                 weight_left = weights_left[0][i][0]
31 |                 weight_right = weights_right[0][i][0]
32 |                 preds = pd.read_csv(csv, skiprows=skiprows, nrows=TOP_PREDS * PRODS_BATCH,
33 |                                     names=['product_id', 'img_idx', 'category_idx', 'prob'])
34 |                 preds.fillna(0, inplace=True)
35 |                 preds.loc[preds.category_idx < CATEGORIES_SPLIT, 'prob'] = \
36 |                     preds.loc[preds.category_idx < CATEGORIES_SPLIT, 'prob'] * weight_left
37 |                 preds.loc[preds.category_idx >= CATEGORIES_SPLIT, 'prob'] = \
38 |                     preds.loc[preds.category_idx >= CATEGORIES_SPLIT, 'prob'] * weight_right
39 |                 all_preds.append(preds)
40 |             all_preds = pd.concat(all_preds)
41 |             all_preds = all_preds.groupby(['product_id', 'img_idx', 'category_idx'], as_index=False).sum()
42 |             sum_preds = all_preds[['product_id', 'img_idx', 'prob']].groupby(['product_id', 'img_idx'],
43 |                                                                              as_index=False).sum() \
44 |                 .rename(columns={'prob': 'prob_sum'})
45 |             all_preds = all_preds.merge(sum_preds, on=['product_id', 'img_idx'], how='left')
46 |             all_preds['prob'] = all_preds['prob'] / all_preds['prob_sum']
47 |             all_preds = all_preds[['product_id', 'img_idx', 'category_idx', 'prob']]
48 |             all_preds.sort_values('prob', inplace=True, ascending=False)
49 |             all_preds = all_preds.groupby(['product_id', 'img_idx'], as_index=False).head(TOP_PREDS)
50 | 
51 |             whole.append(all_preds)
52 | 
53 |             skiprows += TOP_PREDS * PRODS_BATCH
54 |             pbar.update(TOP_PREDS * PRODS_BATCH)
55 | 
56 |     pd.concat(whole).to_csv(os.path.join(args.model_dir, PREDICTIONS_FILE), index=False)
57 | 


--------------------------------------------------------------------------------
/src/model/pseudo_label_prod_info.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | from src.data.category_idx import index_to_category_dict
 4 | 
 5 | 
 6 | def pick_top_category(preds, category_map):
 7 |     ordered_preds = preds.sort_values('prob', ascending=False)
 8 |     ordered_preds.fillna(0, inplace=True)
 9 |     taken_products = set()
10 |     tuples = []
11 |     for row in ordered_preds.itertuples():
12 |         if row.product_id in taken_products:
13 |             continue
14 |         taken_products.add(row.product_id)
15 |         tuples.append((row.product_id, category_map[row.category_idx]))
16 |     return pd.DataFrame(tuples, columns=['product_id', 'category_id'], dtype='int64').sort_values('product_id')
17 | 
18 | 
19 | def create_pl_prod_infos(train_prod_info_csv, test_prod_info_csv, valid_preds_csv, test_preds_csv, pl_train_prod_info,
20 |                          pl_test_prod_info, category_idx_csv):
21 |     train_prod_info = pd.read_csv(train_prod_info_csv)
22 |     test_prod_info = pd.read_csv(test_prod_info_csv)
23 |     valid_preds = pd.read_csv(valid_preds_csv)
24 |     test_preds = pd.read_csv(test_preds_csv)
25 |     category_idx = pd.read_csv(category_idx_csv)
26 | 
27 |     category_map = index_to_category_dict(category_idx)
28 | 
29 |     test_preds = pick_top_category(test_preds, category_map)
30 |     test_prod_info = test_prod_info.merge(test_preds, on='product_id', how='left')
31 |     test_prod_info.to_csv(pl_test_prod_info, index=False)
32 | 
33 |     valid_preds = pick_top_category(valid_preds, category_map)
34 |     train_prod_info.loc[train_prod_info.product_id.isin(valid_preds.product_id), 'category_id'] = valid_preds[
35 |         'category_id'].as_matrix()
36 |     train_prod_info.to_csv(pl_train_prod_info, index=False)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('--train_prod_info', required=True, help='Train product info')
42 |     parser.add_argument('--test_prod_info', required=True, help='Test product info')
43 |     parser.add_argument('--valid_preds', required=True, help='Valid split predictions')
44 |     parser.add_argument('--test_preds', required=True, help='Test predictions')
45 |     parser.add_argument('--pl_train_prod_info', required=True, help='Pseudo labeling train product info output file')
46 |     parser.add_argument('--pl_test_prod_info', required=True, help='Pseudo labeling test product info output file')
47 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
48 | 
49 |     args = parser.parse_args()
50 |     create_pl_prod_infos(args.train_prod_info, args.test_prod_info, args.valid_preds, args.test_preds,
51 |                          args.pl_train_prod_info, args.pl_test_prod_info, args.category_idx_csv)
52 | 


--------------------------------------------------------------------------------
/src/model/resnet50_vecs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | import os
 5 | import bcolz
 6 | from tqdm import tqdm
 7 | from .bson_iterator import BSONIterator
 8 | from keras.applications.resnet50 import ResNet50
 9 | from keras.applications.resnet50 import preprocess_input
10 | from keras.preprocessing.image import ImageDataGenerator
11 | from keras.layers import Flatten
12 | from keras.models import Model
13 | import threading
14 | 
15 | 
16 | def compute_vgg16_vecs(bson_path, images_df, vecs_output_dir, save_step=100000):
17 |     resnet_model = ResNet50(include_top=False, input_shape=(3, 197, 197))
18 | 
19 |     if os.path.isdir(vecs_output_dir):
20 |         vecs = bcolz.open(rootdir=vecs_output_dir)
21 |         offset = vecs.shape[0]
22 |     else:
23 |         vecs = None
24 |         offset = 0
25 | 
26 |     lock = threading.Lock()
27 | 
28 |     with open(bson_path, "rb") as train_bson_file, \
29 |             tqdm(total=images_df.shape[0], initial=offset) as pbar:
30 |         for i in range(offset, images_df.shape[0], save_step):
31 |             gen = ImageDataGenerator(preprocessing_function=preprocess_input)
32 |             batches = BSONIterator(bson_file=train_bson_file,
33 |                                    images_df=images_df[i:(i + save_step)],
34 |                                    num_class=0,  # doesn't matter here
35 |                                    image_data_generator=gen,
36 |                                    lock=lock,
37 |                                    target_size=(197, 197),
38 |                                    batch_size=220,
39 |                                    shuffle=False,
40 |                                    with_labels=False)
41 | 
42 |             x = Flatten()(resnet_model.output)
43 |             model = Model(resnet_model.input, x)
44 |             out_vecs = model.predict_generator(batches,
45 |                                                       steps=batches.samples / batches.batch_size,
46 |                                                       verbose=1)
47 |             if not vecs:
48 |                 vecs = bcolz.carray(out_vecs, rootdir=vecs_output_dir, mode='w')
49 |                 vecs.flush()
50 |             else:
51 |                 vecs.append(out_vecs)
52 |                 vecs.flush()
53 |             pbar.update(save_step)
54 | 
55 | 
56 | def create_images_df(product_info, only_first_image=False):
57 |     rows = []
58 |     for row in product_info.itertuples():
59 |         for i in range(row.num_imgs):
60 |             rows.append([row.product_id, i, row.offset, row.length])
61 | 
62 |     images_df = pd.DataFrame(rows, columns=['product_id', 'img_idx', 'offset', 'length'])
63 |     if only_first_image:
64 |         images_df = images_df[images_df.img_idx == 0]
65 |         images_df = images_df.reset_index(drop=True)
66 |     return images_df
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     parser = argparse.ArgumentParser()
71 |     parser.add_argument('--bson', required=True, help='Path to bson with products')
72 |     parser.add_argument('--prod_info_csv', required=True, help='Path to prod info csv')
73 |     parser.add_argument('--output_dir', required=True, help='Output directory for vectors')
74 |     parser.add_argument('--save_step', type=int, required=True, help='Save computed vectors to disk each N steps')
75 |     parser.add_argument('--only_first_image', dest='only_first_image', action='store_true',
76 |                         help="Include only first image from each product")
77 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
78 |                         help='If products should be shuffled, provide seed')
79 |     parser.set_defaults(only_first_image=False)
80 | 
81 |     args = parser.parse_args()
82 |     product_info = pd.read_csv(args.prod_info_csv)
83 | 
84 |     images_df = create_images_df(product_info, args.only_first_image)
85 |     if args.shuffle:
86 |         np.random.seed(args.shuffle)
87 |         perm = np.random.permutation(images_df.shape[0])
88 |         images_df = images_df.reindex(perm)
89 | 
90 |     compute_vgg16_vecs(args.bson, images_df, args.output_dir, args.save_step)
91 | 


--------------------------------------------------------------------------------
/src/model/sngl_preds_to_avg.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import numpy as np
 4 | import itertools
 5 | from tqdm import tqdm
 6 | from collections import namedtuple
 7 | 
 8 | TOP_K = 10
 9 | 
10 | def sngl_preds_to_avg(preds):
11 |     preds.sort_values(['product_id', 'category_idx'], inplace=True)
12 |     cur = (0, 0)
13 |     acc = 0
14 |     prod_cats = []
15 |     chunks = []
16 | 
17 |     with tqdm(total=preds.shape[0]) as pbar:
18 |         for row in itertools.chain(preds.itertuples(),
19 |                                    [namedtuple('Pandas', ['product_id', 'img_idx', 'category_idx', 'prob'])(0, 0, 0,
20 |                                                                                                             0)]):
21 |             if cur == (row.product_id, row.category_idx):
22 |                 acc += row.prob
23 |             else:
24 |                 prod_cats.append((cur[1], acc))
25 |                 if row.product_id != cur[0]:
26 |                     if cur != (0, 0):
27 |                         prod_cats.sort(key=lambda x: x[1], reverse=True)
28 |                         s = sum([x[1] for x in prod_cats])
29 |                         for t in prod_cats[:TOP_K]:
30 |                             chunks.append((cur[0], 0, t[0], t[1] / s))
31 |                     prod_cats = []
32 |                 cur = (row.product_id, row.category_idx)
33 |                 acc = row.prob
34 |             pbar.update(1)
35 | 
36 |     return pd.DataFrame(chunks, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('--preds_csv', required=True, help='File with predictions')
42 |     parser.add_argument('--output_file', required=True, help='File to save submission into')
43 | 
44 |     args = parser.parse_args()
45 |     preds = pd.read_csv(args.preds_csv, dtype={'category_idx': np.int16, 'prob': np.float32, 'product_id': np.int32,
46 |                                                'img_idx': np.int8})
47 |     out = sngl_preds_to_avg(preds)
48 |     out.to_csv(args.output_file, header=True)


--------------------------------------------------------------------------------
/src/model/train_ensemble_nn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | import argparse
  4 | import pandas as pd
  5 | import numpy as np
  6 | import keras.backend as K
  7 | from keras.preprocessing.image import Iterator
  8 | from src.data.category_idx import map_categories
  9 | from keras.layers.embeddings import Embedding
 10 | from keras.layers import Flatten
 11 | from keras.layers import Input
 12 | from keras.layers import merge
 13 | from keras.models import Model
 14 | from keras.initializers import Ones
 15 | from keras.optimizers import Adam
 16 | from keras.models import load_model
 17 | from keras.constraints import non_neg
 18 | 
 19 | N_CATEGORIES = 5270
 20 | CATEGORIES_SPLIT = 2000
 21 | MODEL_FILE = 'model.h5'
 22 | VALID_PREDICTIONS_FILE = 'valid_predictions.csv'
 23 | TOP_K = 10
 24 | 
 25 | 
 26 | class SpecialIterator(Iterator):
 27 |     def __init__(self, images, categories, n_models, batch_size=32, shuffle=True, seed=None):
 28 |         self.x = images
 29 |         self.products = images[['product_id', 'img_idx']].drop_duplicates().sort_values(['product_id', 'img_idx'])
 30 |         self.categories = categories.sort_index()
 31 |         self.num_classes = N_CATEGORIES
 32 |         self.samples = self.products.shape[0]
 33 |         self.n_models = n_models
 34 |         super(SpecialIterator, self).__init__(self.samples, batch_size, shuffle, seed)
 35 | 
 36 |     def next(self):
 37 |         index_array = next(self.index_generator)[0]
 38 |         prods = self.products.iloc[index_array]
 39 |         pd = {(row.product_id, row.img_idx): i for i, row in enumerate(prods.itertuples())}
 40 |         cats = self.categories.loc[prods.product_id]
 41 |         images = prods.merge(self.x, on=['product_id', 'img_idx'], how='left')
 42 |         p = np.zeros((len(index_array), self.num_classes, self.n_models), dtype=np.float32)
 43 |         for row in images.itertuples():
 44 |             p[pd[(row.product_id, row.img_idx)], row.category_idx, row.model] = 0 if np.isnan(row.prob) else row.prob
 45 | 
 46 |         return [np.repeat(np.arange(self.n_models).reshape(1, self.n_models), len(index_array), axis=0),
 47 |                 p[:, :CATEGORIES_SPLIT, :], p[:, CATEGORIES_SPLIT:, :]], cats['category_idx'].as_matrix()
 48 | 
 49 | 
 50 | def train_ensemble_nn(preds_csv_files, prod_info_csv, category_idx_csv, model_dir, lr, seed, batch_size, epochs):
 51 |     prod_info = pd.read_csv(prod_info_csv)
 52 |     category_idx = pd.read_csv(category_idx_csv)
 53 | 
 54 |     all_preds = []
 55 |     model_inx = {}
 56 |     for i, csv in enumerate(preds_csv_files):
 57 |         preds = pd.read_csv(csv)
 58 |         preds['model'] = i
 59 |         model_inx[i] = csv
 60 |         all_preds.append(preds)
 61 |     print('Assigned indexes to models: ', model_inx)
 62 |     all_preds = pd.concat(all_preds)
 63 | 
 64 |     n_models = len(preds_csv_files)
 65 | 
 66 |     categories = prod_info[prod_info.product_id.isin(all_preds.product_id.unique())][['product_id', 'category_id']]
 67 |     categories['category_idx'] = map_categories(category_idx, categories.category_id)
 68 |     categories = categories[['product_id', 'category_idx']]
 69 |     categories = categories.set_index('product_id')
 70 | 
 71 |     it = SpecialIterator(all_preds, categories, n_models, batch_size=batch_size, seed=seed, shuffle=True)
 72 | 
 73 |     model_file = os.path.join(model_dir, MODEL_FILE)
 74 |     if os.path.exists(model_file):
 75 |         model = load_model(model_file)
 76 |     else:
 77 |         model_inp = Input(shape=(n_models,), dtype='int32')
 78 | 
 79 |         preds_cat1_inp = Input((CATEGORIES_SPLIT, n_models))
 80 |         preds_cat2_inp = Input((N_CATEGORIES - CATEGORIES_SPLIT, n_models))
 81 | 
 82 |         mul_cat1 = Embedding(n_models, 1, input_length=n_models, embeddings_initializer=Ones(),
 83 |                              embeddings_constraint=non_neg())(model_inp)
 84 |         mul_cat1 = Flatten()(mul_cat1)
 85 | 
 86 |         mul_cat2 = Embedding(n_models, 1, input_length=n_models, embeddings_initializer=Ones(),
 87 |                              embeddings_constraint=non_neg())(model_inp)
 88 |         mul_cat2 = Flatten()(mul_cat2)
 89 | 
 90 |         def op(x):
 91 |             z_left = x[0].dimshuffle(1, 0, 2) * x[1]
 92 |             z_right = x[2].dimshuffle(1, 0, 2) * x[3]
 93 |             z = K.concatenate([z_left, z_right], axis=0)
 94 |             v = K.sum(z, axis=-1)
 95 |             p = K.sum(v, axis=-2)
 96 |             return (v / p).dimshuffle(1, 0)
 97 | 
 98 |         x = merge([preds_cat1_inp, mul_cat1, preds_cat2_inp, mul_cat2], mode=op, output_shape=(N_CATEGORIES,))
 99 | 
100 |         model = Model([model_inp, preds_cat1_inp, preds_cat2_inp], x)
101 |     np.random.seed(seed)
102 |     model.compile(optimizer=Adam(lr=lr), loss='sparse_categorical_crossentropy',
103 |                   metrics=['sparse_categorical_accuracy'])
104 | 
105 |     model.fit_generator(it, steps_per_epoch=it.samples / it.batch_size, epochs=epochs)
106 | 
107 |     print('First {} categories model weights:'.format(CATEGORIES_SPLIT))
108 |     print(model.get_layer('embedding_1').get_weights())
109 |     print('Left categories model weights:'.format(CATEGORIES_SPLIT))
110 |     print(model.get_layer('embedding_2').get_weights())
111 | 
112 |     if not os.path.isdir(model_dir):
113 |         os.mkdir(model_dir)
114 |     model.save(os.path.join(model_dir, MODEL_FILE))
115 | 
116 | 
117 | def predict_valid(preds_csv_files, prod_info_csv, category_idx_csv, model_dir, batch_size):
118 |     model_file = os.path.join(model_dir, MODEL_FILE)
119 |     if os.path.exists(model_file):
120 |         model = load_model(model_file)
121 |     else:
122 |         raise ValueError("Model doesn't exist")
123 | 
124 |     prod_info = pd.read_csv(prod_info_csv)
125 |     category_idx = pd.read_csv(category_idx_csv)
126 | 
127 |     all_preds = []
128 |     model_inx = {}
129 |     for i, csv in enumerate(preds_csv_files):
130 |         preds = pd.read_csv(csv)
131 |         preds['model'] = i
132 |         model_inx[i] = csv
133 |         all_preds.append(preds)
134 |     print('Assigned indexes to models: ', model_inx)
135 |     all_preds = pd.concat(all_preds)
136 |     all_preds.sort_values(['product_id', 'img_idx'], inplace=True)
137 | 
138 |     n_models = len(preds_csv_files)
139 | 
140 |     categories = prod_info[prod_info.product_id.isin(all_preds.product_id.unique())][['product_id', 'category_id']]
141 |     del prod_info
142 | 
143 |     categories['category_idx'] = map_categories(category_idx, categories.category_id)
144 |     categories = categories[['product_id', 'category_idx']]
145 |     categories = categories.set_index('product_id')
146 | 
147 |     del category_idx
148 |     chunk_size = 50000 * n_models * TOP_K
149 |     with open(os.path.join(args.model_dir, VALID_PREDICTIONS_FILE), 'w') as f:
150 |         f.write('product_id,img_idx,category_idx,prob\n')
151 |         for start_i in range(0, all_preds.shape[0], chunk_size):
152 |             end_i = min(all_preds.shape[0], start_i + chunk_size)
153 |             it = SpecialIterator(all_preds[start_i:end_i], categories, n_models, batch_size=batch_size, shuffle=False)
154 | 
155 |             preds = model.predict_generator(it, it.samples / batch_size,
156 |                                             verbose=1, max_queue_size=10)
157 |             del it
158 |             gc.collect()
159 |             top_k_preds = np.argpartition(preds, -TOP_K)[:, -TOP_K:]
160 |             products = all_preds[start_i:end_i][['product_id', 'img_idx']].drop_duplicates()
161 |             for i, row in enumerate(products.itertuples()):
162 |                 for pred_idx in range(TOP_K):
163 |                     f.write('{},{},{},{}\n'.format(row.product_id, row.img_idx, top_k_preds[i, pred_idx],
164 |                                                    preds[i, top_k_preds[i, pred_idx]]))
165 |             del top_k_preds
166 |             del preds
167 |             del products
168 |             gc.collect()
169 |             f.flush()
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     parser = argparse.ArgumentParser()
174 |     parser.add_argument('--predict_valid', action='store_true', dest='is_predict_valid')
175 |     parser.set_defaults(is_predict_valid=False)
176 |     parser.add_argument('--preds_csvs', nargs='+', required=True, help='Files with predictions of valid split')
177 |     parser.add_argument('--prod_info_csv', required=True, help='Path to prod info csv')
178 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
179 |     parser.add_argument('--model_dir', required=True, help='Model directory')
180 |     parser.add_argument('--lr', type=float, default=0.01, required=False, help='Learning rate')
181 |     parser.add_argument('--seed', type=int, default=456, required=False, help='Learning seed')
182 |     parser.add_argument('--epochs', type=int, default=1, required=False, help='Epochs')
183 |     parser.add_argument('--batch_size', type=int, default=2000, required=False, help='Batch size')
184 | 
185 |     args = parser.parse_args()
186 |     if args.is_predict_valid:
187 |         predict_valid(args.preds_csvs, args.prod_info_csv, args.category_idx_csv, args.model_dir, args.batch_size)
188 |     else:
189 |         train_ensemble_nn(args.preds_csvs, args.prod_info_csv, args.category_idx_csv, args.model_dir, args.lr,
190 |                           args.seed,
191 |                           args.batch_size, args.epochs)
192 | 


--------------------------------------------------------------------------------
/src/model/tune_avg_resnet50_vecs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | import keras.backend as K
  6 | from keras.models import Model
  7 | from keras.models import load_model
  8 | from keras.layers import Dense
  9 | from keras.layers import Input
 10 | from keras.layers import BatchNormalization
 11 | from keras.layers import TimeDistributed
 12 | from keras.layers import SimpleRNN
 13 | from keras.layers import GRU
 14 | from keras.layers import Lambda
 15 | from keras.layers import concatenate
 16 | from keras.optimizers import Adam
 17 | from keras.callbacks import ModelCheckpoint
 18 | from keras.callbacks import CSVLogger
 19 | from src.data.category_idx import map_categories
 20 | from src.model.multi_memmap_iterator import MultiMemmapIterator
 21 | from src.model.resnet50_vecs import create_images_df
 22 | 
 23 | LOAD_MODEL = 'model.h5'
 24 | SNAPSHOT_MODEL = 'model.h5'
 25 | LOG_FILE = 'training.log'
 26 | PREDICTIONS_FILE = 'single_predictions.csv'
 27 | VALID_PREDICTIONS_FILE = 'valid_single_predictions.csv'
 28 | MAX_PREDICTIONS_AT_TIME = 50000
 29 | 
 30 | 
 31 | def train_data(memmap_path, memmap_len, bcolz_prod_info, sample_prod_info, train_split, category_idx, batch_size,
 32 |                shuffle=None, batch_seed=123, max_images=2, only_single=False, use_img_idx=False, include_singles=True):
 33 |     images_df = create_images_df(bcolz_prod_info, False)
 34 |     bcolz_prod_info['category_idx'] = map_categories(category_idx, bcolz_prod_info['category_id'])
 35 |     bcolz_prod_info = bcolz_prod_info.merge(train_split, on='product_id', how='left')
 36 |     images_df = images_df.merge(bcolz_prod_info, on='product_id', how='left')[
 37 |         ['product_id', 'category_idx', 'img_idx', 'num_imgs', 'train']]
 38 |     if shuffle:
 39 |         np.random.seed(shuffle)
 40 |         perm = np.random.permutation(images_df.shape[0])
 41 |         images_df = images_df.reindex(perm)
 42 |         images_df.reset_index(drop=True, inplace=True)
 43 |     images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
 44 |     train_df = images_df[images_df['train']]
 45 |     valid_df = images_df[~images_df['train']]
 46 |     num_classes = np.unique(images_df['category_idx']).size
 47 | 
 48 |     train_it = MultiMemmapIterator(memmap_path=memmap_path,
 49 |                                    memmap_shape=(memmap_len, 2048),
 50 |                                    images_df=train_df,
 51 |                                    num_classes=num_classes,
 52 |                                    seed=batch_seed,
 53 |                                    batch_size=batch_size,
 54 |                                    only_single=only_single,
 55 |                                    include_singles=include_singles,
 56 |                                    max_images=max_images,
 57 |                                    pool_wrokers=4,
 58 |                                    shuffle=True,
 59 |                                    use_side_input=use_img_idx)
 60 |     valid_mul_it = MultiMemmapIterator(memmap_path=memmap_path,
 61 |                                        memmap_shape=(memmap_len, 2048),
 62 |                                        images_df=valid_df,
 63 |                                        num_classes=num_classes,
 64 |                                        seed=batch_seed,
 65 |                                        batch_size=batch_size,
 66 |                                        shuffle=False,
 67 |                                        only_single=False,
 68 |                                        include_singles=False,
 69 |                                        max_images=4,
 70 |                                        pool_wrokers=1,
 71 |                                        use_side_input=use_img_idx)
 72 |     valid_sngl_it = MultiMemmapIterator(memmap_path=memmap_path,
 73 |                                         memmap_shape=(memmap_len, 2048),
 74 |                                         images_df=valid_df,
 75 |                                         num_classes=num_classes,
 76 |                                         seed=batch_seed,
 77 |                                         batch_size=batch_size,
 78 |                                         shuffle=False,
 79 |                                         only_single=True,
 80 |                                         include_singles=True,
 81 |                                         max_images=4,
 82 |                                         pool_wrokers=1,
 83 |                                         use_side_input=use_img_idx)
 84 |     return train_it, valid_mul_it, valid_sngl_it, num_classes
 85 | 
 86 | 
 87 | def fit_model(train_it, valid_mul_it, valid_sngl_it, num_classes, models_dir, lr=0.001, batch_size=64, epochs=1, mode=0,
 88 |               seed=125):
 89 |     model_file = os.path.join(models_dir, LOAD_MODEL)
 90 |     if os.path.exists(model_file):
 91 |         model = load_model(model_file)
 92 |     else:
 93 |         if mode == 0:
 94 |             inp1 = Input((None, 2048))
 95 |             inp2 = Input((None, 8))
 96 |             x = concatenate([inp1, inp2])
 97 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
 98 |             x = BatchNormalization(axis=-1)(x)
 99 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
100 |             x = BatchNormalization(axis=-1)(x)
101 |             x = Lambda(lambda x: K.sum(x, axis=-2), output_shape=(4096,))(x)
102 |             x = Dense(num_classes, activation='softmax')(x)
103 |             model = Model([inp1, inp2], x)
104 |         elif mode == 1:
105 |             inp1 = Input((None, 2048))
106 |             inp2 = Input((None, 8))
107 |             x = concatenate([inp1, inp2])
108 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
109 |             x = BatchNormalization(axis=-1)(x)
110 |             x = SimpleRNN(4096, activation='relu', recurrent_initializer='identity')(x)
111 |             x = BatchNormalization(axis=-1)(x)
112 |             x = Dense(num_classes, activation='softmax')(x)
113 |             model = Model([inp1, inp2], x)
114 |         elif mode == 2:
115 |             inp1 = Input((None, 2048))
116 |             inp2 = Input((None, 8))
117 |             x = concatenate([inp1, inp2])
118 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
119 |             x = BatchNormalization(axis=-1)(x)
120 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
121 |             x = BatchNormalization(axis=-1)(x)
122 |             x = GRU(100, activation='relu')(x)
123 |             x = BatchNormalization(axis=-1)(x)
124 |             x = Dense(num_classes, activation='softmax')(x)
125 |             model = Model([inp1, inp2], x)
126 |         elif mode == 3:
127 |             inp1 = Input((None, 2048))
128 |             inp1_com = Lambda(lambda x: K.max(x, axis=-2), output_shape=(2048,))(inp1)
129 |             x = Dense(4096, activation='relu')(inp1_com)
130 |             x = BatchNormalization(axis=-1)(x)
131 |             x = Dense(4096, activation='relu')(x)
132 |             x = BatchNormalization(axis=-1)(x)
133 |             x = Dense(num_classes, activation='softmax')(x)
134 |             model = Model(inp1, x)
135 | 
136 |     model.compile(optimizer=Adam(lr=lr), loss='sparse_categorical_crossentropy',
137 |                   metrics=['sparse_categorical_accuracy'])
138 | 
139 |     np.random.seed(seed)
140 |     checkpointer = ModelCheckpoint(filepath=os.path.join(models_dir, SNAPSHOT_MODEL))
141 |     csv_logger = CSVLogger(os.path.join(models_dir, LOG_FILE), append=True)
142 |     model.fit_generator(train_it,
143 |                         steps_per_epoch=train_it.samples / batch_size,
144 |                         validation_data=valid_mul_it,
145 |                         validation_steps=valid_mul_it.samples / batch_size,
146 |                         epochs=epochs,
147 |                         callbacks=[checkpointer, csv_logger],
148 |                         max_queue_size=10,
149 |                         use_multiprocessing=False)
150 | 
151 |     with open(os.path.join(models_dir, LOG_FILE), "a") as file:
152 |         file.write('Multi {}\n'.format(model.evaluate_generator(valid_mul_it, steps=valid_mul_it.samples / batch_size)))
153 |         file.write(
154 |             'Single {}\n'.format(model.evaluate_generator(valid_sngl_it, steps=valid_sngl_it.samples / batch_size)))
155 | 
156 | 
157 | def predict(memmap_path, memmap_len, prod_info, sample_prod_info, models_dir, batch_size=200,
158 |             shuffle=None, top_k=10, use_img_idx=True):
159 |     model_file = os.path.join(models_dir, LOAD_MODEL)
160 |     if os.path.exists(model_file):
161 |         model = load_model(model_file)
162 |     else:
163 |         raise ValueError("Model doesn't exist")
164 |     images_df = create_images_df(prod_info, False)
165 |     images_df = images_df.merge(prod_info, on='product_id', how='left')[
166 |         ['product_id', 'img_idx', 'num_imgs']]
167 |     if shuffle:
168 |         np.random.seed(shuffle)
169 |         perm = np.random.permutation(images_df.shape[0])
170 |         images_df = images_df.reindex(perm)
171 |         images_df.reset_index(drop=True, inplace=True)
172 |     if sample_prod_info is not None:
173 |         images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
174 |     images_df.sort_values('product_id', inplace=True)
175 |     dfs = []
176 |     offset = 0
177 |     while offset < images_df.shape[0]:
178 |         end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5)
179 |         while end_idx < images_df.shape[0]:
180 |             if images_df.iloc[end_idx - 1].product_id == images_df.iloc[end_idx].product_id:
181 |                 end_idx += 1
182 |             else:
183 |                 break
184 |         it = MultiMemmapIterator(memmap_path=memmap_path,
185 |                             memmap_shape=(memmap_len, 2048),
186 |                             images_df=images_df[offset:end_idx],
187 |                             batch_size=batch_size,
188 |                             pool_wrokers=1,
189 |                             only_single=False,
190 |                             include_singles=False,
191 |                             max_images=4,
192 |                             shuffle=False,
193 |                             use_side_input=use_img_idx)
194 | 
195 |         preds = model.predict_generator(it, it.samples / batch_size,
196 |                                         verbose=1, max_queue_size=10)
197 |         it.terminate()
198 |         del it
199 |         chunk = []
200 |         for i, product_id in enumerate(images_df[offset:end_idx].product_id.unique()):
201 |             top_k_preds = np.argpartition(preds[i], -top_k)[-top_k:]
202 |             for pred_idx in range(top_k):
203 |                 chunk.append((product_id, 0, top_k_preds[pred_idx], preds[i, top_k_preds[pred_idx]]))
204 | 
205 |         chunk_df = pd.DataFrame(chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
206 |         dfs.append(chunk_df)
207 |         offset = end_idx
208 |         del preds
209 |         del chunk
210 |     return pd.concat(dfs)
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     parser = argparse.ArgumentParser()
215 |     parser.add_argument('--fit', action='store_true', dest='is_fit')
216 |     parser.add_argument('--predict', action='store_true', dest='is_predict')
217 |     parser.add_argument('--predict_valid', action='store_true', dest='is_predict_valid')
218 |     parser.add_argument('--bcolz_root', required=True, help='VGG16 vecs bcolz root path')
219 |     parser.add_argument('--bcolz_prod_info_csv', required=True,
220 |                         help='Path to prod info csv with which VGG16 were generated')
221 |     parser.add_argument('--sample_prod_info_csv', required=True, help='Path to sample prod info csv')
222 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
223 |     parser.add_argument('--train_split_csv', required=True, help='Train split csv')
224 |     parser.add_argument('--models_dir', required=True, help='Output directory for models snapshots')
225 |     parser.add_argument('--lr', type=float, default=0.001, required=False, help='Learning rate')
226 |     parser.add_argument('--batch_size', type=int, default=64, required=False, help='Batch size')
227 |     parser.add_argument('--epochs', type=int, default=1, required=False, help='Number of epochs')
228 |     parser.add_argument('--only_first_image', dest='only_first_image', action='store_true',
229 |                         help="Include only first image from each product")
230 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
231 |                         help='If products should be shuffled, provide seed')
232 |     parser.set_defaults(only_first_image=False)
233 |     parser.add_argument('--mode', type=int, default=0, required=False, help='Mode')
234 |     parser.add_argument('--batch_seed', type=int, default=123, required=False, help='Batch seed')
235 |     parser.add_argument('--dont_use_img_idx', action='store_false', dest='use_img_idx')
236 |     parser.set_defaults(use_img_idx=True)
237 |     parser.add_argument('--memmap_len', type=int, required=True, help='Number of rows in memmap')
238 |     parser.set_defaults(two_outs=False)
239 |     parser.add_argument('--max_images', type=int, default=2, required=False, help='Max images in train record')
240 |     parser.add_argument('--only_single', action='store_true', dest='only_single')
241 |     parser.set_defaults(only_single=False)
242 |     parser.add_argument('--dont_include_singles', action='store_false', dest='include_singles')
243 |     parser.set_defaults(include_singles=True)
244 | 
245 |     args = parser.parse_args()
246 |     if not os.path.isdir(args.models_dir):
247 |         os.mkdir(args.models_dir)
248 | 
249 |     bcolz_prod_info = pd.read_csv(args.bcolz_prod_info_csv)
250 |     sample_prod_info = pd.read_csv(args.sample_prod_info_csv)
251 |     train_split = pd.read_csv(args.train_split_csv)
252 |     category_idx = pd.read_csv(args.category_idx_csv)
253 | 
254 |     if args.is_fit:
255 |         train_it, valid_mul_it, valid_sngl_it, num_classes = train_data(args.bcolz_root,
256 |                                                                         args.memmap_len,
257 |                                                                         bcolz_prod_info,
258 |                                                                         sample_prod_info,
259 |                                                                         train_split,
260 |                                                                         category_idx,
261 |                                                                         args.batch_size,
262 |                                                                         args.shuffle,
263 |                                                                         args.batch_seed,
264 |                                                                         args.max_images,
265 |                                                                         args.only_single,
266 |                                                                         args.use_img_idx,
267 |                                                                         args.include_singles)
268 |         fit_model(train_it, valid_mul_it, valid_sngl_it, num_classes, args.models_dir, args.lr, args.batch_size,
269 |                   args.epochs,
270 |                   args.mode,
271 |                   args.batch_seed)
272 |         train_it.terminate()
273 |         valid_mul_it.terminate()
274 |         valid_sngl_it.terminate()
275 |     elif args.is_predict:
276 |         out_df = predict(args.bcolz_root, args.memmap_len, bcolz_prod_info, sample_prod_info, args.models_dir,
277 |                          use_img_idx=args.use_img_idx)
278 |         out_df.to_csv(os.path.join(args.models_dir, PREDICTIONS_FILE), index=False)
279 |     elif args.is_predict_valid:
280 |         only_valids = bcolz_prod_info[
281 |             bcolz_prod_info.product_id.isin(train_split[train_split.train == False].product_id)]
282 |         out_df = predict(args.bcolz_root, args.memmap_len, bcolz_prod_info, only_valids, args.models_dir,
283 |                          shuffle=args.shuffle, use_img_idx=args.use_img_idx)
284 |         out_df.to_csv(os.path.join(args.models_dir, VALID_PREDICTIONS_FILE), index=False)
285 | 


--------------------------------------------------------------------------------
/src/model/tune_avg_vgg16_vecs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | import keras.backend as K
  6 | from keras.models import Model
  7 | from keras.models import load_model
  8 | from keras.layers import Dense
  9 | from keras.layers import Input
 10 | from keras.layers import BatchNormalization
 11 | from keras.layers import TimeDistributed
 12 | from keras.layers import Flatten
 13 | from keras.layers import Lambda
 14 | from keras.optimizers import Adam
 15 | from keras.callbacks import ModelCheckpoint
 16 | from keras.callbacks import CSVLogger
 17 | from src.data.category_idx import map_categories
 18 | from src.model.multi_memmap_iterator import MultiMemmapIterator
 19 | from src.model.resnet50_vecs import create_images_df
 20 | 
 21 | LOAD_MODEL = 'model.h5'
 22 | SNAPSHOT_MODEL = 'model.h5'
 23 | LOG_FILE = 'training.log'
 24 | PREDICTIONS_FILE = 'single_predictions.csv'
 25 | VALID_PREDICTIONS_FILE = 'valid_single_predictions.csv'
 26 | MAX_PREDICTIONS_AT_TIME = 50000
 27 | 
 28 | 
 29 | def train_data(memmap_path, memmap_len, bcolz_prod_info, sample_prod_info, train_split, category_idx, batch_size,
 30 |                shuffle=None, batch_seed=123, max_images=2, only_single=False, use_img_idx=False):
 31 |     images_df = create_images_df(bcolz_prod_info, False)
 32 |     bcolz_prod_info['category_idx'] = map_categories(category_idx, bcolz_prod_info['category_id'])
 33 |     bcolz_prod_info = bcolz_prod_info.merge(train_split, on='product_id', how='left')
 34 |     images_df = images_df.merge(bcolz_prod_info, on='product_id', how='left')[
 35 |         ['product_id', 'category_idx', 'img_idx', 'num_imgs', 'train']]
 36 |     if shuffle:
 37 |         np.random.seed(shuffle)
 38 |         perm = np.random.permutation(images_df.shape[0])
 39 |         images_df = images_df.reindex(perm)
 40 |         images_df.reset_index(drop=True, inplace=True)
 41 |     images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
 42 |     train_df = images_df[images_df['train']]
 43 |     valid_df = images_df[~images_df['train']]
 44 |     num_classes = np.unique(images_df['category_idx']).size
 45 | 
 46 |     train_it = MultiMemmapIterator(memmap_path=memmap_path,
 47 |                                    memmap_shape=(memmap_len, 512, 2, 2),
 48 |                                    images_df=train_df,
 49 |                                    num_classes=num_classes,
 50 |                                    seed=batch_seed,
 51 |                                    batch_size=batch_size,
 52 |                                    only_single=only_single,
 53 |                                    include_singles=False,
 54 |                                    max_images=max_images,
 55 |                                    pool_wrokers=4,
 56 |                                    shuffle=True,
 57 |                                    use_side_input=use_img_idx)
 58 |     valid_mul_it = MultiMemmapIterator(memmap_path=memmap_path,
 59 |                                        memmap_shape=(memmap_len, 512, 2, 2),
 60 |                                        images_df=valid_df,
 61 |                                        num_classes=num_classes,
 62 |                                        seed=batch_seed,
 63 |                                        batch_size=batch_size,
 64 |                                        shuffle=False,
 65 |                                        only_single=False,
 66 |                                        include_singles=False,
 67 |                                        max_images=4,
 68 |                                        pool_wrokers=1,
 69 |                                        use_side_input=use_img_idx)
 70 |     valid_sngl_it = MultiMemmapIterator(memmap_path=memmap_path,
 71 |                                         memmap_shape=(memmap_len, 512, 2, 2),
 72 |                                         images_df=valid_df,
 73 |                                         num_classes=num_classes,
 74 |                                         seed=batch_seed,
 75 |                                         batch_size=batch_size,
 76 |                                         shuffle=False,
 77 |                                         only_single=True,
 78 |                                         include_singles=True,
 79 |                                         max_images=4,
 80 |                                         pool_wrokers=1,
 81 |                                         use_side_input=use_img_idx)
 82 |     return train_it, valid_mul_it, valid_sngl_it, num_classes
 83 | 
 84 | 
 85 | def fit_model(train_it, valid_mul_it, valid_sngl_it, num_classes, models_dir, lr=0.001, batch_size=64, epochs=1, mode=0,
 86 |               seed=125):
 87 |     model_file = os.path.join(models_dir, LOAD_MODEL)
 88 |     if os.path.exists(model_file):
 89 |         model = load_model(model_file)
 90 |     else:
 91 |         if mode == 0:
 92 |             inp1 = Input((None, 512, 2, 2))
 93 |             inp1_flat = TimeDistributed(Flatten())(inp1)
 94 |             inp1_flat = Lambda(lambda x: K.max(x, axis=-2), output_shape=(2048,))(inp1_flat)
 95 |             x = Dense(4096, activation='relu')(inp1_flat)
 96 |             x = BatchNormalization(axis=-1)(x)
 97 |             x = Dense(4096, activation='relu')(x)
 98 |             x = BatchNormalization(axis=-1)(x)
 99 |             x = Dense(num_classes, activation='softmax')(x)
100 |             model = Model(inp1, x)
101 | 
102 |     model.compile(optimizer=Adam(lr=lr), loss='sparse_categorical_crossentropy',
103 |                   metrics=['sparse_categorical_accuracy'])
104 | 
105 |     np.random.seed(seed)
106 |     checkpointer = ModelCheckpoint(filepath=os.path.join(models_dir, SNAPSHOT_MODEL))
107 |     csv_logger = CSVLogger(os.path.join(models_dir, LOG_FILE), append=True)
108 |     model.fit_generator(train_it,
109 |                         steps_per_epoch=train_it.samples / batch_size,
110 |                         validation_data=valid_mul_it,
111 |                         validation_steps=valid_mul_it.samples / batch_size,
112 |                         epochs=epochs,
113 |                         callbacks=[checkpointer, csv_logger],
114 |                         max_queue_size=10,
115 |                         use_multiprocessing=False)
116 | 
117 |     with open(os.path.join(models_dir, LOG_FILE), "a") as file:
118 |         file.write('Multi {}\n'.format(model.evaluate_generator(valid_mul_it, steps=valid_mul_it.samples / batch_size)))
119 |         file.write(
120 |             'Single {}\n'.format(model.evaluate_generator(valid_sngl_it, steps=valid_sngl_it.samples / batch_size)))
121 | 
122 | 
123 | def predict(memmap_path, memmap_len, prod_info, sample_prod_info, models_dir, batch_size=200,
124 |             shuffle=None, top_k=10, use_img_idx=False):
125 |     model_file = os.path.join(models_dir, LOAD_MODEL)
126 |     if os.path.exists(model_file):
127 |         model = load_model(model_file)
128 |     else:
129 |         raise ValueError("Model doesn't exist")
130 |     images_df = create_images_df(prod_info, False)
131 |     images_df = images_df.merge(prod_info, on='product_id', how='left')[
132 |         ['product_id', 'img_idx', 'num_imgs']]
133 |     if shuffle:
134 |         np.random.seed(shuffle)
135 |         perm = np.random.permutation(images_df.shape[0])
136 |         images_df = images_df.reindex(perm)
137 |         images_df.reset_index(drop=True, inplace=True)
138 |     if sample_prod_info is not None:
139 |         images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
140 |     images_df.sort_values('product_id', inplace=True)
141 |     dfs = []
142 |     offset = 0
143 |     while offset < images_df.shape[0]:
144 |         end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5)
145 |         while end_idx < images_df.shape[0]:
146 |             if images_df.iloc[end_idx - 1].product_id == images_df.iloc[end_idx].product_id:
147 |                 end_idx += 1
148 |             else:
149 |                 break
150 |         it = MultiMemmapIterator(memmap_path=memmap_path,
151 |                                  memmap_shape=(memmap_len, 512, 2, 2),
152 |                                  images_df=images_df[offset:end_idx],
153 |                                  batch_size=batch_size,
154 |                                  pool_wrokers=1,
155 |                                  only_single=False,
156 |                                  include_singles=False,
157 |                                  max_images=4,
158 |                                  shuffle=False,
159 |                                  use_side_input=use_img_idx)
160 | 
161 |         preds = model.predict_generator(it, it.samples / batch_size,
162 |                                         verbose=1, max_queue_size=10)
163 |         it.terminate()
164 |         del it
165 |         chunk = []
166 |         for i, product_id in enumerate(images_df[offset:end_idx].product_id.unique()):
167 |             top_k_preds = np.argpartition(preds[i], -top_k)[-top_k:]
168 |             for pred_idx in range(top_k):
169 |                 chunk.append((product_id, 0, top_k_preds[pred_idx], preds[i, top_k_preds[pred_idx]]))
170 | 
171 |         chunk_df = pd.DataFrame(chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
172 |         dfs.append(chunk_df)
173 |         offset = end_idx
174 |         del preds
175 |         del chunk
176 |     return pd.concat(dfs)
177 | 
178 | if __name__ == '__main__':
179 |     parser = argparse.ArgumentParser()
180 |     parser.add_argument('--fit', action='store_true', dest='is_fit')
181 |     parser.add_argument('--predict', action='store_true', dest='is_predict')
182 |     parser.add_argument('--predict_valid', action='store_true', dest='is_predict_valid')
183 |     parser.add_argument('--memmap_path', required=True, help='VGG16 vecs memmap path')
184 |     parser.add_argument('--prod_info_csv', required=True,
185 |                         help='Path to prod info csv with which VGG16 were generated')
186 |     parser.add_argument('--sample_prod_info_csv', required=True, help='Path to sample prod info csv')
187 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
188 |     parser.add_argument('--train_split_csv', required=True, help='Train split csv')
189 |     parser.add_argument('--models_dir', required=True, help='Output directory for models snapshots')
190 |     parser.add_argument('--lr', type=float, default=0.001, required=False, help='Learning rate')
191 |     parser.add_argument('--batch_size', type=int, default=64, required=False, help='Batch size')
192 |     parser.add_argument('--epochs', type=int, default=1, required=False, help='Number of epochs')
193 |     parser.add_argument('--only_first_image', dest='only_first_image', action='store_true',
194 |                         help="Include only first image from each product")
195 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
196 |                         help='If products should be shuffled, provide seed')
197 |     parser.set_defaults(only_first_image=False)
198 |     parser.add_argument('--mode', type=int, default=0, required=False, help='Mode')
199 |     parser.add_argument('--batch_seed', type=int, default=123, required=False, help='Batch seed')
200 |     parser.add_argument('--use_img_idx', action='store_true', dest='use_img_idx')
201 |     parser.set_defaults(use_img_idx=False)
202 |     parser.add_argument('--memmap_len', type=int, required=True, help='Number of rows in memmap')
203 |     parser.set_defaults(two_outs=False)
204 |     parser.add_argument('--max_images', type=int, default=2, required=False, help='Max images in train record')
205 |     parser.add_argument('--only_single', action='store_true', dest='only_single')
206 |     parser.set_defaults(only_single=False)
207 | 
208 |     args = parser.parse_args()
209 |     if not os.path.isdir(args.models_dir):
210 |         os.mkdir(args.models_dir)
211 | 
212 |     prod_info = pd.read_csv(args.prod_info_csv)
213 |     sample_prod_info = pd.read_csv(args.sample_prod_info_csv)
214 |     train_split = pd.read_csv(args.train_split_csv)
215 |     category_idx = pd.read_csv(args.category_idx_csv)
216 | 
217 |     if args.is_fit:
218 |         train_it, valid_mul_it, valid_sngl_it, num_classes = train_data(args.memmap_path,
219 |                                                                         args.memmap_len,
220 |                                                                         prod_info,
221 |                                                                         sample_prod_info,
222 |                                                                         train_split,
223 |                                                                         category_idx,
224 |                                                                         args.batch_size,
225 |                                                                         args.shuffle,
226 |                                                                         args.batch_seed,
227 |                                                                         args.max_images,
228 |                                                                         args.only_single,
229 |                                                                         args.use_img_idx)
230 |         fit_model(train_it, valid_mul_it, valid_sngl_it, num_classes, args.models_dir, args.lr, args.batch_size,
231 |                   args.epochs,
232 |                   args.mode,
233 |                   args.batch_seed)
234 |         train_it.terminate()
235 |         valid_mul_it.terminate()
236 |         valid_sngl_it.terminate()
237 |     elif args.is_predict:
238 |         out_df = predict(args.memmap_path, args.memmap_len, prod_info, sample_prod_info, args.models_dir,
239 |                          use_img_idx=args.use_img_idx)
240 |         out_df.to_csv(os.path.join(args.models_dir, PREDICTIONS_FILE), index=False)
241 |     elif args.is_predict_valid:
242 |         only_valids = prod_info[
243 |             prod_info.product_id.isin(train_split[train_split.train == False].product_id)]
244 |         out_df = predict(args.memmap_path, args.memmap_len, prod_info, only_valids, args.models_dir,
245 |                          shuffle=args.shuffle, use_img_idx=args.use_img_idx)
246 |         out_df.to_csv(os.path.join(args.models_dir, VALID_PREDICTIONS_FILE), index=False)
247 | 


--------------------------------------------------------------------------------
/src/model/tune_pl_avg_resnet50_vecs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | import keras.backend as K
  6 | from keras.models import Model
  7 | from keras.models import load_model
  8 | from keras.layers import Dense
  9 | from keras.layers import Input
 10 | from keras.layers import BatchNormalization
 11 | from keras.layers import TimeDistributed
 12 | from keras.layers import SimpleRNN
 13 | from keras.layers import GRU
 14 | from keras.layers import Lambda
 15 | from keras.layers import concatenate
 16 | from keras.optimizers import Adam
 17 | from keras.callbacks import ModelCheckpoint
 18 | from keras.callbacks import CSVLogger
 19 | from src.data.category_idx import map_categories
 20 | from src.model.multi_memmap_iterator import MultiMemmapIterator
 21 | from src.model.combine_iterator import CombineIterator
 22 | from src.model.resnet50_vecs import create_images_df
 23 | 
 24 | LOAD_MODEL = 'model.h5'
 25 | SNAPSHOT_MODEL = 'model.h5'
 26 | LOG_FILE = 'training.log'
 27 | PREDICTIONS_FILE = 'single_predictions.csv'
 28 | VALID_PREDICTIONS_FILE = 'valid_single_predictions.csv'
 29 | MAX_PREDICTIONS_AT_TIME = 50000
 30 | 
 31 | 
 32 | def create_train_df(prod_info, category_idx, shuffle=None):
 33 |     images_df = create_images_df(prod_info, False)
 34 |     prod_info['category_idx'] = map_categories(category_idx, prod_info['category_id'])
 35 |     images_df = images_df.merge(prod_info, on='product_id', how='left')[
 36 |         ['product_id', 'category_idx', 'img_idx', 'num_imgs']]
 37 |     if shuffle:
 38 |         np.random.seed(shuffle)
 39 |         perm = np.random.permutation(images_df.shape[0])
 40 |         images_df = images_df.reindex(perm)
 41 |         images_df.reset_index(drop=True, inplace=True)
 42 |     return images_df
 43 | 
 44 | 
 45 | def train_data(train_memmap_path,
 46 |                train_memmap_len,
 47 |                test_memmap_path,
 48 |                test_memmap_len,
 49 |                train_prod_info,
 50 |                train_pl_prod_info,
 51 |                test_pl_prod_info,
 52 |                train_split,
 53 |                category_idx,
 54 |                batch_size,
 55 |                shuffle=None,
 56 |                batch_seed=123,
 57 |                max_images=2,
 58 |                only_single=False,
 59 |                use_img_idx=False,
 60 |                include_singles=True):
 61 |     true_train_df = create_train_df(train_prod_info, category_idx, shuffle=shuffle)
 62 |     true_train_df = true_train_df.merge(train_split, on='product_id', how='left')
 63 |     num_classes = np.unique(true_train_df['category_idx']).size
 64 |     valid_df = true_train_df[~true_train_df['train']]
 65 |     del true_train_df
 66 | 
 67 |     pl_train_df = create_train_df(train_pl_prod_info, category_idx, shuffle=shuffle)
 68 |     pl_test_df = create_train_df(test_pl_prod_info, category_idx, shuffle=None)
 69 | 
 70 |     test_batch_size = int(batch_size * 0.25)
 71 |     train_batch_size = int(batch_size * 0.75)
 72 |     train_batch_size += 1 if test_batch_size + train_batch_size < batch_size else 0
 73 | 
 74 |     train_train_it = MultiMemmapIterator(memmap_path=train_memmap_path,
 75 |                                          memmap_shape=(train_memmap_len, 2048),
 76 |                                          images_df=pl_train_df,
 77 |                                          num_classes=num_classes,
 78 |                                          seed=batch_seed,
 79 |                                          batch_size=train_batch_size,
 80 |                                          only_single=only_single,
 81 |                                          include_singles=include_singles,
 82 |                                          max_images=max_images,
 83 |                                          pool_wrokers=4,
 84 |                                          shuffle=True,
 85 |                                          use_side_input=use_img_idx)
 86 | 
 87 |     train_test_it = MultiMemmapIterator(memmap_path=test_memmap_path,
 88 |                                         memmap_shape=(test_memmap_len, 2048),
 89 |                                         images_df=pl_test_df,
 90 |                                         num_classes=num_classes,
 91 |                                         seed=batch_seed,
 92 |                                         batch_size=test_batch_size,
 93 |                                         only_single=only_single,
 94 |                                         include_singles=include_singles,
 95 |                                         max_images=max_images,
 96 |                                         pool_wrokers=4,
 97 |                                         shuffle=True,
 98 |                                         use_side_input=use_img_idx)
 99 | 
100 |     train_it = CombineIterator(train_train_it, train_test_it)
101 | 
102 |     valid_mul_it = MultiMemmapIterator(memmap_path=train_memmap_path,
103 |                                        memmap_shape=(train_memmap_len, 2048),
104 |                                        images_df=valid_df,
105 |                                        num_classes=num_classes,
106 |                                        seed=batch_seed,
107 |                                        batch_size=batch_size,
108 |                                        shuffle=False,
109 |                                        only_single=False,
110 |                                        include_singles=False,
111 |                                        max_images=4,
112 |                                        pool_wrokers=1,
113 |                                        use_side_input=use_img_idx)
114 |     return train_it, valid_mul_it, num_classes
115 | 
116 | 
117 | def fit_model(train_it, valid_mul_it, num_classes, models_dir, lr=0.001, batch_size=64, epochs=1, mode=0,
118 |               seed=125):
119 |     model_file = os.path.join(models_dir, LOAD_MODEL)
120 |     if os.path.exists(model_file):
121 |         model = load_model(model_file)
122 |     else:
123 |         if mode == 0:
124 |             inp1 = Input((None, 2048))
125 |             inp2 = Input((None, 8))
126 |             x = concatenate([inp1, inp2])
127 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
128 |             x = BatchNormalization(axis=-1)(x)
129 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
130 |             x = BatchNormalization(axis=-1)(x)
131 |             x = Lambda(lambda x: K.sum(x, axis=-2), output_shape=(4096,))(x)
132 |             x = Dense(num_classes, activation='softmax')(x)
133 |             model = Model([inp1, inp2], x)
134 |         elif mode == 1:
135 |             inp1 = Input((None, 2048))
136 |             inp2 = Input((None, 8))
137 |             x = concatenate([inp1, inp2])
138 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
139 |             x = BatchNormalization(axis=-1)(x)
140 |             x = SimpleRNN(4096, activation='relu', recurrent_initializer='identity')(x)
141 |             x = BatchNormalization(axis=-1)(x)
142 |             x = Dense(num_classes, activation='softmax')(x)
143 |             model = Model([inp1, inp2], x)
144 |         elif mode == 2:
145 |             inp1 = Input((None, 2048))
146 |             inp2 = Input((None, 8))
147 |             x = concatenate([inp1, inp2])
148 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
149 |             x = BatchNormalization(axis=-1)(x)
150 |             x = TimeDistributed(Dense(4096, activation='relu'))(x)
151 |             x = BatchNormalization(axis=-1)(x)
152 |             x = GRU(100, activation='relu')(x)
153 |             x = BatchNormalization(axis=-1)(x)
154 |             x = Dense(num_classes, activation='softmax')(x)
155 |             model = Model([inp1, inp2], x)
156 |         elif mode == 3:
157 |             inp1 = Input((None, 2048))
158 |             inp1_com = Lambda(lambda x: K.max(x, axis=-2), output_shape=(2048,))(inp1)
159 |             x = Dense(4096, activation='relu')(inp1_com)
160 |             x = BatchNormalization(axis=-1)(x)
161 |             x = Dense(4096, activation='relu')(x)
162 |             x = BatchNormalization(axis=-1)(x)
163 |             x = Dense(num_classes, activation='softmax')(x)
164 |             model = Model(inp1, x)
165 | 
166 |     model.compile(optimizer=Adam(lr=lr), loss='sparse_categorical_crossentropy',
167 |                   metrics=['sparse_categorical_accuracy'])
168 | 
169 |     np.random.seed(seed)
170 |     checkpointer = ModelCheckpoint(filepath=os.path.join(models_dir, SNAPSHOT_MODEL))
171 |     csv_logger = CSVLogger(os.path.join(models_dir, LOG_FILE), append=True)
172 |     model.fit_generator(train_it,
173 |                         steps_per_epoch=train_it.samples / train_it.batch_size,
174 |                         validation_data=valid_mul_it,
175 |                         validation_steps=valid_mul_it.samples / valid_mul_it.batch_size,
176 |                         epochs=epochs,
177 |                         callbacks=[checkpointer, csv_logger],
178 |                         max_queue_size=10,
179 |                         use_multiprocessing=False)
180 | 
181 | def predict(memmap_path, memmap_len, prod_info, sample_prod_info, models_dir, batch_size=200,
182 |             shuffle=None, top_k=10, use_img_idx=True):
183 |     model_file = os.path.join(models_dir, LOAD_MODEL)
184 |     if os.path.exists(model_file):
185 |         model = load_model(model_file)
186 |     else:
187 |         raise ValueError("Model doesn't exist")
188 |     images_df = create_images_df(prod_info, False)
189 |     images_df = images_df.merge(prod_info, on='product_id', how='left')[
190 |         ['product_id', 'img_idx', 'num_imgs']]
191 |     if shuffle:
192 |         np.random.seed(shuffle)
193 |         perm = np.random.permutation(images_df.shape[0])
194 |         images_df = images_df.reindex(perm)
195 |         images_df.reset_index(drop=True, inplace=True)
196 |     if sample_prod_info is not None:
197 |         images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
198 |     images_df.sort_values('product_id', inplace=True)
199 |     dfs = []
200 |     offset = 0
201 |     while offset < images_df.shape[0]:
202 |         end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5)
203 |         while end_idx < images_df.shape[0]:
204 |             if images_df.iloc[end_idx - 1].product_id == images_df.iloc[end_idx].product_id:
205 |                 end_idx += 1
206 |             else:
207 |                 break
208 |         it = MultiMemmapIterator(memmap_path=memmap_path,
209 |                                  memmap_shape=(memmap_len, 2048),
210 |                                  images_df=images_df[offset:end_idx],
211 |                                  batch_size=batch_size,
212 |                                  pool_wrokers=1,
213 |                                  only_single=False,
214 |                                  include_singles=False,
215 |                                  max_images=4,
216 |                                  shuffle=False,
217 |                                  use_side_input=use_img_idx)
218 | 
219 |         preds = model.predict_generator(it, it.samples / batch_size,
220 |                                         verbose=1, max_queue_size=10)
221 |         it.terminate()
222 |         del it
223 |         chunk = []
224 |         for i, product_id in enumerate(images_df[offset:end_idx].product_id.unique()):
225 |             top_k_preds = np.argpartition(preds[i], -top_k)[-top_k:]
226 |             for pred_idx in range(top_k):
227 |                 chunk.append((product_id, 0, top_k_preds[pred_idx], preds[i, top_k_preds[pred_idx]]))
228 | 
229 |         chunk_df = pd.DataFrame(chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
230 |         dfs.append(chunk_df)
231 |         offset = end_idx
232 |         del preds
233 |         del chunk
234 |     return pd.concat(dfs)
235 | 
236 | 
237 | if __name__ == '__main__':
238 |     parser = argparse.ArgumentParser()
239 |     parser.add_argument('--fit', action='store_true', dest='is_fit')
240 |     parser.add_argument('--predict', action='store_true', dest='is_predict')
241 |     parser.add_argument('--predict_valid', action='store_true', dest='is_predict_valid')
242 |     parser.add_argument('--memmap_path_train', required=True)
243 |     parser.add_argument('--memmap_train_len', type=int, required=True)
244 |     parser.add_argument('--memmap_path_test', required=True)
245 |     parser.add_argument('--memmap_test_len', type=int, required=True)
246 |     parser.add_argument('--train_prod_info_csv', required=True)
247 |     parser.add_argument('--train_pl_prod_info_csv', required=True)
248 |     parser.add_argument('--test_pl_prod_info_csv', required=True)
249 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
250 |     parser.add_argument('--train_split_csv', required=True, help='Train split csv')
251 |     parser.add_argument('--models_dir', required=True, help='Output directory for models snapshots')
252 |     parser.add_argument('--lr', type=float, default=0.001, required=False, help='Learning rate')
253 |     parser.add_argument('--batch_size', type=int, default=64, required=False, help='Batch size')
254 |     parser.add_argument('--epochs', type=int, default=1, required=False, help='Number of epochs')
255 |     parser.add_argument('--only_first_image', dest='only_first_image', action='store_true',
256 |                         help="Include only first image from each product")
257 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
258 |                         help='If products should be shuffled, provide seed')
259 |     parser.set_defaults(only_first_image=False)
260 |     parser.add_argument('--mode', type=int, default=0, required=False, help='Mode')
261 |     parser.add_argument('--batch_seed', type=int, default=123, required=False, help='Batch seed')
262 |     parser.add_argument('--dont_use_img_idx', action='store_false', dest='use_img_idx')
263 |     parser.set_defaults(use_img_idx=True)
264 |     parser.set_defaults(two_outs=False)
265 |     parser.add_argument('--max_images', type=int, default=2, required=False, help='Max images in train record')
266 |     parser.add_argument('--only_single', action='store_true', dest='only_single')
267 |     parser.set_defaults(only_single=False)
268 |     parser.add_argument('--dont_include_singles', action='store_false', dest='include_singles')
269 |     parser.set_defaults(include_singles=True)
270 | 
271 |     args = parser.parse_args()
272 |     if not os.path.isdir(args.models_dir):
273 |         os.mkdir(args.models_dir)
274 | 
275 |     train_prod_info = pd.read_csv(args.train_prod_info_csv)
276 |     train_pl_prod_info = pd.read_csv(args.train_pl_prod_info_csv)
277 |     test_pl_prod_info = pd.read_csv(args.test_pl_prod_info_csv)
278 |     train_split = pd.read_csv(args.train_split_csv)
279 |     category_idx = pd.read_csv(args.category_idx_csv)
280 | 
281 |     if args.is_fit:
282 |         train_it, valid_mul_it, num_classes = train_data(args.memmap_path_train,
283 |                                                          args.memmap_train_len,
284 |                                                          args.memmap_path_test,
285 |                                                          args.memmap_test_len,
286 |                                                          train_prod_info,
287 |                                                          train_pl_prod_info,
288 |                                                          test_pl_prod_info,
289 |                                                          train_split,
290 |                                                          category_idx,
291 |                                                          args.batch_size,
292 |                                                          args.shuffle,
293 |                                                          args.batch_seed,
294 |                                                          args.max_images,
295 |                                                          args.only_single,
296 |                                                          args.use_img_idx,
297 |                                                          args.include_singles)
298 |         fit_model(train_it, valid_mul_it, num_classes, args.models_dir, args.lr, args.batch_size,
299 |                   args.epochs,
300 |                   args.mode,
301 |                   args.batch_seed)
302 |         train_it.first_iterator.terminate()
303 |         train_it.second_iterator.terminate()
304 |         valid_mul_it.terminate()
305 |     elif args.is_predict:
306 |         test_prod_info = test_pl_prod_info.drop('category_id', 1)
307 |         out_df = predict(args.memmap_path_test, args.memmap_test_len, test_prod_info, test_prod_info, args.models_dir,
308 |                          use_img_idx=args.use_img_idx)
309 |         out_df.to_csv(os.path.join(args.models_dir, PREDICTIONS_FILE), index=False)
310 |     elif args.is_predict_valid:
311 |         only_valids = train_prod_info[
312 |             train_prod_info.product_id.isin(train_split[train_split.train == False].product_id)]
313 |         out_df = predict(args.memmap_path_train, args.memmap_train_len, train_prod_info, only_valids, args.models_dir,
314 |                          shuffle=args.shuffle, use_img_idx=args.use_img_idx)
315 |         out_df.to_csv(os.path.join(args.models_dir, VALID_PREDICTIONS_FILE), index=False)
316 | 


--------------------------------------------------------------------------------
/src/model/tune_resnet50_memmap_vecs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | import itertools
  6 | from collections import namedtuple
  7 | from keras.models import Model
  8 | from keras.models import load_model
  9 | from keras.layers import Dense
 10 | from keras.layers import Input
 11 | from keras.layers import BatchNormalization
 12 | from keras.layers import concatenate
 13 | from keras.optimizers import Adam
 14 | from keras.callbacks import ModelCheckpoint
 15 | from keras.callbacks import CSVLogger
 16 | from src.data.category_idx import map_categories
 17 | from src.model.resnet50_vecs import create_images_df
 18 | from src.model.memmap_iterator import MemmapIterator
 19 | 
 20 | LOAD_MODEL = 'model.h5'
 21 | SNAPSHOT_MODEL = 'model.h5'
 22 | LOG_FILE = 'training.log'
 23 | PREDICTIONS_FILE = 'single_predictions.csv'
 24 | VALID_PREDICTIONS_FILE = 'valid_single_predictions.csv'
 25 | MAX_PREDICTIONS_AT_TIME = 50000
 26 | 
 27 | 
 28 | def train_data(memmap_path, memmap_len, prod_info, sample_prod_info, train_split, category_idx, batch_size,
 29 |                shuffle=None, batch_seed=123, use_side_input=False):
 30 |     images_df = create_images_df(prod_info, False)
 31 |     prod_info['category_idx'] = map_categories(category_idx, prod_info['category_id'])
 32 |     prod_info = prod_info.merge(train_split, on='product_id', how='left')
 33 |     images_df = images_df.merge(prod_info, on='product_id', how='left')[
 34 |         ['product_id', 'category_idx', 'img_idx', 'num_imgs', 'train']]
 35 |     if shuffle:
 36 |         np.random.seed(shuffle)
 37 |         perm = np.random.permutation(images_df.shape[0])
 38 |         images_df = images_df.reindex(perm)
 39 |         images_df.reset_index(drop=True, inplace=True)
 40 |     images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
 41 |     train_df = images_df[images_df['train']]
 42 |     valid_df = images_df[~images_df['train']]
 43 |     num_classes = np.unique(images_df['category_idx']).size
 44 | 
 45 |     train_it = MemmapIterator(memmap_path=memmap_path,
 46 |                               memmap_shape=(memmap_len, 2048),
 47 |                               images_df=train_df,
 48 |                               num_classes=num_classes,
 49 |                               seed=batch_seed,
 50 |                               batch_size=batch_size,
 51 |                               pool_wrokers=4,
 52 |                               use_side_input=use_side_input,
 53 |                               shuffle=True)
 54 |     valid_it = MemmapIterator(memmap_path=memmap_path,
 55 |                               memmap_shape=(memmap_len, 2048),
 56 |                               images_df=valid_df,
 57 |                               num_classes=num_classes,
 58 |                               seed=batch_seed,
 59 |                               batch_size=batch_size,
 60 |                               pool_wrokers=4,
 61 |                               use_side_input=use_side_input,
 62 |                               shuffle=False)
 63 |     return train_it, valid_it, num_classes
 64 | 
 65 | 
 66 | def fit_model(train_it, valid_it, num_classes, models_dir, lr=0.001, batch_size=64, epochs=1, mode=0, seed=125):
 67 |     model_file = os.path.join(models_dir, LOAD_MODEL)
 68 |     if os.path.exists(model_file):
 69 |         model = load_model(model_file)
 70 |     else:
 71 |         if mode == 0:
 72 |             inp = Input((2048,))
 73 |             x = Dense(num_classes, activation='softmax')(inp)
 74 |             model = Model(inp, x)
 75 |         elif mode == 1:
 76 |             inp = Input((2048,))
 77 |             x = Dense(4096, activation='relu')(inp)
 78 |             x = BatchNormalization(axis=-1)(x)
 79 |             x = Dense(4096, activation='relu')(x)
 80 |             x = BatchNormalization(axis=-1)(x)
 81 |             x = Dense(num_classes, activation='softmax')(x)
 82 |             model = Model(inp, x)
 83 |         elif mode == 2:
 84 |             inp = Input((2048,))
 85 |             x = Dense(4096, activation='relu')(inp)
 86 |             x = BatchNormalization(axis=-1)(x)
 87 |             x = Dense(num_classes, activation='softmax')(x)
 88 |             model = Model(inp, x)
 89 |         elif mode == 3:
 90 |             inp = Input((2048,))
 91 |             x = Dense(2048, activation='relu')(inp)
 92 |             x = BatchNormalization(axis=-1)(x)
 93 |             x = Dense(2048, activation='relu')(x)
 94 |             x = BatchNormalization(axis=-1)(x)
 95 |             x = Dense(num_classes, activation='softmax')(x)
 96 |             model = Model(inp, x)
 97 |         elif mode == 4:
 98 |             inp = Input((2048,))
 99 |             x = Dense(1024, activation='relu')(inp)
100 |             x = BatchNormalization(axis=-1)(x)
101 |             x = Dense(1024, activation='relu')(x)
102 |             x = BatchNormalization(axis=-1)(x)
103 |             x = Dense(num_classes, activation='softmax')(x)
104 |             model = Model(inp, x)
105 |         elif mode == 5:
106 |             inp = Input((2048,))
107 |             x = Dense(6144, activation='relu')(inp)
108 |             x = BatchNormalization(axis=-1)(x)
109 |             x = Dense(6144, activation='relu')(x)
110 |             x = BatchNormalization(axis=-1)(x)
111 |             x = Dense(num_classes, activation='softmax')(x)
112 |             model = Model(inp, x)
113 |         elif mode == 6:
114 |             inp_vec = Input((2048,))
115 |             img_idx_inp = Input((8,))
116 |             x = concatenate([inp_vec, img_idx_inp])
117 |             x = Dense(4096, activation='relu')(x)
118 |             x = BatchNormalization(axis=-1)(x)
119 |             x = Dense(4096, activation='relu')(x)
120 |             x = BatchNormalization(axis=-1)(x)
121 |             x = Dense(num_classes, activation='softmax')(x)
122 |             model = Model([inp_vec, img_idx_inp], x)
123 |         elif mode == 7:
124 |             inp_vec = Input((2048,))
125 |             img_idx_inp = Input((8,))
126 |             x = concatenate([inp_vec, img_idx_inp])
127 |             x = Dense(2048, activation='relu')(x)
128 |             x = BatchNormalization(axis=-1)(x)
129 |             x = Dense(2048, activation='relu')(x)
130 |             x = BatchNormalization(axis=-1)(x)
131 |             x = Dense(num_classes, activation='softmax')(x)
132 |             model = Model([inp_vec, img_idx_inp], x)
133 |         elif mode == 8:
134 |             inp_vec = Input((2048,))
135 |             img_idx_inp = Input((8,))
136 |             x = concatenate([inp_vec, img_idx_inp])
137 |             x = Dense(6144, activation='relu')(x)
138 |             x = BatchNormalization(axis=-1)(x)
139 |             x = Dense(6144, activation='relu')(x)
140 |             x = BatchNormalization(axis=-1)(x)
141 |             x = Dense(num_classes, activation='softmax')(x)
142 |             model = Model([inp_vec, img_idx_inp], x)
143 | 
144 |     model.compile(optimizer=Adam(lr=lr), loss='sparse_categorical_crossentropy',
145 |                   metrics=['sparse_categorical_accuracy'])
146 | 
147 |     np.random.seed(seed)
148 |     checkpointer = ModelCheckpoint(filepath=os.path.join(models_dir, SNAPSHOT_MODEL))
149 |     csv_logger = CSVLogger(os.path.join(models_dir, LOG_FILE), append=True)
150 |     model.fit_generator(train_it,
151 |                         steps_per_epoch=train_it.samples / batch_size,
152 |                         validation_data=valid_it,
153 |                         validation_steps=valid_it.samples / batch_size,
154 |                         epochs=epochs,
155 |                         callbacks=[checkpointer, csv_logger])
156 | 
157 | 
158 | def predict(memmap_path, memmap_len, prod_info, sample_prod_info, models_dir, use_side_input=False, batch_size=200,
159 |             shuffle=None, top_k=10):
160 |     model_file = os.path.join(models_dir, LOAD_MODEL)
161 |     if os.path.exists(model_file):
162 |         model = load_model(model_file)
163 |     else:
164 |         raise ValueError("Model doesn't exist")
165 |     images_df = create_images_df(prod_info, False)
166 |     images_df = images_df.merge(prod_info, on='product_id', how='left')[
167 |         ['product_id', 'img_idx', 'num_imgs']]
168 |     if shuffle:
169 |         np.random.seed(shuffle)
170 |         perm = np.random.permutation(images_df.shape[0])
171 |         images_df = images_df.reindex(perm)
172 |         images_df.reset_index(drop=True, inplace=True)
173 |     if sample_prod_info is not None:
174 |         images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
175 |     images_df.sort_values('product_id', inplace=True)
176 |     dfs = []
177 |     offset = 0
178 |     while offset < images_df.shape[0]:
179 |         end_idx = min(images_df.shape[0], offset + MAX_PREDICTIONS_AT_TIME - 5)
180 |         while end_idx < images_df.shape[0]:
181 |             if images_df.iloc[end_idx - 1].product_id == images_df.iloc[end_idx].product_id:
182 |                 end_idx += 1
183 |             else:
184 |                 break
185 |         it = MemmapIterator(memmap_path=memmap_path,
186 |                             memmap_shape=(memmap_len, 2048),
187 |                             images_df=images_df[offset:end_idx],
188 |                             batch_size=batch_size,
189 |                             pool_wrokers=1,
190 |                             use_side_input=use_side_input,
191 |                             shuffle=False)
192 |         preds = model.predict_generator(it, it.samples / batch_size,
193 |                                         verbose=1, max_queue_size=10)
194 |         it.terminate()
195 |         del it
196 |         product_start = 0
197 |         prev_product_id = 0
198 |         chunk = []
199 |         for i, row in enumerate(
200 |                 itertools.chain(images_df[offset:(offset + preds.shape[0])].itertuples(),
201 |                                 [namedtuple('Pandas', ['product_id', 'img_idx'])(1, 0)])):
202 |             if prev_product_id != 0 and prev_product_id != row.product_id:
203 |                 prods = preds[product_start:i].prod(axis=-2)
204 |                 prods = prods / prods.sum()
205 |                 top_k_preds = np.argpartition(prods, -top_k)[-top_k:]
206 |                 for pred_idx in range(top_k):
207 |                     chunk.append((prev_product_id, 0, top_k_preds[pred_idx], prods[top_k_preds[pred_idx]]))
208 |                 product_start = i
209 |             prev_product_id = row.product_id
210 |         chunk_df = pd.DataFrame(chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
211 |         dfs.append(chunk_df)
212 |         offset += preds.shape[0]
213 |         del preds
214 |         del chunk
215 |     return pd.concat(dfs)
216 | 
217 | 
218 | if __name__ == '__main__':
219 |     parser = argparse.ArgumentParser()
220 |     parser.add_argument('--fit', action='store_true', dest='is_fit')
221 |     parser.add_argument('--predict', action='store_true', dest='is_predict')
222 |     parser.add_argument('--predict_valid', action='store_true', dest='is_predict_valid')
223 |     parser.add_argument('--memmap_path', required=True, help='ResNet50 vecs memmap path')
224 |     parser.add_argument('--memmap_len', type=int, required=True, help='Length of memmap')
225 |     parser.add_argument('--prod_info_csv', required=True,
226 |                         help='Path to prod info csv with which VGG16 were generated')
227 |     parser.add_argument('--sample_prod_info_csv', required=True, help='Path to sample prod info csv')
228 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
229 |     parser.add_argument('--train_split_csv', required=True, help='Train split csv')
230 |     parser.add_argument('--models_dir', required=True, help='Output directory for models snapshots')
231 |     parser.add_argument('--lr', type=float, default=0.001, required=False, help='Learning rate')
232 |     parser.add_argument('--batch_size', type=int, default=64, required=False, help='Batch size')
233 |     parser.add_argument('--epochs', type=int, default=1, required=False, help='Number of epochs')
234 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
235 |                         help='If products should be shuffled, provide seed')
236 |     parser.add_argument('--mode', type=int, default=0, required=False, help='Mode')
237 |     parser.add_argument('--batch_seed', type=int, default=123, required=False, help='Batch seed')
238 |     parser.add_argument('--use_img_idx', action='store_true', dest='use_img_idx')
239 |     parser.set_defaults(use_img_idx=False)
240 | 
241 |     args = parser.parse_args()
242 |     if not os.path.isdir(args.models_dir):
243 |         os.mkdir(args.models_dir)
244 | 
245 |     prod_info = pd.read_csv(args.prod_info_csv)
246 |     sample_prod_info = pd.read_csv(args.sample_prod_info_csv)
247 |     train_split = pd.read_csv(args.train_split_csv)
248 |     category_idx = pd.read_csv(args.category_idx_csv)
249 | 
250 |     if args.is_fit:
251 |         train_it, valid_it, num_classes = train_data(memmap_path=args.memmap_path,
252 |                                                      memmap_len=args.memmap_len,
253 |                                                      prod_info=prod_info,
254 |                                                      sample_prod_info=sample_prod_info,
255 |                                                      train_split=train_split,
256 |                                                      category_idx=category_idx,
257 |                                                      batch_size=args.batch_size,
258 |                                                      shuffle=args.shuffle,
259 |                                                      batch_seed=args.batch_seed,
260 |                                                      use_side_input=args.use_img_idx)
261 |         fit_model(train_it, valid_it, num_classes, args.models_dir, args.lr, args.batch_size, args.epochs, args.mode,
262 |                   args.batch_seed)
263 |         train_it.terminate()
264 |         valid_it.terminate()
265 |     elif args.is_predict:
266 |         out_df = predict(args.memmap_path, args.memmap_len, prod_info, sample_prod_info, args.models_dir,
267 |                          args.use_img_idx)
268 |         out_df.to_csv(os.path.join(args.models_dir, PREDICTIONS_FILE), index=False)
269 |     elif args.is_predict_valid:
270 |         only_valids = prod_info[
271 |             prod_info.product_id.isin(train_split[train_split.train == False].product_id)]
272 |         out_df = predict(args.memmap_path, args.memmap_len, prod_info, only_valids, args.models_dir,
273 |                          shuffle=args.shuffle, use_side_input=args.use_img_idx)
274 |         out_df.to_csv(os.path.join(args.models_dir, VALID_PREDICTIONS_FILE), index=False)
275 | 


--------------------------------------------------------------------------------
/src/model/tune_resnet50_vecs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from keras.models import Model
  6 | from keras.models import load_model
  7 | from keras.layers import Dense
  8 | from keras.layers import Input
  9 | from keras.layers import BatchNormalization
 10 | from keras.layers import concatenate
 11 | from keras.optimizers import Adam
 12 | from keras.callbacks import ModelCheckpoint
 13 | from keras.callbacks import CSVLogger
 14 | from keras.utils import to_categorical
 15 | from ..data.category_idx import map_categories
 16 | from .bcolz_iterator import BcolzIterator
 17 | from .resnet50_vecs import create_images_df
 18 | 
 19 | LOAD_MODEL = 'model.h5'
 20 | SNAPSHOT_MODEL = 'model.h5'
 21 | LOG_FILE = 'training.log'
 22 | PREDICTIONS_FILE = 'predictions.csv'
 23 | VALID_PREDICTIONS_FILE = 'valid_predictions.csv'
 24 | MAX_PREDICTIONS_AT_TIME = 50000
 25 | 
 26 | def form_side_input(df):
 27 |     return np.hstack([to_categorical(df.num_imgs - 1, num_classes=4),
 28 |                                to_categorical(df.img_idx, num_classes=4)])
 29 | 
 30 | def train_data(bcolz_root, bcolz_prod_info, sample_prod_info, train_split, category_idx, only_first_image, batch_size,
 31 |                shuffle=None, batch_seed=123, use_img_idx=False):
 32 |     images_df = create_images_df(bcolz_prod_info, only_first_image)
 33 |     bcolz_prod_info['category_idx'] = map_categories(category_idx, bcolz_prod_info['category_id'])
 34 |     bcolz_prod_info = bcolz_prod_info.merge(train_split, on='product_id', how='left')
 35 |     cat_idxs = images_df.merge(bcolz_prod_info, on='product_id', how='left')[
 36 |         ['product_id', 'category_idx', 'img_idx', 'num_imgs', 'train']]
 37 |     del images_df
 38 |     if shuffle:
 39 |         np.random.seed(shuffle)
 40 |         perm = np.random.permutation(cat_idxs.shape[0])
 41 |         cat_idxs = cat_idxs.reindex(perm)
 42 |         cat_idxs.reset_index(drop=True, inplace=True)
 43 |     cat_idxs_smpl = cat_idxs[cat_idxs.product_id.isin(sample_prod_info.product_id)]
 44 |     idxs = cat_idxs_smpl.index.values
 45 |     train_idxs = idxs[cat_idxs_smpl['train']]
 46 |     valid_idxs = idxs[~cat_idxs_smpl['train']]
 47 |     num_classes = np.unique(cat_idxs_smpl['category_idx']).size
 48 | 
 49 |     if use_img_idx:
 50 |         side_input = form_side_input(cat_idxs)
 51 |     else:
 52 |         side_input = None
 53 | 
 54 |     train_it = BcolzIterator(bcolz_root=bcolz_root, x_idxs=train_idxs,
 55 |                              side_input=side_input,
 56 |                              y=cat_idxs_smpl['category_idx'].loc[train_idxs].as_matrix(),
 57 |                              num_classes=num_classes, seed=batch_seed, batch_size=batch_size, shuffle=True)
 58 |     valid_it = BcolzIterator(bcolz_root=bcolz_root, x_idxs=valid_idxs,
 59 |                              side_input=side_input,
 60 |                              y=cat_idxs_smpl['category_idx'].loc[valid_idxs].as_matrix(),
 61 |                              num_classes=num_classes, batch_size=batch_size, shuffle=False)
 62 |     return train_it, valid_it, num_classes
 63 | 
 64 | 
 65 | def fit_model(train_it, valid_it, num_classes, models_dir, lr=0.001, batch_size=64, epochs=1, mode=0, seed=125):
 66 |     model_file = os.path.join(models_dir, LOAD_MODEL)
 67 |     if os.path.exists(model_file):
 68 |         model = load_model(model_file)
 69 |     else:
 70 |         if mode == 0:
 71 |             inp = Input((2048,))
 72 |             x = Dense(num_classes, activation='softmax')(inp)
 73 |             model = Model(inp, x)
 74 |         elif mode == 1:
 75 |             inp = Input((2048,))
 76 |             x = Dense(4096, activation='relu')(inp)
 77 |             x = BatchNormalization(axis=-1)(x)
 78 |             x = Dense(4096, activation='relu')(x)
 79 |             x = BatchNormalization(axis=-1)(x)
 80 |             x = Dense(num_classes, activation='softmax')(x)
 81 |             model = Model(inp, x)
 82 |         elif mode == 2:
 83 |             inp = Input((2048,))
 84 |             x = Dense(4096, activation='relu')(inp)
 85 |             x = BatchNormalization(axis=-1)(x)
 86 |             x = Dense(num_classes, activation='softmax')(x)
 87 |             model = Model(inp, x)
 88 |         elif mode == 3:
 89 |             inp = Input((2048,))
 90 |             x = Dense(2048, activation='relu')(inp)
 91 |             x = BatchNormalization(axis=-1)(x)
 92 |             x = Dense(2048, activation='relu')(x)
 93 |             x = BatchNormalization(axis=-1)(x)
 94 |             x = Dense(num_classes, activation='softmax')(x)
 95 |             model = Model(inp, x)
 96 |         elif mode == 4:
 97 |             inp = Input((2048,))
 98 |             x = Dense(1024, activation='relu')(inp)
 99 |             x = BatchNormalization(axis=-1)(x)
100 |             x = Dense(1024, activation='relu')(x)
101 |             x = BatchNormalization(axis=-1)(x)
102 |             x = Dense(num_classes, activation='softmax')(x)
103 |             model = Model(inp, x)
104 |         elif mode == 5:
105 |             inp = Input((2048,))
106 |             x = Dense(6144, activation='relu')(inp)
107 |             x = BatchNormalization(axis=-1)(x)
108 |             x = Dense(6144, activation='relu')(x)
109 |             x = BatchNormalization(axis=-1)(x)
110 |             x = Dense(num_classes, activation='softmax')(x)
111 |             model = Model(inp, x)
112 |         elif mode == 6:
113 |             inp_vec = Input((2048,))
114 |             img_idx_inp = Input((8,))
115 |             x = concatenate([inp_vec, img_idx_inp])
116 |             x = Dense(4096, activation='relu')(x)
117 |             x = BatchNormalization(axis=-1)(x)
118 |             x = Dense(4096, activation='relu')(x)
119 |             x = BatchNormalization(axis=-1)(x)
120 |             x = Dense(num_classes, activation='softmax')(x)
121 |             model = Model([inp_vec, img_idx_inp], x)
122 |         elif mode == 7:
123 |             inp_vec = Input((2048,))
124 |             img_idx_inp = Input((8,))
125 |             x = concatenate([inp_vec, img_idx_inp])
126 |             x = Dense(2048, activation='relu')(x)
127 |             x = BatchNormalization(axis=-1)(x)
128 |             x = Dense(2048, activation='relu')(x)
129 |             x = BatchNormalization(axis=-1)(x)
130 |             x = Dense(num_classes, activation='softmax')(x)
131 |             model = Model([inp_vec, img_idx_inp], x)
132 |         elif mode == 8:
133 |             inp_vec = Input((2048,))
134 |             img_idx_inp = Input((8,))
135 |             x = concatenate([inp_vec, img_idx_inp])
136 |             x = Dense(6144, activation='relu')(x)
137 |             x = BatchNormalization(axis=-1)(x)
138 |             x = Dense(6144, activation='relu')(x)
139 |             x = BatchNormalization(axis=-1)(x)
140 |             x = Dense(num_classes, activation='softmax')(x)
141 |             model = Model([inp_vec, img_idx_inp], x)
142 | 
143 |     model.compile(optimizer=Adam(lr=lr), loss='sparse_categorical_crossentropy',
144 |                   metrics=['sparse_categorical_accuracy'])
145 | 
146 |     np.random.seed(seed)
147 |     checkpointer = ModelCheckpoint(filepath=os.path.join(models_dir, SNAPSHOT_MODEL))
148 |     csv_logger = CSVLogger(os.path.join(models_dir, LOG_FILE), append=True)
149 |     model.fit_generator(train_it,
150 |                         steps_per_epoch=train_it.samples / batch_size,
151 |                         validation_data=valid_it,
152 |                         validation_steps=valid_it.samples / batch_size,
153 |                         epochs=epochs,
154 |                         callbacks=[checkpointer, csv_logger])
155 | 
156 | 
157 | def predict(bcolz_root, prod_info, sample_prod_info, models_dir, only_first_image, batch_size=200, shuffle=None,
158 |             top_k=10, use_img_idx=False):
159 |     model_file = os.path.join(models_dir, LOAD_MODEL)
160 |     if os.path.exists(model_file):
161 |         model = load_model(model_file)
162 |     else:
163 |         raise ValueError("Model doesn't exist")
164 |     images_df = create_images_df(prod_info, only_first_image)
165 |     images_df = images_df.merge(prod_info, on='product_id', how='left')[
166 |         ['product_id',  'img_idx', 'num_imgs']]
167 |     if shuffle:
168 |         np.random.seed(shuffle)
169 |         perm = np.random.permutation(images_df.shape[0])
170 |         images_df = images_df.reindex(perm)
171 |         images_df.reset_index(drop=True, inplace=True)
172 |     if sample_prod_info is not None:
173 |         images_df_smpl = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
174 |     else:
175 |         images_df_smpl = images_df
176 |     idxs = images_df_smpl.index.values
177 |     dfs = []
178 |     steps = MAX_PREDICTIONS_AT_TIME // batch_size
179 |     offset = 0
180 |     while offset < images_df_smpl.shape[0]:
181 |         it = BcolzIterator(bcolz_root=bcolz_root,
182 |                            x_idxs=idxs[offset:],
183 |                            side_input=form_side_input(images_df[offset:]) if use_img_idx else None,
184 |                            batch_size=batch_size,
185 |                            shuffle=False)
186 |         preds = model.predict_generator(it, min(steps, (images_df_smpl.shape[0] - offset) / batch_size),
187 |                                         verbose=1, max_queue_size=5)
188 |         top_k_preds = np.argpartition(preds, -top_k)[:, -top_k:]
189 |         chunk = []
190 |         for i in range(top_k_preds.shape[0]):
191 |             product_id = images_df_smpl.iloc[offset + i]['product_id']
192 |             img_idx = images_df_smpl.iloc[offset + i]['img_idx']
193 |             for pred_idx in range(top_k):
194 |                 chunk.append((product_id, img_idx, top_k_preds[i, pred_idx], preds[i, top_k_preds[i, pred_idx]]))
195 |         chunk_df = pd.DataFrame(chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
196 |         dfs.append(chunk_df)
197 |         offset += top_k_preds.shape[0]
198 |         del top_k_preds
199 |         del preds
200 |         del chunk
201 |     return pd.concat(dfs)
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     parser = argparse.ArgumentParser()
206 |     parser.add_argument('--fit', action='store_true', dest='is_fit')
207 |     parser.add_argument('--predict', action='store_true', dest='is_predict')
208 |     parser.add_argument('--predict_valid', action='store_true', dest='is_predict_valid')
209 |     parser.add_argument('--bcolz_root', required=True, help='VGG16 vecs bcolz root path')
210 |     parser.add_argument('--bcolz_prod_info_csv', required=True,
211 |                         help='Path to prod info csv with which VGG16 were generated')
212 |     parser.add_argument('--sample_prod_info_csv', required=True, help='Path to sample prod info csv')
213 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
214 |     parser.add_argument('--train_split_csv', required=True, help='Train split csv')
215 |     parser.add_argument('--models_dir', required=True, help='Output directory for models snapshots')
216 |     parser.add_argument('--lr', type=float, default=0.001, required=False, help='Learning rate')
217 |     parser.add_argument('--batch_size', type=int, default=64, required=False, help='Batch size')
218 |     parser.add_argument('--epochs', type=int, default=1, required=False, help='Number of epochs')
219 |     parser.add_argument('--only_first_image', dest='only_first_image', action='store_true',
220 |                         help="Include only first image from each product")
221 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
222 |                         help='If products should be shuffled, provide seed')
223 |     parser.set_defaults(only_first_image=False)
224 |     parser.add_argument('--mode', type=int, default=0, required=False, help='Mode')
225 |     parser.add_argument('--batch_seed', type=int, default=123, required=False, help='Batch seed')
226 |     parser.add_argument('--use_img_idx', action='store_true', dest='use_img_idx')
227 |     parser.set_defaults(use_img_idx=False)
228 | 
229 |     args = parser.parse_args()
230 |     if not os.path.isdir(args.models_dir):
231 |         os.mkdir(args.models_dir)
232 | 
233 |     bcolz_prod_info = pd.read_csv(args.bcolz_prod_info_csv)
234 |     sample_prod_info = pd.read_csv(args.sample_prod_info_csv)
235 |     train_split = pd.read_csv(args.train_split_csv)
236 |     category_idx = pd.read_csv(args.category_idx_csv)
237 | 
238 |     if args.is_fit:
239 |         train_it, valid_it, num_classes = train_data(args.bcolz_root, bcolz_prod_info, sample_prod_info,
240 |                                                      train_split,
241 |                                                      category_idx,
242 |                                                      args.only_first_image,
243 |                                                      args.batch_size, args.shuffle,
244 |                                                      args.batch_seed,
245 |                                                      args.use_img_idx)
246 |         fit_model(train_it, valid_it, num_classes, args.models_dir, args.lr, args.batch_size, args.epochs, args.mode,
247 |                   args.batch_seed)
248 |     elif args.is_predict:
249 |         out_df = predict(args.bcolz_root, bcolz_prod_info, sample_prod_info, args.models_dir, args.only_first_image,
250 |                          args.use_img_idx)
251 |         out_df.to_csv(os.path.join(args.models_dir, PREDICTIONS_FILE), index=False)
252 |     elif args.is_predict_valid:
253 |         only_valids = bcolz_prod_info[
254 |             bcolz_prod_info.product_id.isin(train_split[train_split.train == False].product_id)]
255 |         out_df = predict(args.bcolz_root, bcolz_prod_info, only_valids, args.models_dir, args.only_first_image,
256 |                          shuffle=args.shuffle, use_img_idx=args.use_img_idx)
257 |         out_df.to_csv(os.path.join(args.models_dir, VALID_PREDICTIONS_FILE), index=False)
258 | 


--------------------------------------------------------------------------------
/src/model/tune_vgg16_vecs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import pandas as pd
  4 | import numpy as np
  5 | from keras.models import Model
  6 | from keras.models import load_model
  7 | from keras.layers import Dense
  8 | from keras.layers import Flatten
  9 | from keras.layers import Input
 10 | from keras.layers import Dropout
 11 | from keras.layers.normalization import BatchNormalization
 12 | from keras.optimizers import Adam
 13 | from keras.optimizers import SGD
 14 | from keras.callbacks import ModelCheckpoint
 15 | from keras.callbacks import CSVLogger
 16 | from ..data.category_idx import map_categories
 17 | from .bcolz_iterator import BcolzIterator
 18 | from .vgg16_vecs import create_images_df
 19 | 
 20 | LOAD_MODEL = 'model.h5'
 21 | SNAPSHOT_MODEL = 'model.h5'
 22 | LOG_FILE = 'training.log'
 23 | PREDICTIONS_FILE = 'predictions.csv'
 24 | VALID_PREDICTIONS_FILE = 'valid_predictions.csv'
 25 | MAX_PREDICTIONS_AT_TIME = 50000
 26 | 
 27 | 
 28 | def train_data(bcolz_root, bcolz_prod_info, sample_prod_info, train_split, category_idx, only_first_image, batch_size,
 29 |                shuffle=None, batch_seed=123):
 30 |     images_df = create_images_df(bcolz_prod_info, only_first_image)
 31 |     bcolz_prod_info['category_idx'] = map_categories(category_idx, bcolz_prod_info['category_id'])
 32 |     bcolz_prod_info = bcolz_prod_info.merge(train_split, on='product_id', how='left')
 33 |     cat_idxs = images_df.merge(bcolz_prod_info, on='product_id', how='left')[['product_id', 'category_idx', 'train']]
 34 |     if shuffle:
 35 |         np.random.seed(shuffle)
 36 |         perm = np.random.permutation(cat_idxs.shape[0])
 37 |         cat_idxs = cat_idxs.reindex(perm)
 38 |         cat_idxs.reset_index(drop=True, inplace=True)
 39 |     cat_idxs = cat_idxs[cat_idxs.product_id.isin(sample_prod_info.product_id)]
 40 |     idxs = cat_idxs.index.values
 41 |     train_idxs = idxs[cat_idxs['train']]
 42 |     valid_idxs = idxs[~cat_idxs['train']]
 43 |     num_classes = np.unique(cat_idxs['category_idx']).size
 44 | 
 45 |     train_it = BcolzIterator(bcolz_root=bcolz_root, x_idxs=train_idxs,
 46 |                              y=cat_idxs['category_idx'].loc[train_idxs].as_matrix(),
 47 |                              num_classes=num_classes, seed=batch_seed, batch_size=batch_size, shuffle=True)
 48 |     valid_it = BcolzIterator(bcolz_root=bcolz_root, x_idxs=valid_idxs,
 49 |                              y=cat_idxs['category_idx'].loc[valid_idxs].as_matrix(),
 50 |                              num_classes=num_classes, batch_size=batch_size, shuffle=False)
 51 |     return train_it, valid_it, num_classes
 52 | 
 53 | 
 54 | def fit_model(train_it, valid_it, num_classes, models_dir, lr=0.001, batch_size=64, epochs=1, mode=0):
 55 |     model_file = os.path.join(models_dir, LOAD_MODEL)
 56 |     if os.path.exists(model_file):
 57 |         model = load_model(model_file)
 58 |     else:
 59 |         if mode == 0:
 60 |             inp = Input((512, 2, 2))
 61 |             x = Flatten()(inp)
 62 |             x = Dense(1024, activation='relu')(x)
 63 |             x = BatchNormalization(axis=-1)(x)
 64 |             x = Dense(num_classes, activation='softmax')(x)
 65 |             model = Model(inp, x)
 66 |         elif mode == 1:
 67 |             inp = Input((512, 2, 2))
 68 |             x = Flatten()(inp)
 69 |             x = Dense(2048, activation='relu')(x)
 70 |             x = BatchNormalization(axis=-1)(x)
 71 |             x = Dense(num_classes, activation='softmax')(x)
 72 |             model = Model(inp, x)
 73 |         elif mode == 2:
 74 |             inp = Input((512, 2, 2))
 75 |             x = Flatten()(inp)
 76 |             x = Dense(512, activation='relu')(x)
 77 |             x = BatchNormalization(axis=-1)(x)
 78 |             x = Dense(1024, activation='relu')(x)
 79 |             x = BatchNormalization(axis=-1)(x)
 80 |             x = Dense(num_classes, activation='softmax')(x)
 81 |             model = Model(inp, x)
 82 |         elif mode == 3:
 83 |             inp = Input((512, 2, 2))
 84 |             x = Flatten()(inp)
 85 |             x = Dense(num_classes, activation='softmax')(x)
 86 |             model = Model(inp, x)
 87 |         elif mode == 4:
 88 |             inp = Input((512, 2, 2))
 89 |             x = Flatten()(inp)
 90 |             x = BatchNormalization(axis=-1)(x)
 91 |             x = Dense(num_classes, activation='softmax')(x)
 92 |             model = Model(inp, x)
 93 |         elif mode == 5:
 94 |             inp = Input((512, 2, 2))
 95 |             x = Flatten()(inp)
 96 |             x = Dense(512, activation='relu')(x)
 97 |             x = BatchNormalization(axis=-1)(x)
 98 |             x = Dense(num_classes, activation='softmax')(x)
 99 |             model = Model(inp, x)
100 |         elif mode == 6:
101 |             inp = Input((512, 2, 2))
102 |             x = Flatten()(inp)
103 |             x = Dense(512, activation='relu')(x)
104 |             x = BatchNormalization(axis=-1)(x)
105 |             x = Dense(1024, activation='relu')(x)
106 |             x = BatchNormalization(axis=-1)(x)
107 |             x = Dense(num_classes, activation='softmax')(x)
108 |             model = Model(inp, x)
109 |         elif mode == 7:
110 |             inp = Input((512, 2, 2))
111 |             x = Flatten()(inp)
112 |             x = Dense(1024, activation='relu')(x)
113 |             x = BatchNormalization(axis=-1)(x)
114 |             x = Dense(2048, activation='relu')(x)
115 |             x = BatchNormalization(axis=-1)(x)
116 |             x = Dense(num_classes, activation='softmax')(x)
117 |             model = Model(inp, x)
118 |         elif mode == 8:
119 |             inp = Input((512, 2, 2))
120 |             x = Flatten()(inp)
121 |             x = Dense(512, activation='relu')(x)
122 |             x = BatchNormalization(axis=-1)(x)
123 |             x = Dense(512, activation='relu')(x)
124 |             x = BatchNormalization(axis=-1)(x)
125 |             x = Dense(1024, activation='relu')(x)
126 |             x = BatchNormalization(axis=-1)(x)
127 |             x = Dense(num_classes, activation='softmax')(x)
128 |             model = Model(inp, x)
129 |         elif mode == 9:
130 |             inp = Input((512, 2, 2))
131 |             x = Flatten()(inp)
132 |             x = BatchNormalization(axis=-1)(x)
133 |             x = Dense(2048, activation='relu')(x)
134 |             x = BatchNormalization(axis=-1)(x)
135 |             x = Dense(num_classes, activation='softmax')(x)
136 |             model = Model(inp, x)
137 |         elif mode == 10:
138 |             inp = Input((512, 2, 2))
139 |             x = Flatten()(inp)
140 |             x = Dense(4096, activation='relu')(x)
141 |             x = BatchNormalization(axis=-1)(x)
142 |             x = Dense(num_classes, activation='softmax')(x)
143 |             model = Model(inp, x)
144 |         elif mode == 11:
145 |             inp = Input((512, 2, 2))
146 |             x = Flatten()(inp)
147 |             x = Dense(4096, activation='relu')(x)
148 |             x = BatchNormalization(axis=-1)(x)
149 |             x = Dropout(0.5)(x)
150 |             x = Dense(num_classes, activation='softmax')(x)
151 |             model = Model(inp, x)
152 |         elif mode == 12:
153 |             inp = Input((512, 2, 2))
154 |             x = Flatten()(inp)
155 |             x = Dense(4096, activation='relu')(x)
156 |             x = BatchNormalization(axis=-1)(x)
157 |             x = Dense(4096, activation='relu')(x)
158 |             x = BatchNormalization(axis=-1)(x)
159 |             x = Dense(num_classes, activation='softmax')(x)
160 |             model = Model(inp, x)
161 |         elif mode == 13:
162 |             inp = Input((512, 2, 2))
163 |             x = Flatten()(inp)
164 |             x = Dense(4096, activation='relu')(x)
165 |             x = BatchNormalization(axis=-1)(x)
166 |             x = Dropout(0.5)(x)
167 |             x = Dense(4096, activation='relu')(x)
168 |             x = BatchNormalization(axis=-1)(x)
169 |             x = Dropout(0.5)(x)
170 |             x = Dense(num_classes, activation='softmax')(x)
171 |             model = Model(inp, x)
172 |         elif mode == 14:
173 |             inp = Input((512, 2, 2))
174 |             x = Flatten()(inp)
175 |             x = Dense(4096, activation='relu')(x)
176 |             x = BatchNormalization(axis=-1)(x)
177 |             x = Dropout(0.2)(x)
178 |             x = Dense(4096, activation='relu')(x)
179 |             x = BatchNormalization(axis=-1)(x)
180 |             x = Dropout(0.4)(x)
181 |             x = Dense(num_classes, activation='softmax')(x)
182 |             model = Model(inp, x)
183 | 
184 |     if mode == 6:
185 |         model.compile(optimizer=SGD(lr=lr), loss='sparse_categorical_crossentropy',
186 |                       metrics=['sparse_categorical_accuracy'])
187 |     else:
188 |         model.compile(optimizer=Adam(lr=lr), loss='sparse_categorical_crossentropy',
189 |                       metrics=['sparse_categorical_accuracy'])
190 | 
191 |     np.random.seed(125)
192 |     checkpointer = ModelCheckpoint(filepath=os.path.join(models_dir, SNAPSHOT_MODEL))
193 |     csv_logger = CSVLogger(os.path.join(models_dir, LOG_FILE), append=True)
194 |     model.fit_generator(train_it,
195 |                         steps_per_epoch=train_it.samples / batch_size,
196 |                         validation_data=valid_it,
197 |                         validation_steps=valid_it.samples / batch_size,
198 |                         epochs=epochs,
199 |                         callbacks=[checkpointer, csv_logger])
200 | 
201 | 
202 | def predict(bcolz_root, prod_info, sample_prod_info, models_dir, only_first_image, batch_size=200, shuffle=None,
203 |             top_k=10):
204 |     model_file = os.path.join(models_dir, LOAD_MODEL)
205 |     if os.path.exists(model_file):
206 |         model = load_model(model_file)
207 |     else:
208 |         raise ValueError("Model doesn't exist")
209 |     images_df = create_images_df(prod_info, only_first_image)
210 |     if shuffle:
211 |         np.random.seed(shuffle)
212 |         perm = np.random.permutation(images_df.shape[0])
213 |         images_df = images_df.reindex(perm)
214 |         images_df.reset_index(drop=True, inplace=True)
215 |     if sample_prod_info is not None:
216 |         images_df = images_df[images_df.product_id.isin(sample_prod_info.product_id)]
217 |     idxs = images_df.index.values
218 |     dfs = []
219 |     steps = MAX_PREDICTIONS_AT_TIME // batch_size
220 |     offset = 0
221 |     while offset < images_df.shape[0]:
222 |         it = BcolzIterator(bcolz_root=bcolz_root, x_idxs=idxs[offset:],
223 |                            batch_size=batch_size,
224 |                            shuffle=False)
225 |         preds = model.predict_generator(it, min(steps, (images_df.shape[0] - offset) / batch_size),
226 |                                         verbose=1, max_queue_size=5)
227 |         top_k_preds = np.argpartition(preds, -top_k)[:, -top_k:]
228 |         chunk = []
229 |         for i in range(top_k_preds.shape[0]):
230 |             product_id = images_df.iloc[offset + i]['product_id']
231 |             img_idx = images_df.iloc[offset + i]['img_idx']
232 |             for pred_idx in range(top_k):
233 |                 chunk.append((product_id, img_idx, top_k_preds[i, pred_idx], preds[i, top_k_preds[i, pred_idx]]))
234 |         chunk_df = pd.DataFrame(chunk, columns=['product_id', 'img_idx', 'category_idx', 'prob'])
235 |         dfs.append(chunk_df)
236 |         offset += top_k_preds.shape[0]
237 |         del top_k_preds
238 |         del preds
239 |         del chunk
240 |     return pd.concat(dfs)
241 | 
242 | 
243 | if __name__ == '__main__':
244 |     parser = argparse.ArgumentParser()
245 |     parser.add_argument('--fit', action='store_true', dest='is_fit')
246 |     parser.add_argument('--predict', action='store_true', dest='is_predict')
247 |     parser.add_argument('--predict_valid', action='store_true', dest='is_predict_valid')
248 |     parser.add_argument('--bcolz_root', required=True, help='VGG16 vecs bcolz root path')
249 |     parser.add_argument('--bcolz_prod_info_csv', required=True,
250 |                         help='Path to prod info csv with which VGG16 were generated')
251 |     parser.add_argument('--sample_prod_info_csv', required=True, help='Path to sample prod info csv')
252 |     parser.add_argument('--category_idx_csv', required=True, help='Path to categories to index mapping csv')
253 |     parser.add_argument('--train_split_csv', required=True, help='Train split csv')
254 |     parser.add_argument('--models_dir', required=True, help='Output directory for models snapshots')
255 |     parser.add_argument('--lr', type=float, default=0.001, required=False, help='Learning rate')
256 |     parser.add_argument('--batch_size', type=int, default=64, required=False, help='Batch size')
257 |     parser.add_argument('--epochs', type=int, default=1, required=False, help='Number of epochs')
258 |     parser.add_argument('--only_first_image', dest='only_first_image', action='store_true',
259 |                         help="Include only first image from each product")
260 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
261 |                         help='If products should be shuffled, provide seed')
262 |     parser.set_defaults(only_first_image=False)
263 |     parser.add_argument('--mode', type=int, default=0, required=False, help='Mode')
264 |     parser.add_argument('--batch_seed', type=int, default=123, required=False, help='Batch seed')
265 | 
266 |     args = parser.parse_args()
267 |     if not os.path.isdir(args.models_dir):
268 |         os.mkdir(args.models_dir)
269 | 
270 |     bcolz_prod_info = pd.read_csv(args.bcolz_prod_info_csv)
271 |     sample_prod_info = pd.read_csv(args.sample_prod_info_csv)
272 |     train_split = pd.read_csv(args.train_split_csv)
273 |     category_idx = pd.read_csv(args.category_idx_csv)
274 | 
275 |     if args.is_fit:
276 |         train_it, valid_it, num_classes = train_data(args.bcolz_root, bcolz_prod_info, sample_prod_info,
277 |                                                      train_split,
278 |                                                      category_idx,
279 |                                                      args.only_first_image,
280 |                                                      args.batch_size, args.shuffle,
281 |                                                      args.batch_seed)
282 |         fit_model(train_it, valid_it, num_classes, args.models_dir, args.lr, args.batch_size, args.epochs, args.mode)
283 |     elif args.is_predict:
284 |         out_df = predict(args.bcolz_root, bcolz_prod_info, sample_prod_info, args.models_dir, args.only_first_image)
285 |         out_df.to_csv(os.path.join(args.models_dir, PREDICTIONS_FILE), index=False)
286 |     elif args.is_predict_valid:
287 |         only_valids = bcolz_prod_info[bcolz_prod_info.product_id.isin(train_split[train_split.train == False].product_id)]
288 |         out_df = predict(args.bcolz_root, bcolz_prod_info, only_valids, args.models_dir, args.only_first_image,
289 |                          shuffle=args.shuffle)
290 |         out_df.to_csv(os.path.join(args.models_dir, VALID_PREDICTIONS_FILE), index=False)
291 | 


--------------------------------------------------------------------------------
/src/model/vgg16_vecs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | import os
 5 | import bcolz
 6 | from tqdm import tqdm
 7 | from .bson_iterator import BSONIterator
 8 | from keras.applications.vgg16 import VGG16
 9 | from keras.applications.vgg16 import preprocess_input
10 | from keras.models import Model
11 | from keras.layers import AveragePooling2D
12 | from keras.preprocessing.image import ImageDataGenerator
13 | import threading
14 | 
15 | 
16 | def compute_vgg16_vecs(bson_path, images_df, vecs_output_dir, save_step=100000):
17 |     vgg_model = VGG16(include_top=False, input_shape=(3, 180, 180))
18 | 
19 |     if os.path.isdir(vecs_output_dir):
20 |         vecs = bcolz.open(rootdir=vecs_output_dir)
21 |         offset = vecs.shape[0]
22 |     else:
23 |         vecs = None
24 |         offset = 0
25 | 
26 |     lock = threading.Lock()
27 | 
28 |     with open(bson_path, "rb") as train_bson_file, \
29 |             tqdm(total=images_df.shape[0], initial=offset) as pbar:
30 |         for i in range(offset, images_df.shape[0], save_step):
31 |             gen = ImageDataGenerator(preprocessing_function=preprocess_input)
32 |             batches = BSONIterator(bson_file=train_bson_file,
33 |                                    images_df=images_df[i:(i + save_step)],
34 |                                    num_class=0,  # doesn't matter here
35 |                                    image_data_generator=gen,
36 |                                    lock=lock,
37 |                                    target_size=(180, 180),
38 |                                    batch_size=220,
39 |                                    shuffle=False,
40 |                                    with_labels=False)
41 |             x = AveragePooling2D()(vgg_model.output)
42 |             model = Model(vgg_model.input, x)
43 |             out_vecs = model.predict_generator(batches,
44 |                                                steps=batches.samples / batches.batch_size,
45 |                                                verbose=1)
46 |             if not vecs:
47 |                 vecs = bcolz.carray(out_vecs, rootdir=vecs_output_dir, mode='w')
48 |                 vecs.flush()
49 |             else:
50 |                 vecs.append(out_vecs)
51 |                 vecs.flush()
52 |             pbar.update(save_step)
53 | 
54 | 
55 | def create_images_df(product_info, only_first_image=False):
56 |     rows = []
57 |     for row in product_info.itertuples():
58 |         for i in range(row.num_imgs):
59 |             rows.append([row.product_id, i, row.offset, row.length])
60 | 
61 |     images_df = pd.DataFrame(rows, columns=['product_id', 'img_idx', 'offset', 'length'])
62 |     if only_first_image:
63 |         images_df = images_df[images_df.img_idx == 0]
64 |         images_df = images_df.reset_index(drop=True)
65 |     return images_df
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     parser = argparse.ArgumentParser()
70 |     parser.add_argument('--bson', required=True, help='Path to bson with products')
71 |     parser.add_argument('--prod_info_csv', required=True, help='Path to prod info csv')
72 |     parser.add_argument('--output_dir', required=True, help='Output directory for vectors')
73 |     parser.add_argument('--save_step', type=int, required=True, help='Save computed vectors to disk each N steps')
74 |     parser.add_argument('--only_first_image', dest='only_first_image', action='store_true',
75 |                         help="Include only first image from each product")
76 |     parser.add_argument('--shuffle', type=int, default=None, required=False,
77 |                         help='If products should be shuffled, provide seed')
78 |     parser.set_defaults(only_first_image=False)
79 | 
80 |     args = parser.parse_args()
81 |     product_info = pd.read_csv(args.prod_info_csv)
82 | 
83 |     images_df = create_images_df(product_info, args.only_first_image)
84 |     if args.shuffle:
85 |         np.random.seed(args.shuffle)
86 |         perm = np.random.permutation(images_df.shape[0])
87 |         images_df = images_df.reindex(perm)
88 | 
89 |     compute_vgg16_vecs(args.bson, images_df, args.output_dir, args.save_step)
90 | 


--------------------------------------------------------------------------------
/test_environment.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | REQUIRED_PYTHON = "python3"
 4 | 
 5 | 
 6 | def main():
 7 |     system_major = sys.version_info.major
 8 |     if REQUIRED_PYTHON == "python":
 9 |         required_major = 2
10 |     elif REQUIRED_PYTHON == "python3":
11 |         required_major = 3
12 |     else:
13 |         raise ValueError("Unrecognized python interpreter: {}".format(
14 |             REQUIRED_PYTHON))
15 | 
16 |     if system_major != required_major:
17 |         raise TypeError(
18 |             "This project requires Python {}. Found: Python {}".format(
19 |                 required_major, sys.version))
20 |     else:
21 |         print(">>> Development environment passes all tests!")
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     main()
26 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | max-complexity = 10
4 | 


--------------------------------------------------------------------------------