├── script.py
├── README.md
├── feature_extraction.py
├── preprocessing.py
└── summary.ipynb
/script.py:
--------------------------------------------------------------------------------
1 | from preprocessing import normalize
2 | from feature_extraction import vectorize
3 | import pandas as pd
4 | from gensim.models import FastText
5 | import pickle
6 |
7 | test = pd.read_parquet('data/task1_test_for_user.parquet')
8 | pipe = pickle.load(open('clf_task1', 'rb'))
9 |
10 | test.item_name = normalize(test.item_name)
11 | X_wv = vectorize(test.item_name)
12 | pred = pipe.predict(X_wv)
13 |
14 | res = pd.DataFrame(pred, columns=['pred'])
15 | res['id'] = test['id']
16 |
17 | res[['id', 'pred']].to_csv('answers.csv', index=None)
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GoodsClassifier
2 | Всем привет, в этом проекте я покажу, как я решал первую задачу в соревновании [Data Fusion Contest](https://boosters.pro/championship/data_fusion/overview), в которой необходимо было классифицировать товары по данным из чека. Данные представляют собой набор размеченных и неразмеченных чеков. Каждый чек имеет такую информацию как описание товара, его категорию, цена товара, количество и пр. Для предсказания категории мы будем пользоваться только описанием товара (item_name). [Приятного ресёрча!](https://github.com/gorodion/GoodsClassifier/blob/master/summary.ipynb)
3 |
--------------------------------------------------------------------------------
/feature_extraction.py:
--------------------------------------------------------------------------------
1 | import gensim
2 | import numpy as np
3 | import pandas as pd
4 |
5 | def word_averaging(wv, words):
6 | mean = np.zeros((wv.vector_size,))
7 |
8 | for word in words:
9 | mean += wv.get_vector(word)
10 |
11 | mean = gensim.matutils.unitvec(mean)
12 | return mean
13 |
14 | def word_averaging_list(wv, text_list):
15 | return np.vstack([word_averaging(wv, review) for review in text_list])
16 |
17 |
18 | def vectorize(test: pd.Series):
19 | test = test.apply(lambda x: [i for i in x.split() if len(i) > 1])
20 | model = FastText.load('ft.model', mmap='r')
21 | X_wv = word_averaging_list(model.wv, test)
22 | X_wv = X_wv.astype('float16')
23 | return X_wv
24 |
--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
1 | import re
2 | import pandas as pd
3 |
4 | def make_trans():
5 | a = 'a b c d e f g h i j k l m n o p q r s t u v w x y z ё'.split()
6 | b = 'а в с д е ф г н и ж к л м н о р к р с т у в в х у з е'.split()
7 | trans_dict = dict(zip(a, b))
8 | trans_table = ''.join(a).maketrans(trans_dict)
9 | return trans_table
10 |
11 | def normalize(ser: pd.Series):
12 | # "СокДобрый" -> "Сок Добрый"
13 | camel_case_pat = re.compile(r'([а-яa-z])([А-ЯA-Z])')
14 | # "lmno" -> "лмно"
15 | trans_table = make_trans()
16 | # "14х15х30" -> "DxDxD"
17 | dxdxd_pat = re.compile(r'((?:\d+\s*[х\*]\s*){2}\d+)')
18 | # "1.2 15,5" -> "1p2 15p5"
19 | digit_pat = re.compile(r'(\d+)[\.,](\d+)')
20 | # "15 мл" -> "15мл"
21 | unit = 'мг|г|гр|кг|мл|л|шт'
22 | unit_pat = re.compile(fr'((?:\d+p)?\d+)\s*({unit})\b')
23 | # "ж/б ст/б" -> "жб стб"
24 | w_w_pat = re.compile(r'\b([а-я]{1,2})/([а-я]{1,2})\b')
25 | # "a b c d" -> "abcd"
26 | glue_pat = re.compile(r'(?<=(?\n",
219 | "\n",
232 | "
\n",
233 | " \n",
234 | " \n",
235 | " \n",
236 | " item_name \n",
237 | " \n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " 2437976 \n",
242 | " Сапоги школьные д/д Flois-Kids (Размер: 34) \n",
243 | " \n",
244 | " \n",
245 | " 882416 \n",
246 | " Шок.Озера Т/У 100г Молочный /малин шт \n",
247 | " \n",
248 | " \n",
249 | " 2297689 \n",
250 | " Кальмар в масле Щупальца \n",
251 | " \n",
252 | " \n",
253 | " 1742849 \n",
254 | " pellesana масло косметическое 100мл \n",
255 | " \n",
256 | " \n",
257 | " 780364 \n",
258 | " Обезжириватель Полихим 0,5л шт \n",
259 | " \n",
260 | " \n",
261 | " 1745151 \n",
262 | " 1 864 055 Обувь весна-лето-осень ABRICOT \n",
263 | " \n",
264 | " \n",
265 | " 1905276 \n",
266 | " Огурцы Луховицкие Россия 2.456кг*97.00 \n",
267 | " \n",
268 | " \n",
269 | " 1159080 \n",
270 | " Штуцер для шланга VALTEC 3/4 внутр. *20 мм \n",
271 | " \n",
272 | " \n",
273 | " 1733705 \n",
274 | " Лизун № \"Роза\" \n",
275 | " \n",
276 | " \n",
277 | " 583559 \n",
278 | " БЗМЖ Молоко пастеризов 3.7% 1400мл ПЭТ Домик в... \n",
279 | " \n",
280 | " \n",
281 | "
\n",
282 | ""
283 | ],
284 | "text/plain": [
285 | " item_name\n",
286 | "2437976 Сапоги школьные д/д Flois-Kids (Размер: 34)\n",
287 | "882416 Шок.Озера Т/У 100г Молочный /малин шт\n",
288 | "2297689 Кальмар в масле Щупальца\n",
289 | "1742849 pellesana масло косметическое 100мл\n",
290 | "780364 Обезжириватель Полихим 0,5л шт\n",
291 | "1745151 1 864 055 Обувь весна-лето-осень ABRICOT\n",
292 | "1905276 Огурцы Луховицкие Россия 2.456кг*97.00\n",
293 | "1159080 Штуцер для шланга VALTEC 3/4 внутр. *20 мм\n",
294 | "1733705 Лизун № \"Роза\"\n",
295 | "583559 БЗМЖ Молоко пастеризов 3.7% 1400мл ПЭТ Домик в..."
296 | ]
297 | },
298 | "execution_count": 6,
299 | "metadata": {},
300 | "output_type": "execute_result"
301 | }
302 | ],
303 | "source": [
304 | "df_full.sample(10)"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "id": "interstate-familiar",
310 | "metadata": {},
311 | "source": [
312 | "Посмотрим распределение классов"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 7,
318 | "id": "featured-smell",
319 | "metadata": {},
320 | "outputs": [
321 | {
322 | "data": {
323 | "text/plain": [
324 | "84 7070\n",
325 | "71 4760\n",
326 | "78 2866\n",
327 | "83 2856\n",
328 | "0 2352\n",
329 | " ... \n",
330 | "102 19\n",
331 | "101 16\n",
332 | "46 15\n",
333 | "100 14\n",
334 | "97 13\n",
335 | "Name: category_id, Length: 96, dtype: int64"
336 | ]
337 | },
338 | "execution_count": 7,
339 | "metadata": {},
340 | "output_type": "execute_result"
341 | }
342 | ],
343 | "source": [
344 | "df.category_id.value_counts()"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 8,
350 | "id": "adult-species",
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "data": {
355 | "image/png": "iVBORw0KGgoAAAANSUhEUgAABIoAAAERCAYAAAAUiEzSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAA5rklEQVR4nO3de5glVXno/+8LIwoiDJcJIiBjFDUaDZIR8GgikSNXI5zEGDW/CASDiXhJ1ChGEwiKYn5RDxyVhAiIGkViYsArjnhJcgz34SooI4JAuIwMIIo39D1/1Gopamp31+ru3bOH/n6eZz9de9Xaq1etWrWq9rvrEpmJJEmSJEmStNH6roAkSZIkSZImg4EiSZIkSZIkAQaKJEmSJEmSVBgokiRJkiRJEmCgSJIkSZIkSYWBIkmSJEmSJAEGiiRJkiRJklQYKJIkaRGKiOsj4ocR8f2IuC0iPhgRm6/vekmSJGn9MlAkSdLi9duZuTmwG7ACeMt6ro8kSZLWMwNFkiQtcpl5M/A54FcBIuKwiLg6Iu6JiOsi4uXt/BFxUERcGhHfi4hvRcR+Jf0rEfGjcpbS98sZS9e3Pnd9RLwpIr4eEXdGxGkR8bDW/OeVcu+KiK9FxFM7//cjEfGTVtk3teY9NCL+LiK+U86Q+vuI2LQ1f3lEZKtuP4uIl5V5G0XEUWVZ7oiIMyNi687nlnTqcUyZ3qtTjxeW/C9rpf1Rac87I+KciNi5bz301PH7EfHTqf9V8vxxRKyOiLURcXZEPGqGspaU96+IiKsiYptWng9O054nRMSNZR1fHBG/0Zq3cUT8ZWmve8r8ncq8J0fEylK/2yLiL0v67hHxX2Xd3hIR742ITVplZkRc1vkfN7frJEmSFoaBIkmSFrnyJf8AYFVJuh14HrAFcBjwnojYreTdHfgQ8BfAUuA3getbxb0yMzcvZyr9ds+/+wNgX+CxwOMpZzFFxNOAU4GXA9sA/wCcHREPbVcVOK6UvX+n3ONLebsCjwN2AP66NX/qmGfL8vn/aM17FXAw8GzgUcCdwPt66j6tiHgI8FbgllbaQcBfAr8DLCv/92MzFLW01YYfb5X1HOAdwAuB7YEbgDMG1OtFwOuBfTPzjtasjYB3jmjPC2nacmvgo8A/t4J6rwVeTNNntgD+CLg3Ih4BfBH4PE07Pg44t3zmZ8CfA9sCzwD2Bl7R+Z+bRMTTy/SBwN0zLZskSZp/BookSVq8/i0i7gL+E/gq8HaAzPxMZn4rG18FvgBMnVFyOHBqZq7MzJ9n5s2ZeU3F/3xvZt6YmWuB42gCDgBHAP+Qmedn5s8y83Tgx8Cerc9uCvykW2BERPn8n2fm2sy8pyzLi1rZNgF+npk/66nTnwBvzsybMvPHwDHAC9pnEQ30cuB84Judst+RmVdn5n2lXruOOqtoBn9A0/aXlHq+CXhGRCyf5jP7AacA+2dm9+ycTehpT4DM/Ehm3pGZ92Xmu4CHAk8os18GvCUzv1H6yGUlAPU84NbMfFdm/igz78nM80t5F2fmeaW862kCgc/u/NtTStlT/+OUmRpEkiTNPwNFkiQtXgdn5tLM3DkzX5GZPwSIiP0j4rxy+dBdNGeObFs+sxPwrTn8zxtb0zfQnHkCsDPwunJp0l3l/+7Umg/wSGBNT5nLgM2Ai1uf/XxJn7I1zZlCfXYGPtn67NU0Z8Bs18rz3db8F3YLKGfTvAH4q56yT2h9di3NmVE7jKjLdB5F02YAZOb3gTtmKOsDNGd8dYMyME2bRMTry+Vyd5d6b8nMfWBk34iIx0fEpyPi1oj4Hk3AbNtOtk8De0XE42jOmLp4muWSJEljYqBIkiT9QrnU61+AvwO2y8ylwGdpghvQBHoeO4d/sVNr+tHAf7fKPa4ErqZem2Xmx0q9HkJzD6XLWNd3gR8CT259duoSsymP54Fn+rTdSHPGTft/P6zcu2nKtlPzgDN7yvgL4MzMvKGTfiPw8k7Zm2bm10bUZTr/TRN4AiAiHk5zmd7NIz/RnLH1+8BxEbFjZ15vm5T7Eb2BJiC2VVnmu5m5D9wI/PKIepwEXAPskplb0FyOF5089wGfBD4BfHCaZZIkSWNkoEiSJLVtQnOZ0RrgvojYH9inNf8U4LCI2LvcBHqHiHhiRflHRsSO5WbRb+b+e/D8I/AnEbFHNB4eEQeWM3WguVfSrcBF3QIz8+fl8++JiF8CKPXat0zvBLwG+LcRdfp7mkDKziX/snJvoaEeUep33Iiy3xQRTy5lbxkRv1dRdtvHaNp+1xLQeztwfrmUa5T/yMwrgROBk0sdlkTEnwDdezW1l+c+mj6wJCL+muZeRFM+ALw1InYp6+qp5SbZnwa2j4g/i+bm4o+IiD1aZX4P+H7pL386or4n05zR9U8ztIUkSRoTA0WSJOkXyv19Xk1z1sydwEuAs1vzL6Dc4JrmLJOv0jrLZYCP0tzz6Dqay5TeVsq9CPhj4L3l/64GDgWIiD+guafNY4B7IuL7NE9pe1RE/H0p943lM+eVS5u+yP331DkH+Eqpc58TyjJ+ISLuAc4D9hiRt88WwImZuc5lXJn5SeCdwBmlXley7o2jB8nML9Jc2vYvNDfMfiwPvA/TdI6nCeIcQnOfqcOAg6YuN+w4h+bSvW/SXOr2Ix54yeC7afrHF2iCP6cAm5a+81yam5jfClwL/Fb5zOtp+tI9NEG9j9MjM6/LzBdn5l0Dl0uSJM2zyMz1XQdJkrQIRMT1wMtKwKPmc4cCyzPzmE76jsDbMvPQeaqiJEnSoucZRZIkadL9gObMla77aG4OLUmSpHniGUWSJGlBzPaMIkmSJC0cA0WSJEmSJEkCvPRMkiRJkiRJxZL1XYHpbLvttrl8+fL1XQ1JkiRJkqQHjYsvvvi7mbmsb95EB4qWL1/ORRddtL6rIUmSJEmS9KARETeMmuelZ5IkSZIkSQIMFEmSJEmSJKkwUCRJkiRJkiTAQJEkSZIkSZIKA0WSJEmSJEkCDBRJkiRJkiSpMFAkSZIkSZIkwECRJEmSJEmSihkDRRHxhIi4tPX6XkT8WURsHRErI+La8nerkj8i4sSIWB0Rl0fEbq2yDin5r42IQ8a5YJIkSZIkSaqzZKYMmfkNYFeAiNgYuBn4JHAUcG5mHh8RR5X3bwT2B3Yprz2Ak4A9ImJr4GhgBZDAxRFxdmbeOaSiy4/6TG/69ccfOOTjkiRJkiRJmkHtpWd7A9/KzBuAg4DTS/rpwMFl+iDgQ9k4D1gaEdsD+wIrM3NtCQ6tBPab6wJIkiRJkiRpftQGil4EfKxMb5eZt5TpW4HtyvQOwI2tz9xU0kalP0BEHBERF0XERWvWrKmsniRJkiRJkmZrcKAoIjYBng/8c3deZibN5WRzlpknZ+aKzFyxbNmy+ShSkiRJkiRJA9ScUbQ/cElm3lbe31YuKaP8vb2k3wzs1PrcjiVtVLokSZIkSZImQE2g6MXcf9kZwNnA1JPLDgHOaqW/tDz9bE/g7nKJ2jnAPhGxVXlC2j4lTZIkSZIkSRNgxqeeAUTEw4HnAi9vJR8PnBkRhwM3AC8s6Z8FDgBWA/cChwFk5tqIeCtwYcl3bGaunfMSSJIkSZIkaV4MChRl5g+AbTppd9A8Ba2bN4EjR5RzKnBqfTUlSZIkSZI0brVPPZMkSZIkSdKDlIEiSZIkSZIkAQaKJEmSJEmSVBgokiRJkiRJEmCgSJIkSZIkSYWBIkmSJEmSJAEGiiRJkiRJklQYKJIkSZIkSRJgoEiSJEmSJEmFgSJJkiRJkiQBBookSZIkSZJUGCiSJEmSJEkSYKBIkiRJkiRJhYEiSZIkSZIkAQaKJEmSJEmSVBgokiRJkiRJEmCgSJIkSZIkSYWBIkmSJEmSJAEGiiRJkiRJklQYKJIkSZIkSRIwMFAUEUsj4hMRcU1EXB0Rz4iIrSNiZURcW/5uVfJGRJwYEasj4vKI2K1VziEl/7URcci4FkqSJEmSJEn1hp5RdALw+cx8IvBrwNXAUcC5mbkLcG55D7A/sEt5HQGcBBARWwNHA3sAuwNHTwWXJEmSJEmStP7NGCiKiC2B3wROAcjMn2TmXcBBwOkl2+nAwWX6IOBD2TgPWBoR2wP7Aiszc21m3gmsBPabx2WRJEmSJEnSHAw5o+gxwBrgtIhYFREfiIiHA9tl5i0lz63AdmV6B+DG1udvKmmj0h8gIo6IiIsi4qI1a9bULY0kSZIkSZJmbUigaAmwG3BSZj4N+AH3X2YGQGYmkPNRocw8OTNXZOaKZcuWzUeRkiRJkiRJGmBIoOgm4KbMPL+8/wRN4Oi2ckkZ5e/tZf7NwE6tz+9Y0kalS5IkSZIkaQLMGCjKzFuBGyPiCSVpb+DrwNnA1JPLDgHOKtNnAy8tTz/bE7i7XKJ2DrBPRGxVbmK9T0mTJEmSJEnSBFgyMN+rgH+KiE2A64DDaIJMZ0bE4cANwAtL3s8CBwCrgXtLXjJzbUS8Fbiw5Ds2M9fOy1JIkiRJkiRpzgYFijLzUmBFz6y9e/ImcOSIck4FTq2onyRJkiRJkhbIkHsUSZIkSZIkaREwUCRJkiRJkiTAQJEkSZIkSZIKA0WSJEmSJEkCDBRJkiRJkiSpMFAkSZIkSZIkwECRJEmSJEmSCgNFkiRJkiRJAgwUSZIkSZIkqTBQJEmSJEmSJMBAkSRJkiRJkgoDRZIkSZIkSQIMFEmSJEmSJKkwUCRJkiRJkiTAQJEkSZIkSZIKA0WSJEmSJEkCDBRJkiRJkiSpMFAkSZIkSZIkwECRJEmSJEmSCgNFkiRJkiRJAgYGiiLi+oi4IiIujYiLStrWEbEyIq4tf7cq6RERJ0bE6oi4PCJ2a5VzSMl/bUQcMp5FkiRJkiRJ0mzUnFH0W5m5a2auKO+PAs7NzF2Ac8t7gP2BXcrrCOAkaAJLwNHAHsDuwNFTwSVJkiRJkiStf3O59Owg4PQyfTpwcCv9Q9k4D1gaEdsD+wIrM3NtZt4JrAT2m8P/lyRJkiRJ0jwaGihK4AsRcXFEHFHStsvMW8r0rcB2ZXoH4MbWZ28qaaPSHyAijoiIiyLiojVr1gysniRJkiRJkuZqycB8z8rMmyPil4CVEXFNe2ZmZkTkfFQoM08GTgZYsWLFvJQpSZIkSZKkmQ06oygzby5/bwc+SXOPodvKJWWUv7eX7DcDO7U+vmNJG5UuSZIkSZKkCTBjoCgiHh4Rj5iaBvYBrgTOBqaeXHYIcFaZPht4aXn62Z7A3eUStXOAfSJiq3IT631KmiRJkiRJkibAkEvPtgM+GRFT+T+amZ+PiAuBMyPicOAG4IUl/2eBA4DVwL3AYQCZuTYi3gpcWPIdm5lr521JJEmSJEmSNCczBooy8zrg13rS7wD27klP4MgRZZ0KnFpfTUmSJEmSJI3b0KeeSZIkSZIk6UFu6FPPNjjLj/pMb/r1xx+4wDWRJEmSJEnaMHhGkSRJkiRJkgADRZIkSZIkSSoMFEmSJEmSJAkwUCRJkiRJkqTCQJEkSZIkSZIAA0WSJEmSJEkqDBRJkiRJkiQJMFAkSZIkSZKkwkCRJEmSJEmSAANFkiRJkiRJKgwUSZIkSZIkCTBQJEmSJEmSpMJAkSRJkiRJkgADRZIkSZIkSSoMFEmSJEmSJAkwUCRJkiRJkqTCQJEkSZIkSZIAA0WSJEmSJEkqBgeKImLjiFgVEZ8u7x8TEedHxOqI+HhEbFLSH1rery7zl7fKeFNJ/0ZE7DvvSyNJkiRJkqRZqzmj6DXA1a337wTek5mPA+4EDi/phwN3lvT3lHxExJOAFwFPBvYD3h8RG8+t+pIkSZIkSZovgwJFEbEjcCDwgfI+gOcAnyhZTgcOLtMHlfeU+XuX/AcBZ2TmjzPz28BqYPd5WAZJkiRJkiTNg6FnFP1v4A3Az8v7bYC7MvO+8v4mYIcyvQNwI0CZf3fJ/4v0ns9IkiRJkiRpPZsxUBQRzwNuz8yLF6A+RMQREXFRRFy0Zs2ahfiXkiRJkiRJYtgZRc8Enh8R1wNn0FxydgKwNCKWlDw7AjeX6ZuBnQDK/C2BO9rpPZ/5hcw8OTNXZOaKZcuWVS+QJEmSJEmSZmfGQFFmvikzd8zM5TQ3o/5SZv4B8GXgBSXbIcBZZfrs8p4y/0uZmSX9ReWpaI8BdgEumLclkSRJkiRJ0pwsmTnLSG8EzoiItwGrgFNK+inAhyNiNbCWJrhEZl4VEWcCXwfuA47MzJ/N4f9LkiRJkiRpHlUFijLzK8BXyvR19Dy1LDN/BPzeiM8fBxxXW0lJkiRJkiSN39CnnkmSJEmSJOlBzkCRJEmSJEmSAANFkiRJkiRJKgwUSZIkSZIkCTBQJEmSJEmSpMJAkSRJkiRJkgADRZIkSZIkSSoMFEmSJEmSJAkwUCRJkiRJkqTCQJEkSZIkSZIAA0WSJEmSJEkqDBRJkiRJkiQJMFAkSZIkSZKkwkCRJEmSJEmSAANFkiRJkiRJKgwUSZIkSZIkCTBQJEmSJEmSpMJAkSRJkiRJkgADRZIkSZIkSSoMFEmSJEmSJAkwUCRJkiRJkqRixkBRRDwsIi6IiMsi4qqI+JuS/piIOD8iVkfExyNik5L+0PJ+dZm/vFXWm0r6NyJi37EtlSRJkiRJkqoNOaPox8BzMvPXgF2B/SJiT+CdwHsy83HAncDhJf/hwJ0l/T0lHxHxJOBFwJOB/YD3R8TG87gskiRJkiRJmoMZA0XZ+H55+5DySuA5wCdK+unAwWX6oPKeMn/viIiSfkZm/jgzvw2sBnafj4WQJEmSJEnS3A26R1FEbBwRlwK3AyuBbwF3ZeZ9JctNwA5legfgRoAy/25gm3Z6z2fa/+uIiLgoIi5as2ZN9QJJkiRJkiRpdgYFijLzZ5m5K7AjzVlATxxXhTLz5MxckZkrli1bNq5/I0mSJEmSpI6qp55l5l3Al4FnAEsjYkmZtSNwc5m+GdgJoMzfErijnd7zGUmSJEmSJK1nQ556tiwilpbpTYHnAlfTBIxeULIdApxVps8u7ynzv5SZWdJfVJ6K9hhgF+CCeVoOSZIkSZIkzdGSmbOwPXB6eULZRsCZmfnpiPg6cEZEvA1YBZxS8p8CfDgiVgNraZ50RmZeFRFnAl8H7gOOzMyfze/izM7yoz7Tm3798QfOS35JkiRJkqQNwYyBosy8HHhaT/p19Dy1LDN/BPzeiLKOA46rr6YkSZIkSZLGreoeRZIkSZIkSXrwMlAkSZIkSZIkwECRJEmSJEmSCgNFkiRJkiRJAgwUSZIkSZIkqTBQJEmSJEmSJMBAkSRJkiRJkgoDRZIkSZIkSQIMFEmSJEmSJKkwUCRJkiRJkiTAQJEkSZIkSZIKA0WSJEmSJEkCDBRJkiRJkiSpWLK+K7AYLD/qM73p1x9/4ALXRJIkSZIkaTTPKJIkSZIkSRLgGUUTx7OPJEmSJEnS+uIZRZIkSZIkSQIMFEmSJEmSJKkwUCRJkiRJkiTAQJEkSZIkSZKKGQNFEbFTRHw5Ir4eEVdFxGtK+tYRsTIiri1/tyrpEREnRsTqiLg8InZrlXVIyX9tRBwyvsWSJEmSJElSrSFnFN0HvC4znwTsCRwZEU8CjgLOzcxdgHPLe4D9gV3K6wjgJGgCS8DRwB7A7sDRU8ElSZIkSZIkrX8zBooy85bMvKRM3wNcDewAHAScXrKdDhxcpg8CPpSN84ClEbE9sC+wMjPXZuadwEpgv/lcGEmSJEmSJM1e1T2KImI58DTgfGC7zLylzLoV2K5M7wDc2PrYTSVtVHr3fxwRERdFxEVr1qypqZ4kSZIkSZLmYHCgKCI2B/4F+LPM/F57XmYmkPNRocw8OTNXZOaKZcuWzUeRkiRJkiRJGmBQoCgiHkITJPqnzPzXknxbuaSM8vf2kn4zsFPr4zuWtFHpkiRJkiRJmgBDnnoWwCnA1Zn57tass4GpJ5cdApzVSn9pefrZnsDd5RK1c4B9ImKrchPrfUqaJEmSJEmSJsCSAXmeCfwhcEVEXFrS/hI4HjgzIg4HbgBeWOZ9FjgAWA3cCxwGkJlrI+KtwIUl37GZuXY+FkKSJEmSJElzN2OgKDP/E4gRs/fuyZ/AkSPKOhU4taaCmt7yoz7Tm3798QcucE0kSZIkSdKGruqpZ5IkSZIkSXrwMlAkSZIkSZIkwECRJEmSJEmSCgNFkiRJkiRJAgwUSZIkSZIkqTBQJEmSJEmSJMBAkSRJkiRJkgoDRZIkSZIkSQIMFEmSJEmSJKkwUCRJkiRJkiTAQJEkSZIkSZIKA0WSJEmSJEkCDBRJkiRJkiSpMFAkSZIkSZIkwECRJEmSJEmSCgNFkiRJkiRJAgwUSZIkSZIkqTBQJEmSJEmSJACWrO8KaGEtP+oz66Rdf/yB66EmkiRJkiRp0nhGkSRJkiRJkoABZxRFxKnA84DbM/NXS9rWwMeB5cD1wAsz886ICOAE4ADgXuDQzLykfOYQ4C2l2Ldl5unzuyiab31nH8HoM5Bq8teWLUmSJEmSxm/IGUUfBPbrpB0FnJuZuwDnlvcA+wO7lNcRwEnwi8DS0cAewO7A0RGx1VwrL0mSJEmSpPkzY6AoM/8dWNtJPgiYOiPodODgVvqHsnEesDQitgf2BVZm5trMvBNYybrBJ0mSJEmSJK1Hs72Z9XaZeUuZvhXYrkzvANzYyndTSRuVvo6IOILmbCQe/ehHz7J6erDxUjVJkiRJksZvzjezzswEch7qMlXeyZm5IjNXLFu2bL6KlSRJkiRJ0gxmGyi6rVxSRvl7e0m/GdiplW/HkjYqXZIkSZIkSRNitpeenQ0cAhxf/p7VSn9lRJxBc+PquzPzlog4B3h76wbW+wBvmn21pel5qZokSZIkSfVmDBRFxMeAvYBtI+ImmqeXHQ+cGRGHAzcALyzZPwscAKwG7gUOA8jMtRHxVuDCku/YzOzeIFtaL2qDSgahJEmSJEkPVjMGijLzxSNm7d2TN4EjR5RzKnBqVe0kSZIkSZK0YGZ76ZmkgfrOQPLsI0mSJEnSJJrzU88kSZIkSZL04OAZRdIE8f5HkiRJkqT1yTOKJEmSJEmSBHhGkbRB8wwkSZIkSdJ88owiSZIkSZIkAQaKJEmSJEmSVHjpmbRIeJmaJEmSJGkmBook9aoNLNXkH2fZkiRJkqTZ89IzSZIkSZIkAZ5RJOlBqO8MJM8+kiRJkqSZGSiStKh5GZwkSZIk3c9LzyRJkiRJkgQYKJIkSZIkSVLhpWeSNCZe1iZJkiRpQ2OgSJI2UDWBpXEHrWpuID7OuqyP5ZwuvyRJkrSh8dIzSZIkSZIkAZ5RJEnSWHgWl2dlSZIkbYgMFEmSpIll0EqSJGlhGSiSJEkaYEM9i0uSJKmGgSJJkqQHsUkKWm2ol1hKkrSYLHigKCL2A04ANgY+kJnHL3QdJEmSpKEm6YmKiyU4ZyBPktafBQ0URcTGwPuA5wI3ARdGxNmZ+fWFrIckSZKkB49JClotluDcg60unq0o3W+hzyjaHVidmdcBRMQZwEGAgSJJkiRJ0qJhcG5xLOck1WVowDIyc1DG+RARLwD2y8yXlfd/COyRma9s5TkCOKK8fQLwjZ6itgW+W/Gva/KPs2zrsvBlW5eFL9u6LHzZ1mXhy7YuC1+2dVn4sq3LwpdtXRa+bOuy8GVbl4Uv27osfNkbQl12zsxlvbkzc8FewAto7ks09f4PgffOopyLxpV/nGVbl8W9nJNUl8WynJNUl8WynJNUl8WynJNUl8WynJNUl8WynJNUl8WynJNUl8WynJNUl8WynJNUl8WynJNUl8WynLPJvxEL62Zgp9b7HUuaJEmSJEmS1rOFDhRdCOwSEY+JiE2AFwFnL3AdJEmSJEmS1GNBb2admfdFxCuBc4CNgVMz86pZFHXyGPOPs+za/IulLotlOWvzb6hl1+ZfLHVZLMtZm39DLbs2/2Kpy2JZztr8G2rZtfkXS10Wy3LW5t9Qy67Nv1jqsliWszb/hlp2bf7FUpfFspzV+Rf0ZtaSJEmSJEmaXAt96ZkkSZIkSZImlIEiSZIkSZIkAQaKJEmSJEmSVBgoGqOIeGJEvDEiTiyvN0bEr6zvem3oIuKX1ncdNnQRsc36roNmr4wte0fE5p30/dZXnTS9DXWbi4gPTTPv1RGx00LWR5oPEbF7RDy9TD8pIl4bEQdMk/+XI+L1EXFCRLw7Iv4kIrZYuBprnCbpuHK6ukTEHlP9LiI2jYi/iYhPRcQ7I2LLhavlg8uGun/ekE1Sm09SXSbNBhkoiohvzvJzn+u8XxERX46Ij0TEThGxMiLujogLI+JpPZ/fLCLeEBF/EREPi4hDI+LsiPjbni9sbwTOAAK4oLwC+FhEHNVT9i9HxKkR8baI2Dwi/jEiroyIf46I5T35l0TEyyPi8xFxeXl9rhy8PKST95KIeEtEPHYWzdb9v4Pulh4Rzx+R/siIOCki3hcR20TEMRFxRUScGRHb9+TfuvPaBrggIraKiK0r6v25nrQtI+L4iLgmItZGxB0RcXVJWzq07BH/b/OIODYirip9ak1EnBcRh47Iv19resuIOKWs049GxHadvE9tTT+krNuzI+LtEbFZT9nHR8S2ZXpFRFwHnB8RN0TEszt5B/erkn/jkv+tEfHMzry3DGut9aN2Ha0vPePWq4GzgFcBV0bEQa3Zb+/5fFV/qazbvI0tPWVvFBF/FBGfiYjLyv86IyL2GpF/8Phc8teMoVtExDsi4sMR8ZLOvPf3lD14mxu3mnVU2qv9+hTwO1Pvez7yVprl+o+IeEVELJuh/MHtWDs+R8QrW23+uIj494i4KyLOj4inzLTs8ykqjy0qy67ti2Nr81nUffCYGxH/GhH/X9+2Ow/1OBo4ETgpIt4BvBd4OHBURLy5J/+rgb8HHgY8HXgosBNwXt94FJXHOZV132D3ufMlBh6HTvP5eTmunA+zqMupwL1l+gRgS+CdJe20nvLHuk33Lc+I9NrvW+M8Ph/b/jnqj1sGH/sP+N8nd96P7fis1ny2eUQc1nk/tu9E4xZjPG6Zt/WfmRP9Au4Bvlde95TXz6bSe/LvNuL168AtnbwXAPsDLwZuBF5Q0vcG/qun7DOBdwHvB86lObj4DeD/Bz7cyftN4CE9ZWwCXNuT/u/AnwJHAVcCr6M5EDkc+FJP/o8BJwF7AjuW154l7eOdvN8G/g74TlnmPwceNU2bbz3itQ1wU0/+3+m8fhe4dep9J+/nab7gHgVcDryxLOergLN6yv55qX/79dPy97rZrvuS/5zy/x/ZSntkSfvCNO2zXavs7UbkOQs4tKyX1wJ/BewCnA68vSf/Ja3pDwBvA3Yu6+rfpsn7LuCDwLOB9wAf6in7itb0l4Gnl+nHAxfNtl+16vpR4M+Ai4F399Wz53PLgKcBTwU2nybfU+dhDNl6PtbRiDJeMSDPkP5SM25dMdVmwHLgIuA15f2qGfrWjP2l5AtgD+7fpvegPCWzk69qbCmfeTSwtFX/FwC/2pPvNOAY4FnA/waOBZ4LfBF4VU/+weNzbV8H/gU4HjgYOLu8f+iofk7FNlfSl86yb8+4HdWsI+AS4CPAXqWP7AXcUqaf3ZN/Fc0PTvsApwBraMb4Q4BH9OQf3I5Ujs/AVa3pzwD/q0zvBfzfnvxbAO8APgy8pDPv/QPbf9TYUnVs0fP5zWm2/3X6xSz64tjavKZNyrzBYy5wM/AJYC3Ntv2/gE2mKXvwvoJmDN0Y2Izm2HKLkr4pcPmo/GV6M+ArZfrR9I+5Vcc5lX2rep8LLOn0rRXTlF+1z2XgeN7KP3TfUnUcWrOcVBxXtso6FrgKuJtmnDsPOLSinXqPFWZRl6tHrW/g0p78c96mW597Yuf9M4GrS7vsAawEvkUz5j2jk7f2+9Z81vtznfdV++cybyNgozK9Cc343Ne3ao9bBh/7124X1H/3eyTN8c/7SnnH0Ix9ZwLbD2jnx9F8/3tSz7zqNp/m/3xnmjac1+9E8/2ic5xG5XFLK++Q7xXVx+e95YyzQeapUU8EPtRuCODb0+T/GfClsvK7rx928q6apuOt6in70vI3aAIh0Xp/eSfvNcDOPWXsDHyjJ722Lt+cpg2+2Xnf3oh+g+aL1K2lTY4Y0YbX8cAd19T7n/Tk/ynwaZpfOk4rr3vK31MrlvPSnrJfR3PQ9ZSZ1n/Nui/511kP080DdqU5OLiaZtD/YlnP5wG7dfJe1nl/Yfm7EXBNT9ntdXRpZ173fbsNL6UEJPv6YUm/mnIABZzXmXdF5/3gflXSLm9NLwFOBv6V5hfXvn77pNJuq4GfAOeXfvVBYMsR6/RamjMX1tn59OR/S+d/fbOUfz2wxxzX0Ws7r9cB3516P8f+UjNuXdV5vznNNvLubl+ZZX/Zp6yfz9EcuHyglL8a2GeafjtkbDmqrI9rgJeVv6fQHGi+tpO3O6aeV/4+lNbBcnc7YcD4XNvXu+0KvBn4vzQHU31fzgdvcyXtvtI/DmdA0IiK7ahmHdH0/T+nOeDftaSt82Wlr+zy/iHA82mCcGtGraMh7Uj9+PyN1vSF0/WlklYbcKn5UtTe5obsz9/fmn4WzYHdl0vZB8yxL46zzQePtyXP4DF3qp1oAnp/CHyW5gv6aXTGoZJv8L6is35WdeZd2pP/ilbf2IrWlwngyhnKH3KcU9O3ave5hwJ3lHWzP82x3Lml7BfPsR0Hj+clf82+pfY4dPByUnFcWebV/vA3+FhhFnX5Z+CwMn0asKJMP57OuDdqux0yb0T+bl++AHgK8IyyfM8q6bvR+ZJL/ZhYOxbV/NhWu38+GLiN5keTg2j2t+cCNwG/PWr7bJfP6OOWwcf+tdsF9cdntT/kfxnYtkz/Ic229wGa8fJVnby1bX75iNcVwI+n6VuXMo/fiUraU2iO22+kGW+3am8Dc9yGao9bdmX494qq9T+yzjULuL5eNBv6l4BX0xxQTHfgeiWwy4h5N3be/xfNjuv3gBuAg0v6s+n/1ffS1nQ3ANI9ANqP+3eIJ5fX1A5xv56yL6YZ6HenGXCnBv/Hjegs55V6b9RK2wj4feD8Tt5VPZ/fuNTxtJ551wKPHtKGJe3pNIPmn7bSvj3i85e1pt/WmbfOcpb0HWl2ju8GHjFq/des+5L2BeANPDAIuR3N4PjFvvVP/8Hvnj3r/2vcv9N8PnBOa17fTu4m7j+guI7WL2zddinzp87curoz77Kesl9VlvU5NL8QnFD6+N+w7plwg/tVmdcXUDma5otI35lz5wFPKNO7A6eX6T8GPtGTfxXwq8BxZdu5jGYntnzEem4PjJ8B9m/9r6/NcR3dA3wc+OuyjEcDd05Nz7G/1IxbX6J8iW+lLaEJqP+s5/PX0fwaP7S/XN3XvsBjej6/qiffdGPLVTS/2m9T2nNZSX84nS9cNGPiY8v0bsC/t+Z9va+9W9PTjs+1fb20yUadtEPL8twwl22u5L8CeB7wTzRfds4CXgRsOqJPDN6O6A8ejFxHZf7UmPteOgc2M63/1rzNRvStQe1I/fh8HE2g7JeBv6Q542Jn4DDg09P1l/J+poBLzZei2mOL9rj1ZcoBX1mW7lmftX1xnG0+eLwt6YPH3BHrYBvgT+g/y3rwvoLmi95mZbq9/W854v++huZLyj/SHJRPfVFfRmtcauWvOs6p7Fu1+9wrgG1pxu/vcf+Yut2IutS04+DxvNUX+8rp27fUHofWLueg48ru+izvZ/pRqfZYoaYuW9KMc98q/finNPv4rwK/1pO/dps+ccTr/9C5ioMHfkHvrr9uELp2TKytd82PbbX751U0Z9tM9a2pfe/O3bpTf9wy+Ni/drvoroOSNt3xWXt9DglwX9mavhDYpkxv1q37LNr8NpqgyM6d13Lgvzt5x/adqOT/z9JmS4HX04x7U+t4VU/+bqC4HTBe28lbfdzC8O8V1cd+vf1qaMb1/aIZkF8N/Ee3k3TyvYCyEffMO7jz/tdoTm/8HPDE0lnuKp3gf/R8/gP0nN4PPBb4zxF13rN03t8t0xuPqNvewDdodqTPovl181rgduCgnvzLaXZEt9NEcb9Zpj8OPKaT94zKtj6Snh1OmbfOqZOtZX0NzaC8O6ODOceOaMPH0RMo6OR5Ps0XpFvnuu5L2lY013ZfQ7MTX1va/530n1K6zkFYa97qzvun0hz83UUzyDy+pC8DXt3z+aM7r6kDrkfSOXWScqYW95+9tV0r77kj6vdbpW+sojmg+ixwBJ3LI1v9ak3pU1N9cJ1+VfJ/hP7A58uAn/akjxzImOHXlvJ+d5qDqZvo/yLSLm9VZ173/dQ6unPgOno0zcHcO7n/S8Z0B3Q1/aVm3NqR1unYnXnP7En7YKuvzNhfyjpf0pO+SU+9a8eWy8vfjUu/an9J6waKnkNzZsVqml/L9mytn7/tKbt2fJ7q60PG0L8F/mdPGfuNWs80pw7PuM319NtNgRfSnCVwB/DRuWxHteuo89kDmeYyzKltpqK8we1I5fhcPnMozZen79J8Wfs6zX27tuzJWxtwWTVNG3fHqb5jiztL2X3baHv9XTxD2VV9cZxtTsV422qX9pg79YVrnTGXngDMDH1r8L6CcnZQTxnb0jqzozPvyTTj9BMH1KXqOKeyb9Xucy9tTXe/YPV9Ea1px8HjeUmr2bdUHYfWLmdr3rTHlSVP7Y9KVccKNXVp5d2ibE+/zohLT0q+2m36Hpr91CE9r+928rYDogd35nX357tSNyZO1fvqUueZ6l37Q/FeNPvnS7h///xy+vfPq6ZZru72MnXcci3NccseJX3UccvRDDz2r90uqD8+my7A3XemzSpghzL9ZeBhZXpjOme+d9p8yDHRKZRtrmfeRzvvT6PiGHcWdekeb/1WWb97dtd/mf8jmjMyu+v2aOCunvyHMfy4peZ7xayP/R5QznwUMs4XzQ7kpZQDHZrT264DXjFihb4a2Glg2YPzlvwP7dTlJTS/uB7ZV5dZLOse3H+t5JNpIpcHjMi7Cc3A/VyaX3P+gOa0snXqspDLCexAcz3rqEBRbV0ekJ/mS1Tv9e+l/dr3GjgW+BTNjmXLeajLiTS/mv4+8D/K6/dL2nvnczkHrJ9Dhq6f2rq0PrdNeX1kln1lnftJ0Hz5/SuaU+3fRTnzg+aSlWkvyeykB/33TLmL5jKST9EEuzZrzevu4PeY6hc0v4D8zXT9pfW5g2h+vX3BqH5e2196Pvssml8g1rnEYj5ejLg3UZn3Jpqd5xvL+nxJmV4FvKmnDWu2uQ/S3F/jLJrLkz5MM3adApw5Yj1vO6Te0y0nPffAaNV/99LPn8mIMXcW23O7XTYr7fLpadplVD/fEjikJ71qO1rIvlW5fgbXhebU6ddNl7esy/Y+9HV967PMrw241Hwpao8tU9vFdOv/Xu4/rf4eyqntND/ADCl72nGL5tfK19N8OXs3zVk5W4zot1NlD+m3dzFwvG2l/wrwP1n3Xg19wY/2+nxS6Suj1ueqEem9+4pJ6uc1fWvE56cbz8+muRfXe2nOuHhXGTeOphXwmE07Uj+eD963jHs5O5/9DeAt06yfqh+VWp8beqzwRJofizendYzbt03Msv89sWKb+xI9P5SXed/uvH8+/WeOPhZ4w4A2n2k8fyzwFzTHUu9hxLhV8tb+UPzLFWWv4v77E+3eSt+4b/ukOTNw0LjV89kZ94nUjYu/6FsD1n1tgHsvmkDfsWW7+1rZ3lYCr5/r+q/s4+3vzzO2eWW7XMa6l/Q/lSZYdEdP/q8Bvz7i/64TtOzJs85ZTa15Vd8rGLjvn+41dQ+HiRUR/0RzWcVmNAclDwc+SbOCIzMP6eS/G/gBzWmZHwP+OTPXjCh7cN4Rddmc5oB9b4DMPHR2S/mLp3DsX8pfSTMQfIUmEHROZh43oi6b0txcb2S7zPNyrtPmlctZW5du/jMz87sj8l5FE2m/r9z9/16am2HuXdJ/Z5qyP1rq0lt26zMH0OwcdyhJNwNnZ+ZnZyj7E5XLOV2/reqHNcsZ/U82eg7NwQOZ+fxO/tq6LKU5vfJJNIPv8Zl5TzSPdf2VzDyvk/8lmfnRvrqOqP+zO0kXZ+b3y9MjXpCZ72vlreovnf/zcJpTVvfIzN+cJt/+NAeLM/WXCzJz9zL9xzSBtk/SnKr9qcw8fualH1mHqnVaPvMk+vv51zv5are5JTSnnmfJtzvNl4XvAO/LzB/Mtt49+YPml59R+QePubMYt7rt8gOas0RHtcvrM/PvRpXXU/5SBm5H4+xbtWrq0sn7spL330bVu3YfOkM9D8vM0zppz6e55OHeTvpjgd/NzL9tpdWu/507VfjvzPxpeRrKb2bmv05T9kzb3KtpLmv8d+AAmi89d9FcjvqKzPzKHOrdHW8vKf1wnfG2VZdX0JzdsCvNTfjPKvMuyczdWnm763MPml+tRx0TVe0rxql2m6vsW7Xj3Bbl/yfNl7l9aX7B/g7w1sy8pZN/cDvWjOetz/wK/fvE7r5lrsu5H80ZgussZ8/6eQXTjC0ztME6Y0Vn/rTHCmWbOJLmbJldmWabmI3a8qN5atmPun1xrmrbvGbcGvC/H7COIuI1NGfLDio7Ip5Oc0bNjzrpy2nOevlIK632u1x3P/dKph8rBo+LEfGqUt6c+9aofl6OOV5Cc+uUJTRnHp6VmdfMsJzT7s9rzKLNa7eJl9AEervfTx4N/FVm/nEn/Qk0l5itc4wYEdtl5m2t97M5Ph/6vWJ+tqGch0jeOF/cf2rrEpprFqeePDHqJlWrGPgUlpq8s6lL5XLWPoVjcF3GvZw0v3ofT3Pwt5bmcomrS9rSOdalZn3WPg2iqi6V63OcyznObaL2qUdj2ybm6wX80oj0qv4y5rqsak1fyAPv9bDOKb+z6IuD12ll2WNrw9p6zyL/4DF3Ftvzgvet6dpxXH1rnHWprXfN+hxQz5H3ZRr4+XFuF7X7ucFP7Bp3v6XiiY3zuT4X+jXOba52nJun5dlmfbThuJZzPtfP0LFiVBvWbBOzXNaxlU/lsX9Nm9eMW7XraD7KnmF91nyXm1W7DCl/Ptf90H4+zeerlrOy7No2H+s2V1n3qu9cs2mXnEM/z0w2YvJtFBGb0NzgbTOagQmaS14e0pM/M/PnmfmFzDwceBTNJVn70VyyNtu8s6lLjfsy82fZRPG/lZnfKxX8Ic3jM+dSl3Ev55k0p+TulZlbZ+Y2NL/83FnmzaUuNfmvjIjDyvRlEbECICIeT3PDv7mUTUQ8MiJOioj3RcQ2EXFMRFweEWdGxPYLuJzj3CZW0NyM783A3dlEnH+YmV/NzK/2lF1Vl9KG7++04RUj2nBaEfG5nrStuy/ggojYqky31faXcdZlo5K+Dc1Ze2sAsvlF9r6auvT4dSrWaURsERHviIgPR8SLO/Pe38k+zjasqvcs8teMubXbc1W7RMSWEXF8RFwTEWsj4o6IuLqkLZ0m/9UD8o+zb9WqqUttvav2oWXs7ntdQXPT1G7+vvF/1Ng1zu1iNmUvKX8fSnPWJ5n5HdYdo8fab2ku3/h++f/X0xwU7x8R76b5caGtdn1eEhFvieYsnPWtqu9WtmPteN63zx113EL5n9uW6RURcR1wfkTcEJ0zyDr7ipd05nX3FUTEfp1l/kCpy0ejOQttLstZ04a166d2rOhrw/P62pC6bWI25q38nrGo5th/NvuhoeNW9TqqLHvwNsHsvsuNaz9Xte5n0c/Hts1Vqm7zynaZ7fHZkPxV37mi7tgPKvr5SDmHaNVCvGge1XsdzV3yX03zdK1/pImUHd2Tf9U0ZW0227yzqUvlctY+hWNwXca9nFQ8xnIWdalZn1tS9zSI2roMfnTkmJdzbNtEK33oU49q61L7+M3dRrx+nc4jT0v+n/PAx4Z+u/SDb9O5R8As+ss463I99z/i9Dpg+5K+OfN0BkrFOh382PBxt2FNvWexnIPHXOq359p2OadsC49spT2ypH1hLvkXom9V9MHBdamtd836LOmDn6pS8teM/2PbLmZR9msY+MSuBei3X2LgExtnsT6/DfwdzaVGF9Dsmx61kP17Dn23qh3L/KHjXO0+94rW9Je5/74fj2fdJzwN3ld002gePvA2mm3uz4F/m+NyjmrDo7ptOIv1UztW1LTh4G1iln2xqnzqxqKaY//aNh88btWuo1mUXbM+a8et2napOW6pXfe1/XwijkNm0ea17TK247PW/HGMc1X9fOT/nMvKWagXza+3jyrTS2luWrb7iLyDn8JSk3c2daksdzZP4RhUl3EvJxWPsaytyyzrvgXDngZRW5dVrelpHx057uUc1zbR89kDmeapR7Ooy+A2LGmDH3la8r+O5sD4Ka20b89Tfxl7XXrK2Iyep83N5TXTOu3py29mmseGj7MNa/ti5XIOHnNnuw1VtMvgA+7Z5F+ovjWHPjm4LqPy1qzPkj74qSolbVVresaxq3L9V28XQ8sueZ/MwCd2Vda7tt8OfmLjLNZnOwjxGzRn/N1a2vCIcffhOfbdWW/PA8a5Va3pIfvcqylPJgPO68zrXh56aef9tPuKzjrqfrZ3G6pYzrGNibMYK2rasOopprPoc7VPSa15zHzVI+xr2rzMGzxuzWId1ZRdsz6rv8tV9sWa45badV/bhhNxHFLb5rNolwU7Ppvvca6mn48sdy4rx5evzIRZPMZ4Q3xR+ehIX3NvQyofeVrSpyLz76a5JG7Gx9IOrPvE1GXM66jqseHjbsPF8KLygLs2v695WUdjG/831O1ikvoh/YGJjWkuDz1tfbfV+mrHWexzX1Xq8xyamzCfQHO/jL+h8zSe2n0FzY1uX0vzI8p1cP/TKJn7fT4nqS8ObsNJe9WMRSyeY/8Ndn2OuV0mZpubpOUc83i+4G2+IdyjSBMuM+8ETqO5u/5O2Vyr/CuZ+Uaau88/WJwVEVPXeL5lKjEiHgd8Y73VasNS24bHwMhx6lV9iZl5U2b+Hs1TD1bS/GIxHyapLuP0KZoDol/IzA/SHNz/ZI5lH0NlGy4Sv0/zK/xXy3Xna2n6zNY0TxWaa37N3TjH/2PYMLeLSeqH3+wmZHPfis9n5mF9H5gg42zHqn6bmf8HeDvwcpon6zyH5kvIzTRPS2ur3Vf8I80PJpsDp9P84k9EPBK4tHbBOiamL1a24aQ5hoFj0WI59t/A1+c4Tcw2N2aTdHy24G0eJRolzVqM+fGeG4KY4RGpmlltGw7JHxGbAo/NzCvHuY4mqS7jtL7bcDEax3ah+eV2sa5Jqvck1aXWJPWtmvyTNG5N0vqfpLrU6tbdY/8Ne32O02Jpl8Uwzhko0pyVO+I/IzO/HxHLgU/QnI55QkSsysynrd8ajl9EfCczH72+67Ehq23DceevMUl1GadJasPFYrH0rQ2Z28W6Jqnek1SXWpPUt2ryT9K4NUnrf5LqUqtbd4/9N+z1OU6LpV0Wwzi3ZOYs0owe8KjBiNgL+ERE7Mz8PN5zIkTE5aNm0f/4TXXUtuG489eYpLqM0yS14WKxWPrWhsztYl2TVO9JqkutSepbNfknadyapPU/SXWpVVl3j/0nfH2O02Jpl8U+zhko0ny4LSJ2zcxLAcqvC88DTgWesl5rNr+2A/aluWlfWwBfW/jqbJBq23Dc+WtMUl3GaZLacLFYLH1rQ+Z2sa5Jqvck1aXWJPWtmvyTNG5N0vqfpLrUqqm7x/6Tvz7HabG0y6Ie5wwUaT68FLivnZCZ9wEvjYh/WD9VGotPA5tP7RTbIuIrC16bDVNtG447f41Jqss4TVIbLhaLpW9tyNwu1jVJ9Z6kutSapL5Vk3+Sxq1JWv+TVJdaNXX32H/y1+c4LZZ2WdTjnPcokiRJkiRJEjD6EYiSJEmSJElaZAwUSZIkSZIkCTBQJEmSJEmSpMJAkSRJkiRJkgD4f9y9lGWu9rWUAAAAAElFTkSuQmCC\n",
356 | "text/plain": [
357 | ""
358 | ]
359 | },
360 | "metadata": {
361 | "needs_background": "light"
362 | },
363 | "output_type": "display_data"
364 | }
365 | ],
366 | "source": [
367 | "df.category_id.value_counts().plot.bar(figsize=(20, 4))\n",
368 | "plt.title('Распределение по классам');"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "id": "freelance-capture",
374 | "metadata": {},
375 | "source": [
376 | "Посмотрим самые частые категории"
377 | ]
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 9,
382 | "id": "aggressive-anthropology",
383 | "metadata": {},
384 | "outputs": [
385 | {
386 | "data": {
387 | "text/html": [
388 | "\n",
389 | "\n",
402 | "
\n",
403 | " \n",
404 | " \n",
405 | " \n",
406 | " item_name \n",
407 | " category_id \n",
408 | " \n",
409 | " \n",
410 | " \n",
411 | " \n",
412 | " 4 \n",
413 | " Хлеб на СЫВОРОТКЕ 350г \n",
414 | " 84 \n",
415 | " \n",
416 | " \n",
417 | " 5 \n",
418 | " Сосиска в тесте с сыром 1шт ГЕ \n",
419 | " 84 \n",
420 | " \n",
421 | " \n",
422 | " 13 \n",
423 | " Курник 1 шт. \n",
424 | " 84 \n",
425 | " \n",
426 | " \n",
427 | " 19 \n",
428 | " Вафли с топленым молоком вес. 1кг Тортугалия \n",
429 | " 84 \n",
430 | " \n",
431 | " \n",
432 | " 28 \n",
433 | " Кф.Золотой Степ 50г с орехом \n",
434 | " 84 \n",
435 | " \n",
436 | " \n",
437 | "
\n",
438 | "
"
439 | ],
440 | "text/plain": [
441 | " item_name category_id\n",
442 | "4 Хлеб на СЫВОРОТКЕ 350г 84\n",
443 | "5 Сосиска в тесте с сыром 1шт ГЕ 84\n",
444 | "13 Курник 1 шт. 84\n",
445 | "19 Вафли с топленым молоком вес. 1кг Тортугалия 84\n",
446 | "28 Кф.Золотой Степ 50г с орехом 84"
447 | ]
448 | },
449 | "execution_count": 9,
450 | "metadata": {},
451 | "output_type": "execute_result"
452 | }
453 | ],
454 | "source": [
455 | "samples(df, 84).head()"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": 10,
461 | "id": "binary-velvet",
462 | "metadata": {},
463 | "outputs": [
464 | {
465 | "data": {
466 | "text/html": [
467 | "\n",
468 | "\n",
481 | "
\n",
482 | " \n",
483 | " \n",
484 | " \n",
485 | " item_name \n",
486 | " category_id \n",
487 | " \n",
488 | " \n",
489 | " \n",
490 | " \n",
491 | " 1 \n",
492 | " Компот из изюма, 114 ккал \n",
493 | " 71 \n",
494 | " \n",
495 | " \n",
496 | " 2 \n",
497 | " Макаронные изделия отварные (масло сливочное),... \n",
498 | " 71 \n",
499 | " \n",
500 | " \n",
501 | " 15 \n",
502 | " Филе бедра куриного жареное 3 \n",
503 | " 71 \n",
504 | " \n",
505 | " \n",
506 | " 30 \n",
507 | " МОРС 200 мл \n",
508 | " 71 \n",
509 | " \n",
510 | " \n",
511 | " 41 \n",
512 | " Kотлета Kуриная Домашняя 100 г \n",
513 | " 71 \n",
514 | " \n",
515 | " \n",
516 | "
\n",
517 | "
"
518 | ],
519 | "text/plain": [
520 | " item_name category_id\n",
521 | "1 Компот из изюма, 114 ккал 71\n",
522 | "2 Макаронные изделия отварные (масло сливочное),... 71\n",
523 | "15 Филе бедра куриного жареное 3 71\n",
524 | "30 МОРС 200 мл 71\n",
525 | "41 Kотлета Kуриная Домашняя 100 г 71"
526 | ]
527 | },
528 | "execution_count": 10,
529 | "metadata": {},
530 | "output_type": "execute_result"
531 | }
532 | ],
533 | "source": [
534 | "samples(df, 71).head()"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "id": "infrared-eight",
540 | "metadata": {},
541 | "source": [
542 | "## Preprocessing"
543 | ]
544 | },
545 | {
546 | "cell_type": "markdown",
547 | "id": "proud-dragon",
548 | "metadata": {},
549 | "source": [
550 | "Препроцессинг включает следующие шаги:\n",
551 | "* CamelCase сплиттинг: \"СокДобрый\" -> \"Сок Добрый\"\n",
552 | "* Нижний регистр\n",
553 | "* Транслитерация: \"lmnoё\" -> \"лмное\"\n",
554 | "* 50x30x45 -> DxDxD\n",
555 | "* [\"№\", \"%\"] -> ['NUM', 'PERC']\n",
556 | "* Замены величин: [\"1.5\", \"1,3 кг\" 3,5г -> [\"1p5\", \"1p3кг\" \"3p5г\"]\n",
557 | "* сокращения со слешами заменяются: [\"ж/б\", \"тп/р]->[\"жб\", тпр\"]\n",
558 | "* Не буквы и цифры заменяются на пробелы\n",
559 | "* Склейка одиночных символов: \"а б в\" -> \"абв\""
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 11,
565 | "id": "magnetic-burke",
566 | "metadata": {},
567 | "outputs": [],
568 | "source": [
569 | "# функция для создания словаря транслитераций\n",
570 | "def make_trans():\n",
571 | " a = 'a b c d e f g h i j k l m n o p q r s t u v w x y z ё'.split()\n",
572 | " b = 'а в с д е ф г н и ж к л м н о р к р с т у в в х у з е'.split()\n",
573 | " trans_dict = dict(zip(a, b))\n",
574 | " trans_table = ''.join(a).maketrans(trans_dict)\n",
575 | " return trans_table\n",
576 | "\n",
577 | "def normalize(ser: pd.Series):\n",
578 | "# \"СокДобрый\" -> \"Сок Добрый\"\n",
579 | " camel_case_pat = re.compile(r'([а-яa-z])([А-ЯA-Z])')\n",
580 | "# \"lmno\" -> \"лмно\"\n",
581 | " trans_table = make_trans()\n",
582 | "# \"14х15х30\" -> \"DxDxD\"\n",
583 | " dxdxd_pat = re.compile(r'((?:\\d+\\s*[х\\*]\\s*){2}\\d+)')\n",
584 | "# \"1.2 15,5\" -> \"1p2 15p5\" \n",
585 | " digit_pat = re.compile(r'(\\d+)[\\.,](\\d+)')\n",
586 | "# \"15 мл\" -> \"15мл\"\n",
587 | " unit = 'мг|г|гр|кг|мл|л|шт'\n",
588 | " unit_pat = re.compile(fr'((?:\\d+p)?\\d+)\\s*({unit})\\b')\n",
589 | "# \"ж/б ст/б\" -> \"жб стб\"\n",
590 | " w_w_pat = re.compile(r'\\b([а-я]{1,2})/([а-я]{1,2})\\b')\n",
591 | "# \"a b c d\" -> \"abcd\"\n",
592 | " glue_pat = re.compile(r'(?<=(?\n",
638 | "\n",
651 | "\n",
652 | " \n",
653 | " \n",
654 | " \n",
655 | " item_name \n",
656 | " \n",
657 | " \n",
658 | " \n",
659 | " \n",
660 | " 2408405 \n",
661 | " мор эскимо мм 63г \n",
662 | " \n",
663 | " \n",
664 | " 461664 \n",
665 | " эфес пилснер 0p3 \n",
666 | " \n",
667 | " \n",
668 | " 2316442 \n",
669 | " хек тушка 1кг \n",
670 | " \n",
671 | " \n",
672 | " 344862 \n",
673 | " капучино бл \n",
674 | " \n",
675 | " \n",
676 | " 1178767 \n",
677 | " килька черноморская неразделанная в том соусе... \n",
678 | " \n",
679 | " \n",
680 | " 2832448 \n",
681 | " ментос лимонад 1шт \n",
682 | " \n",
683 | " \n",
684 | " 2710457 \n",
685 | " саморез универс 4p5 х20 оцинк \n",
686 | " \n",
687 | " \n",
688 | " 51850 \n",
689 | " фромилид уно таб пролонг дя по плен 500м... \n",
690 | " \n",
691 | " \n",
692 | " 513585 \n",
693 | " пивной нап амстердам навигатор 6p8 PERC 0... \n",
694 | " \n",
695 | " \n",
696 | " 2446381 \n",
697 | " семена укроп мамонт 2г \n",
698 | " \n",
699 | " \n",
700 | "
\n",
701 | ""
702 | ],
703 | "text/plain": [
704 | " item_name\n",
705 | "2408405 мор эскимо мм 63г \n",
706 | "461664 эфес пилснер 0p3 \n",
707 | "2316442 хек тушка 1кг \n",
708 | "344862 капучино бл\n",
709 | "1178767 килька черноморская неразделанная в том соусе...\n",
710 | "2832448 ментос лимонад 1шт \n",
711 | "2710457 саморез универс 4p5 х20 оцинк \n",
712 | "51850 фромилид уно таб пролонг дя по плен 500м...\n",
713 | "513585 пивной нап амстердам навигатор 6p8 PERC 0...\n",
714 | "2446381 семена укроп мамонт 2г "
715 | ]
716 | },
717 | "execution_count": 14,
718 | "metadata": {},
719 | "output_type": "execute_result"
720 | }
721 | ],
722 | "source": [
723 | "df_full.sample(10)"
724 | ]
725 | },
726 | {
727 | "cell_type": "markdown",
728 | "id": "pediatric-accordance",
729 | "metadata": {},
730 | "source": [
731 | "## Fasttext"
732 | ]
733 | },
734 | {
735 | "cell_type": "markdown",
736 | "id": "thermal-preparation",
737 | "metadata": {},
738 | "source": [
739 | "### Fitting"
740 | ]
741 | },
742 | {
743 | "cell_type": "markdown",
744 | "id": "southeast-model",
745 | "metadata": {},
746 | "source": [
747 | "Для извлечения фичей воспользуемся FastText. Изначально на его месте был Word2Vec, но позже благодаря [@dremovd](https://github.com/dremovd) осознал, что FastText справится здесь лучше. \n",
748 | "\n",
749 | "Обучать будем на всех неразмеченных данных, чтобы не было лишних утечек. Для начала сделаем сплиттинг строки на список слов, избавившися от однобуквенных слов"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 15,
755 | "id": "direct-heavy",
756 | "metadata": {},
757 | "outputs": [],
758 | "source": [
759 | "df_full.item_name = df_full.item_name.apply(lambda x: [i for i in x.split() if len(i) > 1])"
760 | ]
761 | },
762 | {
763 | "cell_type": "markdown",
764 | "id": "stainless-kelly",
765 | "metadata": {},
766 | "source": [
767 | "Метод построения векторов (sg), размер вектора (size) и минимальное количество слов в словаре (min_count) были подобраны эмпирическим путём. Стоит отметить, что sg=1 или Skip-Grams mode, когда fasttext (а изначально word2vec) пытается предсказать по слову остальной контент, работает значительно лучше, чем sg=0 или CBOW mode. В качестве аргумента window я поставил максимальную длину списка в корпусе. Для демонстрации я поставил количество эпох (iter) равным 10, хотя оптимальным является 30. Такой bucket выставил для экономии памяти на жёстком диске (и чтобы вместился в сабмит)). Последний параметр workers - это допустимое количество потоков, у меня это число 12. Из-за него кстати нет смысла выставлять seed, так как всё сбивается и воспроизвести решение можно только с workers=1 и seed=КАКОЕ-ТО_ЧИСЛО (конкретно в этой реализации от gensim)."
768 | ]
769 | },
770 | {
771 | "cell_type": "code",
772 | "execution_count": null,
773 | "id": "checked-triumph",
774 | "metadata": {
775 | "scrolled": true,
776 | "tags": []
777 | },
778 | "outputs": [],
779 | "source": [
780 | "model = FastText(df_full.item_name, size=200, window=35, min_count=3, workers=12, iter=10, sg=1, bucket=400_000)"
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "id": "minus-cleaners",
786 | "metadata": {},
787 | "source": [
788 | "Сохраним модель"
789 | ]
790 | },
791 | {
792 | "cell_type": "code",
793 | "execution_count": 18,
794 | "id": "illegal-advocate",
795 | "metadata": {},
796 | "outputs": [
797 | {
798 | "name": "stderr",
799 | "output_type": "stream",
800 | "text": [
801 | "2021-03-22 14:05:18,005 : INFO : saving FastText object under ft.model, separately None\n",
802 | "2021-03-22 14:05:18,006 : INFO : storing np array 'vectors' to ft.model.wv.vectors.npy\n",
803 | "2021-03-22 14:05:18,826 : INFO : storing np array 'vectors_vocab' to ft.model.wv.vectors_vocab.npy\n",
804 | "2021-03-22 14:05:19,498 : INFO : storing np array 'vectors_ngrams' to ft.model.wv.vectors_ngrams.npy\n",
805 | "2021-03-22 14:05:21,723 : INFO : not storing attribute vectors_norm\n",
806 | "2021-03-22 14:05:21,724 : INFO : not storing attribute vectors_vocab_norm\n",
807 | "2021-03-22 14:05:21,724 : INFO : not storing attribute vectors_ngrams_norm\n",
808 | "2021-03-22 14:05:21,725 : INFO : not storing attribute buckets_word\n",
809 | "2021-03-22 14:05:21,726 : INFO : storing np array 'syn1neg' to ft.model.trainables.syn1neg.npy\n",
810 | "2021-03-22 14:05:22,500 : INFO : storing np array 'vectors_vocab_lockf' to ft.model.trainables.vectors_vocab_lockf.npy\n",
811 | "2021-03-22 14:05:23,187 : INFO : storing np array 'vectors_ngrams_lockf' to ft.model.trainables.vectors_ngrams_lockf.npy\n",
812 | "2021-03-22 14:05:25,274 : INFO : saved ft.model\n"
813 | ]
814 | }
815 | ],
816 | "source": [
817 | "model.save('ft.model')"
818 | ]
819 | },
820 | {
821 | "cell_type": "markdown",
822 | "id": "searching-encoding",
823 | "metadata": {},
824 | "source": [
825 | "### Model optimization"
826 | ]
827 | },
828 | {
829 | "cell_type": "markdown",
830 | "id": "brilliant-librarian",
831 | "metadata": {},
832 | "source": [
833 | "Больше всего в fasttext отнимается памяти из-за ngram, на данный момент модель занимает 1Гб (!) на жёстком диске. С этим надо было как-то бороться и первое что пришло в голову: заменить float32, в котором хранятся вектора на float16. Это сожмёт нашу модель как минимум в 2 раза. Можете убедиться: это не повлияет на качество классификатора"
834 | ]
835 | },
836 | {
837 | "cell_type": "markdown",
838 | "id": "figured-davis",
839 | "metadata": {},
840 | "source": [
841 | "Посмотрим на файлы, относящиеся к FastText"
842 | ]
843 | },
844 | {
845 | "cell_type": "code",
846 | "execution_count": 19,
847 | "id": "advanced-holiday",
848 | "metadata": {},
849 | "outputs": [
850 | {
851 | "data": {
852 | "text/plain": [
853 | "['ft.model.trainables.syn1neg.npy',\n",
854 | " 'ft.model.trainables.vectors_ngrams_lockf.npy',\n",
855 | " 'ft.model.trainables.vectors_vocab_lockf.npy',\n",
856 | " 'ft.model.wv.vectors.npy',\n",
857 | " 'ft.model.wv.vectors_ngrams.npy',\n",
858 | " 'ft.model.wv.vectors_vocab.npy']"
859 | ]
860 | },
861 | "execution_count": 19,
862 | "metadata": {},
863 | "output_type": "execute_result"
864 | }
865 | ],
866 | "source": [
867 | "list(map(str,Path('.').glob('*.npy')))"
868 | ]
869 | },
870 | {
871 | "cell_type": "code",
872 | "execution_count": 20,
873 | "id": "annual-hearing",
874 | "metadata": {},
875 | "outputs": [
876 | {
877 | "name": "stderr",
878 | "output_type": "stream",
879 | "text": [
880 | "2021-03-22 14:05:26,489 : INFO : ft.model.trainables.syn1neg.npy\n",
881 | "2021-03-22 14:05:26,565 : INFO : min/max values before: (-2.090169, 2.0418446)\n",
882 | "2021-03-22 14:05:27,033 : INFO : min/max values after: (-2.09, 2.041)\n",
883 | "2021-03-22 14:05:27,322 : INFO : __________________________________\n",
884 | "2021-03-22 14:05:27,323 : INFO : ft.model.trainables.vectors_ngrams_lockf.npy\n",
885 | "2021-03-22 14:05:27,500 : INFO : min/max values before: (1.0, 1.0)\n",
886 | "2021-03-22 14:05:28,249 : INFO : min/max values after: (1.0, 1.0)\n",
887 | "2021-03-22 14:05:29,133 : INFO : __________________________________\n",
888 | "2021-03-22 14:05:29,134 : INFO : ft.model.trainables.vectors_vocab_lockf.npy\n",
889 | "2021-03-22 14:05:29,220 : INFO : min/max values before: (1.0, 1.0)\n",
890 | "2021-03-22 14:05:29,518 : INFO : min/max values after: (1.0, 1.0)\n",
891 | "2021-03-22 14:05:29,820 : INFO : __________________________________\n",
892 | "2021-03-22 14:05:29,820 : INFO : ft.model.wv.vectors.npy\n",
893 | "2021-03-22 14:05:29,899 : INFO : min/max values before: (-2.720382, 2.59036)\n",
894 | "2021-03-22 14:05:30,374 : INFO : min/max values after: (-2.72, 2.59)\n",
895 | "2021-03-22 14:05:30,679 : INFO : __________________________________\n",
896 | "2021-03-22 14:05:30,680 : INFO : ft.model.wv.vectors_ngrams.npy\n",
897 | "2021-03-22 14:05:30,860 : INFO : min/max values before: (-9.1180725, 10.212996)\n",
898 | "2021-03-22 14:05:32,053 : INFO : min/max values after: (-9.12, 10.21)\n",
899 | "2021-03-22 14:05:33,198 : INFO : __________________________________\n",
900 | "2021-03-22 14:05:33,199 : INFO : ft.model.wv.vectors_vocab.npy\n",
901 | "2021-03-22 14:05:33,285 : INFO : min/max values before: (-11.192483, 12.355845)\n",
902 | "2021-03-22 14:05:33,753 : INFO : min/max values after: (-11.195, 12.36)\n",
903 | "2021-03-22 14:05:34,062 : INFO : __________________________________\n"
904 | ]
905 | }
906 | ],
907 | "source": [
908 | "for file in map(str,Path('.').glob('*.npy')):\n",
909 | " logging.info(file)\n",
910 | " spam = np.load(file)\n",
911 | " logging.info(f'min/max values before: {spam.min(), spam.max()}')\n",
912 | " spam = spam.astype('float16')\n",
913 | " logging.info(f'min/max values after: {spam.min(), spam.max()}')\n",
914 | " with open(file, 'wb') as f:\n",
915 | " np.save(f, spam)\n",
916 | " logging.info('__________________________________')"
917 | ]
918 | },
919 | {
920 | "cell_type": "markdown",
921 | "id": "conscious-bristol",
922 | "metadata": {},
923 | "source": [
924 | "После этих манипуляций файлы занимают 500Мб. Но этого всё-равно недостаточно: надо будет как-то уместить классификатор\n",
925 | "\n",
926 | "Видим, что у нас есть файлы целиком состоящие из единичек (минимальное и максимальное значения - это единицы). Взглянем на них разок"
927 | ]
928 | },
929 | {
930 | "cell_type": "code",
931 | "execution_count": 21,
932 | "id": "chief-healthcare",
933 | "metadata": {},
934 | "outputs": [],
935 | "source": [
936 | "spam = np.load('ft.model.trainables.vectors_ngrams_lockf.npy')"
937 | ]
938 | },
939 | {
940 | "cell_type": "code",
941 | "execution_count": 22,
942 | "id": "unlimited-prisoner",
943 | "metadata": {},
944 | "outputs": [
945 | {
946 | "data": {
947 | "text/plain": [
948 | "array([[1., 1., 1., ..., 1., 1., 1.],\n",
949 | " [1., 1., 1., ..., 1., 1., 1.],\n",
950 | " [1., 1., 1., ..., 1., 1., 1.],\n",
951 | " ...,\n",
952 | " [1., 1., 1., ..., 1., 1., 1.],\n",
953 | " [1., 1., 1., ..., 1., 1., 1.],\n",
954 | " [1., 1., 1., ..., 1., 1., 1.]], dtype=float16)"
955 | ]
956 | },
957 | "execution_count": 22,
958 | "metadata": {},
959 | "output_type": "execute_result"
960 | }
961 | ],
962 | "source": [
963 | "spam"
964 | ]
965 | },
966 | {
967 | "cell_type": "code",
968 | "execution_count": 23,
969 | "id": "revised-taylor",
970 | "metadata": {},
971 | "outputs": [
972 | {
973 | "data": {
974 | "text/plain": [
975 | "(400000, 200)"
976 | ]
977 | },
978 | "execution_count": 23,
979 | "metadata": {},
980 | "output_type": "execute_result"
981 | }
982 | ],
983 | "source": [
984 | "spam.shape"
985 | ]
986 | },
987 | {
988 | "cell_type": "markdown",
989 | "id": "postal-contamination",
990 | "metadata": {},
991 | "source": [
992 | "Ого, вы видели, сколько эта вещь занимает в памяти?! К счастью, архиваторы умеют грамотно сжимать такие файлы из повторяющихся элементов, так что можем быть уверены, что в итоге у нас останется память для классификатора"
993 | ]
994 | },
995 | {
996 | "cell_type": "markdown",
997 | "id": "wired-objective",
998 | "metadata": {},
999 | "source": [
1000 | "### Vector averaging"
1001 | ]
1002 | },
1003 | {
1004 | "cell_type": "markdown",
1005 | "id": "residential-kitty",
1006 | "metadata": {},
1007 | "source": [
1008 | "Извлечём фичи. Для этого получим вектора каждого слова из предложения, сложим их и отнормируем его, чтобы его размер был равен 1. "
1009 | ]
1010 | },
1011 | {
1012 | "cell_type": "code",
1013 | "execution_count": 24,
1014 | "id": "encouraging-football",
1015 | "metadata": {},
1016 | "outputs": [
1017 | {
1018 | "name": "stderr",
1019 | "output_type": "stream",
1020 | "text": [
1021 | "2021-03-22 14:05:41,479 : INFO : loading FastText object from ft.model\n",
1022 | "2021-03-22 14:05:41,690 : INFO : loading wv recursively from ft.model.wv.* with mmap=None\n",
1023 | "2021-03-22 14:05:41,691 : INFO : loading vectors from ft.model.wv.vectors.npy with mmap=None\n",
1024 | "2021-03-22 14:05:41,724 : INFO : loading vectors_vocab from ft.model.wv.vectors_vocab.npy with mmap=None\n",
1025 | "2021-03-22 14:05:41,760 : INFO : loading vectors_ngrams from ft.model.wv.vectors_ngrams.npy with mmap=None\n",
1026 | "2021-03-22 14:05:41,857 : INFO : setting ignored attribute vectors_norm to None\n",
1027 | "2021-03-22 14:05:41,857 : INFO : setting ignored attribute vectors_vocab_norm to None\n",
1028 | "2021-03-22 14:05:41,858 : INFO : setting ignored attribute vectors_ngrams_norm to None\n",
1029 | "2021-03-22 14:05:41,859 : INFO : setting ignored attribute buckets_word to None\n",
1030 | "2021-03-22 14:05:41,859 : INFO : loading vocabulary recursively from ft.model.vocabulary.* with mmap=None\n",
1031 | "2021-03-22 14:05:41,860 : INFO : loading trainables recursively from ft.model.trainables.* with mmap=None\n",
1032 | "2021-03-22 14:05:41,861 : INFO : loading syn1neg from ft.model.trainables.syn1neg.npy with mmap=None\n",
1033 | "2021-03-22 14:05:41,895 : INFO : loading vectors_vocab_lockf from ft.model.trainables.vectors_vocab_lockf.npy with mmap=None\n",
1034 | "2021-03-22 14:05:41,929 : INFO : loading vectors_ngrams_lockf from ft.model.trainables.vectors_ngrams_lockf.npy with mmap=None\n",
1035 | "2021-03-22 14:05:41,997 : INFO : loaded ft.model\n"
1036 | ]
1037 | }
1038 | ],
1039 | "source": [
1040 | "model = FastText.load('ft.model')"
1041 | ]
1042 | },
1043 | {
1044 | "cell_type": "code",
1045 | "execution_count": 25,
1046 | "id": "proper-married",
1047 | "metadata": {},
1048 | "outputs": [],
1049 | "source": [
1050 | "def word_averaging(wv, words):\n",
1051 | " mean = np.zeros((wv.vector_size,))\n",
1052 | " \n",
1053 | " for word in words:\n",
1054 | " mean += wv.get_vector(word)\n",
1055 | "\n",
1056 | " mean = gensim.matutils.unitvec(mean)\n",
1057 | " return mean\n",
1058 | "\n",
1059 | "def word_averaging_list(wv, text_list):\n",
1060 | " return np.vstack([word_averaging(wv, review) for review in text_list])"
1061 | ]
1062 | },
1063 | {
1064 | "cell_type": "markdown",
1065 | "id": "proof-disorder",
1066 | "metadata": {},
1067 | "source": [
1068 | "Также для ускорения алгоритмов классификации приведём полученную матрицу к типу float16"
1069 | ]
1070 | },
1071 | {
1072 | "cell_type": "code",
1073 | "execution_count": 31,
1074 | "id": "close-junction",
1075 | "metadata": {},
1076 | "outputs": [],
1077 | "source": [
1078 | "df.item_name = df.item_name.apply(lambda x: [i for i in x.split() if len(i) > 1])\n",
1079 | "X_wv = word_averaging_list(model.wv, df.item_name)\n",
1080 | "X_wv = X_wv.astype('float16')\n",
1081 | "\n",
1082 | "y = df.category_id"
1083 | ]
1084 | },
1085 | {
1086 | "cell_type": "markdown",
1087 | "id": "front-shareware",
1088 | "metadata": {},
1089 | "source": [
1090 | "Огромный плюс fasttext'а в том, что он может векторизовать слова, даже которых нет в словаре, благодаря ngram'ам"
1091 | ]
1092 | },
1093 | {
1094 | "cell_type": "markdown",
1095 | "id": "expected-think",
1096 | "metadata": {},
1097 | "source": [
1098 | "## Classification"
1099 | ]
1100 | },
1101 | {
1102 | "cell_type": "markdown",
1103 | "id": "approximate-palace",
1104 | "metadata": {},
1105 | "source": [
1106 | "Наконец-то перейдём к классификации. Будем использовать модель SVM для классификации. Она показала здесь лучшее качество. Также не забудем отнормировать данные перед подачей в классификатор: сделаем пайплайн"
1107 | ]
1108 | },
1109 | {
1110 | "cell_type": "code",
1111 | "execution_count": 32,
1112 | "id": "caring-armstrong",
1113 | "metadata": {},
1114 | "outputs": [],
1115 | "source": [
1116 | "pipe = make_pipeline(\n",
1117 | " StandardScaler(),\n",
1118 | " SVC(random_state=0),\n",
1119 | ")"
1120 | ]
1121 | },
1122 | {
1123 | "cell_type": "code",
1124 | "execution_count": 33,
1125 | "id": "editorial-federation",
1126 | "metadata": {},
1127 | "outputs": [
1128 | {
1129 | "name": "stdout",
1130 | "output_type": "stream",
1131 | "text": [
1132 | "Wall time: 5min 56s\n"
1133 | ]
1134 | },
1135 | {
1136 | "data": {
1137 | "text/plain": [
1138 | "array([0.86135208, 0.85232645, 0.85836571, 0.85216471, 0.86253755])"
1139 | ]
1140 | },
1141 | "execution_count": 33,
1142 | "metadata": {},
1143 | "output_type": "execute_result"
1144 | }
1145 | ],
1146 | "source": [
1147 | "%%time\n",
1148 | "cross_val_score(pipe, \n",
1149 | " X_wv, \n",
1150 | " y, \n",
1151 | " scoring='f1_weighted',\n",
1152 | " cv=StratifiedKFold(5, \n",
1153 | " shuffle=True, \n",
1154 | " random_state=0), \n",
1155 | " n_jobs=5\n",
1156 | ")"
1157 | ]
1158 | },
1159 | {
1160 | "cell_type": "markdown",
1161 | "id": "starting-mandate",
1162 | "metadata": {},
1163 | "source": [
1164 | "Итак, получили неплохое качество на крос-валидации. Обучим на всех данных и сохраним модель"
1165 | ]
1166 | },
1167 | {
1168 | "cell_type": "code",
1169 | "execution_count": null,
1170 | "id": "fifteen-sampling",
1171 | "metadata": {},
1172 | "outputs": [],
1173 | "source": [
1174 | "pipe.fit(X_wv, y)\n",
1175 | "pickle.dump(pipe, open(f'clf_task1', 'wb'))"
1176 | ]
1177 | },
1178 | {
1179 | "cell_type": "markdown",
1180 | "id": "young-speaker",
1181 | "metadata": {},
1182 | "source": [
1183 | "## Conclusion"
1184 | ]
1185 | },
1186 | {
1187 | "cell_type": "markdown",
1188 | "id": "weekly-possibility",
1189 | "metadata": {},
1190 | "source": [
1191 | "На тесте данное решение получило 0.865. Особую благодарность хотелось бы выразить [@dremovd](https://github.com/dremovd), с которым мы были в одной команде. Он направлял меня в нужную сторону, и без него мы бы не вышли в топ-10.\n",
1192 | "\n",
1193 | "Спасибо за ресёрч данного ноутбука, в нём я привёл в краткой форме решение задачи. Ждите более расширенного решения с анализом ошибок, стакингом и постпроцессингом :) \n",
1194 | "\n",
1195 | "До новых встреч!"
1196 | ]
1197 | }
1198 | ],
1199 | "metadata": {
1200 | "kernelspec": {
1201 | "display_name": "Python 3",
1202 | "language": "python",
1203 | "name": "python3"
1204 | },
1205 | "language_info": {
1206 | "codemirror_mode": {
1207 | "name": "ipython",
1208 | "version": 3
1209 | },
1210 | "file_extension": ".py",
1211 | "mimetype": "text/x-python",
1212 | "name": "python",
1213 | "nbconvert_exporter": "python",
1214 | "pygments_lexer": "ipython3",
1215 | "version": "3.9.1"
1216 | }
1217 | },
1218 | "nbformat": 4,
1219 | "nbformat_minor": 5
1220 | }
1221 |
--------------------------------------------------------------------------------