├── bot_final_vesion
    ├── train_tfidf.pkl
    ├── vectorizer.pkl
    ├── model.py
    ├── clean_ingred.txt
    ├── rus_clean_ingred.txt
    ├── dish_id_bot.py
    └── en2ru_ing.json
├── pipeline
    ├── ingred_models.py
    ├── inference.py
    ├── utils
    │   ├── matching.py
    │   └── metrics.py
    ├── match_models.py
    └── preprocess.py
├── .gitignore
├── ReadME.txt
└── scripts
    ├── matching_povar.py
    └── povar_scrapping.py


/bot_final_vesion/train_tfidf.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alenush/dish_id_sirius/HEAD/bot_final_vesion/train_tfidf.pkl


--------------------------------------------------------------------------------
/bot_final_vesion/vectorizer.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alenush/dish_id_sirius/HEAD/bot_final_vesion/vectorizer.pkl


--------------------------------------------------------------------------------
/pipeline/ingred_models.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from PIL import Image
 3 | 
 4 | class IngredModel(ABC):
 5 |   @abstractmethod
 6 |   def predict(self, img_path):
 7 |     pass
 8 | 
 9 | 
10 | class DenseChefNet(IngredModel):
11 |   def __init__(self, model, threshold=0.25):
12 |     self.model = model
13 |     self.threshold = threshold
14 | 
15 |   def predict(self, img_path):
16 |     img = Image.open(img_path)
17 |     img = transform_val(img).to(device).unsqueeze(0)
18 |     
19 |     with torch.no_grad():
20 |       ingred_pred = self.model(img) > self.threshold
21 |     
22 |     ingred_pred = ingred_pred.nonzero()[:, 1].tolist()
23 |     ingred_pred = [id2word[ing+1] for ing in ingred_pred]
24 |     ingred_pred = [en2ru[ing] for ing in ingred_pred]
25 | 
26 |     return ingred_pred


--------------------------------------------------------------------------------
/pipeline/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from torchvision import transforms, models
 4 | from torch.utils.data import Dataset, DataLoader
 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
 6 | from collections import defaultdict
 7 | import matplotlib.pyplot as plt
 8 | from PIL import Image
 9 | import pickle
10 | import pandas as pd
11 | import numpy as np
12 | from collections import defaultdict
13 | 
14 | def predict_image(img_path, df, ingred_model, word_model, k=10, threshold=0.25):
15 |   ingred_pred = ingred_model.predict(img_path)
16 |   cos_sim, best_indices = word_model.most_similar(ingred_pred, k)
17 | 
18 |   return cos_sim, df.loc[best_indices]
19 | 
20 | def main(args):
21 |   img = Image.open(args.img_path)
22 | 
23 |   plt.figure(figsize=(12,8))
24 |   plt.imshow(img)
25 |   plt.axis('off')
26 |   plt.show()
27 | 
28 |   cos_sim, best_rows = predict_image(img_path, df, chefnet, elmo_model)
29 | 
30 |   for i, row in enumerate(best_rows.iterrows()):
31 |     row = row[1]
32 |     print(f"{i + 1}) {row['name']}")
33 |     print('\t' + row['ingreds'])
34 |     print(f"\t{row['url']}")
35 | 


--------------------------------------------------------------------------------
/pipeline/utils/matching.py:
--------------------------------------------------------------------------------
 1 | from scipy.spatial.distance import cosine
 2 | 
 3 | pos_morphy_to_vec = {'ADJF':'ADJ'} # перевод части речи из pymorphy в word2vec
 4 | 
 5 | 
 6 | def match_recipes(ingred_list, recipes_df, word_model, k=10):
 7 |   """
 8 |   Возвращает k ближайших рецептов по косинусному расстоянию.
 9 |   :param ingred_list: список ингедиенто
10 |   :param recipes_df: датасет для матчинга
11 |   :param word_model: векторная модель (word2vec, fasttext, elmo)
12 |   :param k: кол-во возвращаемых рецептов
13 |   :returns: k рецептов
14 |   """
15 |   vectors = []
16 |   for word in ingred_list:
17 |     normal_word = morph.parse(word)[0].normalized
18 |     pos = normal_word.tag.POS
19 |     if pos in pos_morphy_to_vec:
20 |       pos = pos_morphy_to_vec[normal_word.tag.POS]
21 |     word = normal_word.word
22 |     if pos != None:
23 |       word_pos = f'{word}_{pos}'
24 |       if word_pos in word_model:
25 |         vectors.append(word_model[word_pos])
26 | 
27 |   mean_vector = np.mean(vectors, axis=0)
28 |   cos_sim = recipes_df['word2vec_mean'].apply(lambda v: cosine(v, mean_vector)).sort_values()[:k]
29 |   closest_recipes = recipes_df.loc[cos_sim.index]
30 |   closest_recipes['cos_sim'] = cos_sim
31 |   
32 |   return closest_recipes
33 | 


--------------------------------------------------------------------------------
/pipeline/match_models.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from scipy.spatial.distance import cosine
 3 | 
 4 | class MatchModel(ABC):
 5 |   def __init__(self, mean_recipes, word_model):
 6 |     self.mean_recipes = mean_recipes
 7 |     self.word_model = word_model
 8 | 
 9 |   @abstractmethod
10 |   def mean_embedding(self, ingred_list):
11 |     pass
12 | 
13 |   def most_similar(self, ingred_list, k):
14 |     mean_vector = self.mean_embedding(ingred_list)
15 |     cos_sim = self.mean_recipes.apply(lambda v: cosine(v, mean_vector)).\
16 |                   sort_values()[:k]
17 |     return cos_sim, cos_sim.index
18 | 
19 | 
20 | class Word2Vec(MatchModel):
21 |   def __init__(self, mean_recipes, word_model):
22 |     super().__init__(mean_recipes, word_model)
23 | 
24 |   def mean_embedding(self, ingred_list):
25 |     words = ' '.join(tag_ud(ingred_list)).split()
26 |     vectors = [self.word_model[word] for word in words]
27 |     return np.mean(vectors, axis=0)
28 | 
29 | class FastText(MatchModel):
30 |   def __init__(self, mean_recipes, word_model):
31 |     super().__init__(mean_recipes, word_model)
32 | 
33 |   def mean_embedding(self, ingred_list):
34 |     words = ' '.join(tag_ud(ingred_list, keep_pos=False)).split()
35 |     vectors = [self.word_model[word] for word in words if word in self.word_model.vocab]
36 |     return np.mean(vectors, axis=0)
37 | 
38 | class Elmo(MatchModel):
39 |   def __init__(self, mean_recipes, word_model):
40 |     super().__init__(mean_recipes, word_model)
41 | 
42 |   def mean_embedding(self, ingred_list):
43 |     return self.word_model([ingred_list])[0]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .idea/
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | # MAC
108 | .DS_Store
109 | 
110 | 


--------------------------------------------------------------------------------
/bot_final_vesion/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torchvision import transforms, models
 4 | from collections import defaultdict
 5 | import matplotlib.pyplot as plt
 6 | from PIL import Image
 7 | device = torch.device('cpu')
 8 | import json
 9 | 
10 | 
11 | en2ru = {}
12 | with open('/home/dishid_bot/clean_ingred.txt', 'r') as f:
13 |   en=f.readlines()
14 | with open('/home/dishid_bot/rus_clean_ingred.txt', 'r',encoding='utf-8') as f:
15 |   ru=f.readlines()
16 | 
17 | for e, r in zip(en, ru):
18 |   en2ru[e.rstrip('\n')] = r.rstrip('\n')
19 | with open('/home/dishid_bot/en2ru_ing.json', 'w') as f:
20 |   json.dump(en2ru, f)
21 | 
22 | 
23 | with open('/home/dishid_bot/clean_ingred.txt', 'r') as f:
24 |   clean_ingredients = list(f.read().split('\n'))
25 |   print(len(clean_ingredients))
26 | 
27 | id2word = defaultdict()
28 | for i, ingr in enumerate(clean_ingredients):
29 |   id2word[i+1] = ingr
30 | 
31 | rus_id2word = defaultdict()
32 | for i, ingr in enumerate(clean_ingredients):
33 |   rus_id2word[i+1] = ingr
34 | 
35 | word2id = {v:k for k,v in id2word.items()}
36 | def words2ids(ingreds):
37 |   return [word2id[ing] for ing in ingreds]
38 | 
39 | model = models.densenet161(pretrained=True)
40 | 
41 | def init_model():
42 | 
43 |     for param in model.parameters():
44 |         param.requires_grad = False
45 |     num_feat = model.classifier.in_features
46 | 
47 |     model.classifier = nn.Sequential(
48 |         nn.Linear(num_feat, 1024),
49 |         nn.BatchNorm1d(1024),
50 |         nn.ReLU(),
51 |         nn.Linear(1024, 512),
52 |         nn.BatchNorm1d(512),
53 |         nn.ReLU(),
54 |         nn.Linear(512, len(word2id)),
55 |         nn.Sigmoid())
56 |     model.to(device)
57 | 
58 |     model.load_state_dict(torch.load("/home/dishid_bot/model_encoder_classifier_best.pth",map_location=torch.device('cpu')))
59 |     model.eval()
60 | 
61 | 
62 | def predict_image(path_to_img, transform):
63 |     with torch.no_grad():
64 |         img = Image.open(path_to_img).convert('RGB')
65 |     plt.figure(figsize=(12,8))
66 |     plt.imshow(img)
67 |     plt.axis('off')
68 |     plt.show()
69 |     img = transform(img).to(device).unsqueeze(0)
70 | 
71 |     ingred_pred = model(img) > 0.25
72 |     ingred_pred = ingred_pred.nonzero()[:, 1].tolist()
73 |     ingred_pred = [id2word[ing + 1] for ing in ingred_pred]
74 |     ingred_pred = [en2ru[ing] for ing in ingred_pred]
75 |     return '\t' + '\n\t'.join(ingred_pred)


--------------------------------------------------------------------------------
/bot_final_vesion/clean_ingred.txt:
--------------------------------------------------------------------------------
  1 | chicken
  2 | beef
  3 | turkey
  4 | soft cheese
  5 | hard cheese
  6 | nut
  7 | strawberry
  8 | cherry tomato
  9 | sausage
 10 | asparagus
 11 | cranberry
 12 | gelatin
 13 | tofu
 14 | olive
 15 | thyme
 16 | tuna
 17 | paprika
 18 | cardamom
 19 | red pepper
 20 | shrimp
 21 | tapioca
 22 | pumpkin
 23 | date
 24 | plum
 25 | green bean
 26 | bean
 27 | fish
 28 | lasagna
 29 | lobster
 30 | flour
 31 | octopus
 32 | cookie
 33 | milk
 34 | lemon
 35 | chocolate
 36 | cracker
 37 | clam
 38 | cherry
 39 | soy milk
 40 | pineapple
 41 | cabbage
 42 | pasta
 43 | crab
 44 | bacon
 45 | wasabi
 46 | onion
 47 | patty
 48 | baking powder
 49 | almond
 50 | mango
 51 | mussel
 52 | cheese
 53 | grape
 54 | veal
 55 | honey
 56 | vanilla
 57 | potato
 58 | capers
 59 | mollusk
 60 | raspberry
 61 | tomato
 62 | cinnamon
 63 | tortilla
 64 | lamb
 65 | jam
 66 | mustard
 67 | sherbet
 68 | oyster
 69 | bread
 70 | banana
 71 | spaghetti
 72 | noodle
 73 | egg
 74 | duck
 75 | jalapeno
 76 | mayonnaise
 77 | barley
 78 | okra
 79 | chives
 80 | cucumber
 81 | turnip
 82 | feijoa
 83 | scrambled egg
 84 | melon
 85 | persimmon
 86 | wine
 87 | yogurt
 88 | mushroom
 89 | mutton
 90 | bulgur
 91 | salmon
 92 | butter
 93 | blueberry
 94 | celery
 95 | molasses
 96 | dough
 97 | sugar
 98 | apple
 99 | goose
100 | chili pepper
101 | chips
102 | collard
103 | semolina
104 | miso
105 | macaroni
106 | coffee
107 | water
108 | pickle
109 | leeks
110 | cocoa
111 | peach
112 | cereals
113 | margarine
114 | vegetables
115 | sauce
116 | bran
117 | squash
118 | berry
119 | rice
120 | kale
121 | papaya
122 | pea
123 | ham
124 | syrup
125 | juice
126 | orange
127 | prune
128 | broccoli
129 | sesame
130 | whipped cream
131 | sour cream
132 | ice cream
133 | beet
134 | waffle
135 | crouton
136 | oil
137 | coconut oil
138 | avocado
139 | taco
140 | currant
141 | seaweed
142 | carrot
143 | radish
144 | apricot
145 | raisin
146 | pate
147 | salami
148 | brussels sprout
149 | trout
150 | lime
151 | hummus
152 | lentil
153 | scallop
154 | lettuce
155 | poultry
156 | pomegranate
157 | soy sauce
158 | walnut
159 | cashew
160 | egg yolk
161 | corn
162 | cornmeal
163 | almond milk
164 | peanut butter
165 | chia seeds
166 | bell pepper
167 | cake mix
168 | pudding mix
169 | coconut
170 | crab meat
171 | cream cheese
172 | muffin
173 | jell
174 | ice
175 | cottage cheese
176 | pecan
177 | baking soda
178 | buttermilk
179 | zucchini
180 | eggplant
181 | candy
182 | bun
183 | alcohol
184 | beer
185 | lemonade
186 | ketchup
187 | ginger


--------------------------------------------------------------------------------
/bot_final_vesion/rus_clean_ingred.txt:
--------------------------------------------------------------------------------
  1 | курица
  2 | говядина
  3 | индейка
  4 | мягкий сыр
  5 | твердый сыр
  6 | орехи
  7 | клубника
  8 | томаты черри
  9 | колбаса
 10 | спаржа
 11 | клюква
 12 | желатин
 13 | тофу
 14 | оливки
 15 | тимьян
 16 | тунец
 17 | паприка
 18 | кардамон
 19 | красный перец
 20 | креветка
 21 | тапиока
 22 | тыква
 23 | финик
 24 | слива
 25 | стручковая фасоль
 26 | фасоль
 27 | рыба
 28 | лазанья
 29 | лобстер
 30 | мука
 31 | осьминог
 32 | печенье
 33 | молоко
 34 | лимон
 35 | шоколад
 36 | крекер
 37 | моллюск
 38 | вишня
 39 | соевое молоко
 40 | ананас
 41 | капуста
 42 | спагетти
 43 | краб
 44 | бекон
 45 | васаби
 46 | лук
 47 | пирожок
 48 | разрыхлитель
 49 | миндаль
 50 | манго
 51 | мидия
 52 | сыр
 53 | виноград
 54 | телятина
 55 | мед
 56 | ваниль
 57 | картофель
 58 | каперс
 59 | моллюск
 60 | малина
 61 | помидор
 62 | корица
 63 | тортилья
 64 | ягненок
 65 | варенье
 66 | горчица
 67 | шербет
 68 | устрица
 69 | хлеб
 70 | банан
 71 | спагетти
 72 | лапша
 73 | яйцо
 74 | утка
 75 | халапеньо
 76 | майонез
 77 | ячмень
 78 | окра
 79 | зеленый лук
 80 | огурец
 81 | репа
 82 | фейхоа
 83 | омлет
 84 | дыня
 85 | хурма
 86 | вино
 87 | йогурт
 88 | грибы
 89 | баранина
 90 | булгур
 91 | лосось
 92 | сливочное масло
 93 | черника
 94 | сельдерей
 95 | патока
 96 | тесто
 97 | сахар
 98 | яблоко
 99 | гусь
100 | перец чили
101 | чипсы
102 | зелень
103 | манная крупа
104 | мисо
105 | макароны
106 | кофе
107 | вода
108 | соленый огурец
109 | лук-порей
110 | какао
111 | персик
112 | овсяные хлопья
113 | маргарин
114 | овощи
115 | соус
116 | отруби
117 | тыква
118 | ягоды
119 | рис
120 | листовая капуста
121 | папайя
122 | горох
123 | ветчина
124 | сироп
125 | сок
126 | апельсин
127 | чернослив
128 | брокколи
129 | кунжут
130 | взбитые сливки
131 | сметана
132 | мороженое
133 | свекла
134 | вафли
135 | гренки
136 | масло
137 | кокосовое масло
138 | авокадо
139 | тако
140 | смородина
141 | водоросли
142 | морковь
143 | редис
144 | абрикос
145 | изюм
146 | паштет
147 | салями
148 | брюссельская капуста
149 | форель
150 | лайм
151 | хумус
152 | чечевица
153 | гребешок
154 | салат
155 | домашняя птица
156 | гранат
157 | соевый соус
158 | грецкий орех
159 | кешью
160 | яичный желток
161 | кукуруза
162 | кукурузная мука
163 | миндальное молоко
164 | арахисовое масло
165 | семена чиа
166 | болгарский перец
167 | смесь для кекса
168 | смесь для пуддинга
169 | кокос
170 | крабовое мясо
171 | сливочный сыр
172 | маффин
173 | желе
174 | лед
175 | творог
176 | пекан
177 | пищевая сода
178 | кефир
179 | цуккини
180 | баклажан
181 | конфеты
182 | булочка
183 | алкоголь
184 | пиво
185 | лимонад
186 | кетчуп
187 | имбирь


--------------------------------------------------------------------------------
/pipeline/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 5 | 
 6 | 
 7 | def update_error_types(error_types, y_pred, y_true):
 8 |   error_types['tp_i'] += (y_pred * y_true).sum(0).cpu().data.numpy()
 9 |   error_types['fp_i'] += (y_pred * (1-y_true)).sum(0).cpu().data.numpy()
10 |   error_types['fn_i'] += ((1-y_pred) * y_true).sum(0).cpu().data.numpy()
11 |   error_types['tn_i'] += ((1-y_pred) * (1-y_true)).sum(0).cpu().data.numpy()
12 | 
13 |   error_types['tp_all'] += (y_pred * y_true).sum().item()
14 |   error_types['fp_all'] += (y_pred * (1-y_true)).sum().item()
15 |   error_types['fn_all'] += ((1-y_pred) * y_true).sum().item()
16 | 
17 | def label2onehot(labels, pad_value=len(word2id)-1):
18 |     # input labels to one hot vector
19 |     inp_ = torch.unsqueeze(labels, 2)
20 |     one_hot = torch.FloatTensor(labels.size(0), labels.size(1), pad_value + 1).zero_().to(device)
21 |     one_hot.scatter_(2, inp_, 1)
22 |     one_hot, _ = one_hot.max(dim=1)
23 |     # remove pad and eos position
24 |     one_hot = one_hot[:, 1:-1]
25 |     one_hot[:, 0] = 0
26 | 
27 |     return one_hot
28 | 
29 | def compute_metrics(ret_metrics, error_types, metric_names, eps=1e-10, weights=None):
30 |     if 'accuracy' in metric_names:
31 |         ret_metrics['accuracy'].append(np.mean((error_types['tp_i'] + error_types['tn_i']) / (error_types['tp_i'] + error_types['fp_i'] + error_types['fn_i'] + error_types['tn_i'])))
32 |     if 'jaccard' in metric_names:
33 |         ret_metrics['jaccard'].append(error_types['tp_all'] / (error_types['tp_all'] + error_types['fp_all'] + error_types['fn_all'] + eps))
34 |     if 'dice' in metric_names:
35 |         ret_metrics['dice'].append(2*error_types['tp_all'] / (2*(error_types['tp_all'] + error_types['fp_all'] + error_types['fn_all']) + eps))
36 |     if 'f1' in metric_names:
37 |         pre = error_types['tp_i'] / (error_types['tp_i'] + error_types['fp_i'] + eps)
38 |         rec = error_types['tp_i'] / (error_types['tp_i'] + error_types['fn_i'] + eps)
39 |         f1_perclass = 2*(pre * rec) / (pre + rec + eps)
40 |         if 'f1_ingredients' not in ret_metrics.keys():
41 |             ret_metrics['f1_ingredients'] = [np.average(f1_perclass, weights=weights)]
42 |         else:
43 |             ret_metrics['f1_ingredients'].append(np.average(f1_perclass, weights=weights))
44 | 
45 |         pre = error_types['tp_all'] / (error_types['tp_all'] + error_types['fp_all'] + eps)
46 |         rec = error_types['tp_all'] / (error_types['tp_all'] + error_types['fn_all'] + eps)
47 |         f1 = 2*(pre * rec) / (pre + rec + eps)
48 |         ret_metrics['f1'].append(f1)


--------------------------------------------------------------------------------
/ReadME.txt:
--------------------------------------------------------------------------------
 1 | DISH ID 2020_final.pptx - презентация с описанием проекта Dish-ID (https://drive.google.com/file/d/1_Fg-ezBY7KnFsHnxjUJ8bHIi7i60mYGM/view?usp=sharing)
 2 | 
 3 | Итого: 1) данные с картинками и ингредиентами к ним
 4 |        2) данные с рецептами
 5 |        3) бот, который крутится на сервере
 6 |        4) pipeline от и до
 7 |        5) обученная модель по выделению ингредиентов на фотографии
 8 |        6) ноутбуки по обучению моделей
 9 |        7) скрипты с матчингом
10 | 
11 | data/ #папка с данными, которые удалось собрать - https://drive.google.com/drive/folders/1_3nfYMJH6fbME6c_bt68woR6Ia8hcRhQ?usp=sharing
12 | 	AllRecipes_images.zip  #данные с сайта allrecipes.com
13 |                                #для обучения модели по выделению игредиентов по фотографии
14 |                                #данные содержат изображения и ингредиенты
15 | 
16 | 	recipes_final_povar_ru.pkl #рецепты с сайта povar.ru
17 |                                    #датасет содержит данные о названиях, ингредиентах и рецептах блюд
18 | 
19 | 	eda_all_recipes.csv #рецепты с сайта eda.ru
20 |                                    #датасет содержит данные о названиях, ингредиентах и рецептах блюд
21 | 	
22 | 	recipes_all.csv #объединенные данные с eda.ru и povar.ru
23 |                         #на этом датасете нужно тестировать gold
24 | 
25 | 	eda_vectors.pkl #данные с eda.ru с заранее вычисленными эмбеддингами 
26 |                         #word2vec, fasttext, elmo
27 |                         
28 | 	gold.xlsx #золотой стандарт для проверки матчинга 
29 |                   #если в колонке priority - 10, рецепт нерелевантен
30 | 
31 | 	ingredients.json #словарь перевода ингредиентов с английского на русский
32 | 	dishes.json      #словарь перевода названий блюд с английского на русский
33 | 
34 | models/ #обученные модели по выделению ингредиентов на фотографии
35 |         model_encoder_classifier_best.pth #модель, которая используется в боте 
36 |                                           #лучшая из полученных моделей архитектуры encoder->classifier
37 | 					  # Можно найти по ссылке https://drive.google.com/file/d/1CKTwhkTrEJzU69wI11PdmX047p4ULCND/view?usp=sharing
38 | 
39 | bot_final_version/ #папка с ботом @dish_id_bot
40 |                    #в папке также продублированы веса model_encoder_classifier_best.pth (по ссылке https://drive.google.com/file/d/1CKTwhkTrEJzU69wI11PdmX047p4ULCND/view?usp=sharing),
41 |                    #чтобы было меньше трудностей с путями
42 | 
43 | pipeline/ #папка с pipeline от картинки до предсказания рецептов
44 | 	chefnet_dense.ipynb #ноутбук по обучению модели на данных AllRecipes_images.zip
45 |         word_models.ipynb   #ноутбук с папйлайном по подбору рецептов
46 |                             #можно использовать веса обученной модели models/model_encoder_classifier_best.pth
47 | 
48 | scripts/ #папка со скриптами
49 |   	matching_povar.py  #алгоритм подбора рецептов по ингредиентам на данных recipes_final_povar_ru.pkl
50 | 	povar_scrapping.py #скрэппер сайта povar.ru
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/scripts/matching_povar.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pickle as pkl
  4 | 
  5 | from sklearn.feature_extraction.text import TfidfVectorizer
  6 | import pymorphy2
  7 | 
  8 | orph = pymorphy2.MorphAnalyzer()
  9 | 
 10 | FIT_COLUMN = "spl"
 11 | PREDICT_COLUMN = "not_ingridients"
 12 | N_BEST = 10
 13 | 
 14 | 
 15 | # NOTE: Есть такая штука как docstrings (ко всем функциям добавить)
 16 | # Не стоит писать странные "это нужно", объясни сразу что идет на вход (вместе с типом) функции, а что на выход
 17 | def splinst(inst):
 18 |     """
 19 |     это функция, которая принимает на вход текст инструкции и
 20 |     дает обратно текст с нормальными формами слов.
 21 |     Ну и чистит всякие предлоги, цифры и т.д. по возможности.
 22 | 
 23 |     :param array inst: описание что на вход
 24 |     :return: что возвращает в каком типе данных прописать
 25 |     """
 26 |     I = " ".join(inst)  
 27 |     I = I.split(" ")
 28 | 
 29 |     for i in range(len(I)):
 30 |         I[i] = I[i].lower()
 31 |         if len(I[i]) > 2:
 32 |             if ord(I[i][-1]) < 1072 or ord(I[i][-1]) > 1103:
 33 |                 I[i] = I[i][:-1]
 34 |             if ord(I[i][0]) < 1072 or ord(I[i][0]) > 1103:
 35 |                 I[i] = I[i][1:]
 36 | 
 37 |             p = orph.parse(I[i])[0]
 38 |             I[i] = p.normal_form
 39 |             if I[i] == None:
 40 |                 I[i] = ""
 41 |         else:
 42 |             I[i] = ""
 43 |     return " ".join(I)
 44 | 
 45 | 
 46 | def create_text_for_vectorizer(lst_base):
 47 |     lst = lst_base.copy()
 48 |     for i in range(len(lst_base)):
 49 |         lst.extend([lst_base[i]] * int(max(6 // (0.5 * i + 1), 1)))
 50 |     return " ".join(["".join(elem.split()) for elem in lst])
 51 | 
 52 | 
 53 | def prepare_dataset(df):
 54 |     df = df.copy()
 55 | 
 56 |     df.index = [i for i in range(len(df))]
 57 |     df["splinstr"] = df.instructions.apply(splinst)
 58 | 
 59 |     df["nor_ingridients"] = df.pure_ingridients.apply(
 60 |         lambda x: list(map(lambda y: orph.parse(y)[0].normal_form, x))
 61 |     )
 62 |     df["not_ingridients"] = df.nor_ingridients.apply(
 63 |         lambda x: create_text_for_vectorizer(x)
 64 |     )
 65 | 
 66 |     df["splinctr2"] = df.splinstr.apply(lambda x: x.split(" "))
 67 | 
 68 |     spl = []  # запихнуть все в листы как мудак -- могу умею практикую
 69 |     for i in range(len(df)):
 70 |         dd = df.splinstr[i].split(" ")
 71 |         # for s in df2.splinstr[i]:
 72 |         #    dd = dd + ' ' + s
 73 |         # dd = dd.split(' ')
 74 |         nn = df.nor_ingridients[i]
 75 |         # for a in df2.nor_ingridients[i]:
 76 |         #    nn = nn + a
 77 |         sss = []
 78 |         # print (list(df2.nor_ingridients[i]))
 79 |         for j in range(len(dd)):
 80 |             # print (dd[j])
 81 |             if dd[j] in list(nn):
 82 |                 sss.append(dd[j])
 83 |         spl.append(sss)
 84 |         # df2.splinctr2[i] = sss
 85 |     df["spl"] = spl  # нормализованные данные (по упоминанию слов в тексте рецепта)
 86 | 
 87 | 
 88 |     return df
 89 | 
 90 | 
 91 | # для каждого элемента тестовой выборки берем лучшие мэтчи из трэйна
 92 | def get_best_matches(array, n):
 93 |     #     print(array[0])
 94 |     x = np.argsort(np.array(array))
 95 |     #     print(x)
 96 |     return x[-n:]
 97 | 
 98 | 
 99 | def fit_predict(train, test):
100 |     vectorizer = TfidfVectorizer(min_df=2)
101 | 
102 |     train_tfidf = vectorizer.fit_transform(
103 |         train[FIT_COLUMN].apply(
104 |             lambda x: " ".join(["".join(elem.split()) for elem in x])
105 |         )
106 |     )
107 |     test_tfidf = vectorizer.transform(
108 |         test[PREDICT_COLUMN]
109 |     )  # .apply(lambda x: " ".join(x)))
110 | 
111 |     #     display(test_tfidf.todense())
112 |     #     print(train_tfidf.shape, test_tfidf.shape)
113 |     result = np.array(test_tfidf).dot(np.array(train_tfidf.T))
114 |     result = pd.DataFrame(np.array(result.todense()))
115 |     best_matches = result.apply(lambda x: get_best_matches(x, N_BEST), axis=1)
116 |     return best_matches
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     with open("recipes_final_povar_ru.pkl", "rb") as f:
121 |         df = pkl.load(f)
122 |     df_norm = prepare_dataset(df)
123 | 
124 |     train_base = df_norm.sample(frac=1).reset_index()
125 | 
126 |     train = train_base[100:].reset_index()
127 |     test = train_base[:100]
128 | 
129 |     best_matches = fit_predict(train, test)
130 |     best_matches = pd.DataFrame(
131 |         best_matches.apply(lambda x: [train["name"][i] for i in x])
132 |     )
133 |     best_matches.index = test.name
134 |     best_matches.columns = ["names"]
135 | 
136 |     final_best_matches = pd.DataFrame(
137 |         best_matches.names.tolist(), index=best_matches.index
138 |     )
139 |     final_best_matches.to_csv("best_matches.csv")
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/pipeline/preprocess.py:
--------------------------------------------------------------------------------
  1 | def clean_token(token, misc):
  2 |     """
  3 |     :param token:
  4 |     :param misc:
  5 |     :return:
  6 |     """
  7 |     out_token = token.strip().replace(' ', '')
  8 |     if token == 'Файл' and 'SpaceAfter=No' in misc:
  9 |         return None
 10 |     return out_token
 11 | 
 12 | 
 13 | def clean_lemma(lemma, pos, lowercase=True):
 14 |     """
 15 |     :param lemma:
 16 |     :param pos:
 17 |     :return:
 18 |     """
 19 |     out_lemma = lemma.strip().replace(' ', '').replace('_', '')
 20 |     if lowercase:
 21 |         out_lemma = out_lemma.lower()
 22 |     if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'):
 23 |         return None
 24 |     if pos != 'PUNCT':
 25 |         if out_lemma.startswith('«') or out_lemma.startswith('»'):
 26 |             out_lemma = ''.join(out_lemma[1:])
 27 |         if out_lemma.endswith('«') or out_lemma.endswith('»'):
 28 |             out_lemma = ''.join(out_lemma[:-1])
 29 |         if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \
 30 |                 or out_lemma.endswith('.'):
 31 |             out_lemma = ''.join(out_lemma[:-1])
 32 |     return out_lemma
 33 | 
 34 | def num_replace(word):
 35 |     newtoken = 'x' * len(word)
 36 |     nw = newtoken + '_NUM'
 37 |     return nw
 38 | 
 39 | def process(pipeline, text='Строка', keep_pos=True, keep_punct=False):
 40 |     entities = {'PROPN'}
 41 |     named = False
 42 |     memory = []
 43 |     mem_case = None
 44 |     mem_number = None
 45 |     tagged_propn = []
 46 | 
 47 |     # обрабатываем текст, получаем результат в формате conllu:
 48 |     processed = pipeline.process(text)
 49 | 
 50 |     # пропускаем строки со служебной информацией:
 51 |     content = [l for l in processed.split('\n') if not l.startswith('#')]
 52 | 
 53 |     # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
 54 |     tagged = [w.split('\t') for w in content if w]
 55 | 
 56 |     for t in tagged:
 57 |         if len(t) != 10:
 58 |             continue
 59 |         (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
 60 |         token = clean_token(token, misc)
 61 |         lemma = clean_lemma(lemma, pos)
 62 |         if not lemma or not token:
 63 |             continue
 64 |         if pos in entities:
 65 |             if '|' not in feats:
 66 |                 tagged_propn.append('%s_%s' % (lemma, pos))
 67 |                 continue
 68 |             morph = {el.split('=')[0]: el.split('=')[1] for el in feats.split('|')}
 69 |             if 'Case' not in morph or 'Number' not in morph:
 70 |                 tagged_propn.append('%s_%s' % (lemma, pos))
 71 |                 continue
 72 |             if not named:
 73 |                 named = True
 74 |                 mem_case = morph['Case']
 75 |                 mem_number = morph['Number']
 76 |             if morph['Case'] == mem_case and morph['Number'] == mem_number:
 77 |                 memory.append(lemma)
 78 |                 if 'SpacesAfter=\\n' in misc or 'SpacesAfter=\s\\n' in misc:
 79 |                     named = False
 80 |                     past_lemma = '::'.join(memory)
 81 |                     memory = []
 82 |                     tagged_propn.append(past_lemma + '_PROPN ')
 83 |             else:
 84 |                 named = False
 85 |                 past_lemma = '::'.join(memory)
 86 |                 memory = []
 87 |                 tagged_propn.append(past_lemma + '_PROPN ')
 88 |                 tagged_propn.append('%s_%s' % (lemma, pos))
 89 |         else:
 90 |             if not named:
 91 |                 if pos == 'NUM' and token.isdigit():  # Заменяем числа на xxxxx той же длины
 92 |                     lemma = num_replace(token)
 93 |                 tagged_propn.append('%s_%s' % (lemma, pos))
 94 |             else:
 95 |                 named = False
 96 |                 past_lemma = '::'.join(memory)
 97 |                 memory = []
 98 |                 tagged_propn.append(past_lemma + '_PROPN ')
 99 |                 tagged_propn.append('%s_%s' % (lemma, pos))
100 | 
101 |     if not keep_punct:
102 |         tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
103 |     if not keep_pos:
104 |         tagged_propn = [word.split('_')[0] for word in tagged_propn]
105 |     return tagged_propn
106 | 
107 | from ufal.udpipe import Model, Pipeline
108 | import os
109 | import re
110 | 
111 | def get_pipeline(modelfile):
112 |   print('\nLoading the model...', file=sys.stderr)
113 |   if not os.path.isfile(modelfile):
114 |       udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
115 |       udpipe_filename = udpipe_model_url.split('/')[-1]
116 |       print('UDPipe model not found. Downloading...', file=sys.stderr)
117 |       wget.download(udpipe_model_url)
118 |   ufal_model = Model.load(modelfile)
119 |   process_pipeline = Pipeline(ufal_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
120 |   return process_pipeline
121 | 
122 | def tag_ud(text='Текст нужно передать функции в виде строки!', process_pipeline=process_pipeline, keep_pos=True):
123 |     output = []
124 |     for line in tqdm(text):
125 |         line_proc = process(process_pipeline, text=line, keep_pos=keep_pos)
126 |         output.append(' '.join(line_proc))
127 |     return output


--------------------------------------------------------------------------------
/scripts/povar_scrapping.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pickle as pkl
  3 | from selenium.webdriver import Chrome
  4 | from selenium import webdriver
  5 | from selenium.webdriver.common.by import By
  6 | from selenium.webdriver.support.ui import WebDriverWait
  7 | from selenium.webdriver.support import expected_conditions
  8 | # import wget
  9 | 
 10 | #####################
 11 | ##### Поставить путь до драйвера вашего браузера
 12 | driver = Chrome(executable_path="/path/to/chromedriver.exe")
 13 | option = webdriver.ChromeOptions()
 14 | option.add_argument(" — incognito")
 15 | 
 16 | BASE = 'https://povar.ru/list/'
 17 | 
 18 | #####################
 19 | ##### Поставить срез категорий (между 0 и 271), которые будут скрепиться
 20 | ### У меня 1 категория = 3-4 минуты скрепинга (35-40 рецептов)
 21 | ##### Поставить путь до директории сохранения данных
 22 | MIN_CATEGS = 150
 23 | MAX_CATEGS = 271
 24 | DIR_NAME = '/path/to/save/data'
 25 | # 271 - это количество категорий, которые нужно заскреппить
 26 | # NUM_CATEGS = [i.text for i in categories].index('Соленое') + 1
 27 | 
 28 | 
 29 | def main():
 30 |     inds = []
 31 |     titles = []
 32 |     ingridient_lists = []
 33 |     img_urls = []
 34 |     step_lists = []
 35 |     links = []
 36 | 
 37 |     driver.get(BASE)
 38 |     # categories = driver.find_elements_by_css_selector(".ingredientItem div a")
 39 |     # print([i.text for i in categories])
 40 |     for categ_num in range(MIN_CATEGS, MAX_CATEGS):
 41 |         # Чекаем, есть ли вообще ссылки на категории
 42 |         try:
 43 |             WebDriverWait(driver, 8).until(
 44 |                 expected_conditions.presence_of_element_located(
 45 |                 (By.CSS_SELECTOR, ".ingredientItemH2")))
 46 |         except:
 47 |             print('no-category')
 48 |             continue
 49 | 
 50 |         # Переходим по найденной ссылке на категорию
 51 |         try:
 52 |             driver.find_elements_by_css_selector('.ingredientItem div a')[categ_num].click()
 53 |         except:
 54 |             continue
 55 | 
 56 |         # Пытаемся не упасть в ситуации, когда рецептов меньше 40 (не целая страница)
 57 |         try:
 58 |             ttl_recipes = WebDriverWait(driver, 4).until(
 59 |                 expected_conditions.presence_of_element_located(
 60 |                 (By.CSS_SELECTOR, ".total")))
 61 |             num_recipes = int(ttl_recipes.text.split()[-1])
 62 |         except:
 63 |             driver.get(BASE)
 64 |             print('less than 40 recipes')
 65 |             continue
 66 |         print(f"Рецептов в категории: {num_recipes}")
 67 | 
 68 |         categ_url = driver.current_url
 69 |         for page_num in range(2, min(num_recipes // 40 + 1, 11)):
 70 |             try:
 71 |                 driver.get(categ_url + str(page_num))
 72 |             except:
 73 |                 print('page is not clicked')
 74 |                 driver.get(categ_url)
 75 |                 continue
 76 |         # На одной странице максимум 40 рецептов
 77 |             for recipe_num in range(40):
 78 | 
 79 |                 # Ждем открытия страницы и переходим на страничку с рецептом
 80 |                 try:
 81 |                     WebDriverWait(driver, 4).until(
 82 |                         expected_conditions.presence_of_element_located(
 83 |                             (By.CSS_SELECTOR, ".listRecipieTitle")))
 84 |                     driver.find_elements_by_css_selector(".listRecipieTitle")[recipe_num].click()
 85 |                 except:
 86 |                     print('skipped_1')
 87 |                     # Если вдруг оказались на неправильной странице, надо попробовать обратно зайти, и потом уже забить
 88 |                     driver.get(categ_url + str(page_num))
 89 |                     continue
 90 | 
 91 |                 # Качаем название и ингридиенты
 92 |                 try:
 93 |                     title = WebDriverWait(driver, 4).until(
 94 |                         expected_conditions.presence_of_element_located(
 95 |                             (By.CSS_SELECTOR, ".detailed"))).text.strip()
 96 |                 except:
 97 |                     title = None
 98 |                 try:
 99 |                     ingridients = [elem.text.strip() for elem
100 |                                    in driver.find_elements_by_css_selector('.detailed_ingredients li')]
101 |                 except:
102 |                     ingridients = None
103 |                 try:
104 |                     steps = [elem.text.strip() for elem in
105 |                              driver.find_elements_by_css_selector(".detailed_step_description_big")]
106 |                 except:
107 |                     steps = None
108 |                 cur_url = driver.current_url
109 | 
110 | 
111 |                 # Качаем фотку
112 |                 try:
113 |                     img_src = WebDriverWait(driver, 5).until(
114 |                         expected_conditions.presence_of_element_located(
115 |                         (By.CSS_SELECTOR, ".bigImgBox img"))).get_attribute('src')
116 |                 except:
117 |                     img_src = None
118 | 
119 |                 # Тут еще можно саму фотку скачать, но это вроде не особо нужно
120 |                 #     print(img_src)
121 |                 #     try:
122 |                 #         wget.download(img_src, DIR_NAME + f'{categ_num}_{recipe_num}.png')
123 |                 #     except:
124 |                 #         driver.refresh()
125 |                 #         print('refreshing')
126 |                 #         img_src = WebDriverWait(driver, 2).until(
127 |                 #             expected_conditions.presence_of_element_located(
128 |                 #                 (By.CSS_SELECTOR, ".bigImgBox img"))).get_attribute('src')
129 |                 #         print('img_src')
130 |                 #         try:
131 |                 #             wget.download(img_src, DIR_NAME + f'{categ_num}_{recipe_num}.png')
132 |                 #         except:
133 |                 #             pass
134 | 
135 |                 # Сохраняем в базу данные
136 |                 inds.append(f'{categ_num}_{recipe_num + 40 * page_num}')
137 |                 titles.append(title)
138 |                 img_urls.append(img_src)
139 |                 ingridient_lists.append(tuple(ingridients))
140 |                 step_lists.append(tuple(steps))
141 |                 links.append(cur_url)
142 | 
143 |                 driver.get(categ_url + str(page_num))
144 | 
145 |         # Делаем бэкап, вдруг что пойдет не так
146 |         df = pd.DataFrame([inds, titles, ingridient_lists, img_urls, step_lists, links]).transpose().set_index(0)
147 |         if categ_num == 99:
148 |             with open(DIR_NAME + 'backup_100.pkl', 'wb') as f:
149 |                 pkl.dump(df, f, protocol=4)
150 |         else:
151 |             with open(DIR_NAME + 'backup.pkl', 'wb') as f:
152 |                 pkl.dump(df, f, protocol=4)
153 |         driver.get(BASE)
154 | 
155 |     # Закрываем драйвер и фиксируем резы в .pkl
156 |     driver.close()
157 |     df = pd.DataFrame([inds, titles, ingridient_lists, img_urls, step_lists, links]).transpose().set_index(0)
158 |     with open(DIR_NAME + 'backup.pkl', 'wb') as f:
159 |         pkl.dump(df, f, protocol=4)
160 | 
161 | 
162 | if __name__ == '__main__':
163 |     main()
164 |     # 01:19 - начало скрепинга
165 |     with open(DIR_NAME + 'backup.pkl', 'rb') as f:
166 |         df = pkl.load(f)
167 |     print(df.shape)


--------------------------------------------------------------------------------
/bot_final_vesion/dish_id_bot.py:
--------------------------------------------------------------------------------
  1 | # To-do
  2 | # # Вывести в отдельный работающий пайплайн
  3 | # # Прикрутить распознование изображения
  4 | # # Прикрутить ранжирование
  5 | # # Прикрутить логирование
  6 | 
  7 | 
  8 | import aiohttp
  9 | import io
 10 | import logging
 11 | import os
 12 | import time
 13 | import numpy as np
 14 | import pandas as pd
 15 | import pickle as pkl
 16 | import typing as tp
 17 | 
 18 | from PIL import Image
 19 | from aiogram import Bot, types
 20 | from aiogram.utils import executor
 21 | from aiogram.dispatcher import Dispatcher
 22 | from aiogram.utils.helper import Helper, HelperMode, ListItem
 23 | from aiogram.contrib.fsm_storage.memory import MemoryStorage
 24 | from aiogram.types import ReplyKeyboardRemove, \
 25 |     ReplyKeyboardMarkup, KeyboardButton, \
 26 |     InlineKeyboardMarkup, InlineKeyboardButton
 27 | from aiofiles import os as aio_os
 28 | from torchvision import transforms
 29 | from model import init_model, predict_image
 30 | 
 31 | 
 32 | N_BEST = 5
 33 | best_recipes = dict()
 34 | TYPE = 'image'
 35 | 
 36 | 
 37 | # bot = Bot(token=os.environ.get('TOKEN', None),
 38 | TOKEN = 'TOKEN'
 39 | bot = Bot(token=TOKEN)
 40 | dp = Dispatcher(bot, storage=MemoryStorage())
 41 | 
 42 | 
 43 | init_model()
 44 | logging.info("Model was init")
 45 | logging.basicConfig(filename='log.txt',
 46 |                     filemode='a',
 47 |                     format='%(asctime)s, %(msecs) d %(name)s %(levelname) s %(message) s',
 48 |                     datefmt='%H:%M:%S',
 49 |                     level=logging.INFO)
 50 | 
 51 | inline_keyboard_markup = types.InlineKeyboardMarkup()
 52 | inline_keyboard_markup.add(types.InlineKeyboardButton('Применить стиль Сезанна', callback_data='sezanne'))
 53 | 
 54 | 
 55 | class TestStates(Helper):
 56 |     mode = HelperMode.snake_case
 57 | 
 58 |     TEST_STATE_0 = ListItem()
 59 |     TEST_STATE_1 = ListItem()
 60 |     TEST_STATE_2 = ListItem()
 61 | 
 62 | 
 63 | button0 = KeyboardButton('/help')
 64 | buttons = [KeyboardButton('1')]
 65 | buttons.append(KeyboardButton('2'))
 66 | buttons.append(KeyboardButton('3'))
 67 | buttons.append(KeyboardButton('4'))
 68 | buttons.append(KeyboardButton('5'))
 69 | markup0 = ReplyKeyboardMarkup(resize_keyboard=True,
 70 |                               one_time_keyboard=True).add(
 71 |     button0)
 72 | markup1 = ReplyKeyboardMarkup(resize_keyboard=True,
 73 |                               one_time_keyboard=True).row(
 74 |     *buttons[:2])
 75 | markup5 = ReplyKeyboardMarkup(resize_keyboard=True,
 76 |                               one_time_keyboard=True).row(
 77 |     *buttons)
 78 | 
 79 | 
 80 | # Обработка 3 команд - старт, хелп и тим
 81 | @dp.message_handler(commands=['start'])
 82 | async def send_welcome(message: types.Message) -> None:
 83 |     state = dp.current_state(user=message.from_user.id)
 84 |     await state.set_state(TestStates.all()[0])
 85 |     await message.answer("Привет, {}!\n".format(message.from_user.first_name) +
 86 |                          "Я бот, который поможет тебе найти рецепт для блюда из фотографии")
 87 | 
 88 | 
 89 | @dp.message_handler(commands=['help'])
 90 | async def send_help(message: types.Message) -> None:
 91 |     logging.info("User asked for help")
 92 |     state = dp.current_state(user=message.from_user.id)
 93 |     await state.set_state(TestStates.all()[0])
 94 |     await message.answer("Нужна помощь? Решение очень простое!\n" +
 95 |                          "Просто отправь фотку блюда и алгоритм подберет оптимальный рецепт")
 96 | 
 97 | 
 98 | @dp.message_handler(state=TestStates.all(), commands=['start'])
 99 | async def send_welcome_state(message: types.Message) -> None:
100 |     await send_welcome(message)
101 | 
102 | 
103 | @dp.message_handler(state=TestStates.all(), commands=['help'])
104 | async def send_help_state(message: types.Message) -> None:
105 |     await send_help(message)
106 | 
107 | 
108 | @dp.message_handler(state=TestStates.all(), content_types=['photo'])
109 | async def handle_photo(message):
110 |     if not os.path.isdir(f'/home/dishid_bot/photo/{message.from_user.id}'):
111 |         await aio_os.mkdir(f'/home/dishid_bot/photo/{message.from_user.id}')
112 |     await message.photo[-1].download(
113 |         f'/home/dishid_bot/photo/{message.from_user.id}/{TYPE}.jpg')
114 |     if TYPE == 'image':
115 |         await bot.send_message(message.chat.id, "Отлично, картинка загружена! Сейчас скажу, какие ингредиенты я тут вижу")
116 |     await apply_model(message)
117 | 
118 | 
119 | @dp.message_handler(state=TestStates.TEST_STATE_1[0], commands=['run'])
120 | async def apply_model(message: types.Message):
121 |     logging.info("Get image from user {}".format(message.chat.id))
122 |     transform = transforms.Compose([
123 |         transforms.Resize((250, 250)),
124 |         transforms.CenterCrop((224, 224)),
125 |         transforms.ToTensor(),
126 |         transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
127 |     ])
128 |     path_image = f'/home/dishid_bot/photo/{message.chat.id}/{TYPE}.jpg'
129 |     result = predict_image(path_image, transform)
130 |     res_list = result.replace('\t', '').split('\n')
131 | 
132 |     if len(res_list) == 0:
133 |         await message.answer("Я на фото ничего не нашел...\nПопробуй загрузить фото лучшего качества")
134 | 
135 |     logging.info("Found ingredients: {}".format(", ".join(res_list)))
136 |     await message.answer("Я выделил ингредиенты на фото.\nВот что я тут вижу: {}\n\nСейчас по этому списку подберу рецептики".format(", ".join(res_list)))
137 | 
138 |     await match_recipes(" ".join(res_list), message)
139 | 
140 | 
141 | @dp.message_handler(state=TestStates.TEST_STATE_1[0])
142 | async def team_1(message: types.Message, redirect: bool = False) -> None:
143 |     await message.answer("Отправь фото блюда, чтобы получить релевантный рецепт\n" +
144 |                          "Если хочешь посмотреть другие рецепты по предыдущей фотографии, просто отправь её ещё раз")
145 | 
146 | 
147 | async def match_recipes(ingridients, message):
148 |     t0 = time.time()
149 |     logging.info("Matching with ingridients list: {}".format(ingridients))
150 |     with open('/home/dishid_bot/vectorizer.pkl', 'rb') as f:
151 |         vectorizer = pkl.load(f)
152 |     with open('/home/dishid_bot/botrain_tfidf.pkl', 'rb') as f:
153 |         train_tfidf = pkl.load(f)
154 |     with open('/home/dishid_bot/eda_povar.pkl', 'rb') as f:
155 |         train = pkl.load(f)
156 | 
157 |     test_tfidf = vectorizer.transform([ingridients])
158 |     result = np.array(test_tfidf).dot(np.array(train_tfidf.T))
159 | 
160 |     loc_best_matches = np.argsort(np.array(result.todense()))[0][-N_BEST:]
161 |     loc_best_matches = pd.DataFrame(loc_best_matches)
162 |     loc_best_matches.columns = ['idx']
163 | 
164 | 
165 |     short_list = loc_best_matches['idx'].apply(
166 |         lambda x: train[['name', 'ingridients', 'instructions', 'img_url', 'recipe_link']].iloc[x])
167 | 
168 |     logging.info('Recipes matched: {}'.format(list(short_list.name)))
169 | 
170 |     best_recipes[message.from_user.id] = short_list
171 | 
172 |     await message.answer("Выберите номер наиболее понравившегося рецепта")
173 |     await message.answer(
174 |         "\n".join([f"{i}. {j}\n" for i, j in zip(range(1, 6), best_recipes[message.from_user.id].name)]),
175 |         reply_markup=markup5)
176 |     logging.info("Time spent from receiving the photo till proposing 5 recipes: {}".format(time.time() - t0))
177 |     state = dp.current_state(user=message.from_user.id)
178 |     await state.set_state(TestStates.all()[2])
179 | 
180 | 
181 | # team 2
182 | @dp.message_handler(content_types=['text'], state=TestStates.TEST_STATE_2[0])
183 | async def team_2_txt(message: types.Message) -> None:
184 | 
185 |     dct_keys = {f'{i}': i for i in range(1, 6)}
186 |     dct_keys.update({'1️': 1, '2️': 2, '3️': 3, '4️⃣': 4, '5️⃣': 5})
187 | 
188 |     flag = dct_keys.get(message.text.strip())  ### Добавил .strip()
189 |     if (best_recipes.get(message.from_user.id) is None) or (flag is None):
190 |         await message.answer(
191 |             "Немного не понял. Выбери число еще раз (кнопки ниже)")  # Поменял текст, закомментил 2 строки ниже
192 |         return
193 |     recipe = best_recipes[message.from_user.id].iloc[flag - 1]
194 |     logging.info("User chose {}".format(recipe['name']))
195 |     await message.answer("Название: {}\n\nСсылка: {}\n\nИнгредиенты:\n{}\n\nРецепт: {}".format(
196 |         recipe['name'], recipe['recipe_link'], "\n".join(recipe['ingridients'][2:-2].split("', '")),
197 |         "\n\n".join(recipe['instructions'][2:-2].split("', '"))),
198 |         reply_markup=ReplyKeyboardRemove()
199 |     )
200 |     state = dp.current_state(user=message.from_user.id)
201 |     await state.set_state(TestStates.all()[1])
202 | 
203 | @dp.message_handler()
204 | async def state0(message: types.Message) -> None:
205 |     await first_message(message)
206 | 
207 | 
208 | @dp.message_handler(state=TestStates.TEST_STATE_0[0])
209 | async def first_message(message: types.Message) -> None:
210 |     state = dp.current_state(user=message.from_user.id)
211 |     await state.set_state(TestStates.all()[1])
212 | 
213 |     await message.answer("Ты работаешь с проектом Dish-ID.\n" +
214 |                          "Просто отправь мне фото блюда, и я подберу лучший рецепт",
215 |                          reply_markup=ReplyKeyboardRemove())
216 | 
217 | 
218 | if __name__ == '__main__':
219 |     executor.start_polling(dp)
220 | 


--------------------------------------------------------------------------------
/bot_final_vesion/en2ru_ing.json:
--------------------------------------------------------------------------------
1 | {"chicken": "\u043a\u0443\u0440\u0438\u0446\u0430", "beef": "\u0433\u043e\u0432\u044f\u0434\u0438\u043d\u0430", "turkey": "\u0438\u043d\u0434\u0435\u0439\u043a\u0430", "soft cheese": "\u043c\u044f\u0433\u043a\u0438\u0439 \u0441\u044b\u0440", "hard cheese": "\u0442\u0432\u0435\u0440\u0434\u044b\u0439 \u0441\u044b\u0440", "nut": "\u043e\u0440\u0435\u0445\u0438", "strawberry": "\u043a\u043b\u0443\u0431\u043d\u0438\u043a\u0430", "cherry tomato": "\u0442\u043e\u043c\u0430\u0442\u044b \u0447\u0435\u0440\u0440\u0438", "sausage": "\u043a\u043e\u043b\u0431\u0430\u0441\u0430", "asparagus": "\u0441\u043f\u0430\u0440\u0436\u0430", "cranberry": "\u043a\u043b\u044e\u043a\u0432\u0430", "gelatin": "\u0436\u0435\u043b\u0430\u0442\u0438\u043d", "tofu": "\u0442\u043e\u0444\u0443", "olive": "\u043e\u043b\u0438\u0432\u043a\u0438", "thyme": "\u0442\u0438\u043c\u044c\u044f\u043d", "tuna": "\u0442\u0443\u043d\u0435\u0446", "paprika": "\u043f\u0430\u043f\u0440\u0438\u043a\u0430", "cardamom": "\u043a\u0430\u0440\u0434\u0430\u043c\u043e\u043d", "red pepper": "\u043a\u0440\u0430\u0441\u043d\u044b\u0439 \u043f\u0435\u0440\u0435\u0446", "shrimp": "\u043a\u0440\u0435\u0432\u0435\u0442\u043a\u0430", "tapioca": "\u0442\u0430\u043f\u0438\u043e\u043a\u0430", "pumpkin": "\u0442\u044b\u043a\u0432\u0430", "date": "\u0444\u0438\u043d\u0438\u043a", "plum": "\u0441\u043b\u0438\u0432\u0430", "green bean": "\u0441\u0442\u0440\u0443\u0447\u043a\u043e\u0432\u0430\u044f \u0444\u0430\u0441\u043e\u043b\u044c", "bean": "\u0444\u0430\u0441\u043e\u043b\u044c", "fish": "\u0440\u044b\u0431\u0430", "lasagna": "\u043b\u0430\u0437\u0430\u043d\u044c\u044f", "lobster": "\u043b\u043e\u0431\u0441\u0442\u0435\u0440", "flour": "\u043c\u0443\u043a\u0430", "octopus": "\u043e\u0441\u044c\u043c\u0438\u043d\u043e\u0433", "cookie": "\u043f\u0435\u0447\u0435\u043d\u044c\u0435", "milk": "\u043c\u043e\u043b\u043e\u043a\u043e", "lemon": "\u043b\u0438\u043c\u043e\u043d", "chocolate": "\u0448\u043e\u043a\u043e\u043b\u0430\u0434", "cracker": "\u043a\u0440\u0435\u043a\u0435\u0440", "clam": "\u043c\u043e\u043b\u043b\u044e\u0441\u043a", "cherry": "\u0432\u0438\u0448\u043d\u044f", "soy milk": "\u0441\u043e\u0435\u0432\u043e\u0435 \u043c\u043e\u043b\u043e\u043a\u043e", "pineapple": "\u0430\u043d\u0430\u043d\u0430\u0441", "cabbage": "\u043a\u0430\u043f\u0443\u0441\u0442\u0430", "pasta": "\u0441\u043f\u0430\u0433\u0435\u0442\u0442\u0438", "crab": "\u043a\u0440\u0430\u0431", "bacon": "\u0431\u0435\u043a\u043e\u043d", "wasabi": "\u0432\u0430\u0441\u0430\u0431\u0438", "onion": "\u043b\u0443\u043a", "patty": "\u043f\u0438\u0440\u043e\u0436\u043e\u043a", "baking powder": "\u0440\u0430\u0437\u0440\u044b\u0445\u043b\u0438\u0442\u0435\u043b\u044c", "almond": "\u043c\u0438\u043d\u0434\u0430\u043b\u044c", "mango": "\u043c\u0430\u043d\u0433\u043e", "mussel": "\u043c\u0438\u0434\u0438\u044f", "cheese": "\u0441\u044b\u0440", "grape": "\u0432\u0438\u043d\u043e\u0433\u0440\u0430\u0434", "veal": "\u0442\u0435\u043b\u044f\u0442\u0438\u043d\u0430", "honey": "\u043c\u0435\u0434", "vanilla": "\u0432\u0430\u043d\u0438\u043b\u044c", "potato": "\u043a\u0430\u0440\u0442\u043e\u0444\u0435\u043b\u044c", "capers": "\u043a\u0430\u043f\u0435\u0440\u0441", "mollusk": "\u043c\u043e\u043b\u043b\u044e\u0441\u043a", "raspberry": "\u043c\u0430\u043b\u0438\u043d\u0430", "tomato": "\u043f\u043e\u043c\u0438\u0434\u043e\u0440", "cinnamon": "\u043a\u043e\u0440\u0438\u0446\u0430", "tortilla": "\u0442\u043e\u0440\u0442\u0438\u043b\u044c\u044f", "lamb": "\u044f\u0433\u043d\u0435\u043d\u043e\u043a", "jam": "\u0432\u0430\u0440\u0435\u043d\u044c\u0435", "mustard": "\u0433\u043e\u0440\u0447\u0438\u0446\u0430", "sherbet": "\u0448\u0435\u0440\u0431\u0435\u0442", "oyster": "\u0443\u0441\u0442\u0440\u0438\u0446\u0430", "bread": "\u0445\u043b\u0435\u0431", "banana": "\u0431\u0430\u043d\u0430\u043d", "spaghetti": "\u0441\u043f\u0430\u0433\u0435\u0442\u0442\u0438", "noodle": "\u043b\u0430\u043f\u0448\u0430", "egg": "\u044f\u0439\u0446\u043e", "duck": "\u0443\u0442\u043a\u0430", "jalapeno": "\u0445\u0430\u043b\u0430\u043f\u0435\u043d\u044c\u043e", "mayonnaise": "\u043c\u0430\u0439\u043e\u043d\u0435\u0437", "barley": "\u044f\u0447\u043c\u0435\u043d\u044c", "okra": "\u043e\u043a\u0440\u0430", "chives": "\u0437\u0435\u043b\u0435\u043d\u044b\u0439 \u043b\u0443\u043a", "cucumber": "\u043e\u0433\u0443\u0440\u0435\u0446", "turnip": "\u0440\u0435\u043f\u0430", "feijoa": "\u0444\u0435\u0439\u0445\u043e\u0430", "scrambled egg": "\u043e\u043c\u043b\u0435\u0442", "melon": "\u0434\u044b\u043d\u044f", "persimmon": "\u0445\u0443\u0440\u043c\u0430", "wine": "\u0432\u0438\u043d\u043e", "yogurt": "\u0439\u043e\u0433\u0443\u0440\u0442", "mushroom": "\u0433\u0440\u0438\u0431\u044b", "mutton": "\u0431\u0430\u0440\u0430\u043d\u0438\u043d\u0430", "bulgur": "\u0431\u0443\u043b\u0433\u0443\u0440", "salmon": "\u043b\u043e\u0441\u043e\u0441\u044c", "butter": "\u0441\u043b\u0438\u0432\u043e\u0447\u043d\u043e\u0435 \u043c\u0430\u0441\u043b\u043e", "blueberry": "\u0447\u0435\u0440\u043d\u0438\u043a\u0430", "celery": "\u0441\u0435\u043b\u044c\u0434\u0435\u0440\u0435\u0439", "molasses": "\u043f\u0430\u0442\u043e\u043a\u0430", "dough": "\u0442\u0435\u0441\u0442\u043e", "sugar": "\u0441\u0430\u0445\u0430\u0440", "apple": "\u044f\u0431\u043b\u043e\u043a\u043e", "goose": "\u0433\u0443\u0441\u044c", "chili pepper": "\u043f\u0435\u0440\u0435\u0446 \u0447\u0438\u043b\u0438", "chips": "\u0447\u0438\u043f\u0441\u044b", "collard": "\u0437\u0435\u043b\u0435\u043d\u044c", "semolina": "\u043c\u0430\u043d\u043d\u0430\u044f \u043a\u0440\u0443\u043f\u0430", "miso": "\u043c\u0438\u0441\u043e", "macaroni": "\u043c\u0430\u043a\u0430\u0440\u043e\u043d\u044b", "coffee": "\u043a\u043e\u0444\u0435", "water": "\u0432\u043e\u0434\u0430", "pickle": "\u0441\u043e\u043b\u0435\u043d\u044b\u0439 \u043e\u0433\u0443\u0440\u0435\u0446", "leeks": "\u043b\u0443\u043a-\u043f\u043e\u0440\u0435\u0439", "cocoa": "\u043a\u0430\u043a\u0430\u043e", "peach": "\u043f\u0435\u0440\u0441\u0438\u043a", "cereals": "\u043e\u0432\u0441\u044f\u043d\u044b\u0435 \u0445\u043b\u043e\u043f\u044c\u044f", "margarine": "\u043c\u0430\u0440\u0433\u0430\u0440\u0438\u043d", "vegetables": "\u043e\u0432\u043e\u0449\u0438", "sauce": "\u0441\u043e\u0443\u0441", "bran": "\u043e\u0442\u0440\u0443\u0431\u0438", "squash": "\u0442\u044b\u043a\u0432\u0430", "berry": "\u044f\u0433\u043e\u0434\u044b", "rice": "\u0440\u0438\u0441", "kale": "\u043b\u0438\u0441\u0442\u043e\u0432\u0430\u044f \u043a\u0430\u043f\u0443\u0441\u0442\u0430", "papaya": "\u043f\u0430\u043f\u0430\u0439\u044f", "pea": "\u0433\u043e\u0440\u043e\u0445", "ham": "\u0432\u0435\u0442\u0447\u0438\u043d\u0430", "syrup": "\u0441\u0438\u0440\u043e\u043f", "juice": "\u0441\u043e\u043a", "orange": "\u0430\u043f\u0435\u043b\u044c\u0441\u0438\u043d", "prune": "\u0447\u0435\u0440\u043d\u043e\u0441\u043b\u0438\u0432", "broccoli": "\u0431\u0440\u043e\u043a\u043a\u043e\u043b\u0438", "sesame": "\u043a\u0443\u043d\u0436\u0443\u0442", "whipped cream": "\u0432\u0437\u0431\u0438\u0442\u044b\u0435 \u0441\u043b\u0438\u0432\u043a\u0438", "sour cream": "\u0441\u043c\u0435\u0442\u0430\u043d\u0430", "ice cream": "\u043c\u043e\u0440\u043e\u0436\u0435\u043d\u043e\u0435", "beet": "\u0441\u0432\u0435\u043a\u043b\u0430", "waffle": "\u0432\u0430\u0444\u043b\u0438", "crouton": "\u0433\u0440\u0435\u043d\u043a\u0438", "oil": "\u043c\u0430\u0441\u043b\u043e", "coconut oil": "\u043a\u043e\u043a\u043e\u0441\u043e\u0432\u043e\u0435 \u043c\u0430\u0441\u043b\u043e", "avocado": "\u0430\u0432\u043e\u043a\u0430\u0434\u043e", "taco": "\u0442\u0430\u043a\u043e", "currant": "\u0441\u043c\u043e\u0440\u043e\u0434\u0438\u043d\u0430", "seaweed": "\u0432\u043e\u0434\u043e\u0440\u043e\u0441\u043b\u0438", "carrot": "\u043c\u043e\u0440\u043a\u043e\u0432\u044c", "radish": "\u0440\u0435\u0434\u0438\u0441", "apricot": "\u0430\u0431\u0440\u0438\u043a\u043e\u0441", "raisin": "\u0438\u0437\u044e\u043c", "pate": "\u043f\u0430\u0448\u0442\u0435\u0442", "salami": "\u0441\u0430\u043b\u044f\u043c\u0438", "brussels sprout": "\u0431\u0440\u044e\u0441\u0441\u0435\u043b\u044c\u0441\u043a\u0430\u044f \u043a\u0430\u043f\u0443\u0441\u0442\u0430", "trout": "\u0444\u043e\u0440\u0435\u043b\u044c", "lime": "\u043b\u0430\u0439\u043c", "hummus": "\u0445\u0443\u043c\u0443\u0441", "lentil": "\u0447\u0435\u0447\u0435\u0432\u0438\u0446\u0430", "scallop": "\u0433\u0440\u0435\u0431\u0435\u0448\u043e\u043a", "lettuce": "\u0441\u0430\u043b\u0430\u0442", "poultry": "\u0434\u043e\u043c\u0430\u0448\u043d\u044f\u044f \u043f\u0442\u0438\u0446\u0430", "pomegranate": "\u0433\u0440\u0430\u043d\u0430\u0442", "soy sauce": "\u0441\u043e\u0435\u0432\u044b\u0439 \u0441\u043e\u0443\u0441", "walnut": "\u0433\u0440\u0435\u0446\u043a\u0438\u0439 \u043e\u0440\u0435\u0445", "cashew": "\u043a\u0435\u0448\u044c\u044e", "egg yolk": "\u044f\u0438\u0447\u043d\u044b\u0439 \u0436\u0435\u043b\u0442\u043e\u043a", "corn": "\u043a\u0443\u043a\u0443\u0440\u0443\u0437\u0430", "cornmeal": "\u043a\u0443\u043a\u0443\u0440\u0443\u0437\u043d\u0430\u044f \u043c\u0443\u043a\u0430", "almond milk": "\u043c\u0438\u043d\u0434\u0430\u043b\u044c\u043d\u043e\u0435 \u043c\u043e\u043b\u043e\u043a\u043e", "peanut butter": "\u0430\u0440\u0430\u0445\u0438\u0441\u043e\u0432\u043e\u0435 \u043c\u0430\u0441\u043b\u043e", "chia seeds": "\u0441\u0435\u043c\u0435\u043d\u0430 \u0447\u0438\u0430", "bell pepper": "\u0431\u043e\u043b\u0433\u0430\u0440\u0441\u043a\u0438\u0439 \u043f\u0435\u0440\u0435\u0446", "cake mix": "\u0441\u043c\u0435\u0441\u044c \u0434\u043b\u044f \u043a\u0435\u043a\u0441\u0430", "pudding mix": "\u0441\u043c\u0435\u0441\u044c \u0434\u043b\u044f \u043f\u0443\u0434\u0434\u0438\u043d\u0433\u0430", "coconut": "\u043a\u043e\u043a\u043e\u0441", "crab meat": "\u043a\u0440\u0430\u0431\u043e\u0432\u043e\u0435 \u043c\u044f\u0441\u043e", "cream cheese": "\u0441\u043b\u0438\u0432\u043e\u0447\u043d\u044b\u0439 \u0441\u044b\u0440", "muffin": "\u043c\u0430\u0444\u0444\u0438\u043d", "jell": "\u0436\u0435\u043b\u0435", "ice": "\u043b\u0435\u0434", "cottage cheese": "\u0442\u0432\u043e\u0440\u043e\u0433", "pecan": "\u043f\u0435\u043a\u0430\u043d", "baking soda": "\u043f\u0438\u0449\u0435\u0432\u0430\u044f \u0441\u043e\u0434\u0430", "buttermilk": "\u043a\u0435\u0444\u0438\u0440", "zucchini": "\u0446\u0443\u043a\u043a\u0438\u043d\u0438", "eggplant": "\u0431\u0430\u043a\u043b\u0430\u0436\u0430\u043d", "candy": "\u043a\u043e\u043d\u0444\u0435\u0442\u044b", "bun": "\u0431\u0443\u043b\u043e\u0447\u043a\u0430", "alcohol": "\u0430\u043b\u043a\u043e\u0433\u043e\u043b\u044c", "beer": "\u043f\u0438\u0432\u043e", "lemonade": "\u043b\u0438\u043c\u043e\u043d\u0430\u0434", "ketchup": "\u043a\u0435\u0442\u0447\u0443\u043f", "ginger": "\u0438\u043c\u0431\u0438\u0440\u044c"}


--------------------------------------------------------------------------------