├── bot_final_vesion ├── train_tfidf.pkl ├── vectorizer.pkl ├── model.py ├── clean_ingred.txt ├── rus_clean_ingred.txt ├── dish_id_bot.py └── en2ru_ing.json ├── pipeline ├── ingred_models.py ├── inference.py ├── utils │ ├── matching.py │ └── metrics.py ├── match_models.py └── preprocess.py ├── .gitignore ├── ReadME.txt └── scripts ├── matching_povar.py └── povar_scrapping.py /bot_final_vesion/train_tfidf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alenush/dish_id_sirius/HEAD/bot_final_vesion/train_tfidf.pkl -------------------------------------------------------------------------------- /bot_final_vesion/vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alenush/dish_id_sirius/HEAD/bot_final_vesion/vectorizer.pkl -------------------------------------------------------------------------------- /pipeline/ingred_models.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from PIL import Image 3 | 4 | class IngredModel(ABC): 5 | @abstractmethod 6 | def predict(self, img_path): 7 | pass 8 | 9 | 10 | class DenseChefNet(IngredModel): 11 | def __init__(self, model, threshold=0.25): 12 | self.model = model 13 | self.threshold = threshold 14 | 15 | def predict(self, img_path): 16 | img = Image.open(img_path) 17 | img = transform_val(img).to(device).unsqueeze(0) 18 | 19 | with torch.no_grad(): 20 | ingred_pred = self.model(img) > self.threshold 21 | 22 | ingred_pred = ingred_pred.nonzero()[:, 1].tolist() 23 | ingred_pred = [id2word[ing+1] for ing in ingred_pred] 24 | ingred_pred = [en2ru[ing] for ing in ingred_pred] 25 | 26 | return ingred_pred -------------------------------------------------------------------------------- /pipeline/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from torchvision import transforms, models 4 | from torch.utils.data import Dataset, DataLoader 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence 6 | from collections import defaultdict 7 | import matplotlib.pyplot as plt 8 | from PIL import Image 9 | import pickle 10 | import pandas as pd 11 | import numpy as np 12 | from collections import defaultdict 13 | 14 | def predict_image(img_path, df, ingred_model, word_model, k=10, threshold=0.25): 15 | ingred_pred = ingred_model.predict(img_path) 16 | cos_sim, best_indices = word_model.most_similar(ingred_pred, k) 17 | 18 | return cos_sim, df.loc[best_indices] 19 | 20 | def main(args): 21 | img = Image.open(args.img_path) 22 | 23 | plt.figure(figsize=(12,8)) 24 | plt.imshow(img) 25 | plt.axis('off') 26 | plt.show() 27 | 28 | cos_sim, best_rows = predict_image(img_path, df, chefnet, elmo_model) 29 | 30 | for i, row in enumerate(best_rows.iterrows()): 31 | row = row[1] 32 | print(f"{i + 1}) {row['name']}") 33 | print('\t' + row['ingreds']) 34 | print(f"\t{row['url']}") 35 | -------------------------------------------------------------------------------- /pipeline/utils/matching.py: -------------------------------------------------------------------------------- 1 | from scipy.spatial.distance import cosine 2 | 3 | pos_morphy_to_vec = {'ADJF':'ADJ'} # перевод части речи из pymorphy в word2vec 4 | 5 | 6 | def match_recipes(ingred_list, recipes_df, word_model, k=10): 7 | """ 8 | Возвращает k ближайших рецептов по косинусному расстоянию. 9 | :param ingred_list: список ингедиенто 10 | :param recipes_df: датасет для матчинга 11 | :param word_model: векторная модель (word2vec, fasttext, elmo) 12 | :param k: кол-во возвращаемых рецептов 13 | :returns: k рецептов 14 | """ 15 | vectors = [] 16 | for word in ingred_list: 17 | normal_word = morph.parse(word)[0].normalized 18 | pos = normal_word.tag.POS 19 | if pos in pos_morphy_to_vec: 20 | pos = pos_morphy_to_vec[normal_word.tag.POS] 21 | word = normal_word.word 22 | if pos != None: 23 | word_pos = f'{word}_{pos}' 24 | if word_pos in word_model: 25 | vectors.append(word_model[word_pos]) 26 | 27 | mean_vector = np.mean(vectors, axis=0) 28 | cos_sim = recipes_df['word2vec_mean'].apply(lambda v: cosine(v, mean_vector)).sort_values()[:k] 29 | closest_recipes = recipes_df.loc[cos_sim.index] 30 | closest_recipes['cos_sim'] = cos_sim 31 | 32 | return closest_recipes 33 | -------------------------------------------------------------------------------- /pipeline/match_models.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from scipy.spatial.distance import cosine 3 | 4 | class MatchModel(ABC): 5 | def __init__(self, mean_recipes, word_model): 6 | self.mean_recipes = mean_recipes 7 | self.word_model = word_model 8 | 9 | @abstractmethod 10 | def mean_embedding(self, ingred_list): 11 | pass 12 | 13 | def most_similar(self, ingred_list, k): 14 | mean_vector = self.mean_embedding(ingred_list) 15 | cos_sim = self.mean_recipes.apply(lambda v: cosine(v, mean_vector)).\ 16 | sort_values()[:k] 17 | return cos_sim, cos_sim.index 18 | 19 | 20 | class Word2Vec(MatchModel): 21 | def __init__(self, mean_recipes, word_model): 22 | super().__init__(mean_recipes, word_model) 23 | 24 | def mean_embedding(self, ingred_list): 25 | words = ' '.join(tag_ud(ingred_list)).split() 26 | vectors = [self.word_model[word] for word in words] 27 | return np.mean(vectors, axis=0) 28 | 29 | class FastText(MatchModel): 30 | def __init__(self, mean_recipes, word_model): 31 | super().__init__(mean_recipes, word_model) 32 | 33 | def mean_embedding(self, ingred_list): 34 | words = ' '.join(tag_ud(ingred_list, keep_pos=False)).split() 35 | vectors = [self.word_model[word] for word in words if word in self.word_model.vocab] 36 | return np.mean(vectors, axis=0) 37 | 38 | class Elmo(MatchModel): 39 | def __init__(self, mean_recipes, word_model): 40 | super().__init__(mean_recipes, word_model) 41 | 42 | def mean_embedding(self, ingred_list): 43 | return self.word_model([ingred_list])[0] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .idea/ 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | # MAC 108 | .DS_Store 109 | 110 | -------------------------------------------------------------------------------- /bot_final_vesion/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torchvision import transforms, models 4 | from collections import defaultdict 5 | import matplotlib.pyplot as plt 6 | from PIL import Image 7 | device = torch.device('cpu') 8 | import json 9 | 10 | 11 | en2ru = {} 12 | with open('/home/dishid_bot/clean_ingred.txt', 'r') as f: 13 | en=f.readlines() 14 | with open('/home/dishid_bot/rus_clean_ingred.txt', 'r',encoding='utf-8') as f: 15 | ru=f.readlines() 16 | 17 | for e, r in zip(en, ru): 18 | en2ru[e.rstrip('\n')] = r.rstrip('\n') 19 | with open('/home/dishid_bot/en2ru_ing.json', 'w') as f: 20 | json.dump(en2ru, f) 21 | 22 | 23 | with open('/home/dishid_bot/clean_ingred.txt', 'r') as f: 24 | clean_ingredients = list(f.read().split('\n')) 25 | print(len(clean_ingredients)) 26 | 27 | id2word = defaultdict() 28 | for i, ingr in enumerate(clean_ingredients): 29 | id2word[i+1] = ingr 30 | 31 | rus_id2word = defaultdict() 32 | for i, ingr in enumerate(clean_ingredients): 33 | rus_id2word[i+1] = ingr 34 | 35 | word2id = {v:k for k,v in id2word.items()} 36 | def words2ids(ingreds): 37 | return [word2id[ing] for ing in ingreds] 38 | 39 | model = models.densenet161(pretrained=True) 40 | 41 | def init_model(): 42 | 43 | for param in model.parameters(): 44 | param.requires_grad = False 45 | num_feat = model.classifier.in_features 46 | 47 | model.classifier = nn.Sequential( 48 | nn.Linear(num_feat, 1024), 49 | nn.BatchNorm1d(1024), 50 | nn.ReLU(), 51 | nn.Linear(1024, 512), 52 | nn.BatchNorm1d(512), 53 | nn.ReLU(), 54 | nn.Linear(512, len(word2id)), 55 | nn.Sigmoid()) 56 | model.to(device) 57 | 58 | model.load_state_dict(torch.load("/home/dishid_bot/model_encoder_classifier_best.pth",map_location=torch.device('cpu'))) 59 | model.eval() 60 | 61 | 62 | def predict_image(path_to_img, transform): 63 | with torch.no_grad(): 64 | img = Image.open(path_to_img).convert('RGB') 65 | plt.figure(figsize=(12,8)) 66 | plt.imshow(img) 67 | plt.axis('off') 68 | plt.show() 69 | img = transform(img).to(device).unsqueeze(0) 70 | 71 | ingred_pred = model(img) > 0.25 72 | ingred_pred = ingred_pred.nonzero()[:, 1].tolist() 73 | ingred_pred = [id2word[ing + 1] for ing in ingred_pred] 74 | ingred_pred = [en2ru[ing] for ing in ingred_pred] 75 | return '\t' + '\n\t'.join(ingred_pred) -------------------------------------------------------------------------------- /bot_final_vesion/clean_ingred.txt: -------------------------------------------------------------------------------- 1 | chicken 2 | beef 3 | turkey 4 | soft cheese 5 | hard cheese 6 | nut 7 | strawberry 8 | cherry tomato 9 | sausage 10 | asparagus 11 | cranberry 12 | gelatin 13 | tofu 14 | olive 15 | thyme 16 | tuna 17 | paprika 18 | cardamom 19 | red pepper 20 | shrimp 21 | tapioca 22 | pumpkin 23 | date 24 | plum 25 | green bean 26 | bean 27 | fish 28 | lasagna 29 | lobster 30 | flour 31 | octopus 32 | cookie 33 | milk 34 | lemon 35 | chocolate 36 | cracker 37 | clam 38 | cherry 39 | soy milk 40 | pineapple 41 | cabbage 42 | pasta 43 | crab 44 | bacon 45 | wasabi 46 | onion 47 | patty 48 | baking powder 49 | almond 50 | mango 51 | mussel 52 | cheese 53 | grape 54 | veal 55 | honey 56 | vanilla 57 | potato 58 | capers 59 | mollusk 60 | raspberry 61 | tomato 62 | cinnamon 63 | tortilla 64 | lamb 65 | jam 66 | mustard 67 | sherbet 68 | oyster 69 | bread 70 | banana 71 | spaghetti 72 | noodle 73 | egg 74 | duck 75 | jalapeno 76 | mayonnaise 77 | barley 78 | okra 79 | chives 80 | cucumber 81 | turnip 82 | feijoa 83 | scrambled egg 84 | melon 85 | persimmon 86 | wine 87 | yogurt 88 | mushroom 89 | mutton 90 | bulgur 91 | salmon 92 | butter 93 | blueberry 94 | celery 95 | molasses 96 | dough 97 | sugar 98 | apple 99 | goose 100 | chili pepper 101 | chips 102 | collard 103 | semolina 104 | miso 105 | macaroni 106 | coffee 107 | water 108 | pickle 109 | leeks 110 | cocoa 111 | peach 112 | cereals 113 | margarine 114 | vegetables 115 | sauce 116 | bran 117 | squash 118 | berry 119 | rice 120 | kale 121 | papaya 122 | pea 123 | ham 124 | syrup 125 | juice 126 | orange 127 | prune 128 | broccoli 129 | sesame 130 | whipped cream 131 | sour cream 132 | ice cream 133 | beet 134 | waffle 135 | crouton 136 | oil 137 | coconut oil 138 | avocado 139 | taco 140 | currant 141 | seaweed 142 | carrot 143 | radish 144 | apricot 145 | raisin 146 | pate 147 | salami 148 | brussels sprout 149 | trout 150 | lime 151 | hummus 152 | lentil 153 | scallop 154 | lettuce 155 | poultry 156 | pomegranate 157 | soy sauce 158 | walnut 159 | cashew 160 | egg yolk 161 | corn 162 | cornmeal 163 | almond milk 164 | peanut butter 165 | chia seeds 166 | bell pepper 167 | cake mix 168 | pudding mix 169 | coconut 170 | crab meat 171 | cream cheese 172 | muffin 173 | jell 174 | ice 175 | cottage cheese 176 | pecan 177 | baking soda 178 | buttermilk 179 | zucchini 180 | eggplant 181 | candy 182 | bun 183 | alcohol 184 | beer 185 | lemonade 186 | ketchup 187 | ginger -------------------------------------------------------------------------------- /bot_final_vesion/rus_clean_ingred.txt: -------------------------------------------------------------------------------- 1 | курица 2 | говядина 3 | индейка 4 | мягкий сыр 5 | твердый сыр 6 | орехи 7 | клубника 8 | томаты черри 9 | колбаса 10 | спаржа 11 | клюква 12 | желатин 13 | тофу 14 | оливки 15 | тимьян 16 | тунец 17 | паприка 18 | кардамон 19 | красный перец 20 | креветка 21 | тапиока 22 | тыква 23 | финик 24 | слива 25 | стручковая фасоль 26 | фасоль 27 | рыба 28 | лазанья 29 | лобстер 30 | мука 31 | осьминог 32 | печенье 33 | молоко 34 | лимон 35 | шоколад 36 | крекер 37 | моллюск 38 | вишня 39 | соевое молоко 40 | ананас 41 | капуста 42 | спагетти 43 | краб 44 | бекон 45 | васаби 46 | лук 47 | пирожок 48 | разрыхлитель 49 | миндаль 50 | манго 51 | мидия 52 | сыр 53 | виноград 54 | телятина 55 | мед 56 | ваниль 57 | картофель 58 | каперс 59 | моллюск 60 | малина 61 | помидор 62 | корица 63 | тортилья 64 | ягненок 65 | варенье 66 | горчица 67 | шербет 68 | устрица 69 | хлеб 70 | банан 71 | спагетти 72 | лапша 73 | яйцо 74 | утка 75 | халапеньо 76 | майонез 77 | ячмень 78 | окра 79 | зеленый лук 80 | огурец 81 | репа 82 | фейхоа 83 | омлет 84 | дыня 85 | хурма 86 | вино 87 | йогурт 88 | грибы 89 | баранина 90 | булгур 91 | лосось 92 | сливочное масло 93 | черника 94 | сельдерей 95 | патока 96 | тесто 97 | сахар 98 | яблоко 99 | гусь 100 | перец чили 101 | чипсы 102 | зелень 103 | манная крупа 104 | мисо 105 | макароны 106 | кофе 107 | вода 108 | соленый огурец 109 | лук-порей 110 | какао 111 | персик 112 | овсяные хлопья 113 | маргарин 114 | овощи 115 | соус 116 | отруби 117 | тыква 118 | ягоды 119 | рис 120 | листовая капуста 121 | папайя 122 | горох 123 | ветчина 124 | сироп 125 | сок 126 | апельсин 127 | чернослив 128 | брокколи 129 | кунжут 130 | взбитые сливки 131 | сметана 132 | мороженое 133 | свекла 134 | вафли 135 | гренки 136 | масло 137 | кокосовое масло 138 | авокадо 139 | тако 140 | смородина 141 | водоросли 142 | морковь 143 | редис 144 | абрикос 145 | изюм 146 | паштет 147 | салями 148 | брюссельская капуста 149 | форель 150 | лайм 151 | хумус 152 | чечевица 153 | гребешок 154 | салат 155 | домашняя птица 156 | гранат 157 | соевый соус 158 | грецкий орех 159 | кешью 160 | яичный желток 161 | кукуруза 162 | кукурузная мука 163 | миндальное молоко 164 | арахисовое масло 165 | семена чиа 166 | болгарский перец 167 | смесь для кекса 168 | смесь для пуддинга 169 | кокос 170 | крабовое мясо 171 | сливочный сыр 172 | маффин 173 | желе 174 | лед 175 | творог 176 | пекан 177 | пищевая сода 178 | кефир 179 | цуккини 180 | баклажан 181 | конфеты 182 | булочка 183 | алкоголь 184 | пиво 185 | лимонад 186 | кетчуп 187 | имбирь -------------------------------------------------------------------------------- /pipeline/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 5 | 6 | 7 | def update_error_types(error_types, y_pred, y_true): 8 | error_types['tp_i'] += (y_pred * y_true).sum(0).cpu().data.numpy() 9 | error_types['fp_i'] += (y_pred * (1-y_true)).sum(0).cpu().data.numpy() 10 | error_types['fn_i'] += ((1-y_pred) * y_true).sum(0).cpu().data.numpy() 11 | error_types['tn_i'] += ((1-y_pred) * (1-y_true)).sum(0).cpu().data.numpy() 12 | 13 | error_types['tp_all'] += (y_pred * y_true).sum().item() 14 | error_types['fp_all'] += (y_pred * (1-y_true)).sum().item() 15 | error_types['fn_all'] += ((1-y_pred) * y_true).sum().item() 16 | 17 | def label2onehot(labels, pad_value=len(word2id)-1): 18 | # input labels to one hot vector 19 | inp_ = torch.unsqueeze(labels, 2) 20 | one_hot = torch.FloatTensor(labels.size(0), labels.size(1), pad_value + 1).zero_().to(device) 21 | one_hot.scatter_(2, inp_, 1) 22 | one_hot, _ = one_hot.max(dim=1) 23 | # remove pad and eos position 24 | one_hot = one_hot[:, 1:-1] 25 | one_hot[:, 0] = 0 26 | 27 | return one_hot 28 | 29 | def compute_metrics(ret_metrics, error_types, metric_names, eps=1e-10, weights=None): 30 | if 'accuracy' in metric_names: 31 | ret_metrics['accuracy'].append(np.mean((error_types['tp_i'] + error_types['tn_i']) / (error_types['tp_i'] + error_types['fp_i'] + error_types['fn_i'] + error_types['tn_i']))) 32 | if 'jaccard' in metric_names: 33 | ret_metrics['jaccard'].append(error_types['tp_all'] / (error_types['tp_all'] + error_types['fp_all'] + error_types['fn_all'] + eps)) 34 | if 'dice' in metric_names: 35 | ret_metrics['dice'].append(2*error_types['tp_all'] / (2*(error_types['tp_all'] + error_types['fp_all'] + error_types['fn_all']) + eps)) 36 | if 'f1' in metric_names: 37 | pre = error_types['tp_i'] / (error_types['tp_i'] + error_types['fp_i'] + eps) 38 | rec = error_types['tp_i'] / (error_types['tp_i'] + error_types['fn_i'] + eps) 39 | f1_perclass = 2*(pre * rec) / (pre + rec + eps) 40 | if 'f1_ingredients' not in ret_metrics.keys(): 41 | ret_metrics['f1_ingredients'] = [np.average(f1_perclass, weights=weights)] 42 | else: 43 | ret_metrics['f1_ingredients'].append(np.average(f1_perclass, weights=weights)) 44 | 45 | pre = error_types['tp_all'] / (error_types['tp_all'] + error_types['fp_all'] + eps) 46 | rec = error_types['tp_all'] / (error_types['tp_all'] + error_types['fn_all'] + eps) 47 | f1 = 2*(pre * rec) / (pre + rec + eps) 48 | ret_metrics['f1'].append(f1) -------------------------------------------------------------------------------- /ReadME.txt: -------------------------------------------------------------------------------- 1 | DISH ID 2020_final.pptx - презентация с описанием проекта Dish-ID (https://drive.google.com/file/d/1_Fg-ezBY7KnFsHnxjUJ8bHIi7i60mYGM/view?usp=sharing) 2 | 3 | Итого: 1) данные с картинками и ингредиентами к ним 4 | 2) данные с рецептами 5 | 3) бот, который крутится на сервере 6 | 4) pipeline от и до 7 | 5) обученная модель по выделению ингредиентов на фотографии 8 | 6) ноутбуки по обучению моделей 9 | 7) скрипты с матчингом 10 | 11 | data/ #папка с данными, которые удалось собрать - https://drive.google.com/drive/folders/1_3nfYMJH6fbME6c_bt68woR6Ia8hcRhQ?usp=sharing 12 | AllRecipes_images.zip #данные с сайта allrecipes.com 13 | #для обучения модели по выделению игредиентов по фотографии 14 | #данные содержат изображения и ингредиенты 15 | 16 | recipes_final_povar_ru.pkl #рецепты с сайта povar.ru 17 | #датасет содержит данные о названиях, ингредиентах и рецептах блюд 18 | 19 | eda_all_recipes.csv #рецепты с сайта eda.ru 20 | #датасет содержит данные о названиях, ингредиентах и рецептах блюд 21 | 22 | recipes_all.csv #объединенные данные с eda.ru и povar.ru 23 | #на этом датасете нужно тестировать gold 24 | 25 | eda_vectors.pkl #данные с eda.ru с заранее вычисленными эмбеддингами 26 | #word2vec, fasttext, elmo 27 | 28 | gold.xlsx #золотой стандарт для проверки матчинга 29 | #если в колонке priority - 10, рецепт нерелевантен 30 | 31 | ingredients.json #словарь перевода ингредиентов с английского на русский 32 | dishes.json #словарь перевода названий блюд с английского на русский 33 | 34 | models/ #обученные модели по выделению ингредиентов на фотографии 35 | model_encoder_classifier_best.pth #модель, которая используется в боте 36 | #лучшая из полученных моделей архитектуры encoder->classifier 37 | # Можно найти по ссылке https://drive.google.com/file/d/1CKTwhkTrEJzU69wI11PdmX047p4ULCND/view?usp=sharing 38 | 39 | bot_final_version/ #папка с ботом @dish_id_bot 40 | #в папке также продублированы веса model_encoder_classifier_best.pth (по ссылке https://drive.google.com/file/d/1CKTwhkTrEJzU69wI11PdmX047p4ULCND/view?usp=sharing), 41 | #чтобы было меньше трудностей с путями 42 | 43 | pipeline/ #папка с pipeline от картинки до предсказания рецептов 44 | chefnet_dense.ipynb #ноутбук по обучению модели на данных AllRecipes_images.zip 45 | word_models.ipynb #ноутбук с папйлайном по подбору рецептов 46 | #можно использовать веса обученной модели models/model_encoder_classifier_best.pth 47 | 48 | scripts/ #папка со скриптами 49 | matching_povar.py #алгоритм подбора рецептов по ингредиентам на данных recipes_final_povar_ru.pkl 50 | povar_scrapping.py #скрэппер сайта povar.ru 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /scripts/matching_povar.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pickle as pkl 4 | 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | import pymorphy2 7 | 8 | orph = pymorphy2.MorphAnalyzer() 9 | 10 | FIT_COLUMN = "spl" 11 | PREDICT_COLUMN = "not_ingridients" 12 | N_BEST = 10 13 | 14 | 15 | # NOTE: Есть такая штука как docstrings (ко всем функциям добавить) 16 | # Не стоит писать странные "это нужно", объясни сразу что идет на вход (вместе с типом) функции, а что на выход 17 | def splinst(inst): 18 | """ 19 | это функция, которая принимает на вход текст инструкции и 20 | дает обратно текст с нормальными формами слов. 21 | Ну и чистит всякие предлоги, цифры и т.д. по возможности. 22 | 23 | :param array inst: описание что на вход 24 | :return: что возвращает в каком типе данных прописать 25 | """ 26 | I = " ".join(inst) 27 | I = I.split(" ") 28 | 29 | for i in range(len(I)): 30 | I[i] = I[i].lower() 31 | if len(I[i]) > 2: 32 | if ord(I[i][-1]) < 1072 or ord(I[i][-1]) > 1103: 33 | I[i] = I[i][:-1] 34 | if ord(I[i][0]) < 1072 or ord(I[i][0]) > 1103: 35 | I[i] = I[i][1:] 36 | 37 | p = orph.parse(I[i])[0] 38 | I[i] = p.normal_form 39 | if I[i] == None: 40 | I[i] = "" 41 | else: 42 | I[i] = "" 43 | return " ".join(I) 44 | 45 | 46 | def create_text_for_vectorizer(lst_base): 47 | lst = lst_base.copy() 48 | for i in range(len(lst_base)): 49 | lst.extend([lst_base[i]] * int(max(6 // (0.5 * i + 1), 1))) 50 | return " ".join(["".join(elem.split()) for elem in lst]) 51 | 52 | 53 | def prepare_dataset(df): 54 | df = df.copy() 55 | 56 | df.index = [i for i in range(len(df))] 57 | df["splinstr"] = df.instructions.apply(splinst) 58 | 59 | df["nor_ingridients"] = df.pure_ingridients.apply( 60 | lambda x: list(map(lambda y: orph.parse(y)[0].normal_form, x)) 61 | ) 62 | df["not_ingridients"] = df.nor_ingridients.apply( 63 | lambda x: create_text_for_vectorizer(x) 64 | ) 65 | 66 | df["splinctr2"] = df.splinstr.apply(lambda x: x.split(" ")) 67 | 68 | spl = [] # запихнуть все в листы как мудак -- могу умею практикую 69 | for i in range(len(df)): 70 | dd = df.splinstr[i].split(" ") 71 | # for s in df2.splinstr[i]: 72 | # dd = dd + ' ' + s 73 | # dd = dd.split(' ') 74 | nn = df.nor_ingridients[i] 75 | # for a in df2.nor_ingridients[i]: 76 | # nn = nn + a 77 | sss = [] 78 | # print (list(df2.nor_ingridients[i])) 79 | for j in range(len(dd)): 80 | # print (dd[j]) 81 | if dd[j] in list(nn): 82 | sss.append(dd[j]) 83 | spl.append(sss) 84 | # df2.splinctr2[i] = sss 85 | df["spl"] = spl # нормализованные данные (по упоминанию слов в тексте рецепта) 86 | 87 | 88 | return df 89 | 90 | 91 | # для каждого элемента тестовой выборки берем лучшие мэтчи из трэйна 92 | def get_best_matches(array, n): 93 | # print(array[0]) 94 | x = np.argsort(np.array(array)) 95 | # print(x) 96 | return x[-n:] 97 | 98 | 99 | def fit_predict(train, test): 100 | vectorizer = TfidfVectorizer(min_df=2) 101 | 102 | train_tfidf = vectorizer.fit_transform( 103 | train[FIT_COLUMN].apply( 104 | lambda x: " ".join(["".join(elem.split()) for elem in x]) 105 | ) 106 | ) 107 | test_tfidf = vectorizer.transform( 108 | test[PREDICT_COLUMN] 109 | ) # .apply(lambda x: " ".join(x))) 110 | 111 | # display(test_tfidf.todense()) 112 | # print(train_tfidf.shape, test_tfidf.shape) 113 | result = np.array(test_tfidf).dot(np.array(train_tfidf.T)) 114 | result = pd.DataFrame(np.array(result.todense())) 115 | best_matches = result.apply(lambda x: get_best_matches(x, N_BEST), axis=1) 116 | return best_matches 117 | 118 | 119 | if __name__ == "__main__": 120 | with open("recipes_final_povar_ru.pkl", "rb") as f: 121 | df = pkl.load(f) 122 | df_norm = prepare_dataset(df) 123 | 124 | train_base = df_norm.sample(frac=1).reset_index() 125 | 126 | train = train_base[100:].reset_index() 127 | test = train_base[:100] 128 | 129 | best_matches = fit_predict(train, test) 130 | best_matches = pd.DataFrame( 131 | best_matches.apply(lambda x: [train["name"][i] for i in x]) 132 | ) 133 | best_matches.index = test.name 134 | best_matches.columns = ["names"] 135 | 136 | final_best_matches = pd.DataFrame( 137 | best_matches.names.tolist(), index=best_matches.index 138 | ) 139 | final_best_matches.to_csv("best_matches.csv") 140 | 141 | 142 | -------------------------------------------------------------------------------- /pipeline/preprocess.py: -------------------------------------------------------------------------------- 1 | def clean_token(token, misc): 2 | """ 3 | :param token: 4 | :param misc: 5 | :return: 6 | """ 7 | out_token = token.strip().replace(' ', '') 8 | if token == 'Файл' and 'SpaceAfter=No' in misc: 9 | return None 10 | return out_token 11 | 12 | 13 | def clean_lemma(lemma, pos, lowercase=True): 14 | """ 15 | :param lemma: 16 | :param pos: 17 | :return: 18 | """ 19 | out_lemma = lemma.strip().replace(' ', '').replace('_', '') 20 | if lowercase: 21 | out_lemma = out_lemma.lower() 22 | if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'): 23 | return None 24 | if pos != 'PUNCT': 25 | if out_lemma.startswith('«') or out_lemma.startswith('»'): 26 | out_lemma = ''.join(out_lemma[1:]) 27 | if out_lemma.endswith('«') or out_lemma.endswith('»'): 28 | out_lemma = ''.join(out_lemma[:-1]) 29 | if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \ 30 | or out_lemma.endswith('.'): 31 | out_lemma = ''.join(out_lemma[:-1]) 32 | return out_lemma 33 | 34 | def num_replace(word): 35 | newtoken = 'x' * len(word) 36 | nw = newtoken + '_NUM' 37 | return nw 38 | 39 | def process(pipeline, text='Строка', keep_pos=True, keep_punct=False): 40 | entities = {'PROPN'} 41 | named = False 42 | memory = [] 43 | mem_case = None 44 | mem_number = None 45 | tagged_propn = [] 46 | 47 | # обрабатываем текст, получаем результат в формате conllu: 48 | processed = pipeline.process(text) 49 | 50 | # пропускаем строки со служебной информацией: 51 | content = [l for l in processed.split('\n') if not l.startswith('#')] 52 | 53 | # извлекаем из обработанного текста леммы, тэги и морфологические характеристики 54 | tagged = [w.split('\t') for w in content if w] 55 | 56 | for t in tagged: 57 | if len(t) != 10: 58 | continue 59 | (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t 60 | token = clean_token(token, misc) 61 | lemma = clean_lemma(lemma, pos) 62 | if not lemma or not token: 63 | continue 64 | if pos in entities: 65 | if '|' not in feats: 66 | tagged_propn.append('%s_%s' % (lemma, pos)) 67 | continue 68 | morph = {el.split('=')[0]: el.split('=')[1] for el in feats.split('|')} 69 | if 'Case' not in morph or 'Number' not in morph: 70 | tagged_propn.append('%s_%s' % (lemma, pos)) 71 | continue 72 | if not named: 73 | named = True 74 | mem_case = morph['Case'] 75 | mem_number = morph['Number'] 76 | if morph['Case'] == mem_case and morph['Number'] == mem_number: 77 | memory.append(lemma) 78 | if 'SpacesAfter=\\n' in misc or 'SpacesAfter=\s\\n' in misc: 79 | named = False 80 | past_lemma = '::'.join(memory) 81 | memory = [] 82 | tagged_propn.append(past_lemma + '_PROPN ') 83 | else: 84 | named = False 85 | past_lemma = '::'.join(memory) 86 | memory = [] 87 | tagged_propn.append(past_lemma + '_PROPN ') 88 | tagged_propn.append('%s_%s' % (lemma, pos)) 89 | else: 90 | if not named: 91 | if pos == 'NUM' and token.isdigit(): # Заменяем числа на xxxxx той же длины 92 | lemma = num_replace(token) 93 | tagged_propn.append('%s_%s' % (lemma, pos)) 94 | else: 95 | named = False 96 | past_lemma = '::'.join(memory) 97 | memory = [] 98 | tagged_propn.append(past_lemma + '_PROPN ') 99 | tagged_propn.append('%s_%s' % (lemma, pos)) 100 | 101 | if not keep_punct: 102 | tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT'] 103 | if not keep_pos: 104 | tagged_propn = [word.split('_')[0] for word in tagged_propn] 105 | return tagged_propn 106 | 107 | from ufal.udpipe import Model, Pipeline 108 | import os 109 | import re 110 | 111 | def get_pipeline(modelfile): 112 | print('\nLoading the model...', file=sys.stderr) 113 | if not os.path.isfile(modelfile): 114 | udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model' 115 | udpipe_filename = udpipe_model_url.split('/')[-1] 116 | print('UDPipe model not found. Downloading...', file=sys.stderr) 117 | wget.download(udpipe_model_url) 118 | ufal_model = Model.load(modelfile) 119 | process_pipeline = Pipeline(ufal_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') 120 | return process_pipeline 121 | 122 | def tag_ud(text='Текст нужно передать функции в виде строки!', process_pipeline=process_pipeline, keep_pos=True): 123 | output = [] 124 | for line in tqdm(text): 125 | line_proc = process(process_pipeline, text=line, keep_pos=keep_pos) 126 | output.append(' '.join(line_proc)) 127 | return output -------------------------------------------------------------------------------- /scripts/povar_scrapping.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle as pkl 3 | from selenium.webdriver import Chrome 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions 8 | # import wget 9 | 10 | ##################### 11 | ##### Поставить путь до драйвера вашего браузера 12 | driver = Chrome(executable_path="/path/to/chromedriver.exe") 13 | option = webdriver.ChromeOptions() 14 | option.add_argument(" — incognito") 15 | 16 | BASE = 'https://povar.ru/list/' 17 | 18 | ##################### 19 | ##### Поставить срез категорий (между 0 и 271), которые будут скрепиться 20 | ### У меня 1 категория = 3-4 минуты скрепинга (35-40 рецептов) 21 | ##### Поставить путь до директории сохранения данных 22 | MIN_CATEGS = 150 23 | MAX_CATEGS = 271 24 | DIR_NAME = '/path/to/save/data' 25 | # 271 - это количество категорий, которые нужно заскреппить 26 | # NUM_CATEGS = [i.text for i in categories].index('Соленое') + 1 27 | 28 | 29 | def main(): 30 | inds = [] 31 | titles = [] 32 | ingridient_lists = [] 33 | img_urls = [] 34 | step_lists = [] 35 | links = [] 36 | 37 | driver.get(BASE) 38 | # categories = driver.find_elements_by_css_selector(".ingredientItem div a") 39 | # print([i.text for i in categories]) 40 | for categ_num in range(MIN_CATEGS, MAX_CATEGS): 41 | # Чекаем, есть ли вообще ссылки на категории 42 | try: 43 | WebDriverWait(driver, 8).until( 44 | expected_conditions.presence_of_element_located( 45 | (By.CSS_SELECTOR, ".ingredientItemH2"))) 46 | except: 47 | print('no-category') 48 | continue 49 | 50 | # Переходим по найденной ссылке на категорию 51 | try: 52 | driver.find_elements_by_css_selector('.ingredientItem div a')[categ_num].click() 53 | except: 54 | continue 55 | 56 | # Пытаемся не упасть в ситуации, когда рецептов меньше 40 (не целая страница) 57 | try: 58 | ttl_recipes = WebDriverWait(driver, 4).until( 59 | expected_conditions.presence_of_element_located( 60 | (By.CSS_SELECTOR, ".total"))) 61 | num_recipes = int(ttl_recipes.text.split()[-1]) 62 | except: 63 | driver.get(BASE) 64 | print('less than 40 recipes') 65 | continue 66 | print(f"Рецептов в категории: {num_recipes}") 67 | 68 | categ_url = driver.current_url 69 | for page_num in range(2, min(num_recipes // 40 + 1, 11)): 70 | try: 71 | driver.get(categ_url + str(page_num)) 72 | except: 73 | print('page is not clicked') 74 | driver.get(categ_url) 75 | continue 76 | # На одной странице максимум 40 рецептов 77 | for recipe_num in range(40): 78 | 79 | # Ждем открытия страницы и переходим на страничку с рецептом 80 | try: 81 | WebDriverWait(driver, 4).until( 82 | expected_conditions.presence_of_element_located( 83 | (By.CSS_SELECTOR, ".listRecipieTitle"))) 84 | driver.find_elements_by_css_selector(".listRecipieTitle")[recipe_num].click() 85 | except: 86 | print('skipped_1') 87 | # Если вдруг оказались на неправильной странице, надо попробовать обратно зайти, и потом уже забить 88 | driver.get(categ_url + str(page_num)) 89 | continue 90 | 91 | # Качаем название и ингридиенты 92 | try: 93 | title = WebDriverWait(driver, 4).until( 94 | expected_conditions.presence_of_element_located( 95 | (By.CSS_SELECTOR, ".detailed"))).text.strip() 96 | except: 97 | title = None 98 | try: 99 | ingridients = [elem.text.strip() for elem 100 | in driver.find_elements_by_css_selector('.detailed_ingredients li')] 101 | except: 102 | ingridients = None 103 | try: 104 | steps = [elem.text.strip() for elem in 105 | driver.find_elements_by_css_selector(".detailed_step_description_big")] 106 | except: 107 | steps = None 108 | cur_url = driver.current_url 109 | 110 | 111 | # Качаем фотку 112 | try: 113 | img_src = WebDriverWait(driver, 5).until( 114 | expected_conditions.presence_of_element_located( 115 | (By.CSS_SELECTOR, ".bigImgBox img"))).get_attribute('src') 116 | except: 117 | img_src = None 118 | 119 | # Тут еще можно саму фотку скачать, но это вроде не особо нужно 120 | # print(img_src) 121 | # try: 122 | # wget.download(img_src, DIR_NAME + f'{categ_num}_{recipe_num}.png') 123 | # except: 124 | # driver.refresh() 125 | # print('refreshing') 126 | # img_src = WebDriverWait(driver, 2).until( 127 | # expected_conditions.presence_of_element_located( 128 | # (By.CSS_SELECTOR, ".bigImgBox img"))).get_attribute('src') 129 | # print('img_src') 130 | # try: 131 | # wget.download(img_src, DIR_NAME + f'{categ_num}_{recipe_num}.png') 132 | # except: 133 | # pass 134 | 135 | # Сохраняем в базу данные 136 | inds.append(f'{categ_num}_{recipe_num + 40 * page_num}') 137 | titles.append(title) 138 | img_urls.append(img_src) 139 | ingridient_lists.append(tuple(ingridients)) 140 | step_lists.append(tuple(steps)) 141 | links.append(cur_url) 142 | 143 | driver.get(categ_url + str(page_num)) 144 | 145 | # Делаем бэкап, вдруг что пойдет не так 146 | df = pd.DataFrame([inds, titles, ingridient_lists, img_urls, step_lists, links]).transpose().set_index(0) 147 | if categ_num == 99: 148 | with open(DIR_NAME + 'backup_100.pkl', 'wb') as f: 149 | pkl.dump(df, f, protocol=4) 150 | else: 151 | with open(DIR_NAME + 'backup.pkl', 'wb') as f: 152 | pkl.dump(df, f, protocol=4) 153 | driver.get(BASE) 154 | 155 | # Закрываем драйвер и фиксируем резы в .pkl 156 | driver.close() 157 | df = pd.DataFrame([inds, titles, ingridient_lists, img_urls, step_lists, links]).transpose().set_index(0) 158 | with open(DIR_NAME + 'backup.pkl', 'wb') as f: 159 | pkl.dump(df, f, protocol=4) 160 | 161 | 162 | if __name__ == '__main__': 163 | main() 164 | # 01:19 - начало скрепинга 165 | with open(DIR_NAME + 'backup.pkl', 'rb') as f: 166 | df = pkl.load(f) 167 | print(df.shape) -------------------------------------------------------------------------------- /bot_final_vesion/dish_id_bot.py: -------------------------------------------------------------------------------- 1 | # To-do 2 | # # Вывести в отдельный работающий пайплайн 3 | # # Прикрутить распознование изображения 4 | # # Прикрутить ранжирование 5 | # # Прикрутить логирование 6 | 7 | 8 | import aiohttp 9 | import io 10 | import logging 11 | import os 12 | import time 13 | import numpy as np 14 | import pandas as pd 15 | import pickle as pkl 16 | import typing as tp 17 | 18 | from PIL import Image 19 | from aiogram import Bot, types 20 | from aiogram.utils import executor 21 | from aiogram.dispatcher import Dispatcher 22 | from aiogram.utils.helper import Helper, HelperMode, ListItem 23 | from aiogram.contrib.fsm_storage.memory import MemoryStorage 24 | from aiogram.types import ReplyKeyboardRemove, \ 25 | ReplyKeyboardMarkup, KeyboardButton, \ 26 | InlineKeyboardMarkup, InlineKeyboardButton 27 | from aiofiles import os as aio_os 28 | from torchvision import transforms 29 | from model import init_model, predict_image 30 | 31 | 32 | N_BEST = 5 33 | best_recipes = dict() 34 | TYPE = 'image' 35 | 36 | 37 | # bot = Bot(token=os.environ.get('TOKEN', None), 38 | TOKEN = 'TOKEN' 39 | bot = Bot(token=TOKEN) 40 | dp = Dispatcher(bot, storage=MemoryStorage()) 41 | 42 | 43 | init_model() 44 | logging.info("Model was init") 45 | logging.basicConfig(filename='log.txt', 46 | filemode='a', 47 | format='%(asctime)s, %(msecs) d %(name)s %(levelname) s %(message) s', 48 | datefmt='%H:%M:%S', 49 | level=logging.INFO) 50 | 51 | inline_keyboard_markup = types.InlineKeyboardMarkup() 52 | inline_keyboard_markup.add(types.InlineKeyboardButton('Применить стиль Сезанна', callback_data='sezanne')) 53 | 54 | 55 | class TestStates(Helper): 56 | mode = HelperMode.snake_case 57 | 58 | TEST_STATE_0 = ListItem() 59 | TEST_STATE_1 = ListItem() 60 | TEST_STATE_2 = ListItem() 61 | 62 | 63 | button0 = KeyboardButton('/help') 64 | buttons = [KeyboardButton('1')] 65 | buttons.append(KeyboardButton('2')) 66 | buttons.append(KeyboardButton('3')) 67 | buttons.append(KeyboardButton('4')) 68 | buttons.append(KeyboardButton('5')) 69 | markup0 = ReplyKeyboardMarkup(resize_keyboard=True, 70 | one_time_keyboard=True).add( 71 | button0) 72 | markup1 = ReplyKeyboardMarkup(resize_keyboard=True, 73 | one_time_keyboard=True).row( 74 | *buttons[:2]) 75 | markup5 = ReplyKeyboardMarkup(resize_keyboard=True, 76 | one_time_keyboard=True).row( 77 | *buttons) 78 | 79 | 80 | # Обработка 3 команд - старт, хелп и тим 81 | @dp.message_handler(commands=['start']) 82 | async def send_welcome(message: types.Message) -> None: 83 | state = dp.current_state(user=message.from_user.id) 84 | await state.set_state(TestStates.all()[0]) 85 | await message.answer("Привет, {}!\n".format(message.from_user.first_name) + 86 | "Я бот, который поможет тебе найти рецепт для блюда из фотографии") 87 | 88 | 89 | @dp.message_handler(commands=['help']) 90 | async def send_help(message: types.Message) -> None: 91 | logging.info("User asked for help") 92 | state = dp.current_state(user=message.from_user.id) 93 | await state.set_state(TestStates.all()[0]) 94 | await message.answer("Нужна помощь? Решение очень простое!\n" + 95 | "Просто отправь фотку блюда и алгоритм подберет оптимальный рецепт") 96 | 97 | 98 | @dp.message_handler(state=TestStates.all(), commands=['start']) 99 | async def send_welcome_state(message: types.Message) -> None: 100 | await send_welcome(message) 101 | 102 | 103 | @dp.message_handler(state=TestStates.all(), commands=['help']) 104 | async def send_help_state(message: types.Message) -> None: 105 | await send_help(message) 106 | 107 | 108 | @dp.message_handler(state=TestStates.all(), content_types=['photo']) 109 | async def handle_photo(message): 110 | if not os.path.isdir(f'/home/dishid_bot/photo/{message.from_user.id}'): 111 | await aio_os.mkdir(f'/home/dishid_bot/photo/{message.from_user.id}') 112 | await message.photo[-1].download( 113 | f'/home/dishid_bot/photo/{message.from_user.id}/{TYPE}.jpg') 114 | if TYPE == 'image': 115 | await bot.send_message(message.chat.id, "Отлично, картинка загружена! Сейчас скажу, какие ингредиенты я тут вижу") 116 | await apply_model(message) 117 | 118 | 119 | @dp.message_handler(state=TestStates.TEST_STATE_1[0], commands=['run']) 120 | async def apply_model(message: types.Message): 121 | logging.info("Get image from user {}".format(message.chat.id)) 122 | transform = transforms.Compose([ 123 | transforms.Resize((250, 250)), 124 | transforms.CenterCrop((224, 224)), 125 | transforms.ToTensor(), 126 | transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) 127 | ]) 128 | path_image = f'/home/dishid_bot/photo/{message.chat.id}/{TYPE}.jpg' 129 | result = predict_image(path_image, transform) 130 | res_list = result.replace('\t', '').split('\n') 131 | 132 | if len(res_list) == 0: 133 | await message.answer("Я на фото ничего не нашел...\nПопробуй загрузить фото лучшего качества") 134 | 135 | logging.info("Found ingredients: {}".format(", ".join(res_list))) 136 | await message.answer("Я выделил ингредиенты на фото.\nВот что я тут вижу: {}\n\nСейчас по этому списку подберу рецептики".format(", ".join(res_list))) 137 | 138 | await match_recipes(" ".join(res_list), message) 139 | 140 | 141 | @dp.message_handler(state=TestStates.TEST_STATE_1[0]) 142 | async def team_1(message: types.Message, redirect: bool = False) -> None: 143 | await message.answer("Отправь фото блюда, чтобы получить релевантный рецепт\n" + 144 | "Если хочешь посмотреть другие рецепты по предыдущей фотографии, просто отправь её ещё раз") 145 | 146 | 147 | async def match_recipes(ingridients, message): 148 | t0 = time.time() 149 | logging.info("Matching with ingridients list: {}".format(ingridients)) 150 | with open('/home/dishid_bot/vectorizer.pkl', 'rb') as f: 151 | vectorizer = pkl.load(f) 152 | with open('/home/dishid_bot/botrain_tfidf.pkl', 'rb') as f: 153 | train_tfidf = pkl.load(f) 154 | with open('/home/dishid_bot/eda_povar.pkl', 'rb') as f: 155 | train = pkl.load(f) 156 | 157 | test_tfidf = vectorizer.transform([ingridients]) 158 | result = np.array(test_tfidf).dot(np.array(train_tfidf.T)) 159 | 160 | loc_best_matches = np.argsort(np.array(result.todense()))[0][-N_BEST:] 161 | loc_best_matches = pd.DataFrame(loc_best_matches) 162 | loc_best_matches.columns = ['idx'] 163 | 164 | 165 | short_list = loc_best_matches['idx'].apply( 166 | lambda x: train[['name', 'ingridients', 'instructions', 'img_url', 'recipe_link']].iloc[x]) 167 | 168 | logging.info('Recipes matched: {}'.format(list(short_list.name))) 169 | 170 | best_recipes[message.from_user.id] = short_list 171 | 172 | await message.answer("Выберите номер наиболее понравившегося рецепта") 173 | await message.answer( 174 | "\n".join([f"{i}. {j}\n" for i, j in zip(range(1, 6), best_recipes[message.from_user.id].name)]), 175 | reply_markup=markup5) 176 | logging.info("Time spent from receiving the photo till proposing 5 recipes: {}".format(time.time() - t0)) 177 | state = dp.current_state(user=message.from_user.id) 178 | await state.set_state(TestStates.all()[2]) 179 | 180 | 181 | # team 2 182 | @dp.message_handler(content_types=['text'], state=TestStates.TEST_STATE_2[0]) 183 | async def team_2_txt(message: types.Message) -> None: 184 | 185 | dct_keys = {f'{i}': i for i in range(1, 6)} 186 | dct_keys.update({'1️': 1, '2️': 2, '3️': 3, '4️⃣': 4, '5️⃣': 5}) 187 | 188 | flag = dct_keys.get(message.text.strip()) ### Добавил .strip() 189 | if (best_recipes.get(message.from_user.id) is None) or (flag is None): 190 | await message.answer( 191 | "Немного не понял. Выбери число еще раз (кнопки ниже)") # Поменял текст, закомментил 2 строки ниже 192 | return 193 | recipe = best_recipes[message.from_user.id].iloc[flag - 1] 194 | logging.info("User chose {}".format(recipe['name'])) 195 | await message.answer("Название: {}\n\nСсылка: {}\n\nИнгредиенты:\n{}\n\nРецепт: {}".format( 196 | recipe['name'], recipe['recipe_link'], "\n".join(recipe['ingridients'][2:-2].split("', '")), 197 | "\n\n".join(recipe['instructions'][2:-2].split("', '"))), 198 | reply_markup=ReplyKeyboardRemove() 199 | ) 200 | state = dp.current_state(user=message.from_user.id) 201 | await state.set_state(TestStates.all()[1]) 202 | 203 | @dp.message_handler() 204 | async def state0(message: types.Message) -> None: 205 | await first_message(message) 206 | 207 | 208 | @dp.message_handler(state=TestStates.TEST_STATE_0[0]) 209 | async def first_message(message: types.Message) -> None: 210 | state = dp.current_state(user=message.from_user.id) 211 | await state.set_state(TestStates.all()[1]) 212 | 213 | await message.answer("Ты работаешь с проектом Dish-ID.\n" + 214 | "Просто отправь мне фото блюда, и я подберу лучший рецепт", 215 | reply_markup=ReplyKeyboardRemove()) 216 | 217 | 218 | if __name__ == '__main__': 219 | executor.start_polling(dp) 220 | -------------------------------------------------------------------------------- /bot_final_vesion/en2ru_ing.json: -------------------------------------------------------------------------------- 1 | {"chicken": "\u043a\u0443\u0440\u0438\u0446\u0430", "beef": "\u0433\u043e\u0432\u044f\u0434\u0438\u043d\u0430", "turkey": "\u0438\u043d\u0434\u0435\u0439\u043a\u0430", "soft cheese": "\u043c\u044f\u0433\u043a\u0438\u0439 \u0441\u044b\u0440", "hard cheese": "\u0442\u0432\u0435\u0440\u0434\u044b\u0439 \u0441\u044b\u0440", "nut": "\u043e\u0440\u0435\u0445\u0438", "strawberry": "\u043a\u043b\u0443\u0431\u043d\u0438\u043a\u0430", "cherry tomato": "\u0442\u043e\u043c\u0430\u0442\u044b \u0447\u0435\u0440\u0440\u0438", "sausage": "\u043a\u043e\u043b\u0431\u0430\u0441\u0430", "asparagus": "\u0441\u043f\u0430\u0440\u0436\u0430", "cranberry": "\u043a\u043b\u044e\u043a\u0432\u0430", "gelatin": "\u0436\u0435\u043b\u0430\u0442\u0438\u043d", "tofu": "\u0442\u043e\u0444\u0443", "olive": "\u043e\u043b\u0438\u0432\u043a\u0438", "thyme": "\u0442\u0438\u043c\u044c\u044f\u043d", "tuna": "\u0442\u0443\u043d\u0435\u0446", "paprika": "\u043f\u0430\u043f\u0440\u0438\u043a\u0430", "cardamom": "\u043a\u0430\u0440\u0434\u0430\u043c\u043e\u043d", "red pepper": "\u043a\u0440\u0430\u0441\u043d\u044b\u0439 \u043f\u0435\u0440\u0435\u0446", "shrimp": "\u043a\u0440\u0435\u0432\u0435\u0442\u043a\u0430", "tapioca": "\u0442\u0430\u043f\u0438\u043e\u043a\u0430", "pumpkin": "\u0442\u044b\u043a\u0432\u0430", "date": "\u0444\u0438\u043d\u0438\u043a", "plum": "\u0441\u043b\u0438\u0432\u0430", "green bean": "\u0441\u0442\u0440\u0443\u0447\u043a\u043e\u0432\u0430\u044f \u0444\u0430\u0441\u043e\u043b\u044c", "bean": "\u0444\u0430\u0441\u043e\u043b\u044c", "fish": "\u0440\u044b\u0431\u0430", "lasagna": "\u043b\u0430\u0437\u0430\u043d\u044c\u044f", "lobster": "\u043b\u043e\u0431\u0441\u0442\u0435\u0440", "flour": "\u043c\u0443\u043a\u0430", "octopus": "\u043e\u0441\u044c\u043c\u0438\u043d\u043e\u0433", "cookie": "\u043f\u0435\u0447\u0435\u043d\u044c\u0435", "milk": "\u043c\u043e\u043b\u043e\u043a\u043e", "lemon": "\u043b\u0438\u043c\u043e\u043d", "chocolate": "\u0448\u043e\u043a\u043e\u043b\u0430\u0434", "cracker": "\u043a\u0440\u0435\u043a\u0435\u0440", "clam": "\u043c\u043e\u043b\u043b\u044e\u0441\u043a", "cherry": "\u0432\u0438\u0448\u043d\u044f", "soy milk": "\u0441\u043e\u0435\u0432\u043e\u0435 \u043c\u043e\u043b\u043e\u043a\u043e", "pineapple": "\u0430\u043d\u0430\u043d\u0430\u0441", "cabbage": "\u043a\u0430\u043f\u0443\u0441\u0442\u0430", "pasta": "\u0441\u043f\u0430\u0433\u0435\u0442\u0442\u0438", "crab": "\u043a\u0440\u0430\u0431", "bacon": "\u0431\u0435\u043a\u043e\u043d", "wasabi": "\u0432\u0430\u0441\u0430\u0431\u0438", "onion": "\u043b\u0443\u043a", "patty": "\u043f\u0438\u0440\u043e\u0436\u043e\u043a", "baking powder": "\u0440\u0430\u0437\u0440\u044b\u0445\u043b\u0438\u0442\u0435\u043b\u044c", "almond": "\u043c\u0438\u043d\u0434\u0430\u043b\u044c", "mango": "\u043c\u0430\u043d\u0433\u043e", "mussel": "\u043c\u0438\u0434\u0438\u044f", "cheese": "\u0441\u044b\u0440", "grape": "\u0432\u0438\u043d\u043e\u0433\u0440\u0430\u0434", "veal": "\u0442\u0435\u043b\u044f\u0442\u0438\u043d\u0430", "honey": "\u043c\u0435\u0434", "vanilla": "\u0432\u0430\u043d\u0438\u043b\u044c", "potato": "\u043a\u0430\u0440\u0442\u043e\u0444\u0435\u043b\u044c", "capers": "\u043a\u0430\u043f\u0435\u0440\u0441", "mollusk": "\u043c\u043e\u043b\u043b\u044e\u0441\u043a", "raspberry": "\u043c\u0430\u043b\u0438\u043d\u0430", "tomato": "\u043f\u043e\u043c\u0438\u0434\u043e\u0440", "cinnamon": "\u043a\u043e\u0440\u0438\u0446\u0430", "tortilla": "\u0442\u043e\u0440\u0442\u0438\u043b\u044c\u044f", "lamb": "\u044f\u0433\u043d\u0435\u043d\u043e\u043a", "jam": "\u0432\u0430\u0440\u0435\u043d\u044c\u0435", "mustard": "\u0433\u043e\u0440\u0447\u0438\u0446\u0430", "sherbet": "\u0448\u0435\u0440\u0431\u0435\u0442", "oyster": "\u0443\u0441\u0442\u0440\u0438\u0446\u0430", "bread": "\u0445\u043b\u0435\u0431", "banana": "\u0431\u0430\u043d\u0430\u043d", "spaghetti": "\u0441\u043f\u0430\u0433\u0435\u0442\u0442\u0438", "noodle": "\u043b\u0430\u043f\u0448\u0430", "egg": "\u044f\u0439\u0446\u043e", "duck": "\u0443\u0442\u043a\u0430", "jalapeno": "\u0445\u0430\u043b\u0430\u043f\u0435\u043d\u044c\u043e", "mayonnaise": "\u043c\u0430\u0439\u043e\u043d\u0435\u0437", "barley": "\u044f\u0447\u043c\u0435\u043d\u044c", "okra": "\u043e\u043a\u0440\u0430", "chives": "\u0437\u0435\u043b\u0435\u043d\u044b\u0439 \u043b\u0443\u043a", "cucumber": "\u043e\u0433\u0443\u0440\u0435\u0446", "turnip": "\u0440\u0435\u043f\u0430", "feijoa": "\u0444\u0435\u0439\u0445\u043e\u0430", "scrambled egg": "\u043e\u043c\u043b\u0435\u0442", "melon": "\u0434\u044b\u043d\u044f", "persimmon": "\u0445\u0443\u0440\u043c\u0430", "wine": "\u0432\u0438\u043d\u043e", "yogurt": "\u0439\u043e\u0433\u0443\u0440\u0442", "mushroom": "\u0433\u0440\u0438\u0431\u044b", "mutton": "\u0431\u0430\u0440\u0430\u043d\u0438\u043d\u0430", "bulgur": "\u0431\u0443\u043b\u0433\u0443\u0440", "salmon": "\u043b\u043e\u0441\u043e\u0441\u044c", "butter": "\u0441\u043b\u0438\u0432\u043e\u0447\u043d\u043e\u0435 \u043c\u0430\u0441\u043b\u043e", "blueberry": "\u0447\u0435\u0440\u043d\u0438\u043a\u0430", "celery": "\u0441\u0435\u043b\u044c\u0434\u0435\u0440\u0435\u0439", "molasses": "\u043f\u0430\u0442\u043e\u043a\u0430", "dough": "\u0442\u0435\u0441\u0442\u043e", "sugar": "\u0441\u0430\u0445\u0430\u0440", "apple": "\u044f\u0431\u043b\u043e\u043a\u043e", "goose": "\u0433\u0443\u0441\u044c", "chili pepper": "\u043f\u0435\u0440\u0435\u0446 \u0447\u0438\u043b\u0438", "chips": "\u0447\u0438\u043f\u0441\u044b", "collard": "\u0437\u0435\u043b\u0435\u043d\u044c", "semolina": "\u043c\u0430\u043d\u043d\u0430\u044f \u043a\u0440\u0443\u043f\u0430", "miso": "\u043c\u0438\u0441\u043e", "macaroni": "\u043c\u0430\u043a\u0430\u0440\u043e\u043d\u044b", "coffee": "\u043a\u043e\u0444\u0435", "water": "\u0432\u043e\u0434\u0430", "pickle": "\u0441\u043e\u043b\u0435\u043d\u044b\u0439 \u043e\u0433\u0443\u0440\u0435\u0446", "leeks": "\u043b\u0443\u043a-\u043f\u043e\u0440\u0435\u0439", "cocoa": "\u043a\u0430\u043a\u0430\u043e", "peach": "\u043f\u0435\u0440\u0441\u0438\u043a", "cereals": "\u043e\u0432\u0441\u044f\u043d\u044b\u0435 \u0445\u043b\u043e\u043f\u044c\u044f", "margarine": "\u043c\u0430\u0440\u0433\u0430\u0440\u0438\u043d", "vegetables": "\u043e\u0432\u043e\u0449\u0438", "sauce": "\u0441\u043e\u0443\u0441", "bran": "\u043e\u0442\u0440\u0443\u0431\u0438", "squash": "\u0442\u044b\u043a\u0432\u0430", "berry": "\u044f\u0433\u043e\u0434\u044b", "rice": "\u0440\u0438\u0441", "kale": "\u043b\u0438\u0441\u0442\u043e\u0432\u0430\u044f \u043a\u0430\u043f\u0443\u0441\u0442\u0430", "papaya": "\u043f\u0430\u043f\u0430\u0439\u044f", "pea": "\u0433\u043e\u0440\u043e\u0445", "ham": "\u0432\u0435\u0442\u0447\u0438\u043d\u0430", "syrup": "\u0441\u0438\u0440\u043e\u043f", "juice": "\u0441\u043e\u043a", "orange": "\u0430\u043f\u0435\u043b\u044c\u0441\u0438\u043d", "prune": "\u0447\u0435\u0440\u043d\u043e\u0441\u043b\u0438\u0432", "broccoli": "\u0431\u0440\u043e\u043a\u043a\u043e\u043b\u0438", "sesame": "\u043a\u0443\u043d\u0436\u0443\u0442", "whipped cream": "\u0432\u0437\u0431\u0438\u0442\u044b\u0435 \u0441\u043b\u0438\u0432\u043a\u0438", "sour cream": "\u0441\u043c\u0435\u0442\u0430\u043d\u0430", "ice cream": "\u043c\u043e\u0440\u043e\u0436\u0435\u043d\u043e\u0435", "beet": "\u0441\u0432\u0435\u043a\u043b\u0430", "waffle": "\u0432\u0430\u0444\u043b\u0438", "crouton": "\u0433\u0440\u0435\u043d\u043a\u0438", "oil": "\u043c\u0430\u0441\u043b\u043e", "coconut oil": "\u043a\u043e\u043a\u043e\u0441\u043e\u0432\u043e\u0435 \u043c\u0430\u0441\u043b\u043e", "avocado": "\u0430\u0432\u043e\u043a\u0430\u0434\u043e", "taco": "\u0442\u0430\u043a\u043e", "currant": "\u0441\u043c\u043e\u0440\u043e\u0434\u0438\u043d\u0430", "seaweed": "\u0432\u043e\u0434\u043e\u0440\u043e\u0441\u043b\u0438", "carrot": "\u043c\u043e\u0440\u043a\u043e\u0432\u044c", "radish": "\u0440\u0435\u0434\u0438\u0441", "apricot": "\u0430\u0431\u0440\u0438\u043a\u043e\u0441", "raisin": "\u0438\u0437\u044e\u043c", "pate": "\u043f\u0430\u0448\u0442\u0435\u0442", "salami": "\u0441\u0430\u043b\u044f\u043c\u0438", "brussels sprout": "\u0431\u0440\u044e\u0441\u0441\u0435\u043b\u044c\u0441\u043a\u0430\u044f \u043a\u0430\u043f\u0443\u0441\u0442\u0430", "trout": "\u0444\u043e\u0440\u0435\u043b\u044c", "lime": "\u043b\u0430\u0439\u043c", "hummus": "\u0445\u0443\u043c\u0443\u0441", "lentil": "\u0447\u0435\u0447\u0435\u0432\u0438\u0446\u0430", "scallop": "\u0433\u0440\u0435\u0431\u0435\u0448\u043e\u043a", "lettuce": "\u0441\u0430\u043b\u0430\u0442", "poultry": "\u0434\u043e\u043c\u0430\u0448\u043d\u044f\u044f \u043f\u0442\u0438\u0446\u0430", "pomegranate": "\u0433\u0440\u0430\u043d\u0430\u0442", "soy sauce": "\u0441\u043e\u0435\u0432\u044b\u0439 \u0441\u043e\u0443\u0441", "walnut": "\u0433\u0440\u0435\u0446\u043a\u0438\u0439 \u043e\u0440\u0435\u0445", "cashew": "\u043a\u0435\u0448\u044c\u044e", "egg yolk": "\u044f\u0438\u0447\u043d\u044b\u0439 \u0436\u0435\u043b\u0442\u043e\u043a", "corn": "\u043a\u0443\u043a\u0443\u0440\u0443\u0437\u0430", "cornmeal": "\u043a\u0443\u043a\u0443\u0440\u0443\u0437\u043d\u0430\u044f \u043c\u0443\u043a\u0430", "almond milk": "\u043c\u0438\u043d\u0434\u0430\u043b\u044c\u043d\u043e\u0435 \u043c\u043e\u043b\u043e\u043a\u043e", "peanut butter": "\u0430\u0440\u0430\u0445\u0438\u0441\u043e\u0432\u043e\u0435 \u043c\u0430\u0441\u043b\u043e", "chia seeds": "\u0441\u0435\u043c\u0435\u043d\u0430 \u0447\u0438\u0430", "bell pepper": "\u0431\u043e\u043b\u0433\u0430\u0440\u0441\u043a\u0438\u0439 \u043f\u0435\u0440\u0435\u0446", "cake mix": "\u0441\u043c\u0435\u0441\u044c \u0434\u043b\u044f \u043a\u0435\u043a\u0441\u0430", "pudding mix": "\u0441\u043c\u0435\u0441\u044c \u0434\u043b\u044f \u043f\u0443\u0434\u0434\u0438\u043d\u0433\u0430", "coconut": "\u043a\u043e\u043a\u043e\u0441", "crab meat": "\u043a\u0440\u0430\u0431\u043e\u0432\u043e\u0435 \u043c\u044f\u0441\u043e", "cream cheese": "\u0441\u043b\u0438\u0432\u043e\u0447\u043d\u044b\u0439 \u0441\u044b\u0440", "muffin": "\u043c\u0430\u0444\u0444\u0438\u043d", "jell": "\u0436\u0435\u043b\u0435", "ice": "\u043b\u0435\u0434", "cottage cheese": "\u0442\u0432\u043e\u0440\u043e\u0433", "pecan": "\u043f\u0435\u043a\u0430\u043d", "baking soda": "\u043f\u0438\u0449\u0435\u0432\u0430\u044f \u0441\u043e\u0434\u0430", "buttermilk": "\u043a\u0435\u0444\u0438\u0440", "zucchini": "\u0446\u0443\u043a\u043a\u0438\u043d\u0438", "eggplant": "\u0431\u0430\u043a\u043b\u0430\u0436\u0430\u043d", "candy": "\u043a\u043e\u043d\u0444\u0435\u0442\u044b", "bun": "\u0431\u0443\u043b\u043e\u0447\u043a\u0430", "alcohol": "\u0430\u043b\u043a\u043e\u0433\u043e\u043b\u044c", "beer": "\u043f\u0438\u0432\u043e", "lemonade": "\u043b\u0438\u043c\u043e\u043d\u0430\u0434", "ketchup": "\u043a\u0435\u0442\u0447\u0443\u043f", "ginger": "\u0438\u043c\u0431\u0438\u0440\u044c"} --------------------------------------------------------------------------------