├── README.md ├── indexData.ipynb ├── indexMapping.py ├── myntra_products_catalog.csv └── searchApp.py /README.md: -------------------------------------------------------------------------------- 1 | # semantic-search-elastic-search-and-BERT-vector-embedding -------------------------------------------------------------------------------- /indexData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from elasticsearch import Elasticsearch" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "True" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "es = Elasticsearch(\n", 30 | " \"https://localhost:9200\",\n", 31 | " basic_auth=(\"elastic\",\"LQym+efHnUy9DbT-jtD2\"),\n", 32 | " ca_certs=\"/Users/abidsaudagar/Personal/yt1_semantic_search/elasticsearch-8.9.1/config/certs/http_ca.crt\"\n", 33 | ")\n", 34 | "es.ping()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Prepare the data" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/html": [ 52 | "
\n", 53 | "\n", 66 | "\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | "
ProductIDProductNameProductBrandGenderPrice (INR)NumImagesDescriptionPrimaryColor
010017413DKNY Unisex Black & Grey Printed Medium Trolle...DKNYUnisex117457Black and grey printed medium trolley bag, sec...Black
110016283EthnoVogue Women Beige & Grey Made to Measure ...EthnoVogueWomen58107Beige & Grey made to measure kurta with churid...Beige
210009781SPYKAR Women Pink Alexa Super Skinny Fit High-...SPYKARWomen8997Pink coloured wash 5-pocket high-rise cropped ...Pink
310015921Raymond Men Blue Self-Design Single-Breasted B...RaymondMen55995Blue self-design bandhgala suitBlue self-desig...Blue
410017833Parx Men Brown & Off-White Slim Fit Printed Ca...ParxMen7595Brown and off-white printed casual shirt, has ...White
\n", 138 | "
" 139 | ], 140 | "text/plain": [ 141 | " ProductID ProductName ProductBrand \\\n", 142 | "0 10017413 DKNY Unisex Black & Grey Printed Medium Trolle... DKNY \n", 143 | "1 10016283 EthnoVogue Women Beige & Grey Made to Measure ... EthnoVogue \n", 144 | "2 10009781 SPYKAR Women Pink Alexa Super Skinny Fit High-... SPYKAR \n", 145 | "3 10015921 Raymond Men Blue Self-Design Single-Breasted B... Raymond \n", 146 | "4 10017833 Parx Men Brown & Off-White Slim Fit Printed Ca... Parx \n", 147 | "\n", 148 | " Gender Price (INR) NumImages \\\n", 149 | "0 Unisex 11745 7 \n", 150 | "1 Women 5810 7 \n", 151 | "2 Women 899 7 \n", 152 | "3 Men 5599 5 \n", 153 | "4 Men 759 5 \n", 154 | "\n", 155 | " Description PrimaryColor \n", 156 | "0 Black and grey printed medium trolley bag, sec... Black \n", 157 | "1 Beige & Grey made to measure kurta with churid... Beige \n", 158 | "2 Pink coloured wash 5-pocket high-rise cropped ... Pink \n", 159 | "3 Blue self-design bandhgala suitBlue self-desig... Blue \n", 160 | "4 Brown and off-white printed casual shirt, has ... White " 161 | ] 162 | }, 163 | "execution_count": 3, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "import pandas as pd\n", 170 | "\n", 171 | "df = pd.read_csv(\"myntra_products_catalog.csv\").loc[:499]\n", 172 | "df.head()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "ProductID ProductName ProductBrand Gender Price (INR) NumImages Description PrimaryColor\n", 184 | "False False False False False False False False 500\n", 185 | "Name: count, dtype: int64" 186 | ] 187 | }, 188 | "execution_count": 6, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "df.isna().value_counts()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 5, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "df.fillna(\"None\", inplace=True)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## Convert the relevant field to Vector using BERT model" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 7, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "name": "stderr", 220 | "output_type": "stream", 221 | "text": [ 222 | "/Users/abidsaudagar/Personal/yt1_semantic_search/semantic_search/venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 223 | " from .autonotebook import tqdm as notebook_tqdm\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "from sentence_transformers import SentenceTransformer\n", 229 | "model = SentenceTransformer('all-mpnet-base-v2')" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 8, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "df[\"DescriptionVector\"] = df[\"Description\"].apply(lambda x: model.encode(x))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 9, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/html": [ 249 | "
\n", 250 | "\n", 263 | "\n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | "
ProductIDProductNameProductBrandGenderPrice (INR)NumImagesDescriptionPrimaryColorDescriptionVector
010017413DKNY Unisex Black & Grey Printed Medium Trolle...DKNYUnisex117457Black and grey printed medium trolley bag, sec...Black[0.027645726, -0.0026341523, -0.0035884595, 0....
110016283EthnoVogue Women Beige & Grey Made to Measure ...EthnoVogueWomen58107Beige & Grey made to measure kurta with churid...Beige[-0.024660703, -0.028755462, -0.02033245, 0.03...
210009781SPYKAR Women Pink Alexa Super Skinny Fit High-...SPYKARWomen8997Pink coloured wash 5-pocket high-rise cropped ...Pink[-0.0469433, 0.08182786, 0.0483352, -0.0001582...
310015921Raymond Men Blue Self-Design Single-Breasted B...RaymondMen55995Blue self-design bandhgala suitBlue self-desig...Blue[-0.015098702, -0.010285483, 0.0094872955, -0....
410017833Parx Men Brown & Off-White Slim Fit Printed Ca...ParxMen7595Brown and off-white printed casual shirt, has ...White[-0.017746529, 0.0062094983, 0.021813963, 0.02...
\n", 341 | "
" 342 | ], 343 | "text/plain": [ 344 | " ProductID ProductName ProductBrand \\\n", 345 | "0 10017413 DKNY Unisex Black & Grey Printed Medium Trolle... DKNY \n", 346 | "1 10016283 EthnoVogue Women Beige & Grey Made to Measure ... EthnoVogue \n", 347 | "2 10009781 SPYKAR Women Pink Alexa Super Skinny Fit High-... SPYKAR \n", 348 | "3 10015921 Raymond Men Blue Self-Design Single-Breasted B... Raymond \n", 349 | "4 10017833 Parx Men Brown & Off-White Slim Fit Printed Ca... Parx \n", 350 | "\n", 351 | " Gender Price (INR) NumImages \\\n", 352 | "0 Unisex 11745 7 \n", 353 | "1 Women 5810 7 \n", 354 | "2 Women 899 7 \n", 355 | "3 Men 5599 5 \n", 356 | "4 Men 759 5 \n", 357 | "\n", 358 | " Description PrimaryColor \\\n", 359 | "0 Black and grey printed medium trolley bag, sec... Black \n", 360 | "1 Beige & Grey made to measure kurta with churid... Beige \n", 361 | "2 Pink coloured wash 5-pocket high-rise cropped ... Pink \n", 362 | "3 Blue self-design bandhgala suitBlue self-desig... Blue \n", 363 | "4 Brown and off-white printed casual shirt, has ... White \n", 364 | "\n", 365 | " DescriptionVector \n", 366 | "0 [0.027645726, -0.0026341523, -0.0035884595, 0.... \n", 367 | "1 [-0.024660703, -0.028755462, -0.02033245, 0.03... \n", 368 | "2 [-0.0469433, 0.08182786, 0.0483352, -0.0001582... \n", 369 | "3 [-0.015098702, -0.010285483, 0.0094872955, -0.... \n", 370 | "4 [-0.017746529, 0.0062094983, 0.021813963, 0.02... " 371 | ] 372 | }, 373 | "execution_count": 9, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "df.head()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 10, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "data": { 389 | "text/plain": [ 390 | "True" 391 | ] 392 | }, 393 | "execution_count": 10, 394 | "metadata": {}, 395 | "output_type": "execute_result" 396 | } 397 | ], 398 | "source": [ 399 | "es.ping()" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "## Create new index in ElasticSearch!" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 12, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'all_products'})" 418 | ] 419 | }, 420 | "execution_count": 12, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "from indexMapping import indexMapping\n", 427 | "\n", 428 | "es.indices.create(index=\"all_products\", mappings=indexMapping)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "## Ingest the data into index" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 14, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "record_list = df.to_dict(\"records\")" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 16, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "for record in record_list:\n", 454 | " try:\n", 455 | " es.index(index=\"all_products\", document=record, id=record[\"ProductID\"])\n", 456 | " except Exception as e:\n", 457 | " print(e)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 17, 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/plain": [ 468 | "ObjectApiResponse({'count': 500, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})" 469 | ] 470 | }, 471 | "execution_count": 17, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "es.count(index=\"all_products\")" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "## Search the data" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 19, 490 | "metadata": {}, 491 | "outputs": [ 492 | { 493 | "name": "stderr", 494 | "output_type": "stream", 495 | "text": [ 496 | "/var/folders/3k/901cf0jd1lngqxfl_y527qp00000gn/T/ipykernel_22712/2784084207.py:11: ElasticsearchWarning: The kNN search API has been replaced by the `knn` option in the search API.\n", 497 | " res = es.knn_search(index=\"all_products\", knn=query , source=[\"ProductName\",\"Description\"])\n" 498 | ] 499 | }, 500 | { 501 | "data": { 502 | "text/plain": [ 503 | "[{'_index': 'all_products',\n", 504 | " '_id': '10018013',\n", 505 | " '_score': 0.61429405,\n", 506 | " '_source': {'ProductName': 'Puma Men Blue Sneakers',\n", 507 | " 'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailTextile upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer'}},\n", 508 | " {'_index': 'all_products',\n", 509 | " '_id': '10018075',\n", 510 | " '_score': 0.61429405,\n", 511 | " '_source': {'ProductName': 'Puma Men Blue Sneakers',\n", 512 | " 'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailTextile upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer'}}]" 513 | ] 514 | }, 515 | "execution_count": 19, 516 | "metadata": {}, 517 | "output_type": "execute_result" 518 | } 519 | ], 520 | "source": [ 521 | "input_keyword = \"Blue Shoes\"\n", 522 | "vector_of_input_keyword = model.encode(input_keyword)\n", 523 | "\n", 524 | "query = {\n", 525 | " \"field\" : \"DescriptionVector\",\n", 526 | " \"query_vector\" : vector_of_input_keyword,\n", 527 | " \"k\" : 2,\n", 528 | " \"num_candidates\" : 500, \n", 529 | "}\n", 530 | "\n", 531 | "res = es.knn_search(index=\"all_products\", knn=query , source=[\"ProductName\",\"Description\"])\n", 532 | "res[\"hits\"][\"hits\"]" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [] 541 | } 542 | ], 543 | "metadata": { 544 | "kernelspec": { 545 | "display_name": "venv", 546 | "language": "python", 547 | "name": "python3" 548 | }, 549 | "language_info": { 550 | "codemirror_mode": { 551 | "name": "ipython", 552 | "version": 3 553 | }, 554 | "file_extension": ".py", 555 | "mimetype": "text/x-python", 556 | "name": "python", 557 | "nbconvert_exporter": "python", 558 | "pygments_lexer": "ipython3", 559 | "version": "3.9.6" 560 | }, 561 | "orig_nbformat": 4 562 | }, 563 | "nbformat": 4, 564 | "nbformat_minor": 2 565 | } 566 | -------------------------------------------------------------------------------- /indexMapping.py: -------------------------------------------------------------------------------- 1 | indexMapping = { 2 | "properties":{ 3 | "ProductID":{ 4 | "type":"long" 5 | }, 6 | "ProductName":{ 7 | "type":"text" 8 | }, 9 | "ProductBrand":{ 10 | "type":"text" 11 | }, 12 | "Gender":{ 13 | "type":"text" 14 | }, 15 | "Price (INR)":{ 16 | "type":"long" 17 | }, 18 | "NumImages":{ 19 | "type":"long" 20 | }, 21 | "Description":{ 22 | "type":"text" 23 | }, 24 | "PrimaryColor":{ 25 | "type":"text" 26 | }, 27 | "DescriptionVector":{ 28 | "type":"dense_vector", 29 | "dims": 768, 30 | "index":True, 31 | "similarity": "l2_norm" 32 | } 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /searchApp.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from elasticsearch import Elasticsearch 3 | from sentence_transformers import SentenceTransformer 4 | 5 | indexName = "all_products" 6 | 7 | try: 8 | es = Elasticsearch( 9 | "https://localhost:9200", 10 | basic_auth=("elastic", "LQym+efHnUy9DbT-jtD2"), 11 | ca_certs="/Users/abidsaudagar/Personal/yt1_semantic_search/elasticsearch-8.9.1/config/certs/http_ca.crt" 12 | ) 13 | except ConnectionError as e: 14 | print("Connection Error:", e) 15 | 16 | if es.ping(): 17 | print("Succesfully connected to ElasticSearch!!") 18 | else: 19 | print("Oops!! Can not connect to Elasticsearch!") 20 | 21 | 22 | 23 | 24 | def search(input_keyword): 25 | model = SentenceTransformer('all-mpnet-base-v2') 26 | vector_of_input_keyword = model.encode(input_keyword) 27 | 28 | query = { 29 | "field": "DescriptionVector", 30 | "query_vector": vector_of_input_keyword, 31 | "k": 10, 32 | "num_candidates": 500 33 | } 34 | res = es.knn_search(index="all_products" 35 | , knn=query 36 | , source=["ProductName","Description"] 37 | ) 38 | results = res["hits"]["hits"] 39 | 40 | return results 41 | 42 | def main(): 43 | st.title("Search Myntra Fashion Products") 44 | 45 | # Input: User enters search query 46 | search_query = st.text_input("Enter your search query") 47 | 48 | # Button: User triggers the search 49 | if st.button("Search"): 50 | if search_query: 51 | # Perform the search and get results 52 | results = search(search_query) 53 | 54 | # Display search results 55 | st.subheader("Search Results") 56 | for result in results: 57 | with st.container(): 58 | if '_source' in result: 59 | try: 60 | st.header(f"{result['_source']['ProductName']}") 61 | except Exception as e: 62 | print(e) 63 | 64 | try: 65 | st.write(f"Description: {result['_source']['Description']}") 66 | except Exception as e: 67 | print(e) 68 | st.divider() 69 | 70 | 71 | if __name__ == "__main__": 72 | main() 73 | --------------------------------------------------------------------------------