├── README.md
├── indexData.ipynb
├── indexMapping.py
├── myntra_products_catalog.csv
└── searchApp.py
/README.md:
--------------------------------------------------------------------------------
1 | # semantic-search-elastic-search-and-BERT-vector-embedding
--------------------------------------------------------------------------------
/indexData.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from elasticsearch import Elasticsearch"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "data": {
19 | "text/plain": [
20 | "True"
21 | ]
22 | },
23 | "execution_count": 2,
24 | "metadata": {},
25 | "output_type": "execute_result"
26 | }
27 | ],
28 | "source": [
29 | "es = Elasticsearch(\n",
30 | " \"https://localhost:9200\",\n",
31 | " basic_auth=(\"elastic\",\"LQym+efHnUy9DbT-jtD2\"),\n",
32 | " ca_certs=\"/Users/abidsaudagar/Personal/yt1_semantic_search/elasticsearch-8.9.1/config/certs/http_ca.crt\"\n",
33 | ")\n",
34 | "es.ping()"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## Prepare the data"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "data": {
51 | "text/html": [
52 | "
\n",
53 | "\n",
66 | "
\n",
67 | " \n",
68 | " \n",
69 | " | \n",
70 | " ProductID | \n",
71 | " ProductName | \n",
72 | " ProductBrand | \n",
73 | " Gender | \n",
74 | " Price (INR) | \n",
75 | " NumImages | \n",
76 | " Description | \n",
77 | " PrimaryColor | \n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " \n",
82 | " 0 | \n",
83 | " 10017413 | \n",
84 | " DKNY Unisex Black & Grey Printed Medium Trolle... | \n",
85 | " DKNY | \n",
86 | " Unisex | \n",
87 | " 11745 | \n",
88 | " 7 | \n",
89 | " Black and grey printed medium trolley bag, sec... | \n",
90 | " Black | \n",
91 | "
\n",
92 | " \n",
93 | " 1 | \n",
94 | " 10016283 | \n",
95 | " EthnoVogue Women Beige & Grey Made to Measure ... | \n",
96 | " EthnoVogue | \n",
97 | " Women | \n",
98 | " 5810 | \n",
99 | " 7 | \n",
100 | " Beige & Grey made to measure kurta with churid... | \n",
101 | " Beige | \n",
102 | "
\n",
103 | " \n",
104 | " 2 | \n",
105 | " 10009781 | \n",
106 | " SPYKAR Women Pink Alexa Super Skinny Fit High-... | \n",
107 | " SPYKAR | \n",
108 | " Women | \n",
109 | " 899 | \n",
110 | " 7 | \n",
111 | " Pink coloured wash 5-pocket high-rise cropped ... | \n",
112 | " Pink | \n",
113 | "
\n",
114 | " \n",
115 | " 3 | \n",
116 | " 10015921 | \n",
117 | " Raymond Men Blue Self-Design Single-Breasted B... | \n",
118 | " Raymond | \n",
119 | " Men | \n",
120 | " 5599 | \n",
121 | " 5 | \n",
122 | " Blue self-design bandhgala suitBlue self-desig... | \n",
123 | " Blue | \n",
124 | "
\n",
125 | " \n",
126 | " 4 | \n",
127 | " 10017833 | \n",
128 | " Parx Men Brown & Off-White Slim Fit Printed Ca... | \n",
129 | " Parx | \n",
130 | " Men | \n",
131 | " 759 | \n",
132 | " 5 | \n",
133 | " Brown and off-white printed casual shirt, has ... | \n",
134 | " White | \n",
135 | "
\n",
136 | " \n",
137 | "
\n",
138 | "
"
139 | ],
140 | "text/plain": [
141 | " ProductID ProductName ProductBrand \\\n",
142 | "0 10017413 DKNY Unisex Black & Grey Printed Medium Trolle... DKNY \n",
143 | "1 10016283 EthnoVogue Women Beige & Grey Made to Measure ... EthnoVogue \n",
144 | "2 10009781 SPYKAR Women Pink Alexa Super Skinny Fit High-... SPYKAR \n",
145 | "3 10015921 Raymond Men Blue Self-Design Single-Breasted B... Raymond \n",
146 | "4 10017833 Parx Men Brown & Off-White Slim Fit Printed Ca... Parx \n",
147 | "\n",
148 | " Gender Price (INR) NumImages \\\n",
149 | "0 Unisex 11745 7 \n",
150 | "1 Women 5810 7 \n",
151 | "2 Women 899 7 \n",
152 | "3 Men 5599 5 \n",
153 | "4 Men 759 5 \n",
154 | "\n",
155 | " Description PrimaryColor \n",
156 | "0 Black and grey printed medium trolley bag, sec... Black \n",
157 | "1 Beige & Grey made to measure kurta with churid... Beige \n",
158 | "2 Pink coloured wash 5-pocket high-rise cropped ... Pink \n",
159 | "3 Blue self-design bandhgala suitBlue self-desig... Blue \n",
160 | "4 Brown and off-white printed casual shirt, has ... White "
161 | ]
162 | },
163 | "execution_count": 3,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "import pandas as pd\n",
170 | "\n",
171 | "df = pd.read_csv(\"myntra_products_catalog.csv\").loc[:499]\n",
172 | "df.head()"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 6,
178 | "metadata": {},
179 | "outputs": [
180 | {
181 | "data": {
182 | "text/plain": [
183 | "ProductID ProductName ProductBrand Gender Price (INR) NumImages Description PrimaryColor\n",
184 | "False False False False False False False False 500\n",
185 | "Name: count, dtype: int64"
186 | ]
187 | },
188 | "execution_count": 6,
189 | "metadata": {},
190 | "output_type": "execute_result"
191 | }
192 | ],
193 | "source": [
194 | "df.isna().value_counts()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 5,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "df.fillna(\"None\", inplace=True)"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "## Convert the relevant field to Vector using BERT model"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 7,
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "name": "stderr",
220 | "output_type": "stream",
221 | "text": [
222 | "/Users/abidsaudagar/Personal/yt1_semantic_search/semantic_search/venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
223 | " from .autonotebook import tqdm as notebook_tqdm\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "from sentence_transformers import SentenceTransformer\n",
229 | "model = SentenceTransformer('all-mpnet-base-v2')"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 8,
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "df[\"DescriptionVector\"] = df[\"Description\"].apply(lambda x: model.encode(x))"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 9,
244 | "metadata": {},
245 | "outputs": [
246 | {
247 | "data": {
248 | "text/html": [
249 | "\n",
250 | "\n",
263 | "
\n",
264 | " \n",
265 | " \n",
266 | " | \n",
267 | " ProductID | \n",
268 | " ProductName | \n",
269 | " ProductBrand | \n",
270 | " Gender | \n",
271 | " Price (INR) | \n",
272 | " NumImages | \n",
273 | " Description | \n",
274 | " PrimaryColor | \n",
275 | " DescriptionVector | \n",
276 | "
\n",
277 | " \n",
278 | " \n",
279 | " \n",
280 | " 0 | \n",
281 | " 10017413 | \n",
282 | " DKNY Unisex Black & Grey Printed Medium Trolle... | \n",
283 | " DKNY | \n",
284 | " Unisex | \n",
285 | " 11745 | \n",
286 | " 7 | \n",
287 | " Black and grey printed medium trolley bag, sec... | \n",
288 | " Black | \n",
289 | " [0.027645726, -0.0026341523, -0.0035884595, 0.... | \n",
290 | "
\n",
291 | " \n",
292 | " 1 | \n",
293 | " 10016283 | \n",
294 | " EthnoVogue Women Beige & Grey Made to Measure ... | \n",
295 | " EthnoVogue | \n",
296 | " Women | \n",
297 | " 5810 | \n",
298 | " 7 | \n",
299 | " Beige & Grey made to measure kurta with churid... | \n",
300 | " Beige | \n",
301 | " [-0.024660703, -0.028755462, -0.02033245, 0.03... | \n",
302 | "
\n",
303 | " \n",
304 | " 2 | \n",
305 | " 10009781 | \n",
306 | " SPYKAR Women Pink Alexa Super Skinny Fit High-... | \n",
307 | " SPYKAR | \n",
308 | " Women | \n",
309 | " 899 | \n",
310 | " 7 | \n",
311 | " Pink coloured wash 5-pocket high-rise cropped ... | \n",
312 | " Pink | \n",
313 | " [-0.0469433, 0.08182786, 0.0483352, -0.0001582... | \n",
314 | "
\n",
315 | " \n",
316 | " 3 | \n",
317 | " 10015921 | \n",
318 | " Raymond Men Blue Self-Design Single-Breasted B... | \n",
319 | " Raymond | \n",
320 | " Men | \n",
321 | " 5599 | \n",
322 | " 5 | \n",
323 | " Blue self-design bandhgala suitBlue self-desig... | \n",
324 | " Blue | \n",
325 | " [-0.015098702, -0.010285483, 0.0094872955, -0.... | \n",
326 | "
\n",
327 | " \n",
328 | " 4 | \n",
329 | " 10017833 | \n",
330 | " Parx Men Brown & Off-White Slim Fit Printed Ca... | \n",
331 | " Parx | \n",
332 | " Men | \n",
333 | " 759 | \n",
334 | " 5 | \n",
335 | " Brown and off-white printed casual shirt, has ... | \n",
336 | " White | \n",
337 | " [-0.017746529, 0.0062094983, 0.021813963, 0.02... | \n",
338 | "
\n",
339 | " \n",
340 | "
\n",
341 | "
"
342 | ],
343 | "text/plain": [
344 | " ProductID ProductName ProductBrand \\\n",
345 | "0 10017413 DKNY Unisex Black & Grey Printed Medium Trolle... DKNY \n",
346 | "1 10016283 EthnoVogue Women Beige & Grey Made to Measure ... EthnoVogue \n",
347 | "2 10009781 SPYKAR Women Pink Alexa Super Skinny Fit High-... SPYKAR \n",
348 | "3 10015921 Raymond Men Blue Self-Design Single-Breasted B... Raymond \n",
349 | "4 10017833 Parx Men Brown & Off-White Slim Fit Printed Ca... Parx \n",
350 | "\n",
351 | " Gender Price (INR) NumImages \\\n",
352 | "0 Unisex 11745 7 \n",
353 | "1 Women 5810 7 \n",
354 | "2 Women 899 7 \n",
355 | "3 Men 5599 5 \n",
356 | "4 Men 759 5 \n",
357 | "\n",
358 | " Description PrimaryColor \\\n",
359 | "0 Black and grey printed medium trolley bag, sec... Black \n",
360 | "1 Beige & Grey made to measure kurta with churid... Beige \n",
361 | "2 Pink coloured wash 5-pocket high-rise cropped ... Pink \n",
362 | "3 Blue self-design bandhgala suitBlue self-desig... Blue \n",
363 | "4 Brown and off-white printed casual shirt, has ... White \n",
364 | "\n",
365 | " DescriptionVector \n",
366 | "0 [0.027645726, -0.0026341523, -0.0035884595, 0.... \n",
367 | "1 [-0.024660703, -0.028755462, -0.02033245, 0.03... \n",
368 | "2 [-0.0469433, 0.08182786, 0.0483352, -0.0001582... \n",
369 | "3 [-0.015098702, -0.010285483, 0.0094872955, -0.... \n",
370 | "4 [-0.017746529, 0.0062094983, 0.021813963, 0.02... "
371 | ]
372 | },
373 | "execution_count": 9,
374 | "metadata": {},
375 | "output_type": "execute_result"
376 | }
377 | ],
378 | "source": [
379 | "df.head()"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 10,
385 | "metadata": {},
386 | "outputs": [
387 | {
388 | "data": {
389 | "text/plain": [
390 | "True"
391 | ]
392 | },
393 | "execution_count": 10,
394 | "metadata": {},
395 | "output_type": "execute_result"
396 | }
397 | ],
398 | "source": [
399 | "es.ping()"
400 | ]
401 | },
402 | {
403 | "cell_type": "markdown",
404 | "metadata": {},
405 | "source": [
406 | "## Create new index in ElasticSearch!"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 12,
412 | "metadata": {},
413 | "outputs": [
414 | {
415 | "data": {
416 | "text/plain": [
417 | "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'all_products'})"
418 | ]
419 | },
420 | "execution_count": 12,
421 | "metadata": {},
422 | "output_type": "execute_result"
423 | }
424 | ],
425 | "source": [
426 | "from indexMapping import indexMapping\n",
427 | "\n",
428 | "es.indices.create(index=\"all_products\", mappings=indexMapping)"
429 | ]
430 | },
431 | {
432 | "cell_type": "markdown",
433 | "metadata": {},
434 | "source": [
435 | "## Ingest the data into index"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": 14,
441 | "metadata": {},
442 | "outputs": [],
443 | "source": [
444 | "record_list = df.to_dict(\"records\")"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 16,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "for record in record_list:\n",
454 | " try:\n",
455 | " es.index(index=\"all_products\", document=record, id=record[\"ProductID\"])\n",
456 | " except Exception as e:\n",
457 | " print(e)"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": 17,
463 | "metadata": {},
464 | "outputs": [
465 | {
466 | "data": {
467 | "text/plain": [
468 | "ObjectApiResponse({'count': 500, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})"
469 | ]
470 | },
471 | "execution_count": 17,
472 | "metadata": {},
473 | "output_type": "execute_result"
474 | }
475 | ],
476 | "source": [
477 | "es.count(index=\"all_products\")"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "metadata": {},
483 | "source": [
484 | "## Search the data"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 19,
490 | "metadata": {},
491 | "outputs": [
492 | {
493 | "name": "stderr",
494 | "output_type": "stream",
495 | "text": [
496 | "/var/folders/3k/901cf0jd1lngqxfl_y527qp00000gn/T/ipykernel_22712/2784084207.py:11: ElasticsearchWarning: The kNN search API has been replaced by the `knn` option in the search API.\n",
497 | " res = es.knn_search(index=\"all_products\", knn=query , source=[\"ProductName\",\"Description\"])\n"
498 | ]
499 | },
500 | {
501 | "data": {
502 | "text/plain": [
503 | "[{'_index': 'all_products',\n",
504 | " '_id': '10018013',\n",
505 | " '_score': 0.61429405,\n",
506 | " '_source': {'ProductName': 'Puma Men Blue Sneakers',\n",
507 | " 'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailTextile upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer'}},\n",
508 | " {'_index': 'all_products',\n",
509 | " '_id': '10018075',\n",
510 | " '_score': 0.61429405,\n",
511 | " '_source': {'ProductName': 'Puma Men Blue Sneakers',\n",
512 | " 'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailTextile upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer'}}]"
513 | ]
514 | },
515 | "execution_count": 19,
516 | "metadata": {},
517 | "output_type": "execute_result"
518 | }
519 | ],
520 | "source": [
521 | "input_keyword = \"Blue Shoes\"\n",
522 | "vector_of_input_keyword = model.encode(input_keyword)\n",
523 | "\n",
524 | "query = {\n",
525 | " \"field\" : \"DescriptionVector\",\n",
526 | " \"query_vector\" : vector_of_input_keyword,\n",
527 | " \"k\" : 2,\n",
528 | " \"num_candidates\" : 500, \n",
529 | "}\n",
530 | "\n",
531 | "res = es.knn_search(index=\"all_products\", knn=query , source=[\"ProductName\",\"Description\"])\n",
532 | "res[\"hits\"][\"hits\"]"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": null,
538 | "metadata": {},
539 | "outputs": [],
540 | "source": []
541 | }
542 | ],
543 | "metadata": {
544 | "kernelspec": {
545 | "display_name": "venv",
546 | "language": "python",
547 | "name": "python3"
548 | },
549 | "language_info": {
550 | "codemirror_mode": {
551 | "name": "ipython",
552 | "version": 3
553 | },
554 | "file_extension": ".py",
555 | "mimetype": "text/x-python",
556 | "name": "python",
557 | "nbconvert_exporter": "python",
558 | "pygments_lexer": "ipython3",
559 | "version": "3.9.6"
560 | },
561 | "orig_nbformat": 4
562 | },
563 | "nbformat": 4,
564 | "nbformat_minor": 2
565 | }
566 |
--------------------------------------------------------------------------------
/indexMapping.py:
--------------------------------------------------------------------------------
1 | indexMapping = {
2 | "properties":{
3 | "ProductID":{
4 | "type":"long"
5 | },
6 | "ProductName":{
7 | "type":"text"
8 | },
9 | "ProductBrand":{
10 | "type":"text"
11 | },
12 | "Gender":{
13 | "type":"text"
14 | },
15 | "Price (INR)":{
16 | "type":"long"
17 | },
18 | "NumImages":{
19 | "type":"long"
20 | },
21 | "Description":{
22 | "type":"text"
23 | },
24 | "PrimaryColor":{
25 | "type":"text"
26 | },
27 | "DescriptionVector":{
28 | "type":"dense_vector",
29 | "dims": 768,
30 | "index":True,
31 | "similarity": "l2_norm"
32 | }
33 |
34 | }
35 | }
--------------------------------------------------------------------------------
/searchApp.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from elasticsearch import Elasticsearch
3 | from sentence_transformers import SentenceTransformer
4 |
5 | indexName = "all_products"
6 |
7 | try:
8 | es = Elasticsearch(
9 | "https://localhost:9200",
10 | basic_auth=("elastic", "LQym+efHnUy9DbT-jtD2"),
11 | ca_certs="/Users/abidsaudagar/Personal/yt1_semantic_search/elasticsearch-8.9.1/config/certs/http_ca.crt"
12 | )
13 | except ConnectionError as e:
14 | print("Connection Error:", e)
15 |
16 | if es.ping():
17 | print("Succesfully connected to ElasticSearch!!")
18 | else:
19 | print("Oops!! Can not connect to Elasticsearch!")
20 |
21 |
22 |
23 |
24 | def search(input_keyword):
25 | model = SentenceTransformer('all-mpnet-base-v2')
26 | vector_of_input_keyword = model.encode(input_keyword)
27 |
28 | query = {
29 | "field": "DescriptionVector",
30 | "query_vector": vector_of_input_keyword,
31 | "k": 10,
32 | "num_candidates": 500
33 | }
34 | res = es.knn_search(index="all_products"
35 | , knn=query
36 | , source=["ProductName","Description"]
37 | )
38 | results = res["hits"]["hits"]
39 |
40 | return results
41 |
42 | def main():
43 | st.title("Search Myntra Fashion Products")
44 |
45 | # Input: User enters search query
46 | search_query = st.text_input("Enter your search query")
47 |
48 | # Button: User triggers the search
49 | if st.button("Search"):
50 | if search_query:
51 | # Perform the search and get results
52 | results = search(search_query)
53 |
54 | # Display search results
55 | st.subheader("Search Results")
56 | for result in results:
57 | with st.container():
58 | if '_source' in result:
59 | try:
60 | st.header(f"{result['_source']['ProductName']}")
61 | except Exception as e:
62 | print(e)
63 |
64 | try:
65 | st.write(f"Description: {result['_source']['Description']}")
66 | except Exception as e:
67 | print(e)
68 | st.divider()
69 |
70 |
71 | if __name__ == "__main__":
72 | main()
73 |
--------------------------------------------------------------------------------