├── DeepTextSearch
    ├── DeepTextSearch.py
    ├── __init__.py
    └── requirements.txt
├── Demo
    ├── Deep Text Search Demo.ipynb
    └── DeepTextSearchDemo.py
├── LICENSE.txt
├── README.md
├── logo
    ├── DeepTextSearch Logo-1.png
    └── DeepTextSearch Logo-2.png
├── setup.cfg
└── setup.py


/DeepTextSearch/DeepTextSearch.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sentence_transformers import SentenceTransformer, util
 3 | import numpy as np
 4 | import pickle
 5 | import os
 6 | 
 7 | 
 8 | corpus_list_data = os.path.join('embedding-data/','corpus_list_data.pickle')
 9 | corpus_embeddings_data = os.path.join('embedding-data/','corpus_embeddings_data.pickle')
10 | 
11 | class LoadData:
12 |     def __init__(self):
13 |         self.corpus_list = None
14 |     def from_csv(self,file_path:str):
15 |         self.file_path = file_path
16 |         csv_data = pd.read_csv(file_path)
17 |         column_name = str(input('Input the text Column Name Please ? : '))
18 |         self.corpus_list =  csv_data[column_name].dropna().to_list()
19 |         return self.corpus_list
20 | 
21 | class TextEmbedder:
22 |     def __init__(self):
23 |         self.corpus_embeddings_data = corpus_embeddings_data
24 |         self.corpus_list_data = corpus_list_data
25 |         self.corpus_list = None
26 |         self.embedder = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
27 |         self.corpus_embeddings = None
28 |         if 'embedding-data' not in os.listdir():
29 |             os.makedirs("embedding-data")
30 |     def embed(self,corpus_list:list):
31 |         self.corpus_list = corpus_list
32 |         if len(os.listdir("embedding-data/"))==0:
33 |             self.corpus_embeddings = self.embedder.encode(self.corpus_list, convert_to_tensor=True,show_progress_bar=True)
34 |             pickle.dump(self.corpus_embeddings, open(self.corpus_embeddings_data, "wb"))
35 |             pickle.dump(self.corpus_list, open(self.corpus_list_data, "wb"))
36 |             print("Embedding data Saved Successfully!")
37 |             print(os.listdir("embedding-data/"))
38 |         else:
39 |             print("Embedding data allready present, Do you want Embed & Save Again? Enter yes or no")
40 |             flag  = str(input())
41 |             if flag.lower() == 'yes':
42 |                 self.corpus_embeddings = self.embedder.encode(self.corpus_list, convert_to_tensor=True,show_progress_bar=True)
43 |                 #np.savez(self.corpus_embeddings_data,self.corpus_embeddings.cpu().data.numpy())
44 |                 #np.savez(self.corpus_list_data,self.corpus_list)
45 |                 pickle.dump(self.corpus_embeddings, open(self.corpus_embeddings_data, "wb"))
46 |                 pickle.dump(self.corpus_list, open(self.corpus_list_data, "wb"))
47 |                 print("Embedding data Saved Successfully Again!")
48 |                 print(os.listdir("embedding-data/"))
49 |             else:
50 |                 print("Embedding data allready Present, Please Apply Search!")
51 |                 print(os.listdir("embedding-data/"))
52 |     def load_embedding(self):
53 |         if len(os.listdir("embedding-data/"))==0:
54 |             print("Embedding data Not present, Please Run Embedding First")
55 |         else:
56 |             print("Embedding data Loaded Successfully!")
57 |             print(os.listdir("embedding-data/"))
58 |             return pickle.load(open(self.corpus_embeddings_data, "rb"))
59 | 
60 | class TextSearch:
61 |     def __init__(self):
62 |         self.corpus_embeddings = pickle.load(open(corpus_embeddings_data, "rb"))
63 |         self.data = pickle.load(open(corpus_list_data, "rb"))
64 |     def find_similar(self,query_text:str,top_n=10):
65 |         self.top_n = top_n
66 |         self.query_text = query_text
67 |         self.query_embedding = TextEmbedder().embedder.encode(self.query_text, convert_to_tensor=True)
68 |         self.cos_scores = util.pytorch_cos_sim(self.query_embedding, self.corpus_embeddings)[0].cpu().data.numpy()
69 |         self.sort_list  = np.argsort(-self.cos_scores)
70 |         self.all_data  = []
71 |         for idx in self.sort_list[1:self.top_n+1]:
72 |             data_out = {}
73 |             data_out['index'] = int(idx)
74 |             data_out['text'] = self.data[idx]
75 |             data_out['score'] = self.cos_scores[idx]
76 |             self.all_data.append(data_out)
77 |         return self.all_data


--------------------------------------------------------------------------------
/DeepTextSearch/__init__.py:
--------------------------------------------------------------------------------
1 | from DeepTextSearch.DeepTextSearch import LoadData,TextEmbedder,TextSearch


--------------------------------------------------------------------------------
/DeepTextSearch/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.2.4
2 | sentence_transformers==1.2.0
3 | numpy==1.18.5
4 | 


--------------------------------------------------------------------------------
/Demo/Deep Text Search Demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "49594b04",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Importing the proper classes\n",
 11 |     "from DeepTextSearch import LoadData,TextEmbedder,TextSearch"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "id": "a5424e23",
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "name": "stdout",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "Input the text Column Name Please ? : Question\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "# Load data from CSV file\n",
 30 |     "data = LoadData().from_csv(\"../your_file_name.csv\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "id": "5ce9f30d",
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "application/vnd.jupyter.widget-view+json": {
 42 |        "model_id": "26865bd100c948a6945f2e47ad3a9183",
 43 |        "version_major": 2,
 44 |        "version_minor": 0
 45 |       },
 46 |       "text/plain": [
 47 |        "Batches:   0%|          | 0/19 [00:00<?, ?it/s]"
 48 |       ]
 49 |      },
 50 |      "metadata": {},
 51 |      "output_type": "display_data"
 52 |     },
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "Embedding data Saved Successfully!\n",
 58 |       "['corpus_embeddings_data.pickle', 'corpus_list_data.pickle']\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "# For Serching we need to Embed Data first, After Embedding all the data stored on the local path\n",
 64 |     "TextEmbedder().embed(corpus_list=data)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "id": "5f349322",
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "[{'index': 575, 'text': 'What is Node.js?', 'score': 0.88481015},\n",
 77 |        " {'index': 578, 'text': 'When should we use Node.js?', 'score': 0.8388137},\n",
 78 |        " {'index': 581, 'text': 'Explain how does Node.js work?', 'score': 0.8064759},\n",
 79 |        " {'index': 591, 'text': 'What are Globals in Node.js?', 'score': 0.7844132},\n",
 80 |        " {'index': 602,\n",
 81 |        "  'text': 'What is chaining process in Node.js?',\n",
 82 |        "  'score': 0.7806176},\n",
 83 |        " {'index': 596, 'text': 'What is NPM in Node.js?', 'score': 0.76716936},\n",
 84 |        " {'index': 586, 'text': 'What is Callback in Node.js?', 'score': 0.7659653},\n",
 85 |        " {'index': 579, 'text': 'When to not use Node.js?', 'score': 0.7643588},\n",
 86 |        " {'index': 593,\n",
 87 |        "  'text': 'What is EventEmitter in Node.js?',\n",
 88 |        "  'score': 0.7514152},\n",
 89 |        " {'index': 580,\n",
 90 |        "  'text': 'What IDEs can you use for Node.js development?',\n",
 91 |        "  'score': 0.74787086}]"
 92 |       ]
 93 |      },
 94 |      "execution_count": 4,
 95 |      "metadata": {},
 96 |      "output_type": "execute_result"
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "# for searching, you need to give the query_text  and the number of the similar text you want\n",
101 |     "TextSearch().find_similar(query_text=\"What are the key features of Node.js?\",top_n=10)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "e8b4c035",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": []
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 3",
116 |    "language": "python",
117 |    "name": "python3"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 3
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython3",
129 |    "version": "3.8.9"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 5
134 | }
135 | 


--------------------------------------------------------------------------------
/Demo/DeepTextSearchDemo.py:
--------------------------------------------------------------------------------
 1 | # Importing the proper classes
 2 | from DeepTextSearch import LoadData,TextEmbedder,TextSearch
 3 | 
 4 | # Load data from CSV file
 5 | data = LoadData().from_csv("../your_file_name.csv")
 6 | 
 7 | # For Serching we need to Embed Data first, After Embedding all the data stored on the local path
 8 | TextEmbedder().embed(corpus_list=data)
 9 | 
10 | # for searching, you need to give the query_text  and the number of the similar text you want
11 | TextSearch().find_similar(query_text="What are the key features of Node.js?",top_n=10)
12 | 
13 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | Copyright (c) 2021 Nilesh Verma
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in all
10 | copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Deep Text Search - AI Based Text Search & Recommendation System
  2 | <p align="center"><img src="https://github.com/TechyNilesh/DeepTextSearch/blob/main/logo/DeepTextSearch%20Logo-2.png?raw=true" alt="Brain+Machine" height="218" width="350"></p>
  3 | 
  4 | **Deep Text Search** is an AI-powered multilingual **text search and recommendation engine** with state-of-the-art transformer-based **multilingual text embedding (50+ languages)**.
  5 | 
  6 | ![Generic badge](https://img.shields.io/badge/DeepTextSerach-v1-orange.svg) ![Generic badge](https://img.shields.io/badge/Artificial_Intelligence-Advance-green.svg) ![Generic badge](https://img.shields.io/badge/Python-v3-blue.svg) ![Generic badge](https://img.shields.io/badge/pip-v3-red.svg)  ![Generic badge](https://img.shields.io/badge/SentenceTransformer-v1-orange.svg) [![Downloads](https://static.pepy.tech/personalized-badge/deeptextsearch?period=total&units=none&left_color=grey&right_color=green&left_text=Downloads)](https://pepy.tech/project/deeptextsearch)
  7 | 
  8 | <h2><img src="https://cdn2.iconfinder.com/data/icons/artificial-intelligence-6/64/ArtificialIntelligence9-512.png" alt="Brain+Machine" height="38" width="38"> Creators </h2>
  9 | 
 10 | ### [Nilesh Verma](https://nileshverma.com "Nilesh Verma")
 11 | 
 12 | ## Features
 13 | - Faster Search.
 14 | - High Accurate Text Recommendation and Search Output Result.
 15 | - Best for Implementing on python based web application or APIs.
 16 | - Best implementation for College students and freshers for project creation.
 17 | - Applications are Text-based News, Social media post, E-commerce Product recommendation and other text-based platforms that want to implement text recommendation and search.
 18 | 
 19 | ## Installation
 20 | 
 21 | This library is compatible with both *windows* and *Linux system* you can just use **PIP command** to install this library on your system:
 22 | 
 23 | ```shell
 24 | pip install DeepTextSearch
 25 | ```
 26 | 
 27 | ## How To Use?
 28 | 
 29 | We have provided the **Demo** folder under the *GitHub repository*, you can find the example in both **.py** and **.ipynb**  file. Following are the ideal flow of the code:
 30 | 
 31 | ### 1. Importing the Important Classes
 32 | There are three important classes you need to load **LoadData** - for data loading, **TextEmbedder** - for embedding the text  to data, **TextSearch** - For searching the text.
 33 | 
 34 | ```python
 35 | # Importing the proper classes
 36 | from DeepTextSearch import LoadData,TextEmbedder,TextSearch
 37 | ```
 38 | 
 39 | ### 2. Loading the Texts Data
 40 | 
 41 | For loading the Texts data we need to use the **LoadData** object, from there we can import text data as python list object from the CSV/Text  file.
 42 | 
 43 | ```python
 44 | # Load data from CSV file
 45 | data = LoadData().from_csv("../your_file_name.csv")
 46 | # Load data from Text file
 47 | data = LoadData().from_text("../your_file_name.txt")
 48 | ```
 49 | ### 3. Embedding and Saving The File in Local Folder
 50 | 
 51 | For Embedding we are using state of the art multilingual Sentence Transformer Embedding, We also store the information of the Embedding for further use on the local path **[embedding-data/]** folder.
 52 | 
 53 | You can also use the **load embedding()** method in a **TextEmbedder()** class to load saved embedding data.
 54 | 
 55 | ```python
 56 | # To use Serching, we must first embed data. After that, we must save all of the data on the local path.
 57 | TextEmbedder().embed(corpus_list=data)
 58 | 
 59 | # Loading Embedding data
 60 | corpus_embedding = TextEmbedder().load_embedding()
 61 | ```
 62 | ### 3. Searching
 63 | 
 64 | We compare Cosian Similarity for searching and recommending, and then the corpus is sorted according to the similarity score:
 65 | 
 66 | ```python
 67 | # You must include the query text and the quantity of comparable texts you want to search for.
 68 | TextSearch().find_similar(query_text="What are the key features of Node.js?",top_n=10)
 69 | ```
 70 | 
 71 | ## Complete Code
 72 | 
 73 | ```python
 74 | # Importing the proper classes
 75 | from DeepTextSearch import LoadData,TextEmbedder,TextSearch
 76 | # Load data from CSV file
 77 | data = LoadData().from_csv("../your_file_name.csv")
 78 | # To use Serching, we must first embed data. After that, we must save all of the data on the local path
 79 | TextEmbedder().embed(corpus_list=data)
 80 | # You must include the query text and the quantity of comparable texts you want to search for
 81 | TextSearch().find_similar(query_text="What are the key features of Node.js?",top_n=10)
 82 | ```
 83 | 
 84 | ## License
 85 | 
 86 | ```rst
 87 | MIT License
 88 | 
 89 | Copyright (c) 2021 Nilesh Verma
 90 | 
 91 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 92 | 
 93 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 94 | 
 95 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 96 | ```
 97 | ### Please do STAR the repository, if it helped you in anyway.
 98 | 
 99 | **More cool features will be added in future. Feel free to give suggestions, report bugs and contribute.**
100 | 


--------------------------------------------------------------------------------
/logo/DeepTextSearch Logo-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechyNilesh/DeepTextSearch/69aa1392312ba821f4188532ce8713af412b244d/logo/DeepTextSearch Logo-1.png


--------------------------------------------------------------------------------
/logo/DeepTextSearch Logo-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TechyNilesh/DeepTextSearch/69aa1392312ba821f4188532ce8713af412b244d/logo/DeepTextSearch Logo-2.png


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import pathlib
 3 | 
 4 | 
 5 | # The directory containing this file
 6 | HERE = pathlib.Path(__file__).parent
 7 | 
 8 | # The text of the README file
 9 | README = (HERE / "README.md").read_text()
10 | 
11 | setup(
12 |   long_description_content_type="text/markdown",
13 |   name = 'DeepTextSearch',         
14 |   packages = ['DeepTextSearch'],
15 |   version = '0.3',
16 |   license='MIT',        
17 |   description = 'Deep Text Search is an AI-powered multilingual text search and recommendation engine with state-of-the-art transformer-based multilingual text embedding (50+ languages).',
18 |   long_description=README,
19 |   author = 'Nilesh Verma',                   
20 |   author_email = 'me@nileshverma.com',     
21 |   url = 'https://github.com/TechyNilesh/DeepTextSearch',
22 |   download_url = 'https://github.com/TechyNilesh/DeepTextSearch/archive/refs/tags/v_03.tar.gz',    
23 |   keywords = ['Deep Text Search Engine', 'AI Text search', 'Text Search Python','Text Recommendation Engine'],   
24 |   install_requires=[        
25 |           'sentence_transformers',
26 |           'pandas',
27 |           'numpy',
28 |       ],
29 |   classifiers=[
30 |     'Development Status :: 4 - Beta',
31 |     'Intended Audience :: Developers', 
32 |     'Topic :: Software Development :: Build Tools',
33 |     'License :: OSI Approved :: MIT License',
34 |     'Programming Language :: Python :: 3',
35 |     'Programming Language :: Python :: 3.4',
36 |     'Programming Language :: Python :: 3.5',
37 |     'Programming Language :: Python :: 3.6',
38 |     'Programming Language :: Python :: 3.7',
39 |     'Programming Language :: Python :: 3.8',
40 |     'Programming Language :: Python :: 3.9',
41 |   ],
42 | )
43 | 


--------------------------------------------------------------------------------