├── DeepTextSearch ├── DeepTextSearch.py ├── __init__.py └── requirements.txt ├── Demo ├── Deep Text Search Demo.ipynb └── DeepTextSearchDemo.py ├── LICENSE.txt ├── README.md ├── logo ├── DeepTextSearch Logo-1.png └── DeepTextSearch Logo-2.png ├── setup.cfg └── setup.py /DeepTextSearch/DeepTextSearch.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sentence_transformers import SentenceTransformer, util 3 | import numpy as np 4 | import pickle 5 | import os 6 | 7 | 8 | corpus_list_data = os.path.join('embedding-data/','corpus_list_data.pickle') 9 | corpus_embeddings_data = os.path.join('embedding-data/','corpus_embeddings_data.pickle') 10 | 11 | class LoadData: 12 | def __init__(self): 13 | self.corpus_list = None 14 | def from_csv(self,file_path:str): 15 | self.file_path = file_path 16 | csv_data = pd.read_csv(file_path) 17 | column_name = str(input('Input the text Column Name Please ? : ')) 18 | self.corpus_list = csv_data[column_name].dropna().to_list() 19 | return self.corpus_list 20 | 21 | class TextEmbedder: 22 | def __init__(self): 23 | self.corpus_embeddings_data = corpus_embeddings_data 24 | self.corpus_list_data = corpus_list_data 25 | self.corpus_list = None 26 | self.embedder = SentenceTransformer('paraphrase-xlm-r-multilingual-v1') 27 | self.corpus_embeddings = None 28 | if 'embedding-data' not in os.listdir(): 29 | os.makedirs("embedding-data") 30 | def embed(self,corpus_list:list): 31 | self.corpus_list = corpus_list 32 | if len(os.listdir("embedding-data/"))==0: 33 | self.corpus_embeddings = self.embedder.encode(self.corpus_list, convert_to_tensor=True,show_progress_bar=True) 34 | pickle.dump(self.corpus_embeddings, open(self.corpus_embeddings_data, "wb")) 35 | pickle.dump(self.corpus_list, open(self.corpus_list_data, "wb")) 36 | print("Embedding data Saved Successfully!") 37 | print(os.listdir("embedding-data/")) 38 | else: 39 | print("Embedding data allready present, Do you want Embed & Save Again? Enter yes or no") 40 | flag = str(input()) 41 | if flag.lower() == 'yes': 42 | self.corpus_embeddings = self.embedder.encode(self.corpus_list, convert_to_tensor=True,show_progress_bar=True) 43 | #np.savez(self.corpus_embeddings_data,self.corpus_embeddings.cpu().data.numpy()) 44 | #np.savez(self.corpus_list_data,self.corpus_list) 45 | pickle.dump(self.corpus_embeddings, open(self.corpus_embeddings_data, "wb")) 46 | pickle.dump(self.corpus_list, open(self.corpus_list_data, "wb")) 47 | print("Embedding data Saved Successfully Again!") 48 | print(os.listdir("embedding-data/")) 49 | else: 50 | print("Embedding data allready Present, Please Apply Search!") 51 | print(os.listdir("embedding-data/")) 52 | def load_embedding(self): 53 | if len(os.listdir("embedding-data/"))==0: 54 | print("Embedding data Not present, Please Run Embedding First") 55 | else: 56 | print("Embedding data Loaded Successfully!") 57 | print(os.listdir("embedding-data/")) 58 | return pickle.load(open(self.corpus_embeddings_data, "rb")) 59 | 60 | class TextSearch: 61 | def __init__(self): 62 | self.corpus_embeddings = pickle.load(open(corpus_embeddings_data, "rb")) 63 | self.data = pickle.load(open(corpus_list_data, "rb")) 64 | def find_similar(self,query_text:str,top_n=10): 65 | self.top_n = top_n 66 | self.query_text = query_text 67 | self.query_embedding = TextEmbedder().embedder.encode(self.query_text, convert_to_tensor=True) 68 | self.cos_scores = util.pytorch_cos_sim(self.query_embedding, self.corpus_embeddings)[0].cpu().data.numpy() 69 | self.sort_list = np.argsort(-self.cos_scores) 70 | self.all_data = [] 71 | for idx in self.sort_list[1:self.top_n+1]: 72 | data_out = {} 73 | data_out['index'] = int(idx) 74 | data_out['text'] = self.data[idx] 75 | data_out['score'] = self.cos_scores[idx] 76 | self.all_data.append(data_out) 77 | return self.all_data -------------------------------------------------------------------------------- /DeepTextSearch/__init__.py: -------------------------------------------------------------------------------- 1 | from DeepTextSearch.DeepTextSearch import LoadData,TextEmbedder,TextSearch -------------------------------------------------------------------------------- /DeepTextSearch/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.2.4 2 | sentence_transformers==1.2.0 3 | numpy==1.18.5 4 | -------------------------------------------------------------------------------- /Demo/Deep Text Search Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "49594b04", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Importing the proper classes\n", 11 | "from DeepTextSearch import LoadData,TextEmbedder,TextSearch" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "id": "a5424e23", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "Input the text Column Name Please ? : Question\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# Load data from CSV file\n", 30 | "data = LoadData().from_csv(\"../your_file_name.csv\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "id": "5ce9f30d", 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "application/vnd.jupyter.widget-view+json": { 42 | "model_id": "26865bd100c948a6945f2e47ad3a9183", 43 | "version_major": 2, 44 | "version_minor": 0 45 | }, 46 | "text/plain": [ 47 | "Batches: 0%| | 0/19 [00:00Brain+Machine

3 | 4 | **Deep Text Search** is an AI-powered multilingual **text search and recommendation engine** with state-of-the-art transformer-based **multilingual text embedding (50+ languages)**. 5 | 6 | ![Generic badge](https://img.shields.io/badge/DeepTextSerach-v1-orange.svg) ![Generic badge](https://img.shields.io/badge/Artificial_Intelligence-Advance-green.svg) ![Generic badge](https://img.shields.io/badge/Python-v3-blue.svg) ![Generic badge](https://img.shields.io/badge/pip-v3-red.svg) ![Generic badge](https://img.shields.io/badge/SentenceTransformer-v1-orange.svg) [![Downloads](https://static.pepy.tech/personalized-badge/deeptextsearch?period=total&units=none&left_color=grey&right_color=green&left_text=Downloads)](https://pepy.tech/project/deeptextsearch) 7 | 8 |

Brain+Machine Creators

9 | 10 | ### [Nilesh Verma](https://nileshverma.com "Nilesh Verma") 11 | 12 | ## Features 13 | - Faster Search. 14 | - High Accurate Text Recommendation and Search Output Result. 15 | - Best for Implementing on python based web application or APIs. 16 | - Best implementation for College students and freshers for project creation. 17 | - Applications are Text-based News, Social media post, E-commerce Product recommendation and other text-based platforms that want to implement text recommendation and search. 18 | 19 | ## Installation 20 | 21 | This library is compatible with both *windows* and *Linux system* you can just use **PIP command** to install this library on your system: 22 | 23 | ```shell 24 | pip install DeepTextSearch 25 | ``` 26 | 27 | ## How To Use? 28 | 29 | We have provided the **Demo** folder under the *GitHub repository*, you can find the example in both **.py** and **.ipynb** file. Following are the ideal flow of the code: 30 | 31 | ### 1. Importing the Important Classes 32 | There are three important classes you need to load **LoadData** - for data loading, **TextEmbedder** - for embedding the text to data, **TextSearch** - For searching the text. 33 | 34 | ```python 35 | # Importing the proper classes 36 | from DeepTextSearch import LoadData,TextEmbedder,TextSearch 37 | ``` 38 | 39 | ### 2. Loading the Texts Data 40 | 41 | For loading the Texts data we need to use the **LoadData** object, from there we can import text data as python list object from the CSV/Text file. 42 | 43 | ```python 44 | # Load data from CSV file 45 | data = LoadData().from_csv("../your_file_name.csv") 46 | # Load data from Text file 47 | data = LoadData().from_text("../your_file_name.txt") 48 | ``` 49 | ### 3. Embedding and Saving The File in Local Folder 50 | 51 | For Embedding we are using state of the art multilingual Sentence Transformer Embedding, We also store the information of the Embedding for further use on the local path **[embedding-data/]** folder. 52 | 53 | You can also use the **load embedding()** method in a **TextEmbedder()** class to load saved embedding data. 54 | 55 | ```python 56 | # To use Serching, we must first embed data. After that, we must save all of the data on the local path. 57 | TextEmbedder().embed(corpus_list=data) 58 | 59 | # Loading Embedding data 60 | corpus_embedding = TextEmbedder().load_embedding() 61 | ``` 62 | ### 3. Searching 63 | 64 | We compare Cosian Similarity for searching and recommending, and then the corpus is sorted according to the similarity score: 65 | 66 | ```python 67 | # You must include the query text and the quantity of comparable texts you want to search for. 68 | TextSearch().find_similar(query_text="What are the key features of Node.js?",top_n=10) 69 | ``` 70 | 71 | ## Complete Code 72 | 73 | ```python 74 | # Importing the proper classes 75 | from DeepTextSearch import LoadData,TextEmbedder,TextSearch 76 | # Load data from CSV file 77 | data = LoadData().from_csv("../your_file_name.csv") 78 | # To use Serching, we must first embed data. After that, we must save all of the data on the local path 79 | TextEmbedder().embed(corpus_list=data) 80 | # You must include the query text and the quantity of comparable texts you want to search for 81 | TextSearch().find_similar(query_text="What are the key features of Node.js?",top_n=10) 82 | ``` 83 | 84 | ## License 85 | 86 | ```rst 87 | MIT License 88 | 89 | Copyright (c) 2021 Nilesh Verma 90 | 91 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 92 | 93 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 94 | 95 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 96 | ``` 97 | ### Please do STAR the repository, if it helped you in anyway. 98 | 99 | **More cool features will be added in future. Feel free to give suggestions, report bugs and contribute.** 100 | -------------------------------------------------------------------------------- /logo/DeepTextSearch Logo-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechyNilesh/DeepTextSearch/69aa1392312ba821f4188532ce8713af412b244d/logo/DeepTextSearch Logo-1.png -------------------------------------------------------------------------------- /logo/DeepTextSearch Logo-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TechyNilesh/DeepTextSearch/69aa1392312ba821f4188532ce8713af412b244d/logo/DeepTextSearch Logo-2.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import pathlib 3 | 4 | 5 | # The directory containing this file 6 | HERE = pathlib.Path(__file__).parent 7 | 8 | # The text of the README file 9 | README = (HERE / "README.md").read_text() 10 | 11 | setup( 12 | long_description_content_type="text/markdown", 13 | name = 'DeepTextSearch', 14 | packages = ['DeepTextSearch'], 15 | version = '0.3', 16 | license='MIT', 17 | description = 'Deep Text Search is an AI-powered multilingual text search and recommendation engine with state-of-the-art transformer-based multilingual text embedding (50+ languages).', 18 | long_description=README, 19 | author = 'Nilesh Verma', 20 | author_email = 'me@nileshverma.com', 21 | url = 'https://github.com/TechyNilesh/DeepTextSearch', 22 | download_url = 'https://github.com/TechyNilesh/DeepTextSearch/archive/refs/tags/v_03.tar.gz', 23 | keywords = ['Deep Text Search Engine', 'AI Text search', 'Text Search Python','Text Recommendation Engine'], 24 | install_requires=[ 25 | 'sentence_transformers', 26 | 'pandas', 27 | 'numpy', 28 | ], 29 | classifiers=[ 30 | 'Development Status :: 4 - Beta', 31 | 'Intended Audience :: Developers', 32 | 'Topic :: Software Development :: Build Tools', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Programming Language :: Python :: 3', 35 | 'Programming Language :: Python :: 3.4', 36 | 'Programming Language :: Python :: 3.5', 37 | 'Programming Language :: Python :: 3.6', 38 | 'Programming Language :: Python :: 3.7', 39 | 'Programming Language :: Python :: 3.8', 40 | 'Programming Language :: Python :: 3.9', 41 | ], 42 | ) 43 | --------------------------------------------------------------------------------