├── DeepTextSearch ├── DeepTextSearch.py ├── __init__.py └── requirements.txt ├── Demo ├── Deep Text Search Demo.ipynb └── DeepTextSearchDemo.py ├── LICENSE.txt ├── README.md ├── logo ├── DeepTextSearch Logo-1.png └── DeepTextSearch Logo-2.png ├── setup.cfg └── setup.py /DeepTextSearch/DeepTextSearch.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sentence_transformers import SentenceTransformer, util 3 | import numpy as np 4 | import pickle 5 | import os 6 | 7 | 8 | corpus_list_data = os.path.join('embedding-data/','corpus_list_data.pickle') 9 | corpus_embeddings_data = os.path.join('embedding-data/','corpus_embeddings_data.pickle') 10 | 11 | class LoadData: 12 | def __init__(self): 13 | self.corpus_list = None 14 | def from_csv(self,file_path:str): 15 | self.file_path = file_path 16 | csv_data = pd.read_csv(file_path) 17 | column_name = str(input('Input the text Column Name Please ? : ')) 18 | self.corpus_list = csv_data[column_name].dropna().to_list() 19 | return self.corpus_list 20 | 21 | class TextEmbedder: 22 | def __init__(self): 23 | self.corpus_embeddings_data = corpus_embeddings_data 24 | self.corpus_list_data = corpus_list_data 25 | self.corpus_list = None 26 | self.embedder = SentenceTransformer('paraphrase-xlm-r-multilingual-v1') 27 | self.corpus_embeddings = None 28 | if 'embedding-data' not in os.listdir(): 29 | os.makedirs("embedding-data") 30 | def embed(self,corpus_list:list): 31 | self.corpus_list = corpus_list 32 | if len(os.listdir("embedding-data/"))==0: 33 | self.corpus_embeddings = self.embedder.encode(self.corpus_list, convert_to_tensor=True,show_progress_bar=True) 34 | pickle.dump(self.corpus_embeddings, open(self.corpus_embeddings_data, "wb")) 35 | pickle.dump(self.corpus_list, open(self.corpus_list_data, "wb")) 36 | print("Embedding data Saved Successfully!") 37 | print(os.listdir("embedding-data/")) 38 | else: 39 | print("Embedding data allready present, Do you want Embed & Save Again? Enter yes or no") 40 | flag = str(input()) 41 | if flag.lower() == 'yes': 42 | self.corpus_embeddings = self.embedder.encode(self.corpus_list, convert_to_tensor=True,show_progress_bar=True) 43 | #np.savez(self.corpus_embeddings_data,self.corpus_embeddings.cpu().data.numpy()) 44 | #np.savez(self.corpus_list_data,self.corpus_list) 45 | pickle.dump(self.corpus_embeddings, open(self.corpus_embeddings_data, "wb")) 46 | pickle.dump(self.corpus_list, open(self.corpus_list_data, "wb")) 47 | print("Embedding data Saved Successfully Again!") 48 | print(os.listdir("embedding-data/")) 49 | else: 50 | print("Embedding data allready Present, Please Apply Search!") 51 | print(os.listdir("embedding-data/")) 52 | def load_embedding(self): 53 | if len(os.listdir("embedding-data/"))==0: 54 | print("Embedding data Not present, Please Run Embedding First") 55 | else: 56 | print("Embedding data Loaded Successfully!") 57 | print(os.listdir("embedding-data/")) 58 | return pickle.load(open(self.corpus_embeddings_data, "rb")) 59 | 60 | class TextSearch: 61 | def __init__(self): 62 | self.corpus_embeddings = pickle.load(open(corpus_embeddings_data, "rb")) 63 | self.data = pickle.load(open(corpus_list_data, "rb")) 64 | def find_similar(self,query_text:str,top_n=10): 65 | self.top_n = top_n 66 | self.query_text = query_text 67 | self.query_embedding = TextEmbedder().embedder.encode(self.query_text, convert_to_tensor=True) 68 | self.cos_scores = util.pytorch_cos_sim(self.query_embedding, self.corpus_embeddings)[0].cpu().data.numpy() 69 | self.sort_list = np.argsort(-self.cos_scores) 70 | self.all_data = [] 71 | for idx in self.sort_list[1:self.top_n+1]: 72 | data_out = {} 73 | data_out['index'] = int(idx) 74 | data_out['text'] = self.data[idx] 75 | data_out['score'] = self.cos_scores[idx] 76 | self.all_data.append(data_out) 77 | return self.all_data -------------------------------------------------------------------------------- /DeepTextSearch/__init__.py: -------------------------------------------------------------------------------- 1 | from DeepTextSearch.DeepTextSearch import LoadData,TextEmbedder,TextSearch -------------------------------------------------------------------------------- /DeepTextSearch/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.2.4 2 | sentence_transformers==1.2.0 3 | numpy==1.18.5 4 | -------------------------------------------------------------------------------- /Demo/Deep Text Search Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "49594b04", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Importing the proper classes\n", 11 | "from DeepTextSearch import LoadData,TextEmbedder,TextSearch" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "id": "a5424e23", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "Input the text Column Name Please ? : Question\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# Load data from CSV file\n", 30 | "data = LoadData().from_csv(\"../your_file_name.csv\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "id": "5ce9f30d", 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "application/vnd.jupyter.widget-view+json": { 42 | "model_id": "26865bd100c948a6945f2e47ad3a9183", 43 | "version_major": 2, 44 | "version_minor": 0 45 | }, 46 | "text/plain": [ 47 | "Batches: 0%| | 0/19 [00:00, ?it/s]" 48 | ] 49 | }, 50 | "metadata": {}, 51 | "output_type": "display_data" 52 | }, 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "Embedding data Saved Successfully!\n", 58 | "['corpus_embeddings_data.pickle', 'corpus_list_data.pickle']\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "# For Serching we need to Embed Data first, After Embedding all the data stored on the local path\n", 64 | "TextEmbedder().embed(corpus_list=data)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "id": "5f349322", 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "[{'index': 575, 'text': 'What is Node.js?', 'score': 0.88481015},\n", 77 | " {'index': 578, 'text': 'When should we use Node.js?', 'score': 0.8388137},\n", 78 | " {'index': 581, 'text': 'Explain how does Node.js work?', 'score': 0.8064759},\n", 79 | " {'index': 591, 'text': 'What are Globals in Node.js?', 'score': 0.7844132},\n", 80 | " {'index': 602,\n", 81 | " 'text': 'What is chaining process in Node.js?',\n", 82 | " 'score': 0.7806176},\n", 83 | " {'index': 596, 'text': 'What is NPM in Node.js?', 'score': 0.76716936},\n", 84 | " {'index': 586, 'text': 'What is Callback in Node.js?', 'score': 0.7659653},\n", 85 | " {'index': 579, 'text': 'When to not use Node.js?', 'score': 0.7643588},\n", 86 | " {'index': 593,\n", 87 | " 'text': 'What is EventEmitter in Node.js?',\n", 88 | " 'score': 0.7514152},\n", 89 | " {'index': 580,\n", 90 | " 'text': 'What IDEs can you use for Node.js development?',\n", 91 | " 'score': 0.74787086}]" 92 | ] 93 | }, 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "# for searching, you need to give the query_text and the number of the similar text you want\n", 101 | "TextSearch().find_similar(query_text=\"What are the key features of Node.js?\",top_n=10)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "e8b4c035", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 3", 116 | "language": "python", 117 | "name": "python3" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 3 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython3", 129 | "version": "3.8.9" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 5 134 | } 135 | -------------------------------------------------------------------------------- /Demo/DeepTextSearchDemo.py: -------------------------------------------------------------------------------- 1 | # Importing the proper classes 2 | from DeepTextSearch import LoadData,TextEmbedder,TextSearch 3 | 4 | # Load data from CSV file 5 | data = LoadData().from_csv("../your_file_name.csv") 6 | 7 | # For Serching we need to Embed Data first, After Embedding all the data stored on the local path 8 | TextEmbedder().embed(corpus_list=data) 9 | 10 | # for searching, you need to give the query_text and the number of the similar text you want 11 | TextSearch().find_similar(query_text="What are the key features of Node.js?",top_n=10) 12 | 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | Copyright (c) 2021 Nilesh Verma 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in all 10 | copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Text Search - AI Based Text Search & Recommendation System 2 |