├── .gitignore ├── LICENSE ├── README.md ├── assets ├── architecture.png └── blinking_elastic2.gif ├── notebooks ├── Searching_with_ElasticTransformers.ipynb └── Setting_up_ElasticTransformers.ipynb ├── requirements.txt └── src ├── database.py └── logger.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .DS_store 3 | .vscode/ 4 | __pycache__/ 5 | index_spec 6 | 7 | logs/ 8 | data/ 9 | 10 | notebooks/Experiments* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ElasticTransformers 2 | Semantic Elasticsearch with Sentence Transformers. We will use the power of Elastic and the magic of BERT to index a million articles and perform lexical and semantic search on them. 3 | 4 | The purpose is to provide an ease-of-use way of setting up your own Elasticsearch with near state of the art capabilities of contextual embeddings / semantic search using NLP transformers. 5 | 6 | ## Overview 7 | 8 |

9 | 10 |

11 | 12 | The above setup works as follows 13 | - Set up an Elasticsearch server with Dockers 14 | - Collect the dataset 15 | - Use sentence-transformers to index them onto Elastic (takes about 3 hrs on 4 CPU cores) 16 | - Look at some comparison examples between lexical and semantic search 17 | 18 | ## Setup 19 | ### Set up your environment 20 | My environment is called `et` and I use conda for this. Navigate inside the project directory 21 | ```python 22 | conda create --name et python=3.7 23 | conda install -n et nb_conda_kernels 24 | conda activate et 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | ### Get the data 29 | For this tutorial I am using [A Million News Headlines](https://www.kaggle.com/therohk/million-headlines "Kaggle A Million News Headlines") by Rohk and place it in the data folder inside the project dir. 30 | 31 | elastic_transformers/ 32 | ├── data/ 33 | 34 | You will find that the steps are otherwise pretty abstracted so you can also do this with your dataset of choice 35 | 36 | ### Elasticsearch with Docker 37 | Follow the instructions on setting up Elastic with Docker from Elastic's page [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html) 38 | For this tutorial, you only need to run the two steps: 39 | - [Pulling the image](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html#_pulling_the_image) 40 | - [Starting a single node cluster with Docker](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html#docker-cli-run-dev-mode) 41 | 42 | ## Features 43 | 44 | The repo introduces the ElasiticTransformers class. Utilities which help create, index and query Elasticsearch indices which include embeddings 45 | 46 | Initiate the connection links as well as (optionally) the name of the index to work with 47 | ```python 48 | et=ElasticTransformers(url='http://localhost:9300',index_name='et-tiny') 49 | ``` 50 | *create_index_spec* define mapping for the index. Lists of relevant fields can 51 | be provided for keyword search or semantic (dense vector) search. 52 | It also has parameters for the size of the dense vector as those can vary 53 | *create_index* - uses the spec created earlier to create an index ready for search 54 | 55 | ```py 56 | et.create_index_spec( 57 | text_fields=['publish_date','headline_text'], 58 | dense_fields=['headline_text_embedding'], 59 | dense_fields_dim=768 60 | ) 61 | et.create_index() 62 | ``` 63 | 64 | *write_large_csv* - breaks up a large csv file into chunks and iteratively uses a predefined 65 | embedding utility to create the embeddings list for each chunk and subsequently feed results to the index 66 | ```py 67 | et.write_large_csv('data/tiny_sample.csv', 68 | chunksize=1000, 69 | embedder=embed_wrapper, 70 | field_to_embed='headline_text') 71 | ``` 72 | *search* - allows to select either keyword (‘match’ in Elastic) or semantic (dense in Elastic) 73 | search. Notably it requires the same embedding function used in write_large_csv 74 | ```py 75 | et.search(query='search these terms', 76 | field='headline_text', 77 | type='match', 78 | embedder=embed_wrapper, 79 | size = 1000) 80 | ``` 81 | 82 | ## Usage 83 | After successful setup, use the folling notebooks to make this all work 84 | - [Setting up the index](../master/notebooks/Setting_up_ElasticTransformers.ipynb) 85 | - [Searching](../master/notebooks/Searching_with_ElasticTransformers.ipynb) 86 | 87 | ## References 88 | This repo combines together the following amazing works by brilliant people. Please check out their work if you haven't done so yet... 89 | 90 | ### The ML part 91 | - [sentence-transformers](https://github.com/UKPLab/sentence-transformers) 92 | - [transformers](https://github.com/huggingface/transformers) 93 | - [BERT](https://github.com/google-research/bert) 94 | ### The engineering part 95 | - [Elasticsearch](https://www.elastic.co/home) 96 | - [Docker](https://hub.docker.com) 97 | -------------------------------------------------------------------------------- /assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/md-experiments/elastic_transformers/9f5920ab14d814739138544f4711567b8b762e5a/assets/architecture.png -------------------------------------------------------------------------------- /assets/blinking_elastic2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/md-experiments/elastic_transformers/9f5920ab14d814739138544f4711567b8b762e5a/assets/blinking_elastic2.gif -------------------------------------------------------------------------------- /notebooks/Searching_with_ElasticTransformers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "import os\n", 11 | "os.chdir(os.path.abspath(os.curdir).replace('notebooks',''))" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import datetime\n", 21 | "from tqdm import trange\n", 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "pd.set_option('display.max_colwidth', 120)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "%autoreload 2\n", 34 | "\n", 35 | "from src.database import ElasticTransformers" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from sentence_transformers import SentenceTransformer\n", 45 | "\n", 46 | "bert_embedder = SentenceTransformer('bert-base-nli-mean-tokens')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 5, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "def embed_wrapper(ls):\n", 56 | " \"\"\"\n", 57 | " Helper function which simplifies the embedding call and helps lading data into elastic easier\n", 58 | " \"\"\"\n", 59 | " results=bert_embedder.encode(ls, convert_to_tensor=True)\n", 60 | " results = [r.tolist() for r in results]\n", 61 | " return results\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "True" 73 | ] 74 | }, 75 | "execution_count": 6, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "et=ElasticTransformers(index_name='et-large')\n", 82 | "et.ping()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "# Search Experiments\n", 90 | "\n", 91 | "To analyse results, I compared top results side by side on a few searches. \n", 92 | "\n", 93 | "Approach is to take the top 10 hits, after removing some of the noisy results (duplicates or “headlines” of just one word).\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "def select_search_results(df,top_n=10):\n", 103 | " # four tokens or more (filtering out some meaningless headlines)\n", 104 | " df=df[df.headline_text.apply(lambda x: len(x.split())>4)].copy()\n", 105 | " # remove exact duplicates\n", 106 | " df=df.groupby('headline_text', as_index=False).first()\n", 107 | " df=df.sort_values('_score',ascending=False)\n", 108 | " df=df.reset_index(drop=True)\n", 109 | " return df.head(top_n)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 31, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "KEYWORD SEARCH RESULTS\n" 122 | ] 123 | }, 124 | { 125 | "data": { 126 | "text/html": [ 127 | "
\n", 128 | "\n", 141 | "\n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
headline_text_score
0public warned of mozzie virus threat13.311735
1cattle producers warned of virus threat13.311735
2expert plays down hendra virus threat13.295192
3residents reminded of mozzie virus threat13.213888
4mozzie virus threat sparks health alert13.213888
5report reveals lower mozzie virus threat13.213888
6hendra like virus identified as potential threat12.498677
7hendra virus poses constant threat chief vet12.498677
8public warned of mossie borne virus threat12.498677
9sunraysia fears watermelon virus threat from nt12.483357
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " headline_text _score\n", 206 | "0 public warned of mozzie virus threat 13.311735\n", 207 | "1 cattle producers warned of virus threat 13.311735\n", 208 | "2 expert plays down hendra virus threat 13.295192\n", 209 | "3 residents reminded of mozzie virus threat 13.213888\n", 210 | "4 mozzie virus threat sparks health alert 13.213888\n", 211 | "5 report reveals lower mozzie virus threat 13.213888\n", 212 | "6 hendra like virus identified as potential threat 12.498677\n", 213 | "7 hendra virus poses constant threat chief vet 12.498677\n", 214 | "8 public warned of mossie borne virus threat 12.498677\n", 215 | "9 sunraysia fears watermelon virus threat from nt 12.483357" 216 | ] 217 | }, 218 | "metadata": {}, 219 | "output_type": "display_data" 220 | }, 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "CONTEXTUAL SEARCH RESULTS\n" 226 | ] 227 | }, 228 | { 229 | "data": { 230 | "text/html": [ 231 | "
\n", 232 | "\n", 245 | "\n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | "
headline_text_score
0hendra like virus identified as potential threat1.859408
1hendra report author warns of virus risk1.853364
2fresh concerns over hendra virus outbreak1.836927
3virus puts giteau in doubt1.823136
4hendra virus case under investigation1.817768
5who highlight dangers of vector borne diseases1.804388
6potentially deadly virus sparks mozzie warning1.799419
7who warns threat from vector borne diseases1.793783
8fears as png diseases spread1.791913
9deadly hendra virus strikes again1.788590
\n", 306 | "
" 307 | ], 308 | "text/plain": [ 309 | " headline_text _score\n", 310 | "0 hendra like virus identified as potential threat 1.859408\n", 311 | "1 hendra report author warns of virus risk 1.853364\n", 312 | "2 fresh concerns over hendra virus outbreak 1.836927\n", 313 | "3 virus puts giteau in doubt 1.823136\n", 314 | "4 hendra virus case under investigation 1.817768\n", 315 | "5 who highlight dangers of vector borne diseases 1.804388\n", 316 | "6 potentially deadly virus sparks mozzie warning 1.799419\n", 317 | "7 who warns threat from vector borne diseases 1.793783\n", 318 | "8 fears as png diseases spread 1.791913\n", 319 | "9 deadly hendra virus strikes again 1.788590" 320 | ] 321 | }, 322 | "metadata": {}, 323 | "output_type": "display_data" 324 | } 325 | ], 326 | "source": [ 327 | "query='virus threat'\n", 328 | "print('KEYWORD SEARCH RESULTS')\n", 329 | "df0=et.search(query,'headline_text',type='match',embedder=embed_wrapper, size = 1000)\n", 330 | "display(select_search_results(df0))\n", 331 | "print('CONTEXTUAL SEARCH RESULTS')\n", 332 | "df1=et.search(query,'headline_text',type='dense',embedder=embed_wrapper, size = 1000)\n", 333 | "display(select_search_results(df1))\n", 334 | "\n" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 32, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/html": [ 345 | "
\n", 346 | "\n", 359 | "\n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | "
headline_text_score
5who highlight dangers of vector borne diseases1.804388
8fears as png diseases spread1.791913
21the odds of an outbreak1.773913
25flood waters carry risk of disease infection1.769219
27port uncertain of impact of viral meningitis outbreak1.767367
30human error blamed for infection scare1.764524
34oakey defence base contaminants linked to serious disease1.758969
35dangerous parasite rife in nt1.757938
40academic fears spread of mozzie borne disease1.755103
44sti symptoms dangers and treatments1.751830
\n", 420 | "
" 421 | ], 422 | "text/plain": [ 423 | " headline_text _score\n", 424 | "5 who highlight dangers of vector borne diseases 1.804388\n", 425 | "8 fears as png diseases spread 1.791913\n", 426 | "21 the odds of an outbreak 1.773913\n", 427 | "25 flood waters carry risk of disease infection 1.769219\n", 428 | "27 port uncertain of impact of viral meningitis outbreak 1.767367\n", 429 | "30 human error blamed for infection scare 1.764524\n", 430 | "34 oakey defence base contaminants linked to serious disease 1.758969\n", 431 | "35 dangerous parasite rife in nt 1.757938\n", 432 | "40 academic fears spread of mozzie borne disease 1.755103\n", 433 | "44 sti symptoms dangers and treatments 1.751830" 434 | ] 435 | }, 436 | "execution_count": 32, 437 | "metadata": {}, 438 | "output_type": "execute_result" 439 | } 440 | ], 441 | "source": [ 442 | "df11=select_search_results(df1,1000)\n", 443 | "\n", 444 | "df11[df11.headline_text.apply(lambda x: all([q not in x for q in query.split()]))].head(10)\n" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 35, 450 | "metadata": {}, 451 | "outputs": [ 452 | { 453 | "name": "stdout", 454 | "output_type": "stream", 455 | "text": [ 456 | "KEYWORD SEARCH RESULTS\n" 457 | ] 458 | }, 459 | { 460 | "data": { 461 | "text/html": [ 462 | "
\n", 463 | "\n", 476 | "\n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | "
headline_text_score
0perth storm a natural disaster16.165108
1more natural disaster planning needed16.165108
2gunnedah declared natural disaster area16.165108
3bushfire prompts natural disaster declaration16.165108
4maclean fire not natural disaster16.032738
5state helps natural disaster victims15.960692
6esperance declared natural disaster area15.960692
7flooding sparks natural disaster declarations15.960692
8government declares natural disaster areas15.960692
9nsw natural disaster zone widened15.960692
\n", 537 | "
" 538 | ], 539 | "text/plain": [ 540 | " headline_text _score\n", 541 | "0 perth storm a natural disaster 16.165108\n", 542 | "1 more natural disaster planning needed 16.165108\n", 543 | "2 gunnedah declared natural disaster area 16.165108\n", 544 | "3 bushfire prompts natural disaster declaration 16.165108\n", 545 | "4 maclean fire not natural disaster 16.032738\n", 546 | "5 state helps natural disaster victims 15.960692\n", 547 | "6 esperance declared natural disaster area 15.960692\n", 548 | "7 flooding sparks natural disaster declarations 15.960692\n", 549 | "8 government declares natural disaster areas 15.960692\n", 550 | "9 nsw natural disaster zone widened 15.960692" 551 | ] 552 | }, 553 | "metadata": {}, 554 | "output_type": "display_data" 555 | }, 556 | { 557 | "name": "stdout", 558 | "output_type": "stream", 559 | "text": [ 560 | "CONTEXTUAL SEARCH RESULTS\n" 561 | ] 562 | }, 563 | { 564 | "data": { 565 | "text/html": [ 566 | "
\n", 567 | "\n", 580 | "\n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | "
headline_text_score
0natural disaster declared in broken hill1.879506
1natural disaster declared in storm area1.877636
2lismore declared a natural disaster area1.867503
3broken hill declared a natural disaster area1.854052
4natural disasters take toll on austar1.848880
5perth storm a natural disaster1.836232
6call for nambucca valley natural disaster1.834766
7wagga albury declared natural disaster areas1.831764
8ballina area declared natural disaster zone1.823700
9disasters take toll on shire1.822510
\n", 641 | "
" 642 | ], 643 | "text/plain": [ 644 | " headline_text _score\n", 645 | "0 natural disaster declared in broken hill 1.879506\n", 646 | "1 natural disaster declared in storm area 1.877636\n", 647 | "2 lismore declared a natural disaster area 1.867503\n", 648 | "3 broken hill declared a natural disaster area 1.854052\n", 649 | "4 natural disasters take toll on austar 1.848880\n", 650 | "5 perth storm a natural disaster 1.836232\n", 651 | "6 call for nambucca valley natural disaster 1.834766\n", 652 | "7 wagga albury declared natural disaster areas 1.831764\n", 653 | "8 ballina area declared natural disaster zone 1.823700\n", 654 | "9 disasters take toll on shire 1.822510" 655 | ] 656 | }, 657 | "metadata": {}, 658 | "output_type": "display_data" 659 | } 660 | ], 661 | "source": [ 662 | "query='natural disaster'\n", 663 | "print('KEYWORD SEARCH RESULTS')\n", 664 | "df0=et.search(query,'headline_text',type='match',embedder=embed_wrapper, size = 1000)\n", 665 | "display(select_search_results(df0,10))\n", 666 | "print('CONTEXTUAL SEARCH RESULTS')\n", 667 | "df1=et.search(query,'headline_text',type='dense',embedder=embed_wrapper, size = 1000)\n", 668 | "display(select_search_results(df1,10))\n", 669 | "\n" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": 36, 675 | "metadata": {}, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/html": [ 680 | "
\n", 681 | "\n", 694 | "\n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | "
headline_text_score
28power supply at risk if flood situation worsens1.787914
36widespread damage from freak storm1.779122
40catastrophic fire conditions for wa1.776245
43leigh creek ucg project lifeline or toxic environmental hazard1.772300
46qlds wild weather caused by freak event1.770647
48wild weather causes qld flooding1.769357
50cyclone damaged water supply fixed1.767906
53humungous effort on catastrophic day1.766371
56nsw floods receding water reveals destruction1.766130
58cyclone olwyn carnarvon water supply problems1.765932
\n", 755 | "
" 756 | ], 757 | "text/plain": [ 758 | " headline_text _score\n", 759 | "28 power supply at risk if flood situation worsens 1.787914\n", 760 | "36 widespread damage from freak storm 1.779122\n", 761 | "40 catastrophic fire conditions for wa 1.776245\n", 762 | "43 leigh creek ucg project lifeline or toxic environmental hazard 1.772300\n", 763 | "46 qlds wild weather caused by freak event 1.770647\n", 764 | "48 wild weather causes qld flooding 1.769357\n", 765 | "50 cyclone damaged water supply fixed 1.767906\n", 766 | "53 humungous effort on catastrophic day 1.766371\n", 767 | "56 nsw floods receding water reveals destruction 1.766130\n", 768 | "58 cyclone olwyn carnarvon water supply problems 1.765932" 769 | ] 770 | }, 771 | "execution_count": 36, 772 | "metadata": {}, 773 | "output_type": "execute_result" 774 | } 775 | ], 776 | "source": [ 777 | "df11=select_search_results(df1,1000)\n", 778 | "\n", 779 | "df11[df11.headline_text.apply(lambda x: all([q not in x for q in query.split()]))].head(10)\n" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 24, 785 | "metadata": {}, 786 | "outputs": [ 787 | { 788 | "name": "stdout", 789 | "output_type": "stream", 790 | "text": [ 791 | "KEYWORD SEARCH RESULTS\n" 792 | ] 793 | }, 794 | { 795 | "data": { 796 | "text/html": [ 797 | "
\n", 798 | "\n", 811 | "\n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | "
headline_text_score
0regulatory madness in the banking world18.955673
1china pushes through banking sector reform14.721983
2swan to announce banking reform package14.582440
3open banking more choice or data risk13.001936
4swan wraps up meeting on banking rules reform12.904054
5govt internet regulatory plan criticised12.000524
6regulatory duplication strangling aquaculture development12.000524
7us flags financial regulatory reforms11.530772
8mcconnell a regulatory train wreck11.530772
9billabong rescue package clears regulatory hurdle11.220221
\n", 872 | "
" 873 | ], 874 | "text/plain": [ 875 | " headline_text _score\n", 876 | "0 regulatory madness in the banking world 18.955673\n", 877 | "1 china pushes through banking sector reform 14.721983\n", 878 | "2 swan to announce banking reform package 14.582440\n", 879 | "3 open banking more choice or data risk 13.001936\n", 880 | "4 swan wraps up meeting on banking rules reform 12.904054\n", 881 | "5 govt internet regulatory plan criticised 12.000524\n", 882 | "6 regulatory duplication strangling aquaculture development 12.000524\n", 883 | "7 us flags financial regulatory reforms 11.530772\n", 884 | "8 mcconnell a regulatory train wreck 11.530772\n", 885 | "9 billabong rescue package clears regulatory hurdle 11.220221" 886 | ] 887 | }, 888 | "metadata": {}, 889 | "output_type": "display_data" 890 | }, 891 | { 892 | "name": "stdout", 893 | "output_type": "stream", 894 | "text": [ 895 | "CONTEXTUAL SEARCH RESULTS\n" 896 | ] 897 | }, 898 | { 899 | "data": { 900 | "text/html": [ 901 | "
\n", 902 | "\n", 915 | "\n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | "
headline_text_score
0us flags financial regulatory reforms1.863391
1the banking royal commissions recommendations1.850401
2what can we expect from the banking inquiry1.841991
3banking royal commission superannuation hearings1.836119
4banking royal commission anz financial advice clients interest1.833776
5rba considers cap on credit card surcharges1.832314
6rba on banks interest rate moves1.831732
7will changes to financial advice laws see the1.831395
8commonwealth bank responds to financial planning inquiry1.831379
9reserve bank financial stability review1.830978
\n", 976 | "
" 977 | ], 978 | "text/plain": [ 979 | " headline_text _score\n", 980 | "0 us flags financial regulatory reforms 1.863391\n", 981 | "1 the banking royal commissions recommendations 1.850401\n", 982 | "2 what can we expect from the banking inquiry 1.841991\n", 983 | "3 banking royal commission superannuation hearings 1.836119\n", 984 | "4 banking royal commission anz financial advice clients interest 1.833776\n", 985 | "5 rba considers cap on credit card surcharges 1.832314\n", 986 | "6 rba on banks interest rate moves 1.831732\n", 987 | "7 will changes to financial advice laws see the 1.831395\n", 988 | "8 commonwealth bank responds to financial planning inquiry 1.831379\n", 989 | "9 reserve bank financial stability review 1.830978" 990 | ] 991 | }, 992 | "metadata": {}, 993 | "output_type": "display_data" 994 | } 995 | ], 996 | "source": [ 997 | "#query='virus threat'\n", 998 | "query='regulatory risk banking reform'\n", 999 | "print('KEYWORD SEARCH RESULTS')\n", 1000 | "df0=et.search(query,'headline_text',type='match',embedder=embed_wrapper, size = 1000)\n", 1001 | "display(select_search_results(df0,10))\n", 1002 | "print('CONTEXTUAL SEARCH RESULTS')\n", 1003 | "df1=et.search(query,'headline_text',type='dense',embedder=embed_wrapper, size = 1000)\n", 1004 | "display(select_search_results(df1,10))\n", 1005 | "\n" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "markdown", 1010 | "metadata": {}, 1011 | "source": [ 1012 | "# Speed comparison\n", 1013 | "\n", 1014 | "Below we perform some non-functional testing on the impact of size of index together with search parameters on time of the query. \n", 1015 | "We have tested with 3 index sizes: 1k (Tiny), 100k (Medium) & 1.1mn (Large). We have not paid particular attention to and sampling effects, meaning that for instance, the 1k index is simply the first 1000 headlines in the data, this might mean ti is not well randomized, which we have not studied" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": 38, 1021 | "metadata": {}, 1022 | "outputs": [ 1023 | { 1024 | "name": "stderr", 1025 | "output_type": "stream", 1026 | "text": [ 1027 | "100%|██████████| 10/10 [26:27<00:00, 158.76s/it]\n" 1028 | ] 1029 | } 1030 | ], 1031 | "source": [ 1032 | "\n", 1033 | "queries=['Amazon','news','security thread','tech news','new vaccine developed new cure','results game today all winners']\n", 1034 | "result_sizes=[1,10,100]\n", 1035 | "repeat=10\n", 1036 | "\n", 1037 | "col_names=['search index','search type','search size' , 'query', '# tokens query','repeat','time taken']\n", 1038 | "search_to_compare={'match':'Keyword Search','dense':'Contextual Search',}\n", 1039 | "indices_to_compare={'et-tiny':'Tiny','et-medium':'Medium','et-large':'Large'}\n", 1040 | "\n", 1041 | "res=[]\n", 1042 | "for i in trange(repeat):\n", 1043 | " for index in indices_to_compare:\n", 1044 | " for search_type in search_to_compare:\n", 1045 | " for query in queries:\n", 1046 | " for size in result_sizes:\n", 1047 | " t0=datetime.datetime.now()\n", 1048 | " _ = et.search(query=query,\n", 1049 | " field='headline_text',\n", 1050 | " index_name=index,\n", 1051 | " type=search_type,\n", 1052 | " embedder=embed_wrapper, \n", 1053 | " size=size)\n", 1054 | " t1=datetime.datetime.now()\n", 1055 | " time_taken=(t1-t0).total_seconds()\n", 1056 | " res.append([indices_to_compare[index], search_to_compare[search_type], size, query, len(query.split()), i,time_taken])\n", 1057 | " \n", 1058 | "result_df=pd.DataFrame(res, columns=col_names)\n", 1059 | "result_df.to_csv('data/results_search.csv')\n" 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "markdown", 1064 | "metadata": {}, 1065 | "source": [ 1066 | "Compare speed across different index sizes and search types for\n", 1067 | "- query token length\n", 1068 | "- result size\n", 1069 | "- index size\n", 1070 | "\n", 1071 | "Results are below" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 39, 1077 | "metadata": {}, 1078 | "outputs": [ 1079 | { 1080 | "data": { 1081 | "image/png": "\n", 1082 | "text/plain": [ 1083 | "
" 1084 | ] 1085 | }, 1086 | "metadata": { 1087 | "needs_background": "light" 1088 | }, 1089 | "output_type": "display_data" 1090 | }, 1091 | { 1092 | "data": { 1093 | "image/png": "\n", 1094 | "text/plain": [ 1095 | "
" 1096 | ] 1097 | }, 1098 | "metadata": { 1099 | "needs_background": "light" 1100 | }, 1101 | "output_type": "display_data" 1102 | } 1103 | ], 1104 | "source": [ 1105 | "compare='search size' #'# tokens query'\n", 1106 | "compare='# tokens query'\n", 1107 | "comparisons=['search size', '# tokens query']\n", 1108 | "\n", 1109 | "for compare in comparisons:\n", 1110 | " fig, axes = plt.subplots(nrows=1, ncols=len(search_to_compare), figsize=(12,4))\n", 1111 | " fig.suptitle(f'Comparing {compare}',size=18)\n", 1112 | " for (c,search_type) in enumerate(search_to_compare.values()):\n", 1113 | " pvt=pd.pivot_table(result_df[(result_df['search type']==search_type)&(result_df['repeat']>2)] \\\n", 1114 | " [['search index',compare,'time taken']],\\\n", 1115 | " values='time taken',\n", 1116 | " index=compare,\n", 1117 | " columns='search index',\n", 1118 | " aggfunc='mean',\n", 1119 | " )\n", 1120 | " pvt.plot.bar(title=search_type,ax=axes[c]) \n" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "markdown", 1125 | "metadata": {}, 1126 | "source": [ 1127 | "The below box plot analyzes the deviation in search times after multiple repeated calls. Although some deviations are observed, they do not seem to be significant. Only results from the Large index are shown" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "code", 1132 | "execution_count": 17, 1133 | "metadata": {}, 1134 | "outputs": [ 1135 | { 1136 | "data": { 1137 | "text/plain": [ 1138 | "" 1139 | ] 1140 | }, 1141 | "execution_count": 17, 1142 | "metadata": {}, 1143 | "output_type": "execute_result" 1144 | }, 1145 | { 1146 | "data": { 1147 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPq0lEQVR4nO3dfbBcdX3H8fcHEuVRqOZqBYSgYpWKD+VaQSxFcKgKre2IDygqahutLWinRaN1QO3U4mgL7dBaY6oyFrGC1lq0CiOgqEhIIJBA1Kqgomgvo1LABwJ8+8c5V9a4N3cT7t78mvt+zezcs+fxe/ec/ezv/PbsbqoKSVK7dtjWBUiSNs+glqTGGdSS1DiDWpIaZ1BLUuMWjWOlS5YsqaVLl45j1ZK0XVqzZs0tVTUxbNpYgnrp0qWsXr16HKuWpO1Skm/ONM2uD0lqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjxvKBF225JFu1nN8nLm3/bFE3oqqG3vZ7/QUzTjOkpYXBFvU8e/xbLuTWn2zcomWWLv/EFs2/x86Luea0o7doGUntMqjn2a0/2ciNpx8z1m1sabBLaptBPc92f8xyDjp7+Zi3ATDeFwNJ88egnme3bTh97NvYY+fFY9+GpPljUM+zLe32WLr8E2PvKpHUNoO6EZu7PC9vn3k5r/yQtn8GdSMMXEkz8TpqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklq3EhBneTPklyXZH2Sc5PsNO7CJEmdWYM6yd7AycBkVT0W2BF4wbgLkyR1Ru36WATsnGQRsAvw3fGVJEkaNGtQV9V3gHcC3wJuBm6tqgs3nS/JsiSrk6yempqa+0olaYEapevjV4BnA/sDewG7Jjlh0/mqakVVTVbV5MTExNxXKkkL1ChdH08HbqiqqaraCHwUeMp4y5IkTRslqL8FHJJkl3Tfbn8UsGG8ZUmSpo3SR30FcD5wFbCuX2bFmOuSJPVG+oWXqjoNOG3MtUiShvCTiZLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklq3EhBnWTPJOcn+XKSDUkOHXdhkqTOohHn+3vgU1V1XJL7AbuMsSZJ0oBZgzrJHsDhwIkAVXUncOd4y5IkTRul62N/YAp4X5Krk6xMsuumMyVZlmR1ktVTU1NzXqgkLVSjBPUi4DeAd1XVE4E7gOWbzlRVK6pqsqomJyYm5rhMSVq4Rgnqm4CbquqK/v75dMEtSZoHswZ1VX0P+HaSX+tHHQVcP9aqJEk/N+pVHycB5/RXfHwDeNn4SpIkDRopqKtqLTA53lIkScP4yURJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxIwd1kh2TXJ3kgnEWJEn6RVvSon4NsGFchUiShhspqJPsAxwDrBxvOZKkTY3aoj4TeB1wz0wzJFmWZHWS1VNTU3NRmySJEYI6ybHA/1TVms3NV1UrqmqyqiYnJibmrEBJWuhGaVEfBvxekhuBDwFHJvnXsVYlSfq5WYO6qt5QVftU1VLgBcDFVXXC2CuTJAFeRy1JzVu0JTNX1aXApWOpRJI0lC1qSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcbMGdZKHJbkkyfVJrkvymvkoTJLUWTTCPHcBf15VVyXZHViT5KKqun7MtUmSGKFFXVU3V9VV/fBtwAZg73EXJknqbFEfdZKlwBOBK4ZMW5ZkdZLVU1NTc1SeJGnkoE6yG/AR4LVV9b+bTq+qFVU1WVWTExMTc1mjJC1oIwV1ksV0IX1OVX10vCVJkgaNctVHgH8BNlTV342/JEnSoFFa1IcBLwaOTLK2vz1rzHVJknqzXp5XVZ8HMg+1SJKG8JOJktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS40YK6iTPSPKVJF9LsnzcRUmS7jVrUCfZEfhH4JnAgcDxSQ4cd2GSpM4oLerfBL5WVd+oqjuBDwHPHm9ZkqRpi0aYZ2/g2wP3bwKePJ5yJG0rB5190LxsZ91L183LdrYnowT1SJIsA5YB7LvvvnO1WknzxABt1yhdH98BHjZwf59+3C+oqhVVNVlVkxMTE3NVnyQteKME9ZXAAUn2T3I/4AXAx8dbliRp2qxdH1V1V5I/BT4N7Ai8t6quG3tlkiRgxD7qqvok8Mkx1yJJGsJPJkpS4wxqSWqcQS1JjTOoJalxqaq5X2kyBXxzzle8MC0BbtnWRUgz8PicO/tV1dAPoYwlqDV3kqyuqsltXYc0jMfn/LDrQ5IaZ1BLUuMM6vat2NYFSJvh8TkP7KOWpMbZopakxhnUktS4BRXUSX41yYeSfD3JmiSfTPKorVzXa5PsspXL7pnk1Vuz7MA63p/kuCHjD0lyRZK1STYkefN92c4IddyYZMk4t7HQJbl9YPhZSb6aZL9tWdO0JCcmOWvI+IckuSDJNUmuTzLWL3Wb6fmwvVgwQZ0kwL8Dl1bVI6rqYOANwEO2cpWvBbYqqIE9gfsU1JtxNrCsqp4APBb48H1dYZI5+yUgbb0kRwH/ADyzqrbJB8r6H7sexVuBi6rq8VV1ILB8nre/XVkwQQ08DdhYVf88PaKqrqmqy9J5R5L1SdYleT5AkiOSXJrk/CRfTnJOP+/JwF7AJUku6ec9OsnlSa5Kcl6S3ZLsl+S/kyxJskOSy5IcDZwOPKJv9b6j384F03UlOSvJif3wqUmu7Gtb0b/gbM6DgZv7/+/uqrq+X8+uSd6bZFWSq5M8ux+/tK/rqv72lIH//bIkHweuT7Jjknf2dVyb5KSBbZ7UL7suyaO3fhdpJkkOB94DHFtVX+/HndDvz7VJ3t3vo5cnOXNguT9KckaSU/rjlv7+xf3wkUnO6YeP7/fh+iRvH1jH7Un+Nsk1wKFJXta36lcBh81Q8kPpfl8VgKq6dmB9p/TH9LVJ3jIw/mPpznSvS/fTfjNt/yX9stck+cDANg9P8sUk39juWtdVtSBuwMnAGTNMew5wEd0PIzwE+BbdgXYEcCvdz4/tAFwOPLVf5kZgST+8BPgcsGt///XAqf3wHwLnAacA7+7HLQXWD2z/COCCgftnASf2ww8cGP8B4Hf74fcDxw35X04Ffkh39vBKYKd+/NuAE/rhPYGvArvSnRVMz3MAsHqgpjuA/fv7fwycDywarKt/HE7qh18NrNzW+3p7uwEbgR8AjxsY9xjgP4HF/f1/Al4C7AZ8fWD8F4GDgEOA8/pxlwGrgMXAaf1xsld/3E/QfU/9xcDv9/MX8Lx++KED890P+AJw1pCafwf4EXAJ8JfAXv34o+ku6Uv/nLoAOHyTY2pnYD3woCHb//X+2F2yyTLvp3ue7QAcCHxtW++3ubwtpBb15jwVOLe6Fuj3gc8CT+qnraqqm6rqHmAtXchu6hC6g+MLSdYCLwX2A6iqlcADgFcBf7EVtT0tXZ/zOuBIugN1RlX1VmASuBB4IfCpftLRwPK+vkuBnYB96Z6s7+nXf17/f0xbVVU39MNPp3uhuavfzg8G5vto/3cNwx8f3Tcb6QL3FQPjjgIOBq7s9+lRwMOr6na6kD22P7tZXFXr6PbNwUkeAPyMrtExCfwWXXA/ia5bcKrfx+cAh/fbuhv4SD/85IH57gT+bVjBVfVp4OF0ZwGPBq5OMkF3HB4NXA1c1U87oF/s5L7V/CW632mdHj+4/SPpXnBu6bczeBx+rKruqe4scmu7NJu0kPoerwO25nToZwPDdzP8MQtdf9zxvzShe8Nxn/7ubsBtQ5a/i1/shtqpX3YnupbSZFV9O90bgzvNVnB1p8bvSvIeYCrJg/oan1NVX9mkvjcD3wce39fw04HJd8y2rd70YzTT46P75h7gecBnkryxqt5Gtz/Prqo3DJl/JfBG4MvA+wCqamOSG4AT6UL/WrruwEcCG7g3FIf5aVXdvaVF9yH6QeCDfdfe4X3df1NV7x6cN8kRdI2BQ6vqx0ku5d5jfdTtDz5XZ+si/H9lIbWoLwbuv0nf1+OSTLcont/38U3QHVCrZlnfbcDu/fCXgMOSPLJf766592qSt9O1Tk6la11suix03zR4YJL7J9mTrnUE9x6otyTZjRFeaJIcM9CPfQBdeP6I7jcvT5qeluSJ/Tx7ADf3Zwwvpuv+GeYi4JXp31hM8sDZatHcqaofA8cAL0ryCuAzwHFJHgzd/kh/JUhVXUHXIn0hcO7Aai6jO6v7XD/8KuDq6voOVgG/ne79lB2B4+nOLDd1RT/fg5IsBp47rN6+73uXfnh34BF0XSafBl7eH88k2bv/H/YAftiH9KPpzlKHuRh4bt/4WDDH4YJp/VRVJfkD4Mwkr6drOd5Id/XG54FDgWvo+sNeV1Xfm+WNsRXAp5J8t6qelu7Nv3OT3L+f/qYkD6U7pTysqu5O8pwkL6uq9yX5QpL1wH9V1SlJPkzXL3cD3WkhVfWjvlW8Hvge3S/Cz+bFwBlJfkzXUn9Rv+2/As4Erk2yQ7+dY+la7B9J8hK6bpKZWtErgUf1y2+ke9H5pcuyND5V9YMkz6AL2tcAbwIu7PfnRuBPuPfrhT8MPKGqfjiwisvo+osvr6o7kvy0H0dV3ZxkOV2fcoBPVNV/DKnh5v4s7HK6BsDaGco9GDgryfTZ4sqquhIgyWOAy/s2w+3ACXTH3quSbAC+Qtf4GfYYXJfkr4HPJrmb7rly4kyP2fbCj5BL26G+q+GMqvrMtq5F991C6vqQtnvpPkz1VeAnhvT2wxa1JDXOFrUkNc6glqTGGdSS1DiDWpIaZ1BLUuP+D9vImSv8J2/qAAAAAElFTkSuQmCC\n", 1148 | "text/plain": [ 1149 | "
" 1150 | ] 1151 | }, 1152 | "metadata": { 1153 | "needs_background": "light" 1154 | }, 1155 | "output_type": "display_data" 1156 | } 1157 | ], 1158 | "source": [ 1159 | "box_df=result_df[(result_df['search index']=='Large')].copy()\n", 1160 | "pd.pivot_table(box_df[['search type','time taken','repeat']],\n", 1161 | " values='time taken',\n", 1162 | " index='repeat',\n", 1163 | " columns='search type',\n", 1164 | " aggfunc='mean',).plot.box()" 1165 | ] 1166 | } 1167 | ], 1168 | "metadata": { 1169 | "kernelspec": { 1170 | "display_name": "Python [conda env:et2] *", 1171 | "language": "python", 1172 | "name": "conda-env-et2-py" 1173 | }, 1174 | "language_info": { 1175 | "codemirror_mode": { 1176 | "name": "ipython", 1177 | "version": 3 1178 | }, 1179 | "file_extension": ".py", 1180 | "mimetype": "text/x-python", 1181 | "name": "python", 1182 | "nbconvert_exporter": "python", 1183 | "pygments_lexer": "ipython3", 1184 | "version": "3.7.7" 1185 | } 1186 | }, 1187 | "nbformat": 4, 1188 | "nbformat_minor": 4 1189 | } 1190 | -------------------------------------------------------------------------------- /notebooks/Setting_up_ElasticTransformers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction\n", 8 | "\n", 9 | "This notebook will accomplish the following\n", 10 | "\n", 11 | "- Set up an ElasticTransformers class\n", 12 | "- Instantiate an index and index the Million headlines dataset in it\n", 13 | "- Preview some search results from comparing lexical vs semantic search\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Loading requirements" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 36, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "The autoreload extension is already loaded. To reload it, use:\n", 33 | " %reload_ext autoreload\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "%load_ext autoreload\n", 39 | "import os\n", 40 | "os.chdir(os.path.abspath(os.curdir).replace('notebooks',''))" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 37, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "%autoreload 2\n", 50 | "from src.database import ElasticTransformers\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Sentence Transformers\n", 58 | "\n", 59 | "This creates the sentence transformer object as well as small helper function which simplifies the embedding call and helps lading data into elastic easier" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 38, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "from sentence_transformers import SentenceTransformer\n", 69 | "bert_embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 39, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "def embed_wrapper(ls):\n", 79 | " \"\"\"\n", 80 | " Helper function which simplifies the embedding call and helps lading data into elastic easier\n", 81 | " \"\"\"\n", 82 | " results=bert_embedder.encode(ls, convert_to_tensor=True)\n", 83 | " results = [r.tolist() for r in results]\n", 84 | " return results" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Quick Preview of the raw data\n", 92 | "\n", 93 | "The data contains 1.15mn news headlines (all in lower case) and their published date" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 40, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import pandas as pd\n", 103 | "df=pd.read_csv('data/abcnews-date-text.csv')" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 41, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/html": [ 114 | "
\n", 115 | "\n", 128 | "\n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | "
publish_dateheadline_text
020030219aba decides against community broadcasting lic...
120030219act fire witnesses must be aware of defamation
220030219a g calls for infrastructure protection summit
320030219air nz staff in aust strike for pay rise
420030219air nz strike to affect australian travellers
\n", 164 | "
" 165 | ], 166 | "text/plain": [ 167 | " publish_date headline_text\n", 168 | "0 20030219 aba decides against community broadcasting lic...\n", 169 | "1 20030219 act fire witnesses must be aware of defamation\n", 170 | "2 20030219 a g calls for infrastructure protection summit\n", 171 | "3 20030219 air nz staff in aust strike for pay rise\n", 172 | "4 20030219 air nz strike to affect australian travellers" 173 | ] 174 | }, 175 | "execution_count": 41, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "df.head()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "# A tiny example\n", 189 | "\n", 190 | "Let's first do this with a tiny example of 1000 headlines (the full dataset is 1.1mn headlines)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 42, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "df.head(1000).to_csv('data/tiny_sample.csv')\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "# Setting up ElasticTransformers\n", 207 | "\n", 208 | "The below lines initialize the class, meaning setting the url and index name" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 32, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "et=ElasticTransformers(url='http://localhost:9300',index_name='et-tiny')\n", 218 | "_ = et.ping()\n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Next, we define the index specification (Elasticsearch index mapping)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 33, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "{'settings': {'number_of_shards': 3, 'number_of_replicas': 1},\n", 238 | " 'mappings': {'dynamic': 'true',\n", 239 | " '_source': {'enabled': 'true'},\n", 240 | " 'properties': {'publish_date': {'type': 'text'},\n", 241 | " 'headline_text': {'type': 'text'},\n", 242 | " 'headline_text_embedding': {'type': 'dense_vector', 'dims': 768}}}}" 243 | ] 244 | }, 245 | "execution_count": 33, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "et.create_index_spec(\n", 252 | " text_fields=['publish_date','headline_text'],\n", 253 | " dense_fields=['headline_text_embedding'],\n", 254 | " dense_fields_dim=768\n", 255 | ")" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 34, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "Creating 'et-tiny' index.\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "et.create_index()\n" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 35, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stderr", 282 | "output_type": "stream", 283 | "text": [ 284 | "1it [00:08, 8.52s/it]\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "et.write_large_csv('data/tiny_sample.csv',\n", 290 | " chunksize=1000,\n", 291 | " embedder=embed_wrapper,\n", 292 | " field_to_embed='headline_text')" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "One sample looks like this" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "## Indexing the entire dataset\n", 307 | "\n", 308 | "Lets do this now with 1.1mn records " 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 44, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "Creating 'et-large' index.\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "# Initialize\n", 326 | "et=ElasticTransformers(url='http://localhost:9200',index_name='et-large')\n", 327 | "_ = et.ping()\n", 328 | "# Create index mapping\n", 329 | "et.create_index_spec(\n", 330 | " text_fields=['publish_date','headline_text'],\n", 331 | " dense_fields=['headline_text_embedding'],\n", 332 | " dense_fields_dim=768\n", 333 | ")\n", 334 | "# Create index\n", 335 | "et.create_index()" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "### Indexing with sentence-transformers... \n", 343 | "\n", 344 | "This takes 3hrs on CPU, consumes 4CPUs & 2GB RAM for the embedding process and about 2GB RAM for Elastic" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 45, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "name": "stderr", 354 | "output_type": "stream", 355 | "text": [ 356 | "1187it [3:18:46, 10.05s/it]\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "et.write_large_csv('data/abcnews-date-text.csv',\n", 362 | " chunksize=1000,\n", 363 | " embedder=embed_wrapper,\n", 364 | " field_to_embed='headline_text')\n" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [] 373 | } 374 | ], 375 | "metadata": { 376 | "kernelspec": { 377 | "display_name": "Python [conda env:et2] *", 378 | "language": "python", 379 | "name": "conda-env-et2-py" 380 | }, 381 | "language_info": { 382 | "codemirror_mode": { 383 | "name": "ipython", 384 | "version": 3 385 | }, 386 | "file_extension": ".py", 387 | "mimetype": "text/x-python", 388 | "name": "python", 389 | "nbconvert_exporter": "python", 390 | "pygments_lexer": "ipython3", 391 | "version": "3.7.7" 392 | } 393 | }, 394 | "nbformat": 4, 395 | "nbformat_minor": 4 396 | } 397 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.0.5 2 | sentence-transformers==0.3.4 3 | elasticsearch==7.6.0 4 | matplotlib==3.3.1 5 | transformers==3.0.2 6 | -------------------------------------------------------------------------------- /src/database.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch, helpers 2 | import datetime 3 | import json 4 | import pandas as pd 5 | import tqdm 6 | import os 7 | from src.logger import logger 8 | 9 | class ElasticTransformers(object): 10 | def __init__(self,url='http://localhost:9200', index_name=None): 11 | """ 12 | Initializes class 13 | 14 | Args: 15 | url (string) full url for elastic 16 | index_name (string, optional) name of index can be used as the default index across all methods for this class instance should this apply 17 | """ 18 | self.url=url 19 | self.es=Elasticsearch(self.url) 20 | self.index_name=index_name 21 | self.index_file=None 22 | 23 | def ping(self): 24 | """ 25 | Checks if Elastic is healthy 26 | 27 | Returns: 28 | True if healthy, False otherwise 29 | """ 30 | ping=self.es.ping() 31 | if ping: 32 | logger.debug(f'Ping successful') 33 | return ping 34 | 35 | def create_index_spec(self, index_name=None,folder='index_spec',text_fields=[], keyword_fields=[], dense_fields=[], dense_fields_dim=512, shards=3, replicas=1): 36 | """ 37 | Creates mapping file for an index and stores the file 38 | 39 | Args: 40 | index_name (string, optional) name of index, defaults to index name defined when initiating the class 41 | folder (string) location to store index spec 42 | text_fields (list) 43 | keyword_fields (list) 44 | dense_fields (list) list of dense field names 45 | dense_fields_dim (int) 46 | shards (int) number of shards for index 47 | replicas (int) number of replicas for index 48 | """ 49 | 50 | if not os.path.exists(folder): 51 | os.makedirs(folder) 52 | 53 | if not index_name: 54 | if self.index_name: 55 | index_name=self.index_name 56 | else: 57 | raise ValueError('index_name not provided') 58 | index_spec={} 59 | 60 | index_spec['settings']={ 61 | "number_of_shards": shards, 62 | "number_of_replicas": replicas 63 | } 64 | 65 | index_spec['mappings']={ 66 | "dynamic": "true", 67 | "_source": { 68 | "enabled": "true" 69 | }, 70 | "properties": {}, 71 | } 72 | 73 | for t in text_fields: 74 | index_spec['mappings']['properties'][t]={ 75 | "type": "text" 76 | } 77 | 78 | for k in keyword_fields: 79 | index_spec['mappings']['properties'][t]={ 80 | "type": "keyword" 81 | } 82 | 83 | for d in dense_fields: 84 | index_spec['mappings']['properties'][d]={ 85 | "type": "dense_vector", 86 | "dims": dense_fields_dim 87 | } 88 | 89 | index_file_name=f'{folder}/spec_{index_name}.json' 90 | with open(index_file_name, 'w') as index_file: 91 | json.dump(index_spec,index_file) 92 | self.index_file=index_file_name 93 | logger.debug(f'Index spec {self.index_file} created') 94 | return index_spec 95 | 96 | def create_index(self, index_name=None, index_file=None): 97 | """ 98 | Create index (index_name) based on file (index_file) containing index mapping 99 | NOTE: existing index of this name will be deleted 100 | 101 | Args: 102 | index_name (string, optional): name of index, defaults to index name defined when initiating the class 103 | index_file (string, optional): index spec file location, if none provided, will use mapping from create_index_spec else will create blank mapping 104 | 105 | """ 106 | if not index_name: 107 | if self.index_name: 108 | index_name=self.index_name 109 | else: 110 | raise ValueError('index_name not provided') 111 | print(f"Creating '{index_name}' index.") 112 | self.es.indices.delete(index=index_name, ignore=[404]) 113 | 114 | if index_file or self.index_file: 115 | if self.index_file: 116 | index_file=self.index_file 117 | with open(index_file) as index_file: 118 | index_spec = index_file.read().strip() 119 | 120 | else: 121 | index_spec={ 122 | "number_of_shards": 3, 123 | "number_of_replicas": 1 124 | } 125 | 126 | self.es.indices.create(index=index_name, body=index_spec) 127 | 128 | def write(self,docs,index_name=None,index_field=None): 129 | """ 130 | Writes entries to index 131 | 132 | Args: 133 | docs (list) list of dictionaries with keys matching index field names from index specification 134 | index_name (string, optional) name of index, defaults to index name defined when initiating the class 135 | index_field (string, optional) name of index field if present in docs. Defaults to elasicsearch indexing otherwise 136 | 137 | """ 138 | if not index_name: 139 | if self.index_name: 140 | index_name=self.index_name 141 | else: 142 | raise ValueError('index_name not provided') 143 | requests = [] 144 | for i, doc in enumerate(docs): 145 | request = doc 146 | request["_op_type"] = "index" 147 | if index_field: 148 | request["_id"] = doc[index_field] 149 | request["_index"] = index_name 150 | requests.append(request) 151 | helpers.bulk(self.es, requests) 152 | 153 | def write_large_csv(self, file_path, index_name=None, chunksize=10000, embedder=None, field_to_embed=None, index_field=None): 154 | """ 155 | Iteratively reads through a csv file and writes it to elastic in batches 156 | 157 | Args: 158 | file_path (string) path to file 159 | index_name (string, optional) name of index, defaults to index name defined when initiating the class 160 | chunksize (int) size of the chunk to be read from file and sent to embedder 161 | embedder (function) embedder function with expected call embedded(list of strings to embed) 162 | field_to_embed (string) name of field to embed 163 | index_field (string, optional) name of index field if present in docs. Defaults to elasicsearch indexing otherwise 164 | """ 165 | if not index_name: 166 | if self.index_name: 167 | index_name=self.index_name 168 | else: 169 | raise ValueError('index_name not provided') 170 | # read the large csv file with specified chunksize 171 | df_chunk = pd.read_csv(file_path, chunksize=chunksize, index_col=0) 172 | 173 | chunk_list = [] # append each chunk df here 174 | 175 | # Each chunk is in df format 176 | for chunk in tqdm.tqdm(df_chunk): 177 | if embedder: 178 | chunk[f'{field_to_embed}_embedding']=embedder(chunk[field_to_embed].values) 179 | chunk_ls=json.loads(chunk.to_json(orient='records')) 180 | self.write(chunk_ls,index_name,index_field=index_field) 181 | logger.debug(f'Successfully wrote {len(chunk_ls)} docs to {index_name}') 182 | 183 | def sample(self, index_name=None, size=3): 184 | """ 185 | Provides a sample of documents from the index 186 | 187 | Args: 188 | index_name (string, optional) name of index, defaults to index name defined when initiating the class 189 | size (int, optional) number of results to retrieve, defaults to 3, max 10k, can be relaxed with elastic config 190 | """ 191 | if not index_name: 192 | if self.index_name: 193 | index_name=self.index_name 194 | else: 195 | raise ValueError('index_name not provided') 196 | res=self.es.search(index=index_name, size=size) 197 | logger.debug(f"Successfully sampled {len(res['hits']['hits'])} docs from {index_name}") 198 | return res 199 | 200 | def search(self, query, field, type='match', index_name=None, embedder=None, size=10): 201 | """ 202 | Search elastic 203 | 204 | Args: 205 | query (string) search query 206 | field (string) field to search 207 | type (string) type of search, takes: match, term, fuzzy, wildcard (requires "*" in query), dense (semantic search, requires embedder, index needs to be indexed with embeddings, assumes embedding field is named {field}_embedding) 208 | index_name (string, optional) name of index, defaults to index name defined when initiating the class 209 | embedder (function) embedder function with expected call embedded(list of strings to embed) 210 | size (int, optional) number of results to retrieve, defaults to 3, max 10k, can be relaxed with elastic config 211 | 212 | Returns: 213 | DataFrame with results and search score 214 | """ 215 | res=[] 216 | 217 | if not index_name: 218 | if self.index_name: 219 | index_name=self.index_name 220 | else: 221 | raise ValueError('index_name not provided') 222 | if type=='dense': 223 | if not embedder: 224 | raise ValueError('Dense search requires embedder') 225 | query_vector = embedder([query])[0] 226 | 227 | script_query = { 228 | "script_score": { 229 | "query": {"match_all": {}}, 230 | "script": { 231 | "source": f"cosineSimilarity(params.query_vector, doc['{field}_embedding']) + 1.0", 232 | "params": {"query_vector": query_vector} 233 | } 234 | } 235 | } 236 | 237 | res = self.es.search( 238 | index=index_name, 239 | body={ 240 | "size": size, 241 | "query": script_query, 242 | "_source": {"excludes": [f'{field}_embedding']} 243 | } 244 | ) 245 | else: 246 | res=self.es.search(index=index_name, body={'query':{type:{field:query}}, "_source": {"excludes": [f'{field}_embedding']}},size=size) 247 | self.search_raw_result=res 248 | hits=res['hits']['hits'] 249 | if len(hits)>0: 250 | keys=list(hits[0]['_source'].keys()) 251 | 252 | out=[[h['_score']]+[h['_source'][k] for k in keys] for h in hits] 253 | 254 | df=pd.DataFrame(out,columns=['_score']+keys) 255 | else: 256 | df=pd.DataFrame([]) 257 | self.search_df_result=df 258 | logger.debug(f'Search {type.upper()} {query} in {index_name}.{field} returned {len(df)} results of {size} requested') 259 | return df -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import datetime 3 | import os 4 | 5 | logs_folder='logs' 6 | if not os.path.exists(logs_folder): 7 | os.makedirs(logs_folder) 8 | 9 | # Create a custom logger 10 | logger = logging.getLogger(__name__) 11 | 12 | # Setting global logging level 13 | logger.setLevel(logging.WARNING) 14 | 15 | date=str(datetime.date.today()).replace('-','') 16 | # Initialize handlers 17 | file_hndl = logging.FileHandler(f'{logs_folder}/q_logs_{date}.log') 18 | cli_hndl = logging.StreamHandler() 19 | # Set logging level 20 | file_hndl.setLevel(level=logging.DEBUG) 21 | cli_hndl.setLevel(level=logging.DEBUG) 22 | # Add formatters to handlers 23 | logger_text_format = logging.Formatter('%(asctime)s --- %(name)s --- %(levelname)s --- %(funcName)s:%(lineno)d --- %(message)s') 24 | file_hndl.setFormatter(logger_text_format) 25 | cli_hndl.setFormatter(logger_text_format) 26 | 27 | # Add handlers to the logger 28 | logger.addHandler(file_hndl) 29 | logger.addHandler(cli_hndl) 30 | 31 | --------------------------------------------------------------------------------