├── .gitignore
├── LICENSE
├── README.md
├── assets
├── architecture.png
└── blinking_elastic2.gif
├── notebooks
├── Searching_with_ElasticTransformers.ipynb
└── Setting_up_ElasticTransformers.ipynb
├── requirements.txt
└── src
├── database.py
└── logger.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .DS_store
3 | .vscode/
4 | __pycache__/
5 | index_spec
6 |
7 | logs/
8 | data/
9 |
10 | notebooks/Experiments*
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ElasticTransformers
2 | Semantic Elasticsearch with Sentence Transformers. We will use the power of Elastic and the magic of BERT to index a million articles and perform lexical and semantic search on them.
3 |
4 | The purpose is to provide an ease-of-use way of setting up your own Elasticsearch with near state of the art capabilities of contextual embeddings / semantic search using NLP transformers.
5 |
6 | ## Overview
7 |
8 |
9 |
10 |
11 |
12 | The above setup works as follows
13 | - Set up an Elasticsearch server with Dockers
14 | - Collect the dataset
15 | - Use sentence-transformers to index them onto Elastic (takes about 3 hrs on 4 CPU cores)
16 | - Look at some comparison examples between lexical and semantic search
17 |
18 | ## Setup
19 | ### Set up your environment
20 | My environment is called `et` and I use conda for this. Navigate inside the project directory
21 | ```python
22 | conda create --name et python=3.7
23 | conda install -n et nb_conda_kernels
24 | conda activate et
25 | pip install -r requirements.txt
26 | ```
27 |
28 | ### Get the data
29 | For this tutorial I am using [A Million News Headlines](https://www.kaggle.com/therohk/million-headlines "Kaggle A Million News Headlines") by Rohk and place it in the data folder inside the project dir.
30 |
31 | elastic_transformers/
32 | ├── data/
33 |
34 | You will find that the steps are otherwise pretty abstracted so you can also do this with your dataset of choice
35 |
36 | ### Elasticsearch with Docker
37 | Follow the instructions on setting up Elastic with Docker from Elastic's page [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html)
38 | For this tutorial, you only need to run the two steps:
39 | - [Pulling the image](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html#_pulling_the_image)
40 | - [Starting a single node cluster with Docker](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html#docker-cli-run-dev-mode)
41 |
42 | ## Features
43 |
44 | The repo introduces the ElasiticTransformers class. Utilities which help create, index and query Elasticsearch indices which include embeddings
45 |
46 | Initiate the connection links as well as (optionally) the name of the index to work with
47 | ```python
48 | et=ElasticTransformers(url='http://localhost:9300',index_name='et-tiny')
49 | ```
50 | *create_index_spec* define mapping for the index. Lists of relevant fields can
51 | be provided for keyword search or semantic (dense vector) search.
52 | It also has parameters for the size of the dense vector as those can vary
53 | *create_index* - uses the spec created earlier to create an index ready for search
54 |
55 | ```py
56 | et.create_index_spec(
57 | text_fields=['publish_date','headline_text'],
58 | dense_fields=['headline_text_embedding'],
59 | dense_fields_dim=768
60 | )
61 | et.create_index()
62 | ```
63 |
64 | *write_large_csv* - breaks up a large csv file into chunks and iteratively uses a predefined
65 | embedding utility to create the embeddings list for each chunk and subsequently feed results to the index
66 | ```py
67 | et.write_large_csv('data/tiny_sample.csv',
68 | chunksize=1000,
69 | embedder=embed_wrapper,
70 | field_to_embed='headline_text')
71 | ```
72 | *search* - allows to select either keyword (‘match’ in Elastic) or semantic (dense in Elastic)
73 | search. Notably it requires the same embedding function used in write_large_csv
74 | ```py
75 | et.search(query='search these terms',
76 | field='headline_text',
77 | type='match',
78 | embedder=embed_wrapper,
79 | size = 1000)
80 | ```
81 |
82 | ## Usage
83 | After successful setup, use the folling notebooks to make this all work
84 | - [Setting up the index](../master/notebooks/Setting_up_ElasticTransformers.ipynb)
85 | - [Searching](../master/notebooks/Searching_with_ElasticTransformers.ipynb)
86 |
87 | ## References
88 | This repo combines together the following amazing works by brilliant people. Please check out their work if you haven't done so yet...
89 |
90 | ### The ML part
91 | - [sentence-transformers](https://github.com/UKPLab/sentence-transformers)
92 | - [transformers](https://github.com/huggingface/transformers)
93 | - [BERT](https://github.com/google-research/bert)
94 | ### The engineering part
95 | - [Elasticsearch](https://www.elastic.co/home)
96 | - [Docker](https://hub.docker.com)
97 |
--------------------------------------------------------------------------------
/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/md-experiments/elastic_transformers/9f5920ab14d814739138544f4711567b8b762e5a/assets/architecture.png
--------------------------------------------------------------------------------
/assets/blinking_elastic2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/md-experiments/elastic_transformers/9f5920ab14d814739138544f4711567b8b762e5a/assets/blinking_elastic2.gif
--------------------------------------------------------------------------------
/notebooks/Searching_with_ElasticTransformers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "import os\n",
11 | "os.chdir(os.path.abspath(os.curdir).replace('notebooks',''))"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import datetime\n",
21 | "from tqdm import trange\n",
22 | "import pandas as pd\n",
23 | "import matplotlib.pyplot as plt\n",
24 | "pd.set_option('display.max_colwidth', 120)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "%autoreload 2\n",
34 | "\n",
35 | "from src.database import ElasticTransformers"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 4,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "from sentence_transformers import SentenceTransformer\n",
45 | "\n",
46 | "bert_embedder = SentenceTransformer('bert-base-nli-mean-tokens')"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 5,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "def embed_wrapper(ls):\n",
56 | " \"\"\"\n",
57 | " Helper function which simplifies the embedding call and helps lading data into elastic easier\n",
58 | " \"\"\"\n",
59 | " results=bert_embedder.encode(ls, convert_to_tensor=True)\n",
60 | " results = [r.tolist() for r in results]\n",
61 | " return results\n"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 6,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "data": {
71 | "text/plain": [
72 | "True"
73 | ]
74 | },
75 | "execution_count": 6,
76 | "metadata": {},
77 | "output_type": "execute_result"
78 | }
79 | ],
80 | "source": [
81 | "et=ElasticTransformers(index_name='et-large')\n",
82 | "et.ping()"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "# Search Experiments\n",
90 | "\n",
91 | "To analyse results, I compared top results side by side on a few searches. \n",
92 | "\n",
93 | "Approach is to take the top 10 hits, after removing some of the noisy results (duplicates or “headlines” of just one word).\n"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 7,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "def select_search_results(df,top_n=10):\n",
103 | " # four tokens or more (filtering out some meaningless headlines)\n",
104 | " df=df[df.headline_text.apply(lambda x: len(x.split())>4)].copy()\n",
105 | " # remove exact duplicates\n",
106 | " df=df.groupby('headline_text', as_index=False).first()\n",
107 | " df=df.sort_values('_score',ascending=False)\n",
108 | " df=df.reset_index(drop=True)\n",
109 | " return df.head(top_n)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 31,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "KEYWORD SEARCH RESULTS\n"
122 | ]
123 | },
124 | {
125 | "data": {
126 | "text/html": [
127 | "\n",
128 | "\n",
141 | "
\n",
142 | " \n",
143 | " \n",
144 | " \n",
145 | " headline_text \n",
146 | " _score \n",
147 | " \n",
148 | " \n",
149 | " \n",
150 | " \n",
151 | " 0 \n",
152 | " public warned of mozzie virus threat \n",
153 | " 13.311735 \n",
154 | " \n",
155 | " \n",
156 | " 1 \n",
157 | " cattle producers warned of virus threat \n",
158 | " 13.311735 \n",
159 | " \n",
160 | " \n",
161 | " 2 \n",
162 | " expert plays down hendra virus threat \n",
163 | " 13.295192 \n",
164 | " \n",
165 | " \n",
166 | " 3 \n",
167 | " residents reminded of mozzie virus threat \n",
168 | " 13.213888 \n",
169 | " \n",
170 | " \n",
171 | " 4 \n",
172 | " mozzie virus threat sparks health alert \n",
173 | " 13.213888 \n",
174 | " \n",
175 | " \n",
176 | " 5 \n",
177 | " report reveals lower mozzie virus threat \n",
178 | " 13.213888 \n",
179 | " \n",
180 | " \n",
181 | " 6 \n",
182 | " hendra like virus identified as potential threat \n",
183 | " 12.498677 \n",
184 | " \n",
185 | " \n",
186 | " 7 \n",
187 | " hendra virus poses constant threat chief vet \n",
188 | " 12.498677 \n",
189 | " \n",
190 | " \n",
191 | " 8 \n",
192 | " public warned of mossie borne virus threat \n",
193 | " 12.498677 \n",
194 | " \n",
195 | " \n",
196 | " 9 \n",
197 | " sunraysia fears watermelon virus threat from nt \n",
198 | " 12.483357 \n",
199 | " \n",
200 | " \n",
201 | "
\n",
202 | "
"
203 | ],
204 | "text/plain": [
205 | " headline_text _score\n",
206 | "0 public warned of mozzie virus threat 13.311735\n",
207 | "1 cattle producers warned of virus threat 13.311735\n",
208 | "2 expert plays down hendra virus threat 13.295192\n",
209 | "3 residents reminded of mozzie virus threat 13.213888\n",
210 | "4 mozzie virus threat sparks health alert 13.213888\n",
211 | "5 report reveals lower mozzie virus threat 13.213888\n",
212 | "6 hendra like virus identified as potential threat 12.498677\n",
213 | "7 hendra virus poses constant threat chief vet 12.498677\n",
214 | "8 public warned of mossie borne virus threat 12.498677\n",
215 | "9 sunraysia fears watermelon virus threat from nt 12.483357"
216 | ]
217 | },
218 | "metadata": {},
219 | "output_type": "display_data"
220 | },
221 | {
222 | "name": "stdout",
223 | "output_type": "stream",
224 | "text": [
225 | "CONTEXTUAL SEARCH RESULTS\n"
226 | ]
227 | },
228 | {
229 | "data": {
230 | "text/html": [
231 | "\n",
232 | "\n",
245 | "
\n",
246 | " \n",
247 | " \n",
248 | " \n",
249 | " headline_text \n",
250 | " _score \n",
251 | " \n",
252 | " \n",
253 | " \n",
254 | " \n",
255 | " 0 \n",
256 | " hendra like virus identified as potential threat \n",
257 | " 1.859408 \n",
258 | " \n",
259 | " \n",
260 | " 1 \n",
261 | " hendra report author warns of virus risk \n",
262 | " 1.853364 \n",
263 | " \n",
264 | " \n",
265 | " 2 \n",
266 | " fresh concerns over hendra virus outbreak \n",
267 | " 1.836927 \n",
268 | " \n",
269 | " \n",
270 | " 3 \n",
271 | " virus puts giteau in doubt \n",
272 | " 1.823136 \n",
273 | " \n",
274 | " \n",
275 | " 4 \n",
276 | " hendra virus case under investigation \n",
277 | " 1.817768 \n",
278 | " \n",
279 | " \n",
280 | " 5 \n",
281 | " who highlight dangers of vector borne diseases \n",
282 | " 1.804388 \n",
283 | " \n",
284 | " \n",
285 | " 6 \n",
286 | " potentially deadly virus sparks mozzie warning \n",
287 | " 1.799419 \n",
288 | " \n",
289 | " \n",
290 | " 7 \n",
291 | " who warns threat from vector borne diseases \n",
292 | " 1.793783 \n",
293 | " \n",
294 | " \n",
295 | " 8 \n",
296 | " fears as png diseases spread \n",
297 | " 1.791913 \n",
298 | " \n",
299 | " \n",
300 | " 9 \n",
301 | " deadly hendra virus strikes again \n",
302 | " 1.788590 \n",
303 | " \n",
304 | " \n",
305 | "
\n",
306 | "
"
307 | ],
308 | "text/plain": [
309 | " headline_text _score\n",
310 | "0 hendra like virus identified as potential threat 1.859408\n",
311 | "1 hendra report author warns of virus risk 1.853364\n",
312 | "2 fresh concerns over hendra virus outbreak 1.836927\n",
313 | "3 virus puts giteau in doubt 1.823136\n",
314 | "4 hendra virus case under investigation 1.817768\n",
315 | "5 who highlight dangers of vector borne diseases 1.804388\n",
316 | "6 potentially deadly virus sparks mozzie warning 1.799419\n",
317 | "7 who warns threat from vector borne diseases 1.793783\n",
318 | "8 fears as png diseases spread 1.791913\n",
319 | "9 deadly hendra virus strikes again 1.788590"
320 | ]
321 | },
322 | "metadata": {},
323 | "output_type": "display_data"
324 | }
325 | ],
326 | "source": [
327 | "query='virus threat'\n",
328 | "print('KEYWORD SEARCH RESULTS')\n",
329 | "df0=et.search(query,'headline_text',type='match',embedder=embed_wrapper, size = 1000)\n",
330 | "display(select_search_results(df0))\n",
331 | "print('CONTEXTUAL SEARCH RESULTS')\n",
332 | "df1=et.search(query,'headline_text',type='dense',embedder=embed_wrapper, size = 1000)\n",
333 | "display(select_search_results(df1))\n",
334 | "\n"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 32,
340 | "metadata": {},
341 | "outputs": [
342 | {
343 | "data": {
344 | "text/html": [
345 | "\n",
346 | "\n",
359 | "
\n",
360 | " \n",
361 | " \n",
362 | " \n",
363 | " headline_text \n",
364 | " _score \n",
365 | " \n",
366 | " \n",
367 | " \n",
368 | " \n",
369 | " 5 \n",
370 | " who highlight dangers of vector borne diseases \n",
371 | " 1.804388 \n",
372 | " \n",
373 | " \n",
374 | " 8 \n",
375 | " fears as png diseases spread \n",
376 | " 1.791913 \n",
377 | " \n",
378 | " \n",
379 | " 21 \n",
380 | " the odds of an outbreak \n",
381 | " 1.773913 \n",
382 | " \n",
383 | " \n",
384 | " 25 \n",
385 | " flood waters carry risk of disease infection \n",
386 | " 1.769219 \n",
387 | " \n",
388 | " \n",
389 | " 27 \n",
390 | " port uncertain of impact of viral meningitis outbreak \n",
391 | " 1.767367 \n",
392 | " \n",
393 | " \n",
394 | " 30 \n",
395 | " human error blamed for infection scare \n",
396 | " 1.764524 \n",
397 | " \n",
398 | " \n",
399 | " 34 \n",
400 | " oakey defence base contaminants linked to serious disease \n",
401 | " 1.758969 \n",
402 | " \n",
403 | " \n",
404 | " 35 \n",
405 | " dangerous parasite rife in nt \n",
406 | " 1.757938 \n",
407 | " \n",
408 | " \n",
409 | " 40 \n",
410 | " academic fears spread of mozzie borne disease \n",
411 | " 1.755103 \n",
412 | " \n",
413 | " \n",
414 | " 44 \n",
415 | " sti symptoms dangers and treatments \n",
416 | " 1.751830 \n",
417 | " \n",
418 | " \n",
419 | "
\n",
420 | "
"
421 | ],
422 | "text/plain": [
423 | " headline_text _score\n",
424 | "5 who highlight dangers of vector borne diseases 1.804388\n",
425 | "8 fears as png diseases spread 1.791913\n",
426 | "21 the odds of an outbreak 1.773913\n",
427 | "25 flood waters carry risk of disease infection 1.769219\n",
428 | "27 port uncertain of impact of viral meningitis outbreak 1.767367\n",
429 | "30 human error blamed for infection scare 1.764524\n",
430 | "34 oakey defence base contaminants linked to serious disease 1.758969\n",
431 | "35 dangerous parasite rife in nt 1.757938\n",
432 | "40 academic fears spread of mozzie borne disease 1.755103\n",
433 | "44 sti symptoms dangers and treatments 1.751830"
434 | ]
435 | },
436 | "execution_count": 32,
437 | "metadata": {},
438 | "output_type": "execute_result"
439 | }
440 | ],
441 | "source": [
442 | "df11=select_search_results(df1,1000)\n",
443 | "\n",
444 | "df11[df11.headline_text.apply(lambda x: all([q not in x for q in query.split()]))].head(10)\n"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 35,
450 | "metadata": {},
451 | "outputs": [
452 | {
453 | "name": "stdout",
454 | "output_type": "stream",
455 | "text": [
456 | "KEYWORD SEARCH RESULTS\n"
457 | ]
458 | },
459 | {
460 | "data": {
461 | "text/html": [
462 | "\n",
463 | "\n",
476 | "
\n",
477 | " \n",
478 | " \n",
479 | " \n",
480 | " headline_text \n",
481 | " _score \n",
482 | " \n",
483 | " \n",
484 | " \n",
485 | " \n",
486 | " 0 \n",
487 | " perth storm a natural disaster \n",
488 | " 16.165108 \n",
489 | " \n",
490 | " \n",
491 | " 1 \n",
492 | " more natural disaster planning needed \n",
493 | " 16.165108 \n",
494 | " \n",
495 | " \n",
496 | " 2 \n",
497 | " gunnedah declared natural disaster area \n",
498 | " 16.165108 \n",
499 | " \n",
500 | " \n",
501 | " 3 \n",
502 | " bushfire prompts natural disaster declaration \n",
503 | " 16.165108 \n",
504 | " \n",
505 | " \n",
506 | " 4 \n",
507 | " maclean fire not natural disaster \n",
508 | " 16.032738 \n",
509 | " \n",
510 | " \n",
511 | " 5 \n",
512 | " state helps natural disaster victims \n",
513 | " 15.960692 \n",
514 | " \n",
515 | " \n",
516 | " 6 \n",
517 | " esperance declared natural disaster area \n",
518 | " 15.960692 \n",
519 | " \n",
520 | " \n",
521 | " 7 \n",
522 | " flooding sparks natural disaster declarations \n",
523 | " 15.960692 \n",
524 | " \n",
525 | " \n",
526 | " 8 \n",
527 | " government declares natural disaster areas \n",
528 | " 15.960692 \n",
529 | " \n",
530 | " \n",
531 | " 9 \n",
532 | " nsw natural disaster zone widened \n",
533 | " 15.960692 \n",
534 | " \n",
535 | " \n",
536 | "
\n",
537 | "
"
538 | ],
539 | "text/plain": [
540 | " headline_text _score\n",
541 | "0 perth storm a natural disaster 16.165108\n",
542 | "1 more natural disaster planning needed 16.165108\n",
543 | "2 gunnedah declared natural disaster area 16.165108\n",
544 | "3 bushfire prompts natural disaster declaration 16.165108\n",
545 | "4 maclean fire not natural disaster 16.032738\n",
546 | "5 state helps natural disaster victims 15.960692\n",
547 | "6 esperance declared natural disaster area 15.960692\n",
548 | "7 flooding sparks natural disaster declarations 15.960692\n",
549 | "8 government declares natural disaster areas 15.960692\n",
550 | "9 nsw natural disaster zone widened 15.960692"
551 | ]
552 | },
553 | "metadata": {},
554 | "output_type": "display_data"
555 | },
556 | {
557 | "name": "stdout",
558 | "output_type": "stream",
559 | "text": [
560 | "CONTEXTUAL SEARCH RESULTS\n"
561 | ]
562 | },
563 | {
564 | "data": {
565 | "text/html": [
566 | "\n",
567 | "\n",
580 | "
\n",
581 | " \n",
582 | " \n",
583 | " \n",
584 | " headline_text \n",
585 | " _score \n",
586 | " \n",
587 | " \n",
588 | " \n",
589 | " \n",
590 | " 0 \n",
591 | " natural disaster declared in broken hill \n",
592 | " 1.879506 \n",
593 | " \n",
594 | " \n",
595 | " 1 \n",
596 | " natural disaster declared in storm area \n",
597 | " 1.877636 \n",
598 | " \n",
599 | " \n",
600 | " 2 \n",
601 | " lismore declared a natural disaster area \n",
602 | " 1.867503 \n",
603 | " \n",
604 | " \n",
605 | " 3 \n",
606 | " broken hill declared a natural disaster area \n",
607 | " 1.854052 \n",
608 | " \n",
609 | " \n",
610 | " 4 \n",
611 | " natural disasters take toll on austar \n",
612 | " 1.848880 \n",
613 | " \n",
614 | " \n",
615 | " 5 \n",
616 | " perth storm a natural disaster \n",
617 | " 1.836232 \n",
618 | " \n",
619 | " \n",
620 | " 6 \n",
621 | " call for nambucca valley natural disaster \n",
622 | " 1.834766 \n",
623 | " \n",
624 | " \n",
625 | " 7 \n",
626 | " wagga albury declared natural disaster areas \n",
627 | " 1.831764 \n",
628 | " \n",
629 | " \n",
630 | " 8 \n",
631 | " ballina area declared natural disaster zone \n",
632 | " 1.823700 \n",
633 | " \n",
634 | " \n",
635 | " 9 \n",
636 | " disasters take toll on shire \n",
637 | " 1.822510 \n",
638 | " \n",
639 | " \n",
640 | "
\n",
641 | "
"
642 | ],
643 | "text/plain": [
644 | " headline_text _score\n",
645 | "0 natural disaster declared in broken hill 1.879506\n",
646 | "1 natural disaster declared in storm area 1.877636\n",
647 | "2 lismore declared a natural disaster area 1.867503\n",
648 | "3 broken hill declared a natural disaster area 1.854052\n",
649 | "4 natural disasters take toll on austar 1.848880\n",
650 | "5 perth storm a natural disaster 1.836232\n",
651 | "6 call for nambucca valley natural disaster 1.834766\n",
652 | "7 wagga albury declared natural disaster areas 1.831764\n",
653 | "8 ballina area declared natural disaster zone 1.823700\n",
654 | "9 disasters take toll on shire 1.822510"
655 | ]
656 | },
657 | "metadata": {},
658 | "output_type": "display_data"
659 | }
660 | ],
661 | "source": [
662 | "query='natural disaster'\n",
663 | "print('KEYWORD SEARCH RESULTS')\n",
664 | "df0=et.search(query,'headline_text',type='match',embedder=embed_wrapper, size = 1000)\n",
665 | "display(select_search_results(df0,10))\n",
666 | "print('CONTEXTUAL SEARCH RESULTS')\n",
667 | "df1=et.search(query,'headline_text',type='dense',embedder=embed_wrapper, size = 1000)\n",
668 | "display(select_search_results(df1,10))\n",
669 | "\n"
670 | ]
671 | },
672 | {
673 | "cell_type": "code",
674 | "execution_count": 36,
675 | "metadata": {},
676 | "outputs": [
677 | {
678 | "data": {
679 | "text/html": [
680 | "\n",
681 | "\n",
694 | "
\n",
695 | " \n",
696 | " \n",
697 | " \n",
698 | " headline_text \n",
699 | " _score \n",
700 | " \n",
701 | " \n",
702 | " \n",
703 | " \n",
704 | " 28 \n",
705 | " power supply at risk if flood situation worsens \n",
706 | " 1.787914 \n",
707 | " \n",
708 | " \n",
709 | " 36 \n",
710 | " widespread damage from freak storm \n",
711 | " 1.779122 \n",
712 | " \n",
713 | " \n",
714 | " 40 \n",
715 | " catastrophic fire conditions for wa \n",
716 | " 1.776245 \n",
717 | " \n",
718 | " \n",
719 | " 43 \n",
720 | " leigh creek ucg project lifeline or toxic environmental hazard \n",
721 | " 1.772300 \n",
722 | " \n",
723 | " \n",
724 | " 46 \n",
725 | " qlds wild weather caused by freak event \n",
726 | " 1.770647 \n",
727 | " \n",
728 | " \n",
729 | " 48 \n",
730 | " wild weather causes qld flooding \n",
731 | " 1.769357 \n",
732 | " \n",
733 | " \n",
734 | " 50 \n",
735 | " cyclone damaged water supply fixed \n",
736 | " 1.767906 \n",
737 | " \n",
738 | " \n",
739 | " 53 \n",
740 | " humungous effort on catastrophic day \n",
741 | " 1.766371 \n",
742 | " \n",
743 | " \n",
744 | " 56 \n",
745 | " nsw floods receding water reveals destruction \n",
746 | " 1.766130 \n",
747 | " \n",
748 | " \n",
749 | " 58 \n",
750 | " cyclone olwyn carnarvon water supply problems \n",
751 | " 1.765932 \n",
752 | " \n",
753 | " \n",
754 | "
\n",
755 | "
"
756 | ],
757 | "text/plain": [
758 | " headline_text _score\n",
759 | "28 power supply at risk if flood situation worsens 1.787914\n",
760 | "36 widespread damage from freak storm 1.779122\n",
761 | "40 catastrophic fire conditions for wa 1.776245\n",
762 | "43 leigh creek ucg project lifeline or toxic environmental hazard 1.772300\n",
763 | "46 qlds wild weather caused by freak event 1.770647\n",
764 | "48 wild weather causes qld flooding 1.769357\n",
765 | "50 cyclone damaged water supply fixed 1.767906\n",
766 | "53 humungous effort on catastrophic day 1.766371\n",
767 | "56 nsw floods receding water reveals destruction 1.766130\n",
768 | "58 cyclone olwyn carnarvon water supply problems 1.765932"
769 | ]
770 | },
771 | "execution_count": 36,
772 | "metadata": {},
773 | "output_type": "execute_result"
774 | }
775 | ],
776 | "source": [
777 | "df11=select_search_results(df1,1000)\n",
778 | "\n",
779 | "df11[df11.headline_text.apply(lambda x: all([q not in x for q in query.split()]))].head(10)\n"
780 | ]
781 | },
782 | {
783 | "cell_type": "code",
784 | "execution_count": 24,
785 | "metadata": {},
786 | "outputs": [
787 | {
788 | "name": "stdout",
789 | "output_type": "stream",
790 | "text": [
791 | "KEYWORD SEARCH RESULTS\n"
792 | ]
793 | },
794 | {
795 | "data": {
796 | "text/html": [
797 | "\n",
798 | "\n",
811 | "
\n",
812 | " \n",
813 | " \n",
814 | " \n",
815 | " headline_text \n",
816 | " _score \n",
817 | " \n",
818 | " \n",
819 | " \n",
820 | " \n",
821 | " 0 \n",
822 | " regulatory madness in the banking world \n",
823 | " 18.955673 \n",
824 | " \n",
825 | " \n",
826 | " 1 \n",
827 | " china pushes through banking sector reform \n",
828 | " 14.721983 \n",
829 | " \n",
830 | " \n",
831 | " 2 \n",
832 | " swan to announce banking reform package \n",
833 | " 14.582440 \n",
834 | " \n",
835 | " \n",
836 | " 3 \n",
837 | " open banking more choice or data risk \n",
838 | " 13.001936 \n",
839 | " \n",
840 | " \n",
841 | " 4 \n",
842 | " swan wraps up meeting on banking rules reform \n",
843 | " 12.904054 \n",
844 | " \n",
845 | " \n",
846 | " 5 \n",
847 | " govt internet regulatory plan criticised \n",
848 | " 12.000524 \n",
849 | " \n",
850 | " \n",
851 | " 6 \n",
852 | " regulatory duplication strangling aquaculture development \n",
853 | " 12.000524 \n",
854 | " \n",
855 | " \n",
856 | " 7 \n",
857 | " us flags financial regulatory reforms \n",
858 | " 11.530772 \n",
859 | " \n",
860 | " \n",
861 | " 8 \n",
862 | " mcconnell a regulatory train wreck \n",
863 | " 11.530772 \n",
864 | " \n",
865 | " \n",
866 | " 9 \n",
867 | " billabong rescue package clears regulatory hurdle \n",
868 | " 11.220221 \n",
869 | " \n",
870 | " \n",
871 | "
\n",
872 | "
"
873 | ],
874 | "text/plain": [
875 | " headline_text _score\n",
876 | "0 regulatory madness in the banking world 18.955673\n",
877 | "1 china pushes through banking sector reform 14.721983\n",
878 | "2 swan to announce banking reform package 14.582440\n",
879 | "3 open banking more choice or data risk 13.001936\n",
880 | "4 swan wraps up meeting on banking rules reform 12.904054\n",
881 | "5 govt internet regulatory plan criticised 12.000524\n",
882 | "6 regulatory duplication strangling aquaculture development 12.000524\n",
883 | "7 us flags financial regulatory reforms 11.530772\n",
884 | "8 mcconnell a regulatory train wreck 11.530772\n",
885 | "9 billabong rescue package clears regulatory hurdle 11.220221"
886 | ]
887 | },
888 | "metadata": {},
889 | "output_type": "display_data"
890 | },
891 | {
892 | "name": "stdout",
893 | "output_type": "stream",
894 | "text": [
895 | "CONTEXTUAL SEARCH RESULTS\n"
896 | ]
897 | },
898 | {
899 | "data": {
900 | "text/html": [
901 | "\n",
902 | "\n",
915 | "
\n",
916 | " \n",
917 | " \n",
918 | " \n",
919 | " headline_text \n",
920 | " _score \n",
921 | " \n",
922 | " \n",
923 | " \n",
924 | " \n",
925 | " 0 \n",
926 | " us flags financial regulatory reforms \n",
927 | " 1.863391 \n",
928 | " \n",
929 | " \n",
930 | " 1 \n",
931 | " the banking royal commissions recommendations \n",
932 | " 1.850401 \n",
933 | " \n",
934 | " \n",
935 | " 2 \n",
936 | " what can we expect from the banking inquiry \n",
937 | " 1.841991 \n",
938 | " \n",
939 | " \n",
940 | " 3 \n",
941 | " banking royal commission superannuation hearings \n",
942 | " 1.836119 \n",
943 | " \n",
944 | " \n",
945 | " 4 \n",
946 | " banking royal commission anz financial advice clients interest \n",
947 | " 1.833776 \n",
948 | " \n",
949 | " \n",
950 | " 5 \n",
951 | " rba considers cap on credit card surcharges \n",
952 | " 1.832314 \n",
953 | " \n",
954 | " \n",
955 | " 6 \n",
956 | " rba on banks interest rate moves \n",
957 | " 1.831732 \n",
958 | " \n",
959 | " \n",
960 | " 7 \n",
961 | " will changes to financial advice laws see the \n",
962 | " 1.831395 \n",
963 | " \n",
964 | " \n",
965 | " 8 \n",
966 | " commonwealth bank responds to financial planning inquiry \n",
967 | " 1.831379 \n",
968 | " \n",
969 | " \n",
970 | " 9 \n",
971 | " reserve bank financial stability review \n",
972 | " 1.830978 \n",
973 | " \n",
974 | " \n",
975 | "
\n",
976 | "
"
977 | ],
978 | "text/plain": [
979 | " headline_text _score\n",
980 | "0 us flags financial regulatory reforms 1.863391\n",
981 | "1 the banking royal commissions recommendations 1.850401\n",
982 | "2 what can we expect from the banking inquiry 1.841991\n",
983 | "3 banking royal commission superannuation hearings 1.836119\n",
984 | "4 banking royal commission anz financial advice clients interest 1.833776\n",
985 | "5 rba considers cap on credit card surcharges 1.832314\n",
986 | "6 rba on banks interest rate moves 1.831732\n",
987 | "7 will changes to financial advice laws see the 1.831395\n",
988 | "8 commonwealth bank responds to financial planning inquiry 1.831379\n",
989 | "9 reserve bank financial stability review 1.830978"
990 | ]
991 | },
992 | "metadata": {},
993 | "output_type": "display_data"
994 | }
995 | ],
996 | "source": [
997 | "#query='virus threat'\n",
998 | "query='regulatory risk banking reform'\n",
999 | "print('KEYWORD SEARCH RESULTS')\n",
1000 | "df0=et.search(query,'headline_text',type='match',embedder=embed_wrapper, size = 1000)\n",
1001 | "display(select_search_results(df0,10))\n",
1002 | "print('CONTEXTUAL SEARCH RESULTS')\n",
1003 | "df1=et.search(query,'headline_text',type='dense',embedder=embed_wrapper, size = 1000)\n",
1004 | "display(select_search_results(df1,10))\n",
1005 | "\n"
1006 | ]
1007 | },
1008 | {
1009 | "cell_type": "markdown",
1010 | "metadata": {},
1011 | "source": [
1012 | "# Speed comparison\n",
1013 | "\n",
1014 | "Below we perform some non-functional testing on the impact of size of index together with search parameters on time of the query. \n",
1015 | "We have tested with 3 index sizes: 1k (Tiny), 100k (Medium) & 1.1mn (Large). We have not paid particular attention to and sampling effects, meaning that for instance, the 1k index is simply the first 1000 headlines in the data, this might mean ti is not well randomized, which we have not studied"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "code",
1020 | "execution_count": 38,
1021 | "metadata": {},
1022 | "outputs": [
1023 | {
1024 | "name": "stderr",
1025 | "output_type": "stream",
1026 | "text": [
1027 | "100%|██████████| 10/10 [26:27<00:00, 158.76s/it]\n"
1028 | ]
1029 | }
1030 | ],
1031 | "source": [
1032 | "\n",
1033 | "queries=['Amazon','news','security thread','tech news','new vaccine developed new cure','results game today all winners']\n",
1034 | "result_sizes=[1,10,100]\n",
1035 | "repeat=10\n",
1036 | "\n",
1037 | "col_names=['search index','search type','search size' , 'query', '# tokens query','repeat','time taken']\n",
1038 | "search_to_compare={'match':'Keyword Search','dense':'Contextual Search',}\n",
1039 | "indices_to_compare={'et-tiny':'Tiny','et-medium':'Medium','et-large':'Large'}\n",
1040 | "\n",
1041 | "res=[]\n",
1042 | "for i in trange(repeat):\n",
1043 | " for index in indices_to_compare:\n",
1044 | " for search_type in search_to_compare:\n",
1045 | " for query in queries:\n",
1046 | " for size in result_sizes:\n",
1047 | " t0=datetime.datetime.now()\n",
1048 | " _ = et.search(query=query,\n",
1049 | " field='headline_text',\n",
1050 | " index_name=index,\n",
1051 | " type=search_type,\n",
1052 | " embedder=embed_wrapper, \n",
1053 | " size=size)\n",
1054 | " t1=datetime.datetime.now()\n",
1055 | " time_taken=(t1-t0).total_seconds()\n",
1056 | " res.append([indices_to_compare[index], search_to_compare[search_type], size, query, len(query.split()), i,time_taken])\n",
1057 | " \n",
1058 | "result_df=pd.DataFrame(res, columns=col_names)\n",
1059 | "result_df.to_csv('data/results_search.csv')\n"
1060 | ]
1061 | },
1062 | {
1063 | "cell_type": "markdown",
1064 | "metadata": {},
1065 | "source": [
1066 | "Compare speed across different index sizes and search types for\n",
1067 | "- query token length\n",
1068 | "- result size\n",
1069 | "- index size\n",
1070 | "\n",
1071 | "Results are below"
1072 | ]
1073 | },
1074 | {
1075 | "cell_type": "code",
1076 | "execution_count": 39,
1077 | "metadata": {},
1078 | "outputs": [
1079 | {
1080 | "data": {
1081 | "image/png": "\n",
1082 | "text/plain": [
1083 | ""
1084 | ]
1085 | },
1086 | "metadata": {
1087 | "needs_background": "light"
1088 | },
1089 | "output_type": "display_data"
1090 | },
1091 | {
1092 | "data": {
1093 | "image/png": "\n",
1094 | "text/plain": [
1095 | ""
1096 | ]
1097 | },
1098 | "metadata": {
1099 | "needs_background": "light"
1100 | },
1101 | "output_type": "display_data"
1102 | }
1103 | ],
1104 | "source": [
1105 | "compare='search size' #'# tokens query'\n",
1106 | "compare='# tokens query'\n",
1107 | "comparisons=['search size', '# tokens query']\n",
1108 | "\n",
1109 | "for compare in comparisons:\n",
1110 | " fig, axes = plt.subplots(nrows=1, ncols=len(search_to_compare), figsize=(12,4))\n",
1111 | " fig.suptitle(f'Comparing {compare}',size=18)\n",
1112 | " for (c,search_type) in enumerate(search_to_compare.values()):\n",
1113 | " pvt=pd.pivot_table(result_df[(result_df['search type']==search_type)&(result_df['repeat']>2)] \\\n",
1114 | " [['search index',compare,'time taken']],\\\n",
1115 | " values='time taken',\n",
1116 | " index=compare,\n",
1117 | " columns='search index',\n",
1118 | " aggfunc='mean',\n",
1119 | " )\n",
1120 | " pvt.plot.bar(title=search_type,ax=axes[c]) \n"
1121 | ]
1122 | },
1123 | {
1124 | "cell_type": "markdown",
1125 | "metadata": {},
1126 | "source": [
1127 | "The below box plot analyzes the deviation in search times after multiple repeated calls. Although some deviations are observed, they do not seem to be significant. Only results from the Large index are shown"
1128 | ]
1129 | },
1130 | {
1131 | "cell_type": "code",
1132 | "execution_count": 17,
1133 | "metadata": {},
1134 | "outputs": [
1135 | {
1136 | "data": {
1137 | "text/plain": [
1138 | ""
1139 | ]
1140 | },
1141 | "execution_count": 17,
1142 | "metadata": {},
1143 | "output_type": "execute_result"
1144 | },
1145 | {
1146 | "data": {
1147 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAPq0lEQVR4nO3dfbBcdX3H8fcHEuVRqOZqBYSgYpWKD+VaQSxFcKgKre2IDygqahutLWinRaN1QO3U4mgL7dBaY6oyFrGC1lq0CiOgqEhIIJBA1Kqgomgvo1LABwJ8+8c5V9a4N3cT7t78mvt+zezcs+fxe/ec/ezv/PbsbqoKSVK7dtjWBUiSNs+glqTGGdSS1DiDWpIaZ1BLUuMWjWOlS5YsqaVLl45j1ZK0XVqzZs0tVTUxbNpYgnrp0qWsXr16HKuWpO1Skm/ONM2uD0lqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjxvKBF225JFu1nN8nLm3/bFE3oqqG3vZ7/QUzTjOkpYXBFvU8e/xbLuTWn2zcomWWLv/EFs2/x86Luea0o7doGUntMqjn2a0/2ciNpx8z1m1sabBLaptBPc92f8xyDjp7+Zi3ATDeFwNJ88egnme3bTh97NvYY+fFY9+GpPljUM+zLe32WLr8E2PvKpHUNoO6EZu7PC9vn3k5r/yQtn8GdSMMXEkz8TpqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklq3EhBneTPklyXZH2Sc5PsNO7CJEmdWYM6yd7AycBkVT0W2BF4wbgLkyR1Ru36WATsnGQRsAvw3fGVJEkaNGtQV9V3gHcC3wJuBm6tqgs3nS/JsiSrk6yempqa+0olaYEapevjV4BnA/sDewG7Jjlh0/mqakVVTVbV5MTExNxXKkkL1ChdH08HbqiqqaraCHwUeMp4y5IkTRslqL8FHJJkl3Tfbn8UsGG8ZUmSpo3SR30FcD5wFbCuX2bFmOuSJPVG+oWXqjoNOG3MtUiShvCTiZLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklq3EhBnWTPJOcn+XKSDUkOHXdhkqTOohHn+3vgU1V1XJL7AbuMsSZJ0oBZgzrJHsDhwIkAVXUncOd4y5IkTRul62N/YAp4X5Krk6xMsuumMyVZlmR1ktVTU1NzXqgkLVSjBPUi4DeAd1XVE4E7gOWbzlRVK6pqsqomJyYm5rhMSVq4Rgnqm4CbquqK/v75dMEtSZoHswZ1VX0P+HaSX+tHHQVcP9aqJEk/N+pVHycB5/RXfHwDeNn4SpIkDRopqKtqLTA53lIkScP4yURJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxIwd1kh2TXJ3kgnEWJEn6RVvSon4NsGFchUiShhspqJPsAxwDrBxvOZKkTY3aoj4TeB1wz0wzJFmWZHWS1VNTU3NRmySJEYI6ybHA/1TVms3NV1UrqmqyqiYnJibmrEBJWuhGaVEfBvxekhuBDwFHJvnXsVYlSfq5WYO6qt5QVftU1VLgBcDFVXXC2CuTJAFeRy1JzVu0JTNX1aXApWOpRJI0lC1qSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcbMGdZKHJbkkyfVJrkvymvkoTJLUWTTCPHcBf15VVyXZHViT5KKqun7MtUmSGKFFXVU3V9VV/fBtwAZg73EXJknqbFEfdZKlwBOBK4ZMW5ZkdZLVU1NTc1SeJGnkoE6yG/AR4LVV9b+bTq+qFVU1WVWTExMTc1mjJC1oIwV1ksV0IX1OVX10vCVJkgaNctVHgH8BNlTV342/JEnSoFFa1IcBLwaOTLK2vz1rzHVJknqzXp5XVZ8HMg+1SJKG8JOJktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS4wxqSWqcQS1JjTOoJalxBrUkNc6glqTGGdSS1DiDWpIaZ1BLUuMMaklqnEEtSY0zqCWpcQa1JDXOoJakxhnUktQ4g1qSGmdQS1LjDGpJapxBLUmNM6glqXEGtSQ1zqCWpMYZ1JLUOINakhpnUEtS40YK6iTPSPKVJF9LsnzcRUmS7jVrUCfZEfhH4JnAgcDxSQ4cd2GSpM4oLerfBL5WVd+oqjuBDwHPHm9ZkqRpi0aYZ2/g2wP3bwKePJ5yJG0rB5190LxsZ91L183LdrYnowT1SJIsA5YB7LvvvnO1WknzxABt1yhdH98BHjZwf59+3C+oqhVVNVlVkxMTE3NVnyQteKME9ZXAAUn2T3I/4AXAx8dbliRp2qxdH1V1V5I/BT4N7Ai8t6quG3tlkiRgxD7qqvok8Mkx1yJJGsJPJkpS4wxqSWqcQS1JjTOoJalxqaq5X2kyBXxzzle8MC0BbtnWRUgz8PicO/tV1dAPoYwlqDV3kqyuqsltXYc0jMfn/LDrQ5IaZ1BLUuMM6vat2NYFSJvh8TkP7KOWpMbZopakxhnUktS4BRXUSX41yYeSfD3JmiSfTPKorVzXa5PsspXL7pnk1Vuz7MA63p/kuCHjD0lyRZK1STYkefN92c4IddyYZMk4t7HQJbl9YPhZSb6aZL9tWdO0JCcmOWvI+IckuSDJNUmuTzLWL3Wb6fmwvVgwQZ0kwL8Dl1bVI6rqYOANwEO2cpWvBbYqqIE9gfsU1JtxNrCsqp4APBb48H1dYZI5+yUgbb0kRwH/ADyzqrbJB8r6H7sexVuBi6rq8VV1ILB8nre/XVkwQQ08DdhYVf88PaKqrqmqy9J5R5L1SdYleT5AkiOSXJrk/CRfTnJOP+/JwF7AJUku6ec9OsnlSa5Kcl6S3ZLsl+S/kyxJskOSy5IcDZwOPKJv9b6j384F03UlOSvJif3wqUmu7Gtb0b/gbM6DgZv7/+/uqrq+X8+uSd6bZFWSq5M8ux+/tK/rqv72lIH//bIkHweuT7Jjknf2dVyb5KSBbZ7UL7suyaO3fhdpJkkOB94DHFtVX+/HndDvz7VJ3t3vo5cnOXNguT9KckaSU/rjlv7+xf3wkUnO6YeP7/fh+iRvH1jH7Un+Nsk1wKFJXta36lcBh81Q8kPpfl8VgKq6dmB9p/TH9LVJ3jIw/mPpznSvS/fTfjNt/yX9stck+cDANg9P8sUk39juWtdVtSBuwMnAGTNMew5wEd0PIzwE+BbdgXYEcCvdz4/tAFwOPLVf5kZgST+8BPgcsGt///XAqf3wHwLnAacA7+7HLQXWD2z/COCCgftnASf2ww8cGP8B4Hf74fcDxw35X04Ffkh39vBKYKd+/NuAE/rhPYGvArvSnRVMz3MAsHqgpjuA/fv7fwycDywarKt/HE7qh18NrNzW+3p7uwEbgR8AjxsY9xjgP4HF/f1/Al4C7AZ8fWD8F4GDgEOA8/pxlwGrgMXAaf1xsld/3E/QfU/9xcDv9/MX8Lx++KED890P+AJw1pCafwf4EXAJ8JfAXv34o+ku6Uv/nLoAOHyTY2pnYD3woCHb//X+2F2yyTLvp3ue7QAcCHxtW++3ubwtpBb15jwVOLe6Fuj3gc8CT+qnraqqm6rqHmAtXchu6hC6g+MLSdYCLwX2A6iqlcADgFcBf7EVtT0tXZ/zOuBIugN1RlX1VmASuBB4IfCpftLRwPK+vkuBnYB96Z6s7+nXf17/f0xbVVU39MNPp3uhuavfzg8G5vto/3cNwx8f3Tcb6QL3FQPjjgIOBq7s9+lRwMOr6na6kD22P7tZXFXr6PbNwUkeAPyMrtExCfwWXXA/ia5bcKrfx+cAh/fbuhv4SD/85IH57gT+bVjBVfVp4OF0ZwGPBq5OMkF3HB4NXA1c1U87oF/s5L7V/CW632mdHj+4/SPpXnBu6bczeBx+rKruqe4scmu7NJu0kPoerwO25nToZwPDdzP8MQtdf9zxvzShe8Nxn/7ubsBtQ5a/i1/shtqpX3YnupbSZFV9O90bgzvNVnB1p8bvSvIeYCrJg/oan1NVX9mkvjcD3wce39fw04HJd8y2rd70YzTT46P75h7gecBnkryxqt5Gtz/Prqo3DJl/JfBG4MvA+wCqamOSG4AT6UL/WrruwEcCG7g3FIf5aVXdvaVF9yH6QeCDfdfe4X3df1NV7x6cN8kRdI2BQ6vqx0ku5d5jfdTtDz5XZ+si/H9lIbWoLwbuv0nf1+OSTLcont/38U3QHVCrZlnfbcDu/fCXgMOSPLJf766592qSt9O1Tk6la11suix03zR4YJL7J9mTrnUE9x6otyTZjRFeaJIcM9CPfQBdeP6I7jcvT5qeluSJ/Tx7ADf3Zwwvpuv+GeYi4JXp31hM8sDZatHcqaofA8cAL0ryCuAzwHFJHgzd/kh/JUhVXUHXIn0hcO7Aai6jO6v7XD/8KuDq6voOVgG/ne79lB2B4+nOLDd1RT/fg5IsBp47rN6+73uXfnh34BF0XSafBl7eH88k2bv/H/YAftiH9KPpzlKHuRh4bt/4WDDH4YJp/VRVJfkD4Mwkr6drOd5Id/XG54FDgWvo+sNeV1Xfm+WNsRXAp5J8t6qelu7Nv3OT3L+f/qYkD6U7pTysqu5O8pwkL6uq9yX5QpL1wH9V1SlJPkzXL3cD3WkhVfWjvlW8Hvge3S/Cz+bFwBlJfkzXUn9Rv+2/As4Erk2yQ7+dY+la7B9J8hK6bpKZWtErgUf1y2+ke9H5pcuyND5V9YMkz6AL2tcAbwIu7PfnRuBPuPfrhT8MPKGqfjiwisvo+osvr6o7kvy0H0dV3ZxkOV2fcoBPVNV/DKnh5v4s7HK6BsDaGco9GDgryfTZ4sqquhIgyWOAy/s2w+3ACXTH3quSbAC+Qtf4GfYYXJfkr4HPJrmb7rly4kyP2fbCj5BL26G+q+GMqvrMtq5F991C6vqQtnvpPkz1VeAnhvT2wxa1JDXOFrUkNc6glqTGGdSS1DiDWpIaZ1BLUuP+D9vImSv8J2/qAAAAAElFTkSuQmCC\n",
1148 | "text/plain": [
1149 | ""
1150 | ]
1151 | },
1152 | "metadata": {
1153 | "needs_background": "light"
1154 | },
1155 | "output_type": "display_data"
1156 | }
1157 | ],
1158 | "source": [
1159 | "box_df=result_df[(result_df['search index']=='Large')].copy()\n",
1160 | "pd.pivot_table(box_df[['search type','time taken','repeat']],\n",
1161 | " values='time taken',\n",
1162 | " index='repeat',\n",
1163 | " columns='search type',\n",
1164 | " aggfunc='mean',).plot.box()"
1165 | ]
1166 | }
1167 | ],
1168 | "metadata": {
1169 | "kernelspec": {
1170 | "display_name": "Python [conda env:et2] *",
1171 | "language": "python",
1172 | "name": "conda-env-et2-py"
1173 | },
1174 | "language_info": {
1175 | "codemirror_mode": {
1176 | "name": "ipython",
1177 | "version": 3
1178 | },
1179 | "file_extension": ".py",
1180 | "mimetype": "text/x-python",
1181 | "name": "python",
1182 | "nbconvert_exporter": "python",
1183 | "pygments_lexer": "ipython3",
1184 | "version": "3.7.7"
1185 | }
1186 | },
1187 | "nbformat": 4,
1188 | "nbformat_minor": 4
1189 | }
1190 |
--------------------------------------------------------------------------------
/notebooks/Setting_up_ElasticTransformers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Introduction\n",
8 | "\n",
9 | "This notebook will accomplish the following\n",
10 | "\n",
11 | "- Set up an ElasticTransformers class\n",
12 | "- Instantiate an index and index the Million headlines dataset in it\n",
13 | "- Preview some search results from comparing lexical vs semantic search\n"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Loading requirements"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 36,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "The autoreload extension is already loaded. To reload it, use:\n",
33 | " %reload_ext autoreload\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "%load_ext autoreload\n",
39 | "import os\n",
40 | "os.chdir(os.path.abspath(os.curdir).replace('notebooks',''))"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 37,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "%autoreload 2\n",
50 | "from src.database import ElasticTransformers\n"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## Sentence Transformers\n",
58 | "\n",
59 | "This creates the sentence transformer object as well as small helper function which simplifies the embedding call and helps lading data into elastic easier"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 38,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "from sentence_transformers import SentenceTransformer\n",
69 | "bert_embedder = SentenceTransformer('bert-base-nli-mean-tokens')\n"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 39,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "def embed_wrapper(ls):\n",
79 | " \"\"\"\n",
80 | " Helper function which simplifies the embedding call and helps lading data into elastic easier\n",
81 | " \"\"\"\n",
82 | " results=bert_embedder.encode(ls, convert_to_tensor=True)\n",
83 | " results = [r.tolist() for r in results]\n",
84 | " return results"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "## Quick Preview of the raw data\n",
92 | "\n",
93 | "The data contains 1.15mn news headlines (all in lower case) and their published date"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 40,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "import pandas as pd\n",
103 | "df=pd.read_csv('data/abcnews-date-text.csv')"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 41,
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/html": [
114 | "\n",
115 | "\n",
128 | "
\n",
129 | " \n",
130 | " \n",
131 | " \n",
132 | " publish_date \n",
133 | " headline_text \n",
134 | " \n",
135 | " \n",
136 | " \n",
137 | " \n",
138 | " 0 \n",
139 | " 20030219 \n",
140 | " aba decides against community broadcasting lic... \n",
141 | " \n",
142 | " \n",
143 | " 1 \n",
144 | " 20030219 \n",
145 | " act fire witnesses must be aware of defamation \n",
146 | " \n",
147 | " \n",
148 | " 2 \n",
149 | " 20030219 \n",
150 | " a g calls for infrastructure protection summit \n",
151 | " \n",
152 | " \n",
153 | " 3 \n",
154 | " 20030219 \n",
155 | " air nz staff in aust strike for pay rise \n",
156 | " \n",
157 | " \n",
158 | " 4 \n",
159 | " 20030219 \n",
160 | " air nz strike to affect australian travellers \n",
161 | " \n",
162 | " \n",
163 | "
\n",
164 | "
"
165 | ],
166 | "text/plain": [
167 | " publish_date headline_text\n",
168 | "0 20030219 aba decides against community broadcasting lic...\n",
169 | "1 20030219 act fire witnesses must be aware of defamation\n",
170 | "2 20030219 a g calls for infrastructure protection summit\n",
171 | "3 20030219 air nz staff in aust strike for pay rise\n",
172 | "4 20030219 air nz strike to affect australian travellers"
173 | ]
174 | },
175 | "execution_count": 41,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "df.head()"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "# A tiny example\n",
189 | "\n",
190 | "Let's first do this with a tiny example of 1000 headlines (the full dataset is 1.1mn headlines)"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 42,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "df.head(1000).to_csv('data/tiny_sample.csv')\n"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "# Setting up ElasticTransformers\n",
207 | "\n",
208 | "The below lines initialize the class, meaning setting the url and index name"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 32,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "et=ElasticTransformers(url='http://localhost:9300',index_name='et-tiny')\n",
218 | "_ = et.ping()\n",
219 | "\n"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "Next, we define the index specification (Elasticsearch index mapping)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 33,
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "data": {
236 | "text/plain": [
237 | "{'settings': {'number_of_shards': 3, 'number_of_replicas': 1},\n",
238 | " 'mappings': {'dynamic': 'true',\n",
239 | " '_source': {'enabled': 'true'},\n",
240 | " 'properties': {'publish_date': {'type': 'text'},\n",
241 | " 'headline_text': {'type': 'text'},\n",
242 | " 'headline_text_embedding': {'type': 'dense_vector', 'dims': 768}}}}"
243 | ]
244 | },
245 | "execution_count": 33,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "et.create_index_spec(\n",
252 | " text_fields=['publish_date','headline_text'],\n",
253 | " dense_fields=['headline_text_embedding'],\n",
254 | " dense_fields_dim=768\n",
255 | ")"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 34,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "Creating 'et-tiny' index.\n"
268 | ]
269 | }
270 | ],
271 | "source": [
272 | "et.create_index()\n"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 35,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "name": "stderr",
282 | "output_type": "stream",
283 | "text": [
284 | "1it [00:08, 8.52s/it]\n"
285 | ]
286 | }
287 | ],
288 | "source": [
289 | "et.write_large_csv('data/tiny_sample.csv',\n",
290 | " chunksize=1000,\n",
291 | " embedder=embed_wrapper,\n",
292 | " field_to_embed='headline_text')"
293 | ]
294 | },
295 | {
296 | "cell_type": "markdown",
297 | "metadata": {},
298 | "source": [
299 | "One sample looks like this"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "## Indexing the entire dataset\n",
307 | "\n",
308 | "Lets do this now with 1.1mn records "
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 44,
314 | "metadata": {},
315 | "outputs": [
316 | {
317 | "name": "stdout",
318 | "output_type": "stream",
319 | "text": [
320 | "Creating 'et-large' index.\n"
321 | ]
322 | }
323 | ],
324 | "source": [
325 | "# Initialize\n",
326 | "et=ElasticTransformers(url='http://localhost:9200',index_name='et-large')\n",
327 | "_ = et.ping()\n",
328 | "# Create index mapping\n",
329 | "et.create_index_spec(\n",
330 | " text_fields=['publish_date','headline_text'],\n",
331 | " dense_fields=['headline_text_embedding'],\n",
332 | " dense_fields_dim=768\n",
333 | ")\n",
334 | "# Create index\n",
335 | "et.create_index()"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "### Indexing with sentence-transformers... \n",
343 | "\n",
344 | "This takes 3hrs on CPU, consumes 4CPUs & 2GB RAM for the embedding process and about 2GB RAM for Elastic"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 45,
350 | "metadata": {},
351 | "outputs": [
352 | {
353 | "name": "stderr",
354 | "output_type": "stream",
355 | "text": [
356 | "1187it [3:18:46, 10.05s/it]\n"
357 | ]
358 | }
359 | ],
360 | "source": [
361 | "et.write_large_csv('data/abcnews-date-text.csv',\n",
362 | " chunksize=1000,\n",
363 | " embedder=embed_wrapper,\n",
364 | " field_to_embed='headline_text')\n"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": null,
370 | "metadata": {},
371 | "outputs": [],
372 | "source": []
373 | }
374 | ],
375 | "metadata": {
376 | "kernelspec": {
377 | "display_name": "Python [conda env:et2] *",
378 | "language": "python",
379 | "name": "conda-env-et2-py"
380 | },
381 | "language_info": {
382 | "codemirror_mode": {
383 | "name": "ipython",
384 | "version": 3
385 | },
386 | "file_extension": ".py",
387 | "mimetype": "text/x-python",
388 | "name": "python",
389 | "nbconvert_exporter": "python",
390 | "pygments_lexer": "ipython3",
391 | "version": "3.7.7"
392 | }
393 | },
394 | "nbformat": 4,
395 | "nbformat_minor": 4
396 | }
397 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.0.5
2 | sentence-transformers==0.3.4
3 | elasticsearch==7.6.0
4 | matplotlib==3.3.1
5 | transformers==3.0.2
6 |
--------------------------------------------------------------------------------
/src/database.py:
--------------------------------------------------------------------------------
1 | from elasticsearch import Elasticsearch, helpers
2 | import datetime
3 | import json
4 | import pandas as pd
5 | import tqdm
6 | import os
7 | from src.logger import logger
8 |
9 | class ElasticTransformers(object):
10 | def __init__(self,url='http://localhost:9200', index_name=None):
11 | """
12 | Initializes class
13 |
14 | Args:
15 | url (string) full url for elastic
16 | index_name (string, optional) name of index can be used as the default index across all methods for this class instance should this apply
17 | """
18 | self.url=url
19 | self.es=Elasticsearch(self.url)
20 | self.index_name=index_name
21 | self.index_file=None
22 |
23 | def ping(self):
24 | """
25 | Checks if Elastic is healthy
26 |
27 | Returns:
28 | True if healthy, False otherwise
29 | """
30 | ping=self.es.ping()
31 | if ping:
32 | logger.debug(f'Ping successful')
33 | return ping
34 |
35 | def create_index_spec(self, index_name=None,folder='index_spec',text_fields=[], keyword_fields=[], dense_fields=[], dense_fields_dim=512, shards=3, replicas=1):
36 | """
37 | Creates mapping file for an index and stores the file
38 |
39 | Args:
40 | index_name (string, optional) name of index, defaults to index name defined when initiating the class
41 | folder (string) location to store index spec
42 | text_fields (list)
43 | keyword_fields (list)
44 | dense_fields (list) list of dense field names
45 | dense_fields_dim (int)
46 | shards (int) number of shards for index
47 | replicas (int) number of replicas for index
48 | """
49 |
50 | if not os.path.exists(folder):
51 | os.makedirs(folder)
52 |
53 | if not index_name:
54 | if self.index_name:
55 | index_name=self.index_name
56 | else:
57 | raise ValueError('index_name not provided')
58 | index_spec={}
59 |
60 | index_spec['settings']={
61 | "number_of_shards": shards,
62 | "number_of_replicas": replicas
63 | }
64 |
65 | index_spec['mappings']={
66 | "dynamic": "true",
67 | "_source": {
68 | "enabled": "true"
69 | },
70 | "properties": {},
71 | }
72 |
73 | for t in text_fields:
74 | index_spec['mappings']['properties'][t]={
75 | "type": "text"
76 | }
77 |
78 | for k in keyword_fields:
79 | index_spec['mappings']['properties'][t]={
80 | "type": "keyword"
81 | }
82 |
83 | for d in dense_fields:
84 | index_spec['mappings']['properties'][d]={
85 | "type": "dense_vector",
86 | "dims": dense_fields_dim
87 | }
88 |
89 | index_file_name=f'{folder}/spec_{index_name}.json'
90 | with open(index_file_name, 'w') as index_file:
91 | json.dump(index_spec,index_file)
92 | self.index_file=index_file_name
93 | logger.debug(f'Index spec {self.index_file} created')
94 | return index_spec
95 |
96 | def create_index(self, index_name=None, index_file=None):
97 | """
98 | Create index (index_name) based on file (index_file) containing index mapping
99 | NOTE: existing index of this name will be deleted
100 |
101 | Args:
102 | index_name (string, optional): name of index, defaults to index name defined when initiating the class
103 | index_file (string, optional): index spec file location, if none provided, will use mapping from create_index_spec else will create blank mapping
104 |
105 | """
106 | if not index_name:
107 | if self.index_name:
108 | index_name=self.index_name
109 | else:
110 | raise ValueError('index_name not provided')
111 | print(f"Creating '{index_name}' index.")
112 | self.es.indices.delete(index=index_name, ignore=[404])
113 |
114 | if index_file or self.index_file:
115 | if self.index_file:
116 | index_file=self.index_file
117 | with open(index_file) as index_file:
118 | index_spec = index_file.read().strip()
119 |
120 | else:
121 | index_spec={
122 | "number_of_shards": 3,
123 | "number_of_replicas": 1
124 | }
125 |
126 | self.es.indices.create(index=index_name, body=index_spec)
127 |
128 | def write(self,docs,index_name=None,index_field=None):
129 | """
130 | Writes entries to index
131 |
132 | Args:
133 | docs (list) list of dictionaries with keys matching index field names from index specification
134 | index_name (string, optional) name of index, defaults to index name defined when initiating the class
135 | index_field (string, optional) name of index field if present in docs. Defaults to elasicsearch indexing otherwise
136 |
137 | """
138 | if not index_name:
139 | if self.index_name:
140 | index_name=self.index_name
141 | else:
142 | raise ValueError('index_name not provided')
143 | requests = []
144 | for i, doc in enumerate(docs):
145 | request = doc
146 | request["_op_type"] = "index"
147 | if index_field:
148 | request["_id"] = doc[index_field]
149 | request["_index"] = index_name
150 | requests.append(request)
151 | helpers.bulk(self.es, requests)
152 |
153 | def write_large_csv(self, file_path, index_name=None, chunksize=10000, embedder=None, field_to_embed=None, index_field=None):
154 | """
155 | Iteratively reads through a csv file and writes it to elastic in batches
156 |
157 | Args:
158 | file_path (string) path to file
159 | index_name (string, optional) name of index, defaults to index name defined when initiating the class
160 | chunksize (int) size of the chunk to be read from file and sent to embedder
161 | embedder (function) embedder function with expected call embedded(list of strings to embed)
162 | field_to_embed (string) name of field to embed
163 | index_field (string, optional) name of index field if present in docs. Defaults to elasicsearch indexing otherwise
164 | """
165 | if not index_name:
166 | if self.index_name:
167 | index_name=self.index_name
168 | else:
169 | raise ValueError('index_name not provided')
170 | # read the large csv file with specified chunksize
171 | df_chunk = pd.read_csv(file_path, chunksize=chunksize, index_col=0)
172 |
173 | chunk_list = [] # append each chunk df here
174 |
175 | # Each chunk is in df format
176 | for chunk in tqdm.tqdm(df_chunk):
177 | if embedder:
178 | chunk[f'{field_to_embed}_embedding']=embedder(chunk[field_to_embed].values)
179 | chunk_ls=json.loads(chunk.to_json(orient='records'))
180 | self.write(chunk_ls,index_name,index_field=index_field)
181 | logger.debug(f'Successfully wrote {len(chunk_ls)} docs to {index_name}')
182 |
183 | def sample(self, index_name=None, size=3):
184 | """
185 | Provides a sample of documents from the index
186 |
187 | Args:
188 | index_name (string, optional) name of index, defaults to index name defined when initiating the class
189 | size (int, optional) number of results to retrieve, defaults to 3, max 10k, can be relaxed with elastic config
190 | """
191 | if not index_name:
192 | if self.index_name:
193 | index_name=self.index_name
194 | else:
195 | raise ValueError('index_name not provided')
196 | res=self.es.search(index=index_name, size=size)
197 | logger.debug(f"Successfully sampled {len(res['hits']['hits'])} docs from {index_name}")
198 | return res
199 |
200 | def search(self, query, field, type='match', index_name=None, embedder=None, size=10):
201 | """
202 | Search elastic
203 |
204 | Args:
205 | query (string) search query
206 | field (string) field to search
207 | type (string) type of search, takes: match, term, fuzzy, wildcard (requires "*" in query), dense (semantic search, requires embedder, index needs to be indexed with embeddings, assumes embedding field is named {field}_embedding)
208 | index_name (string, optional) name of index, defaults to index name defined when initiating the class
209 | embedder (function) embedder function with expected call embedded(list of strings to embed)
210 | size (int, optional) number of results to retrieve, defaults to 3, max 10k, can be relaxed with elastic config
211 |
212 | Returns:
213 | DataFrame with results and search score
214 | """
215 | res=[]
216 |
217 | if not index_name:
218 | if self.index_name:
219 | index_name=self.index_name
220 | else:
221 | raise ValueError('index_name not provided')
222 | if type=='dense':
223 | if not embedder:
224 | raise ValueError('Dense search requires embedder')
225 | query_vector = embedder([query])[0]
226 |
227 | script_query = {
228 | "script_score": {
229 | "query": {"match_all": {}},
230 | "script": {
231 | "source": f"cosineSimilarity(params.query_vector, doc['{field}_embedding']) + 1.0",
232 | "params": {"query_vector": query_vector}
233 | }
234 | }
235 | }
236 |
237 | res = self.es.search(
238 | index=index_name,
239 | body={
240 | "size": size,
241 | "query": script_query,
242 | "_source": {"excludes": [f'{field}_embedding']}
243 | }
244 | )
245 | else:
246 | res=self.es.search(index=index_name, body={'query':{type:{field:query}}, "_source": {"excludes": [f'{field}_embedding']}},size=size)
247 | self.search_raw_result=res
248 | hits=res['hits']['hits']
249 | if len(hits)>0:
250 | keys=list(hits[0]['_source'].keys())
251 |
252 | out=[[h['_score']]+[h['_source'][k] for k in keys] for h in hits]
253 |
254 | df=pd.DataFrame(out,columns=['_score']+keys)
255 | else:
256 | df=pd.DataFrame([])
257 | self.search_df_result=df
258 | logger.debug(f'Search {type.upper()} {query} in {index_name}.{field} returned {len(df)} results of {size} requested')
259 | return df
--------------------------------------------------------------------------------
/src/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import datetime
3 | import os
4 |
5 | logs_folder='logs'
6 | if not os.path.exists(logs_folder):
7 | os.makedirs(logs_folder)
8 |
9 | # Create a custom logger
10 | logger = logging.getLogger(__name__)
11 |
12 | # Setting global logging level
13 | logger.setLevel(logging.WARNING)
14 |
15 | date=str(datetime.date.today()).replace('-','')
16 | # Initialize handlers
17 | file_hndl = logging.FileHandler(f'{logs_folder}/q_logs_{date}.log')
18 | cli_hndl = logging.StreamHandler()
19 | # Set logging level
20 | file_hndl.setLevel(level=logging.DEBUG)
21 | cli_hndl.setLevel(level=logging.DEBUG)
22 | # Add formatters to handlers
23 | logger_text_format = logging.Formatter('%(asctime)s --- %(name)s --- %(levelname)s --- %(funcName)s:%(lineno)d --- %(message)s')
24 | file_hndl.setFormatter(logger_text_format)
25 | cli_hndl.setFormatter(logger_text_format)
26 |
27 | # Add handlers to the logger
28 | logger.addHandler(file_hndl)
29 | logger.addHandler(cli_hndl)
30 |
31 |
--------------------------------------------------------------------------------