├── .dockerignore
├── .gitignore
├── archive
├── docker_milvus
│ ├── Dockerfile
│ ├── install.sh
│ ├── readme.md
│ └── supervisord.conf
├── notebooks
│ ├── 01_bm25.ipynb
│ ├── 02_dense_retriever_milvus.ipynb
│ └── 04_ann-elastic.ipynb
└── notebooks_stackoverflow
│ ├── 00_data_fetch_bq.ipynb
│ ├── 00_data_fetch_spark.ipynb
│ ├── 01_b_setup.ipynb
│ ├── 01_data_cleanup.ipynb
│ ├── 01_data_subset.ipynb
│ ├── 01_workshop_data_preview.ipynb
│ ├── 02_retrieval_dense_milvus.ipynb
│ ├── 02_retrieval_sparse.ipynb
│ ├── 03_comparision.ipynb
│ ├── ann_benchmark_recall.ipynb
│ ├── metrics_utils.py
│ ├── other__retrieve_rerank_simple_wikipedia.ipynb
│ ├── test_setup.ipynb
│ └── workshop_setup.ipynb
├── assets
├── all_assets.sw
├── slides_odsc2022.pdf
├── slides_pydatanyc2022.pdf
└── slides_pydataseattle2023.pdf
├── docker-compose.yaml
├── docs
├── internal_notes.md
└── slide_notes.md
├── environment.yaml
├── notebooks
├── 00_a_setup_dataset.ipynb
├── 00_b_setup_stats.ipynb
├── 00_c_sample_images.ipynb
├── 01_bm25_elastic.ipynb
├── 02_dense_retriever.ipynb
├── 03_clip_embed.ipynb
├── 04_ann.ipynb
└── workshop_setup.ipynb
├── readme.md
├── requirements.txt
└── workshop_infra
├── Dockerfile
├── cert
└── .gitkeep
├── config.enc.yaml
├── config_public.yaml
├── scripts
├── build_setup_root.sh
├── build_setup_user.sh
└── container_startup.sh
└── setup.md
/.dockerignore:
--------------------------------------------------------------------------------
1 | data/
2 | workshop_infra/
3 | !workshop_infra/scripts/
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 |
132 | data/
133 |
134 |
135 | workshop_infra/cert/*
136 | workshop_infra/config.yaml
137 | workshop_infra/key_file.json
138 |
139 | *.db
140 | tmp/
141 | .DS_Store
142 |
143 | !/**/.gitkeep
144 | workshop_infra/keyfile.json
145 | *.zip
146 |
--------------------------------------------------------------------------------
/archive/docker_milvus/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 |
3 | ENV MILVUS_VERSION "2.1.4-1"
4 |
5 | COPY install.sh /tmp/install.sh
6 |
7 | RUN bash /tmp/install.sh
8 |
9 |
10 | # ARG S6_OVERLAY_VERSION=3.1.2.1
11 |
12 |
13 | # ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz /tmp
14 |
15 | # RUN tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz
16 |
17 | # ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-x86_64.tar.xz /tmp
18 |
19 | # RUN tar -C / -Jxpf /tmp/s6-overlay-x86_64.tar.xz
20 |
21 |
22 | COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
23 |
24 |
25 |
26 | CMD ["/usr/bin/supervisord"]
27 |
--------------------------------------------------------------------------------
/archive/docker_milvus/install.sh:
--------------------------------------------------------------------------------
1 | apt-get update -y
2 |
3 | apt install software-properties-common -y
4 |
5 | add-apt-repository ppa:milvusdb/milvus
6 |
7 |
8 | apt-get update -y
9 |
10 |
11 | apt-get install "milvus=$MILVUS_VERSION" -y
12 |
13 |
14 |
15 | #mkdir -p /etc/services.d/system/
16 |
17 | #cp /lib/systemd/system/milvus* /etc/services.d/system/
18 |
19 | #cp /lib/systemd/system/milvus* /etc/services.d/system/
20 |
21 | #COPY resources/docker/services.d /etc/services.d
22 |
23 |
24 |
25 | apt-get update && apt-get install -y supervisor
26 | mkdir -p /var/log/supervisor
--------------------------------------------------------------------------------
/archive/docker_milvus/readme.md:
--------------------------------------------------------------------------------
1 | https://github.com/just-containers/s6-overlay
2 |
3 |
4 |
5 | cat /etc/services.d/system/milvus-etcd.service
6 | ExecStart=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data
7 |
8 |
9 | cat /etc/services.d/system/milvus-minio.service
10 | ExecStart=/usr/bin/milvus-minio server /var/lib/milvus/minio-data
11 |
12 |
13 |
14 | cat /etc/services.d/system/milvus.service
15 |
16 | Environment=MILVUSCONF=/etc/milvus/configs/
17 | ExecStart=/usr/bin/milvus run standalone
18 |
19 |
20 |
21 | https://gdevillele.github.io/engine/admin/using_supervisord/
--------------------------------------------------------------------------------
/archive/docker_milvus/supervisord.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | nodaemon=true
3 |
4 | [program:milvus-minio]
5 | command=/usr/bin/milvus-minio server /var/lib/milvus/minio-data
6 |
7 | [program:milvus-etcd]
8 | command=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data
9 |
10 |
11 | [program:milvus]
12 | environment=MILVUSCONF=/etc/milvus/configs/
13 | command=/usr/bin/milvus run standalone
--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/00_data_fetch_bq.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "2d9c002c-9ba7-48cb-83a5-3d2903056d43",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import modin.pandas as pd\n",
11 | "import re\n",
12 | "import lxml.html\n"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 3,
18 | "id": "05461bbb-02f3-4749-b6ca-dba3a02bf1e8",
19 | "metadata": {},
20 | "outputs": [
21 | {
22 | "name": "stderr",
23 | "output_type": "stream",
24 | "text": [
25 | "UserWarning: Ray execution environment not yet initialized. Initializing...\n",
26 | "To remove this warning, run the following python code before doing dataframe operations:\n",
27 | "\n",
28 | " import ray\n",
29 | " ray.init()\n",
30 | "\n",
31 | "UserWarning: `read_gbq` defaulting to pandas implementation.\n",
32 | "To request implementation, send an email to feature_requests@modin.org.\n"
33 | ]
34 | },
35 | {
36 | "data": {
37 | "text/html": [
38 | "
\n",
39 | "\n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " | \n",
56 | " Id | \n",
57 | " Type | \n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " \n",
62 | " 0 | \n",
63 | " 1 | \n",
64 | " Question | \n",
65 | "
\n",
66 | " \n",
67 | " 1 | \n",
68 | " 2 | \n",
69 | " Answer | \n",
70 | "
\n",
71 | " \n",
72 | " 2 | \n",
73 | " 3 | \n",
74 | " Wiki | \n",
75 | "
\n",
76 | " \n",
77 | " 3 | \n",
78 | " 4 | \n",
79 | " TagWikiExcerpt | \n",
80 | "
\n",
81 | " \n",
82 | " 4 | \n",
83 | " 5 | \n",
84 | " TagWiki | \n",
85 | "
\n",
86 | " \n",
87 | " 5 | \n",
88 | " 6 | \n",
89 | " ModeratorNomination | \n",
90 | "
\n",
91 | " \n",
92 | " 6 | \n",
93 | " 7 | \n",
94 | " WikiPlaceholder | \n",
95 | "
\n",
96 | " \n",
97 | " 7 | \n",
98 | " 8 | \n",
99 | " PrivilegeWiki | \n",
100 | "
\n",
101 | " \n",
102 | "
\n",
103 | "
"
104 | ],
105 | "text/plain": [
106 | " Id Type\n",
107 | "0 1 Question\n",
108 | "1 2 Answer\n",
109 | "2 3 Wiki\n",
110 | "3 4 TagWikiExcerpt\n",
111 | "4 5 TagWiki\n",
112 | "5 6 ModeratorNomination\n",
113 | "6 7 WikiPlaceholder\n",
114 | "7 8 PrivilegeWiki"
115 | ]
116 | },
117 | "execution_count": 3,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ],
122 | "source": [
123 | "pd.read_gbq(f\"\"\"\n",
124 | "select *\n",
125 | "FROM`sotorrent-org.2020_12_31.PostType`\n",
126 | "\n",
127 | "\"\"\", use_bqstorage_api=True)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 4,
133 | "id": "df33775c-cdb8-4fe2-8457-e8642c8265c5",
134 | "metadata": {},
135 | "outputs": [
136 | {
137 | "name": "stderr",
138 | "output_type": "stream",
139 | "text": [
140 | "UserWarning: `read_gbq` defaulting to pandas implementation.\n"
141 | ]
142 | }
143 | ],
144 | "source": [
145 | "df_raw = pd.read_gbq (f\"\"\"\n",
146 | "\n",
147 | "with qn as (\n",
148 | " select Id , AcceptedAnswerId, Title, Body as QuestionBody, Tags , ViewCount, AnswerCount, CommentCount , Score, CreationDate\n",
149 | " FROM `sotorrent-org.2020_12_31.Posts` \n",
150 | " where PostTypeId = 1\n",
151 | "),\n",
152 | "ans as (\n",
153 | " select Id , Body as AnswerBody\n",
154 | " FROM `sotorrent-org.2020_12_31.Posts` \n",
155 | " where PostTypeId = 2\n",
156 | ")\n",
157 | "\n",
158 | "\n",
159 | "SELECT qn.*, ans.AnswerBody\n",
160 | "From qn \n",
161 | "inner join ans \n",
162 | "on qn.AcceptedAnswerId = ans.Id\n",
163 | "\n",
164 | "\"\"\", use_bqstorage_api=True)\n"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 5,
170 | "id": "51c4dd43-b3be-4253-8b1c-76eb854d4668",
171 | "metadata": {},
172 | "outputs": [
173 | {
174 | "data": {
175 | "text/html": [
176 | "\n",
177 | "\n",
190 | "
\n",
191 | " \n",
192 | " \n",
193 | " | \n",
194 | " Id | \n",
195 | " AcceptedAnswerId | \n",
196 | " Title | \n",
197 | " QuestionBody | \n",
198 | " Tags | \n",
199 | " ViewCount | \n",
200 | " AnswerCount | \n",
201 | " CommentCount | \n",
202 | " Score | \n",
203 | " CreationDate | \n",
204 | " AnswerBody | \n",
205 | "
\n",
206 | " \n",
207 | " \n",
208 | " \n",
209 | " 0 | \n",
210 | " 22486469 | \n",
211 | " 22488014 | \n",
212 | " Memory Mapping Large File Haskell | \n",
213 | " <p>I am experimenting with the Haskell mmap pa... | \n",
214 | " <haskell> | \n",
215 | " 566 | \n",
216 | " 1 | \n",
217 | " 1 | \n",
218 | " 10 | \n",
219 | " 2014-03-18 17:18:08 | \n",
220 | " <p>Looks like a typo. If I replace this:</p>\\n... | \n",
221 | "
\n",
222 | " \n",
223 | " 1 | \n",
224 | " 20902775 | \n",
225 | " 20902933 | \n",
226 | " How to check if auto-rotate screen setting is ... | \n",
227 | " <p>I think each android device has an abitily ... | \n",
228 | " <java><android> | \n",
229 | " 11201 | \n",
230 | " 3 | \n",
231 | " 3 | \n",
232 | " 12 | \n",
233 | " 2014-01-03 11:37:35 | \n",
234 | " <p>Hope this code snippet helps you out:-</p>\\... | \n",
235 | "
\n",
236 | " \n",
237 | " 2 | \n",
238 | " 39613023 | \n",
239 | " 39623807 | \n",
240 | " Understanding the FFT output | \n",
241 | " <p>I'm currently occupied in a practicum and m... | \n",
242 | " <java><fft> | \n",
243 | " 277 | \n",
244 | " 1 | \n",
245 | " 0 | \n",
246 | " -4 | \n",
247 | " 2016-09-21 09:46:43 | \n",
248 | " <p>Computing a 512-point fourier transform aft... | \n",
249 | "
\n",
250 | " \n",
251 | " 3 | \n",
252 | " 2770630 | \n",
253 | " 2771563 | \n",
254 | " PDO::fetchAll vs. PDO::fetch in a loop | \n",
255 | " <p>Just a quick question.</p>\\n\\n<p>Is there a... | \n",
256 | " <php><mysql><pdo><fetch> | \n",
257 | " 86006 | \n",
258 | " 7 | \n",
259 | " 1 | \n",
260 | " 72 | \n",
261 | " 2010-05-05 04:31:40 | \n",
262 | " <p>Little benchmark with 200k random records. ... | \n",
263 | "
\n",
264 | " \n",
265 | " 4 | \n",
266 | " 31725206 | \n",
267 | " 40180517 | \n",
268 | " Unable to Flash eMMC from SD Card BeagleBone B... | \n",
269 | " <p>I am working on BeagleBone Black and Debian... | \n",
270 | " <debian><beagleboneblack> | \n",
271 | " 31664 | \n",
272 | " 8 | \n",
273 | " 3 | \n",
274 | " 17 | \n",
275 | " 2015-07-30 13:30:39 | \n",
276 | " <p>Did you remember to remove the \"#\" at the b... | \n",
277 | "
\n",
278 | " \n",
279 | "
\n",
280 | "
"
281 | ],
282 | "text/plain": [
283 | " Id AcceptedAnswerId \\\n",
284 | "0 22486469 22488014 \n",
285 | "1 20902775 20902933 \n",
286 | "2 39613023 39623807 \n",
287 | "3 2770630 2771563 \n",
288 | "4 31725206 40180517 \n",
289 | "\n",
290 | " Title \\\n",
291 | "0 Memory Mapping Large File Haskell \n",
292 | "1 How to check if auto-rotate screen setting is ... \n",
293 | "2 Understanding the FFT output \n",
294 | "3 PDO::fetchAll vs. PDO::fetch in a loop \n",
295 | "4 Unable to Flash eMMC from SD Card BeagleBone B... \n",
296 | "\n",
297 | " QuestionBody \\\n",
298 | "0 I am experimenting with the Haskell mmap pa... \n",
299 | "1
I think each android device has an abitily ... \n",
300 | "2
I'm currently occupied in a practicum and m... \n",
301 | "3
Just a quick question.
\\n\\nIs there a... \n",
302 | "4
I am working on BeagleBone Black and Debian... \n",
303 | "\n",
304 | " Tags ViewCount AnswerCount CommentCount Score \\\n",
305 | "0 566 1 1 10 \n",
306 | "1 11201 3 3 12 \n",
307 | "2 277 1 0 -4 \n",
308 | "3 86006 7 1 72 \n",
309 | "4 31664 8 3 17 \n",
310 | "\n",
311 | " CreationDate AnswerBody \n",
312 | "0 2014-03-18 17:18:08 Looks like a typo. If I replace this:
\\n... \n",
313 | "1 2014-01-03 11:37:35 Hope this code snippet helps you out:-
\\... \n",
314 | "2 2016-09-21 09:46:43 Computing a 512-point fourier transform aft... \n",
315 | "3 2010-05-05 04:31:40
Little benchmark with 200k random records. ... \n",
316 | "4 2015-07-30 13:30:39
Did you remember to remove the \"#\" at the b... "
317 | ]
318 | },
319 | "execution_count": 5,
320 | "metadata": {},
321 | "output_type": "execute_result"
322 | }
323 | ],
324 | "source": [
325 | "df_raw.head()"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "id": "8546ce34-5cce-45e1-8b90-e178e98e7415",
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "df_raw.to_parquet(\"../data/df_raw\",index=False)"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "id": "4abb6cc3-040d-44b5-a5a5-68161b732b9e",
342 | "metadata": {},
343 | "outputs": [],
344 | "source": [
345 | "df_raw = pd.read_parquet(\"../data/df_raw\")"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "id": "c56f1bbd-6195-414a-adee-1075aede6aca",
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "len(df_raw)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "id": "9cd18238-7318-44bb-8302-89232429028e",
362 | "metadata": {},
363 | "outputs": [],
364 | "source": [
365 | "def strip_html(s:str):\n",
366 | " try:\n",
367 | " return str(lxml.html.fromstring(s).text_content())\n",
368 | " except:\n",
369 | " return ''\n",
370 | "\n",
371 | "def parse_tags(content:str):\n",
372 | " return re.findall(r'<(.+?)>',content)\n"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "id": "8eda050e-3ab9-4555-9f7a-fe9cb190d824",
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "strip_html(f\"\"\"\n",
383 | "\n",
384 | "
I was asked to create a singleton that will...\t\n",
385 | "\"\"\")"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "id": "a60b2cd6-2f02-447b-99ac-2de8598bb470",
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "df = df_raw"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": null,
401 | "id": "f8fa3f43-d0c1-42ff-949c-cb7519e11454",
402 | "metadata": {},
403 | "outputs": [],
404 | "source": [
405 | "df['Body'] = df['Body'].apply(strip_html)"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "id": "d957c0c3-aba6-451b-816a-67224b8e6578",
412 | "metadata": {},
413 | "outputs": [],
414 | "source": [
415 | "df['Tags'] = df['Tags'].apply(parse_tags)"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "id": "cb4c000b-0cd1-4d21-9de3-56589a04c40d",
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "df_final.to_parquet(\"../data/df_processed/\")"
426 | ]
427 | }
428 | ],
429 | "metadata": {
430 | "environment": {
431 | "kernel": "python3",
432 | "name": "pytorch-gpu.1-11.m94",
433 | "type": "gcloud",
434 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
435 | },
436 | "kernelspec": {
437 | "display_name": "Python 3",
438 | "language": "python",
439 | "name": "python3"
440 | },
441 | "language_info": {
442 | "codemirror_mode": {
443 | "name": "ipython",
444 | "version": 3
445 | },
446 | "file_extension": ".py",
447 | "mimetype": "text/x-python",
448 | "name": "python",
449 | "nbconvert_exporter": "python",
450 | "pygments_lexer": "ipython3",
451 | "version": "3.7.12"
452 | }
453 | },
454 | "nbformat": 4,
455 | "nbformat_minor": 5
456 | }
457 |
--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/01_b_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "4885264f-1d3f-4ad5-a29a-e338cf64e59c",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": []
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "2d9c002c-9ba7-48cb-83a5-3d2903056d43",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import modin.pandas as pd\n",
19 | "import re\n",
20 | "import lxml.html\n",
21 | "import re"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "id": "ce8fc711-181c-4481-a6c5-fb580bf7e5d0",
28 | "metadata": {},
29 | "outputs": [
30 | {
31 | "name": "stderr",
32 | "output_type": "stream",
33 | "text": [
34 | "UserWarning: Ray execution environment not yet initialized. Initializing...\n",
35 | "To remove this warning, run the following python code before doing dataframe operations:\n",
36 | "\n",
37 | " import ray\n",
38 | " ray.init()\n",
39 | "\n",
40 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
41 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
42 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
43 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
44 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
45 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
46 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
47 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
48 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
49 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
50 | "2022-10-01 21:40:38,955\tWARNING worker.py:1257 -- (ip=10.52.136.102) The agent on node nup0013-dl failed to be restarted 5 times. There are 3 possible problems if you see this error.\n",
51 | " 1. The dashboard might not display correct information on this node.\n",
52 | " 2. Metrics on this node won't be reported.\n",
53 | " 3. runtime_env APIs won't work.\n",
54 | "Check out the `dashboard_agent.log` to see the detailed failure messages.\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "df_raw = pd.read_parquet(\"../data/df_raw/\")"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 3,
65 | "id": "e0a22a80-c5c6-4bd7-8546-ffe39db0b7c6",
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/plain": [
71 | "Index(['Id', 'AcceptedAnswerId', 'Title', 'Body', 'Tags', 'ViewCount',\n",
72 | " 'AnswerCount', 'CommentCount', 'Score', 'CreationDate', 'AnswerBody'],\n",
73 | " dtype='object')"
74 | ]
75 | },
76 | "execution_count": 3,
77 | "metadata": {},
78 | "output_type": "execute_result"
79 | }
80 | ],
81 | "source": [
82 | "df_raw.columns"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 4,
88 | "id": "57723608-d170-419b-b1f0-85dde823485f",
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "name": "stdout",
93 | "output_type": "stream",
94 | "text": [
95 | "00_data_fetch_bq.ipynb\t 01_b_setup_new.ipynb 02_indexing_faiss.ipynb old\n",
96 | "00_data_fetch_spark.ipynb 01_data_cleanup.ipynb 03_searching_es.ipynb\n",
97 | "01_b_setup.ipynb\t 02_indexing_es.ipynb Untitled.ipynb\n"
98 | ]
99 | }
100 | ],
101 | "source": [
102 | "!ls"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 5,
108 | "id": "9d757c53-4328-4baf-a3d1-f26dffd00ca4",
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/html": [
114 | "
\n",
115 | "\n",
128 | "
\n",
129 | " \n",
130 | " \n",
131 | " | \n",
132 | " Id | \n",
133 | " AcceptedAnswerId | \n",
134 | " Title | \n",
135 | " Body | \n",
136 | " Tags | \n",
137 | " ViewCount | \n",
138 | " AnswerCount | \n",
139 | " CommentCount | \n",
140 | " Score | \n",
141 | " CreationDate | \n",
142 | " AnswerBody | \n",
143 | "
\n",
144 | " \n",
145 | " \n",
146 | " \n",
147 | " 0 | \n",
148 | " 17123652 | \n",
149 | " 17124724 | \n",
150 | " hierarchical encryption scheme | \n",
151 | " <p>I am in need of the \"hierarchical\" encrypti... | \n",
152 | " <cryptography><key><hierarchical> | \n",
153 | " 631 | \n",
154 | " 1 | \n",
155 | " 0 | \n",
156 | " 2 | \n",
157 | " 2013-06-15 12:29:50.987 | \n",
158 | " <p>A partial solution. You own the master key... | \n",
159 | "
\n",
160 | " \n",
161 | " 1 | \n",
162 | " 44577139 | \n",
163 | " 44577209 | \n",
164 | " Uncaught TypeError: this.source is not a function | \n",
165 | " <p>I want to prelaod all the customers and giv... | \n",
166 | " <ajax><jquery-ui> | \n",
167 | " 3396 | \n",
168 | " 1 | \n",
169 | " 0 | \n",
170 | " 1 | \n",
171 | " 2017-06-15 21:14:46.990 | \n",
172 | " <p>Initialize <strong>autocomplete</strong> af... | \n",
173 | "
\n",
174 | " \n",
175 | " 2 | \n",
176 | " 45162881 | \n",
177 | " 45162984 | \n",
178 | " Class App\\Http\\Controllers\\ does not exist | \n",
179 | " <p>This is my Route: </p>\\n\\n<pre><code> Route... | \n",
180 | " <php><laravel><controller><routes> | \n",
181 | " 32241 | \n",
182 | " 3 | \n",
183 | " 1 | \n",
184 | " 2 | \n",
185 | " 2017-07-18 09:35:26.630 | \n",
186 | " <p>At the first of controller you do not need ... | \n",
187 | "
\n",
188 | " \n",
189 | " 3 | \n",
190 | " 1887841 | \n",
191 | " 1890092 | \n",
192 | " Grails startup is slow | \n",
193 | " <p>Help! I'm porting a large ruby app to Grail... | \n",
194 | " <grails> | \n",
195 | " 9592 | \n",
196 | " 5 | \n",
197 | " 0 | \n",
198 | " 28 | \n",
199 | " 2009-12-11 12:43:03.790 | \n",
200 | " <p>Unfortunately, I am not sure too much can b... | \n",
201 | "
\n",
202 | " \n",
203 | " 4 | \n",
204 | " 8151129 | \n",
205 | " 8151158 | \n",
206 | " AlertDialog - trying to understand this syntax | \n",
207 | " <p>This is code from the book sample:</p>\\n\\n<... | \n",
208 | " <java><android> | \n",
209 | " 490 | \n",
210 | " 5 | \n",
211 | " 1 | \n",
212 | " 2 | \n",
213 | " 2011-11-16 11:46:42.137 | \n",
214 | " <pre><code>// Create a builder\\nAlertDialog.Bu... | \n",
215 | "
\n",
216 | " \n",
217 | "
\n",
218 | "
"
219 | ],
220 | "text/plain": [
221 | " Id AcceptedAnswerId \\\n",
222 | "0 17123652 17124724 \n",
223 | "1 44577139 44577209 \n",
224 | "2 45162881 45162984 \n",
225 | "3 1887841 1890092 \n",
226 | "4 8151129 8151158 \n",
227 | "\n",
228 | " Title \\\n",
229 | "0 hierarchical encryption scheme \n",
230 | "1 Uncaught TypeError: this.source is not a function \n",
231 | "2 Class App\\Http\\Controllers\\ does not exist \n",
232 | "3 Grails startup is slow \n",
233 | "4 AlertDialog - trying to understand this syntax \n",
234 | "\n",
235 | " Body \\\n",
236 | "0 I am in need of the \"hierarchical\" encrypti... \n",
237 | "1
I want to prelaod all the customers and giv... \n",
238 | "2
This is my Route:
\\n\\n Route... \n",
239 | "3 Help! I'm porting a large ruby app to Grail... \n",
240 | "4
This is code from the book sample:
\\n\\n<... \n",
241 | "\n",
242 | " Tags ViewCount AnswerCount CommentCount \\\n",
243 | "0 631 1 0 \n",
244 | "1 3396 1 0 \n",
245 | "2 32241 3 1 \n",
246 | "3 9592 5 0 \n",
247 | "4 490 5 1 \n",
248 | "\n",
249 | " Score CreationDate \\\n",
250 | "0 2 2013-06-15 12:29:50.987 \n",
251 | "1 1 2017-06-15 21:14:46.990 \n",
252 | "2 2 2017-07-18 09:35:26.630 \n",
253 | "3 28 2009-12-11 12:43:03.790 \n",
254 | "4 2 2011-11-16 11:46:42.137 \n",
255 | "\n",
256 | " AnswerBody \n",
257 | "0 A partial solution. You own the master key... \n",
258 | "1
Initialize autocomplete af... \n",
259 | "2
At the first of controller you do not need ... \n",
260 | "3
Unfortunately, I am not sure too much can b... \n",
261 | "4
// Create a builder\\nAlertDialog.Bu... "
262 | ]
263 | },
264 | "execution_count": 5,
265 | "metadata": {},
266 | "output_type": "execute_result"
267 | }
268 | ],
269 | "source": [
270 | "df_raw.head()"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "id": "bcf8431e-c037-429f-bc53-13a8214be375",
277 | "metadata": {},
278 | "outputs": [],
279 | "source": []
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 6,
284 | "id": "72644750-923e-4c69-8ba1-581e2929540d",
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "regex = r\"\"\"\n",
289 | "\t.*?
\n",
290 | "\t\"\"\"\n",
291 | "\n",
292 | "def clean_text(snippet:str):\n",
293 | " snippet = re.sub(pattern=regex, repl = '[CODE]', string = snippet, flags = re.IGNORECASE | re.DOTALL | re.MULTILINE | re.VERBOSE )\n",
294 | " \n",
295 | " snippet = str(lxml.html.fromstring(snippet).text_content())\n",
296 | " \n",
297 | " return snippet\n",
298 | "\n",
299 | "def parse_tags(content:str):\n",
300 | " return re.findall(r'<(.+?)>',content)"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "id": "99a5b7bf-694f-4798-bd6a-d41f4ec7dfeb",
307 | "metadata": {},
308 | "outputs": [],
309 | "source": [
310 | "\n",
311 | "\n"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 7,
317 | "id": "8eda050e-3ab9-4555-9f7a-fe9cb190d824",
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/plain": [
323 | "'I was asked to create a singleton that will...\\n\\n[CODE]\\n\\n test \\n\\n'"
324 | ]
325 | },
326 | "execution_count": 7,
327 | "metadata": {},
328 | "output_type": "execute_result"
329 | }
330 | ],
331 | "source": [
332 | "clean_text(f\"\"\"\n",
333 | "\n",
334 | "I was asked to create a singleton that will..
.\n",
335 | "\n",
336 | "KDF
\n",
337 | "\n",
338 | " test
\n",
339 | "\n",
340 | "\"\"\")"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "id": "a9a49fef-5499-40de-9bea-01301ce4e339",
347 | "metadata": {},
348 | "outputs": [],
349 | "source": []
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 8,
354 | "id": "a60b2cd6-2f02-447b-99ac-2de8598bb470",
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "df = df_raw"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 9,
364 | "id": "f8fa3f43-d0c1-42ff-949c-cb7519e11454",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "df['QuestionBody'] = df['Body'].apply(clean_text)"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 10,
374 | "id": "d45cc4c4-0142-4d7a-abad-7bd941a331a1",
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "df['AnswerBody'] = df['AnswerBody'].apply(clean_text)"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 11,
384 | "id": "d957c0c3-aba6-451b-816a-67224b8e6578",
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "df['Tags'] = df['Tags'].apply(parse_tags)"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 12,
394 | "id": "cb4c000b-0cd1-4d21-9de3-56589a04c40d",
395 | "metadata": {},
396 | "outputs": [],
397 | "source": [
398 | "df.to_parquet(\"../data/df_processed/\")"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": null,
404 | "id": "f7dc6fdc-8d41-4f9b-bffa-0d8c5274f9f1",
405 | "metadata": {},
406 | "outputs": [],
407 | "source": []
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": 13,
412 | "id": "bcb300a3-53cb-4454-b472-c9cc422f6cc4",
413 | "metadata": {},
414 | "outputs": [
415 | {
416 | "name": "stdout",
417 | "output_type": "stream",
418 | "text": [
419 | "part-0000.snappy.parquet part-0006.snappy.parquet part-0012.snappy.parquet\n",
420 | "part-0001.snappy.parquet part-0007.snappy.parquet part-0013.snappy.parquet\n",
421 | "part-0002.snappy.parquet part-0008.snappy.parquet part-0014.snappy.parquet\n",
422 | "part-0003.snappy.parquet part-0009.snappy.parquet part-0015.snappy.parquet\n",
423 | "part-0004.snappy.parquet part-0010.snappy.parquet\n",
424 | "part-0005.snappy.parquet part-0011.snappy.parquet\n"
425 | ]
426 | }
427 | ],
428 | "source": [
429 | "!ls ../data/df_processed/"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "id": "c722b250-f4ac-4523-adac-280b34dc3209",
436 | "metadata": {},
437 | "outputs": [],
438 | "source": []
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "id": "0569dbf8-8a79-42a2-b854-3f0628e8275d",
444 | "metadata": {},
445 | "outputs": [],
446 | "source": []
447 | }
448 | ],
449 | "metadata": {
450 | "environment": {
451 | "kernel": "stackoverflow",
452 | "name": "pytorch-gpu.1-11.m94",
453 | "type": "gcloud",
454 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
455 | },
456 | "kernelspec": {
457 | "display_name": "Python 3.8.5 ('py38')",
458 | "language": "python",
459 | "name": "python3"
460 | },
461 | "language_info": {
462 | "codemirror_mode": {
463 | "name": "ipython",
464 | "version": 3
465 | },
466 | "file_extension": ".py",
467 | "mimetype": "text/x-python",
468 | "name": "python",
469 | "nbconvert_exporter": "python",
470 | "pygments_lexer": "ipython3",
471 | "version": "3.8.5"
472 | },
473 | "vscode": {
474 | "interpreter": {
475 | "hash": "aefe80b7c360a2b6e560f9a0dcb6ff028291678d8b74cab0042c4a74d0e7253b"
476 | }
477 | }
478 | },
479 | "nbformat": 4,
480 | "nbformat_minor": 5
481 | }
482 |
--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/01_data_subset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "776a1f38-5ec7-4478-b392-bb943274b958",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "id": "a462938f-432c-48cc-b7ae-a20f4df6c3ff",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "df_posts = pd.read_parquet(\"gs://np-training-tmp/stackoverflow/final/posts.parquet\")"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 3,
26 | "id": "f155aa49-6b8d-4056-ad64-eea6fb96cb19",
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "data": {
31 | "text/html": [
32 | "\n",
33 | "\n",
46 | "
\n",
47 | " \n",
48 | " \n",
49 | " | \n",
50 | " Id | \n",
51 | " AcceptedAnswerId | \n",
52 | " Title | \n",
53 | " QuestionBody | \n",
54 | " Tags | \n",
55 | " ViewCount | \n",
56 | " AnswerCount | \n",
57 | " CommentCount | \n",
58 | " Score | \n",
59 | " CreationDate | \n",
60 | " AnswerId | \n",
61 | " AcceptedAnswerBody | \n",
62 | "
\n",
63 | " \n",
64 | " \n",
65 | " \n",
66 | " 0 | \n",
67 | " 33760194 | \n",
68 | " NaN | \n",
69 | " Python How to burning discs with the monitorin... | \n",
70 | " I'm writing the programm on Python with module... | \n",
71 | " [python, event-handling, progressmonitor] | \n",
72 | " 491 | \n",
73 | " 0 | \n",
74 | " 2 | \n",
75 | " 0 | \n",
76 | " 2015-11-17 15:02:09.103 | \n",
77 | " NaN | \n",
78 | " None | \n",
79 | "
\n",
80 | " \n",
81 | " 1 | \n",
82 | " 15020895 | \n",
83 | " NaN | \n",
84 | " Python int-byte efficient data structure | \n",
85 | " i am currently storing key-values of type int-... | \n",
86 | " [python, data-structures] | \n",
87 | " 155 | \n",
88 | " 0 | \n",
89 | " 3 | \n",
90 | " 1 | \n",
91 | " 2013-02-22 09:33:26.360 | \n",
92 | " NaN | \n",
93 | " None | \n",
94 | "
\n",
95 | " \n",
96 | " 2 | \n",
97 | " 47234657 | \n",
98 | " NaN | \n",
99 | " converting word into other word keeping the or... | \n",
100 | " def translate(string, translations):\\n\\n[CODE]... | \n",
101 | " [python, python-3.x] | \n",
102 | " 48 | \n",
103 | " 2 | \n",
104 | " 1 | \n",
105 | " -1 | \n",
106 | " 2017-11-11 05:23:34.343 | \n",
107 | " NaN | \n",
108 | " None | \n",
109 | "
\n",
110 | " \n",
111 | " 3 | \n",
112 | " 37310210 | \n",
113 | " NaN | \n",
114 | " Camera Calibration with OpenCV - How to adjust... | \n",
115 | " I am working on a camera calibration program u... | \n",
116 | " [python, python-2.7, opencv, camera, camera-ca... | \n",
117 | " 8164 | \n",
118 | " 2 | \n",
119 | " 3 | \n",
120 | " 3 | \n",
121 | " 2016-05-18 21:14:34.110 | \n",
122 | " NaN | \n",
123 | " None | \n",
124 | "
\n",
125 | " \n",
126 | " 4 | \n",
127 | " 70675292 | \n",
128 | " NaN | \n",
129 | " Python Same Period Last Year in Pandas with Gr... | \n",
130 | " I have following DataFrame:\\nimport pandas as ... | \n",
131 | " [python, pandas, group-by, offset, forecasting] | \n",
132 | " 70 | \n",
133 | " 1 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | " 2022-01-12 01:19:53.640 | \n",
137 | " NaN | \n",
138 | " None | \n",
139 | "
\n",
140 | " \n",
141 | "
\n",
142 | "
"
143 | ],
144 | "text/plain": [
145 | " Id AcceptedAnswerId \\\n",
146 | "0 33760194 NaN \n",
147 | "1 15020895 NaN \n",
148 | "2 47234657 NaN \n",
149 | "3 37310210 NaN \n",
150 | "4 70675292 NaN \n",
151 | "\n",
152 | " Title \\\n",
153 | "0 Python How to burning discs with the monitorin... \n",
154 | "1 Python int-byte efficient data structure \n",
155 | "2 converting word into other word keeping the or... \n",
156 | "3 Camera Calibration with OpenCV - How to adjust... \n",
157 | "4 Python Same Period Last Year in Pandas with Gr... \n",
158 | "\n",
159 | " QuestionBody \\\n",
160 | "0 I'm writing the programm on Python with module... \n",
161 | "1 i am currently storing key-values of type int-... \n",
162 | "2 def translate(string, translations):\\n\\n[CODE]... \n",
163 | "3 I am working on a camera calibration program u... \n",
164 | "4 I have following DataFrame:\\nimport pandas as ... \n",
165 | "\n",
166 | " Tags ViewCount AnswerCount \\\n",
167 | "0 [python, event-handling, progressmonitor] 491 0 \n",
168 | "1 [python, data-structures] 155 0 \n",
169 | "2 [python, python-3.x] 48 2 \n",
170 | "3 [python, python-2.7, opencv, camera, camera-ca... 8164 2 \n",
171 | "4 [python, pandas, group-by, offset, forecasting] 70 1 \n",
172 | "\n",
173 | " CommentCount Score CreationDate AnswerId AcceptedAnswerBody \n",
174 | "0 2 0 2015-11-17 15:02:09.103 NaN None \n",
175 | "1 3 1 2013-02-22 09:33:26.360 NaN None \n",
176 | "2 1 -1 2017-11-11 05:23:34.343 NaN None \n",
177 | "3 3 3 2016-05-18 21:14:34.110 NaN None \n",
178 | "4 0 0 2022-01-12 01:19:53.640 NaN None "
179 | ]
180 | },
181 | "execution_count": 3,
182 | "metadata": {},
183 | "output_type": "execute_result"
184 | }
185 | ],
186 | "source": [
187 | "df_posts.head()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 4,
193 | "id": "6069508b-5b2e-4572-ace5-ce01d47f9de2",
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/html": [
199 | "\n",
200 | "\n",
213 | "
\n",
214 | " \n",
215 | " \n",
216 | " | \n",
217 | " PostId | \n",
218 | " PostTitle | \n",
219 | " RelatedPostIds | \n",
220 | " RelatedPostTitles | \n",
221 | " num_candidates | \n",
222 | "
\n",
223 | " \n",
224 | " \n",
225 | " \n",
226 | " 0 | \n",
227 | " 57348742 | \n",
228 | " How do I simulate a Scrollbar in tkInter Canvas | \n",
229 | " [57348742, 68340045] | \n",
230 | " [How do I simulate a Scrollbar in tkInter Canv... | \n",
231 | " 2 | \n",
232 | "
\n",
233 | " \n",
234 | " 1 | \n",
235 | " 3494593 | \n",
236 | " Shading a kernel density plot between two points. | \n",
237 | " [3494593, 14863744, 14094644, 16504452, 488531... | \n",
238 | " [Shading a kernel density plot between two poi... | \n",
239 | " 16 | \n",
240 | "
\n",
241 | " \n",
242 | " 2 | \n",
243 | " 37949409 | \n",
244 | " Dictionary in a numpy array? | \n",
245 | " [37949409, 47689224, 61517741] | \n",
246 | " [Dictionary in a numpy array?, How to access t... | \n",
247 | " 3 | \n",
248 | "
\n",
249 | " \n",
250 | " 3 | \n",
251 | " 51519086 | \n",
252 | " How to remove tkinter - - - - line's when crea... | \n",
253 | " [51519086, 55088055] | \n",
254 | " [How to remove tkinter - - - - line's when cre... | \n",
255 | " 2 | \n",
256 | "
\n",
257 | " \n",
258 | " 4 | \n",
259 | " 63107594 | \n",
260 | " How to deal with multi-level column names down... | \n",
261 | " [63107594, 63107603, 62966295, 68674235, 63124... | \n",
262 | " [How to deal with multi-level column names dow... | \n",
263 | " 6 | \n",
264 | "
\n",
265 | " \n",
266 | " ... | \n",
267 | " ... | \n",
268 | " ... | \n",
269 | " ... | \n",
270 | " ... | \n",
271 | " ... | \n",
272 | "
\n",
273 | " \n",
274 | " 33243 | \n",
275 | " 48536681 | \n",
276 | " What is the exact meaning of stride's list in ... | \n",
277 | " [48536681, 47305022] | \n",
278 | " [What is the exact meaning of stride's list in... | \n",
279 | " 2 | \n",
280 | "
\n",
281 | " \n",
282 | " 33244 | \n",
283 | " 37814201 | \n",
284 | " pandas time shift from utc to local | \n",
285 | " [37814201, 52390647] | \n",
286 | " [pandas time shift from utc to local, Convert ... | \n",
287 | " 2 | \n",
288 | "
\n",
289 | " \n",
290 | " 33245 | \n",
291 | " 2316987 | \n",
292 | " Converting a string to a formatted date-time s... | \n",
293 | " [2316987, 48848730] | \n",
294 | " [Converting a string to a formatted date-time ... | \n",
295 | " 2 | \n",
296 | "
\n",
297 | " \n",
298 | " 33246 | \n",
299 | " 52027033 | \n",
300 | " Convert datetime to another format without cha... | \n",
301 | " [52027033, 52252961] | \n",
302 | " [Convert datetime to another format without ch... | \n",
303 | " 2 | \n",
304 | "
\n",
305 | " \n",
306 | " 33247 | \n",
307 | " 17622419 | \n",
308 | " Creating a namedtuple object using only a subs... | \n",
309 | " [17622419, 50899076] | \n",
310 | " [Creating a namedtuple object using only a sub... | \n",
311 | " 2 | \n",
312 | "
\n",
313 | " \n",
314 | "
\n",
315 | "
33248 rows × 5 columns
\n",
316 | "
"
317 | ],
318 | "text/plain": [
319 | " PostId PostTitle \\\n",
320 | "0 57348742 How do I simulate a Scrollbar in tkInter Canvas \n",
321 | "1 3494593 Shading a kernel density plot between two points. \n",
322 | "2 37949409 Dictionary in a numpy array? \n",
323 | "3 51519086 How to remove tkinter - - - - line's when crea... \n",
324 | "4 63107594 How to deal with multi-level column names down... \n",
325 | "... ... ... \n",
326 | "33243 48536681 What is the exact meaning of stride's list in ... \n",
327 | "33244 37814201 pandas time shift from utc to local \n",
328 | "33245 2316987 Converting a string to a formatted date-time s... \n",
329 | "33246 52027033 Convert datetime to another format without cha... \n",
330 | "33247 17622419 Creating a namedtuple object using only a subs... \n",
331 | "\n",
332 | " RelatedPostIds \\\n",
333 | "0 [57348742, 68340045] \n",
334 | "1 [3494593, 14863744, 14094644, 16504452, 488531... \n",
335 | "2 [37949409, 47689224, 61517741] \n",
336 | "3 [51519086, 55088055] \n",
337 | "4 [63107594, 63107603, 62966295, 68674235, 63124... \n",
338 | "... ... \n",
339 | "33243 [48536681, 47305022] \n",
340 | "33244 [37814201, 52390647] \n",
341 | "33245 [2316987, 48848730] \n",
342 | "33246 [52027033, 52252961] \n",
343 | "33247 [17622419, 50899076] \n",
344 | "\n",
345 | " RelatedPostTitles num_candidates \n",
346 | "0 [How do I simulate a Scrollbar in tkInter Canv... 2 \n",
347 | "1 [Shading a kernel density plot between two poi... 16 \n",
348 | "2 [Dictionary in a numpy array?, How to access t... 3 \n",
349 | "3 [How to remove tkinter - - - - line's when cre... 2 \n",
350 | "4 [How to deal with multi-level column names dow... 6 \n",
351 | "... ... ... \n",
352 | "33243 [What is the exact meaning of stride's list in... 2 \n",
353 | "33244 [pandas time shift from utc to local, Convert ... 2 \n",
354 | "33245 [Converting a string to a formatted date-time ... 2 \n",
355 | "33246 [Convert datetime to another format without ch... 2 \n",
356 | "33247 [Creating a namedtuple object using only a sub... 2 \n",
357 | "\n",
358 | "[33248 rows x 5 columns]"
359 | ]
360 | },
361 | "execution_count": 4,
362 | "metadata": {},
363 | "output_type": "execute_result"
364 | }
365 | ],
366 | "source": [
367 | "df_related = pd.read_parquet(\"gs://np-training-tmp/stackoverflow/final/related_posts.parquet\")\n",
368 | "df_related"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "id": "12502a21-39cf-4f73-b6b2-d106f446516f",
375 | "metadata": {},
376 | "outputs": [],
377 | "source": []
378 | },
379 | {
380 | "cell_type": "code",
381 | "execution_count": 15,
382 | "id": "0174520e-92ed-48de-bab5-214d04d0249e",
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "post_ids = set (df_posts.sample(frac=1, random_state=42).head(200_000)['Id'] )"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 16,
392 | "id": "586796a6-f69f-4faf-bd42-5f967986dfc1",
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "def match_exists(related_post_ids):\n",
397 | " res = set(related_post_ids ) & post_ids\n",
398 | " return len(res) > 0"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 17,
404 | "id": "3412f27f-c39e-4fcf-9dd1-9147fbc0eac7",
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "df_related_subset = df_related [ df_related['RelatedPostIds'].apply(match_exists) ]\n",
409 | "post_ids_additional = set(df_candidates['RelatedPostIds'].explode() )\n",
410 | "\n",
411 | "post_id_final = post_ids | post_ids_additional"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": null,
417 | "id": "f323527c-021a-474f-a9d0-b73aa3d55681",
418 | "metadata": {},
419 | "outputs": [],
420 | "source": [
421 | "len("
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 18,
427 | "id": "649e579a-106f-4835-b646-d76e6c2e8305",
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "df_posts_subset = df_posts [ df_posts['Id'].isin(post_id_final)]"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": 19,
437 | "id": "8712592a-1549-4c4e-a508-8357f693d2eb",
438 | "metadata": {},
439 | "outputs": [
440 | {
441 | "data": {
442 | "text/plain": [
443 | "219841"
444 | ]
445 | },
446 | "execution_count": 19,
447 | "metadata": {},
448 | "output_type": "execute_result"
449 | }
450 | ],
451 | "source": [
452 | "len(df_posts_subset)"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "id": "916d2770-d343-47e3-9d06-f9c399e7e6a7",
459 | "metadata": {},
460 | "outputs": [],
461 | "source": []
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 20,
466 | "id": "a503bf07-16fe-4b2a-84d6-cbd86851067e",
467 | "metadata": {},
468 | "outputs": [],
469 | "source": [
470 | "df_posts_subset.to_parquet(\"gs://np-training-tmp/stackoverflow/final_subset/posts.parquet\")"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 21,
476 | "id": "f4fbd837-2557-4b63-b263-9af66690815a",
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "df_related_subset.to_parquet(\"gs://np-training-tmp/stackoverflow/final_subset/related_posts.parquet\")"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": null,
486 | "id": "9e5b4524-14ec-44b7-bfe4-38f95c39e15b",
487 | "metadata": {},
488 | "outputs": [],
489 | "source": []
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "id": "1809b800-0eb7-46ec-9a35-4d52070c6840",
495 | "metadata": {},
496 | "outputs": [],
497 | "source": []
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "id": "4fde525e-db57-4c32-a07a-d0cc2b32926a",
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "!gsutil -m cp -r gs://np-training-tmp/stackoverflow/final_subset/* ../data/final_subset/"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": null,
512 | "id": "7fd4e6ab-9c57-4fe6-a606-eaa4932f4244",
513 | "metadata": {},
514 | "outputs": [],
515 | "source": [
516 | "!gsutil -m cp -r gs://np-training-tmp/stackoverflow/final/* ../data/final/\n",
517 | "\n"
518 | ]
519 | }
520 | ],
521 | "metadata": {
522 | "environment": {
523 | "kernel": "stackoverflow",
524 | "name": "pytorch-gpu.1-12.m99",
525 | "type": "gcloud",
526 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
527 | },
528 | "kernelspec": {
529 | "display_name": "stackoverflow",
530 | "language": "python",
531 | "name": "stackoverflow"
532 | },
533 | "language_info": {
534 | "codemirror_mode": {
535 | "name": "ipython",
536 | "version": 3
537 | },
538 | "file_extension": ".py",
539 | "mimetype": "text/x-python",
540 | "name": "python",
541 | "nbconvert_exporter": "python",
542 | "pygments_lexer": "ipython3",
543 | "version": "3.7.12"
544 | }
545 | },
546 | "nbformat": 4,
547 | "nbformat_minor": 5
548 | }
549 |
--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/01_workshop_data_preview.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "724dc187-f812-4c97-81dd-ad527f9d8338",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import pandas as pd\n",
11 | "from IPython.display import JSON\n",
12 | "import metrics_utils"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "id": "37b1ba40-6527-4ec3-8180-7db66fc9d808",
19 | "metadata": {},
20 | "outputs": [],
21 | "source": []
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 11,
26 | "id": "dfeca3c8-2684-44a1-8497-1bf4c4c89c9d",
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "name": "stdout",
31 | "output_type": "stream",
32 | "text": [
33 | " 1.35 GiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final/posts.parquet\n",
34 | " 5.26 MiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final/related_posts.parquet\n",
35 | "115.09 MiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final_subset/posts.parquet\n",
36 | " 1.08 GiB 2022-11-02T11:42:53Z gs://np-public-training-temp/stackoverflow/final_subset/posts_with_embedding.parquet\n",
37 | " 1.4 MiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final_subset/related_posts.parquet\n",
38 | "TOTAL: 5 objects, 2736956352 bytes (2.55 GiB)\n"
39 | ]
40 | }
41 | ],
42 | "source": [
43 | "!gsutil ls -lh gs://np-public-training-temp/stackoverflow/**"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "id": "e218afd6-edcb-46cc-8263-94611d54ffeb",
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "path_posts = \"gs://np-public-training-temp/stackoverflow/final_subset/posts.parquet\"\n",
54 | "path_posts_related = \"gs://np-public-training-temp/stackoverflow/final_subset/related_posts.parquet\"\n"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 4,
60 | "id": "431537d1-2701-4d8e-a3fc-22d877cd14bb",
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "data": {
65 | "text/html": [
66 | "\n",
67 | "\n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " | \n",
84 | " Id | \n",
85 | " AcceptedAnswerId | \n",
86 | " Title | \n",
87 | " QuestionBody | \n",
88 | " Tags | \n",
89 | " ViewCount | \n",
90 | " AnswerCount | \n",
91 | " CommentCount | \n",
92 | " Score | \n",
93 | " CreationDate | \n",
94 | " AnswerId | \n",
95 | " AcceptedAnswerBody | \n",
96 | "
\n",
97 | " \n",
98 | " \n",
99 | " \n",
100 | " 1 | \n",
101 | " 15020895 | \n",
102 | " NaN | \n",
103 | " Python int-byte efficient data structure | \n",
104 | " i am currently storing key-values of type int-... | \n",
105 | " [python, data-structures] | \n",
106 | " 155 | \n",
107 | " 0 | \n",
108 | " 3 | \n",
109 | " 1 | \n",
110 | " 2013-02-22 09:33:26.360 | \n",
111 | " NaN | \n",
112 | " None | \n",
113 | "
\n",
114 | " \n",
115 | " 9 | \n",
116 | " 68487902 | \n",
117 | " NaN | \n",
118 | " Why does the Variance of Laplace very differen... | \n",
119 | " TL;DR: How can I use skimage.filters.laplace(i... | \n",
120 | " [python, opencv, image-processing, computer-vi... | \n",
121 | " 391 | \n",
122 | " 0 | \n",
123 | " 5 | \n",
124 | " 1 | \n",
125 | " 2021-07-22 15:50:34.220 | \n",
126 | " NaN | \n",
127 | " None | \n",
128 | "
\n",
129 | " \n",
130 | " 15 | \n",
131 | " 61391327 | \n",
132 | " NaN | \n",
133 | " Why input never ends | \n",
134 | " I have python 3.7 installed and I have this co... | \n",
135 | " [python, python-3.x, input] | \n",
136 | " 104 | \n",
137 | " 1 | \n",
138 | " 6 | \n",
139 | " 3 | \n",
140 | " 2020-04-23 15:43:03.497 | \n",
141 | " NaN | \n",
142 | " None | \n",
143 | "
\n",
144 | " \n",
145 | " 27 | \n",
146 | " 28852710 | \n",
147 | " NaN | \n",
148 | " Crashes with piecewise linear objective for gu... | \n",
149 | " We have a complex optimization problem which i... | \n",
150 | " [python, crash, gurobi, piecewise] | \n",
151 | " 403 | \n",
152 | " 1 | \n",
153 | " 1 | \n",
154 | " 3 | \n",
155 | " 2015-03-04 10:58:16.370 | \n",
156 | " NaN | \n",
157 | " None | \n",
158 | "
\n",
159 | " \n",
160 | " 29 | \n",
161 | " 24043029 | \n",
162 | " NaN | \n",
163 | " Python TypeError: plotdatehist() got an unexpe... | \n",
164 | " apologies beforehand if this is a stupid quest... | \n",
165 | " [python, typeerror] | \n",
166 | " 419 | \n",
167 | " 0 | \n",
168 | " 7 | \n",
169 | " 0 | \n",
170 | " 2014-06-04 16:42:32.257 | \n",
171 | " NaN | \n",
172 | " None | \n",
173 | "
\n",
174 | " \n",
175 | " ... | \n",
176 | " ... | \n",
177 | " ... | \n",
178 | " ... | \n",
179 | " ... | \n",
180 | " ... | \n",
181 | " ... | \n",
182 | " ... | \n",
183 | " ... | \n",
184 | " ... | \n",
185 | " ... | \n",
186 | " ... | \n",
187 | " ... | \n",
188 | "
\n",
189 | " \n",
190 | " 2661376 | \n",
191 | " 55431749 | \n",
192 | " 55431832.0 | \n",
193 | " Handling exception returned by a method | \n",
194 | " I am calling a method that throws Valuerror ex... | \n",
195 | " [python-3.x] | \n",
196 | " 26 | \n",
197 | " 1 | \n",
198 | " 2 | \n",
199 | " 1 | \n",
200 | " 2019-03-30 13:07:07.893 | \n",
201 | " 55431832.0 | \n",
202 | " You need to place call to sanitize method in t... | \n",
203 | "
\n",
204 | " \n",
205 | " 2661378 | \n",
206 | " 13794532 | \n",
207 | " 13794740.0 | \n",
208 | " Python regular expression for Beautiful Soup | \n",
209 | " I am using Beautiful Soup to pull out specific... | \n",
210 | " [python, regex, beautifulsoup] | \n",
211 | " 10723 | \n",
212 | " 1 | \n",
213 | " 3 | \n",
214 | " 5 | \n",
215 | " 2012-12-10 03:18:14.743 | \n",
216 | " 13794740.0 | \n",
217 | " I think I've got it:\\n\\n[CODE]\\n\\nNotice that,... | \n",
218 | "
\n",
219 | " \n",
220 | " 2661390 | \n",
221 | " 25083943 | \n",
222 | " 25084142.0 | \n",
223 | " Search has no attribute teaser | \n",
224 | " I am trying to access teaser. I tried many dif... | \n",
225 | " [python, regex, json, python-3.x] | \n",
226 | " 62 | \n",
227 | " 1 | \n",
228 | " 2 | \n",
229 | " -2 | \n",
230 | " 2014-08-01 15:45:26.733 | \n",
231 | " 25084142.0 | \n",
232 | " Not exactly sure what you are trying to do but... | \n",
233 | "
\n",
234 | " \n",
235 | " 2661401 | \n",
236 | " 8221324 | \n",
237 | " 8221764.0 | \n",
238 | " Is there a reason the SQLAlchemy ORM tutorial ... | \n",
239 | " The SQLAlchemy ORM tutorial uses this class:\\n... | \n",
240 | " [python, sqlalchemy] | \n",
241 | " 834 | \n",
242 | " 3 | \n",
243 | " 1 | \n",
244 | " 4 | \n",
245 | " 2011-11-22 02:42:24.157 | \n",
246 | " 8221764.0 | \n",
247 | " Bear in mind that eval is not used too much; c... | \n",
248 | "
\n",
249 | " \n",
250 | " 2661415 | \n",
251 | " 57679429 | \n",
252 | " 57679695.0 | \n",
253 | " How can I turn a list of column names into a p... | \n",
254 | " I have a list of pandas column names (consisti... | \n",
255 | " [python, string, list, patsy] | \n",
256 | " 106 | \n",
257 | " 1 | \n",
258 | " 0 | \n",
259 | " 0 | \n",
260 | " 2019-08-27 17:11:24.390 | \n",
261 | " 57679695.0 | \n",
262 | " [CODE]\\n\\n[CODE]\\n | \n",
263 | "
\n",
264 | " \n",
265 | "
\n",
266 | "
219841 rows × 12 columns
\n",
267 | "
"
268 | ],
269 | "text/plain": [
270 | " Id AcceptedAnswerId \\\n",
271 | "1 15020895 NaN \n",
272 | "9 68487902 NaN \n",
273 | "15 61391327 NaN \n",
274 | "27 28852710 NaN \n",
275 | "29 24043029 NaN \n",
276 | "... ... ... \n",
277 | "2661376 55431749 55431832.0 \n",
278 | "2661378 13794532 13794740.0 \n",
279 | "2661390 25083943 25084142.0 \n",
280 | "2661401 8221324 8221764.0 \n",
281 | "2661415 57679429 57679695.0 \n",
282 | "\n",
283 | " Title \\\n",
284 | "1 Python int-byte efficient data structure \n",
285 | "9 Why does the Variance of Laplace very differen... \n",
286 | "15 Why input never ends \n",
287 | "27 Crashes with piecewise linear objective for gu... \n",
288 | "29 Python TypeError: plotdatehist() got an unexpe... \n",
289 | "... ... \n",
290 | "2661376 Handling exception returned by a method \n",
291 | "2661378 Python regular expression for Beautiful Soup \n",
292 | "2661390 Search has no attribute teaser \n",
293 | "2661401 Is there a reason the SQLAlchemy ORM tutorial ... \n",
294 | "2661415 How can I turn a list of column names into a p... \n",
295 | "\n",
296 | " QuestionBody \\\n",
297 | "1 i am currently storing key-values of type int-... \n",
298 | "9 TL;DR: How can I use skimage.filters.laplace(i... \n",
299 | "15 I have python 3.7 installed and I have this co... \n",
300 | "27 We have a complex optimization problem which i... \n",
301 | "29 apologies beforehand if this is a stupid quest... \n",
302 | "... ... \n",
303 | "2661376 I am calling a method that throws Valuerror ex... \n",
304 | "2661378 I am using Beautiful Soup to pull out specific... \n",
305 | "2661390 I am trying to access teaser. I tried many dif... \n",
306 | "2661401 The SQLAlchemy ORM tutorial uses this class:\\n... \n",
307 | "2661415 I have a list of pandas column names (consisti... \n",
308 | "\n",
309 | " Tags ViewCount \\\n",
310 | "1 [python, data-structures] 155 \n",
311 | "9 [python, opencv, image-processing, computer-vi... 391 \n",
312 | "15 [python, python-3.x, input] 104 \n",
313 | "27 [python, crash, gurobi, piecewise] 403 \n",
314 | "29 [python, typeerror] 419 \n",
315 | "... ... ... \n",
316 | "2661376 [python-3.x] 26 \n",
317 | "2661378 [python, regex, beautifulsoup] 10723 \n",
318 | "2661390 [python, regex, json, python-3.x] 62 \n",
319 | "2661401 [python, sqlalchemy] 834 \n",
320 | "2661415 [python, string, list, patsy] 106 \n",
321 | "\n",
322 | " AnswerCount CommentCount Score CreationDate AnswerId \\\n",
323 | "1 0 3 1 2013-02-22 09:33:26.360 NaN \n",
324 | "9 0 5 1 2021-07-22 15:50:34.220 NaN \n",
325 | "15 1 6 3 2020-04-23 15:43:03.497 NaN \n",
326 | "27 1 1 3 2015-03-04 10:58:16.370 NaN \n",
327 | "29 0 7 0 2014-06-04 16:42:32.257 NaN \n",
328 | "... ... ... ... ... ... \n",
329 | "2661376 1 2 1 2019-03-30 13:07:07.893 55431832.0 \n",
330 | "2661378 1 3 5 2012-12-10 03:18:14.743 13794740.0 \n",
331 | "2661390 1 2 -2 2014-08-01 15:45:26.733 25084142.0 \n",
332 | "2661401 3 1 4 2011-11-22 02:42:24.157 8221764.0 \n",
333 | "2661415 1 0 0 2019-08-27 17:11:24.390 57679695.0 \n",
334 | "\n",
335 | " AcceptedAnswerBody \n",
336 | "1 None \n",
337 | "9 None \n",
338 | "15 None \n",
339 | "27 None \n",
340 | "29 None \n",
341 | "... ... \n",
342 | "2661376 You need to place call to sanitize method in t... \n",
343 | "2661378 I think I've got it:\\n\\n[CODE]\\n\\nNotice that,... \n",
344 | "2661390 Not exactly sure what you are trying to do but... \n",
345 | "2661401 Bear in mind that eval is not used too much; c... \n",
346 | "2661415 [CODE]\\n\\n[CODE]\\n \n",
347 | "\n",
348 | "[219841 rows x 12 columns]"
349 | ]
350 | },
351 | "execution_count": 4,
352 | "metadata": {},
353 | "output_type": "execute_result"
354 | }
355 | ],
356 | "source": [
357 | "df_posts = pd.read_parquet(path_posts)\n",
358 | "df_posts"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "id": "bd200fc0-da3e-4a72-8fd2-2004d540691a",
365 | "metadata": {},
366 | "outputs": [],
367 | "source": []
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "id": "d557f519-6249-4a00-ba28-0948db54405a",
373 | "metadata": {},
374 | "outputs": [],
375 | "source": []
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 5,
380 | "id": "72242ee8-cc09-4ddd-ab0d-89f7ea0d1b78",
381 | "metadata": {},
382 | "outputs": [
383 | {
384 | "data": {
385 | "text/html": [
386 | "\n",
387 | "\n",
400 | "
\n",
401 | " \n",
402 | " \n",
403 | " | \n",
404 | " PostId | \n",
405 | " PostTitle | \n",
406 | " RelatedPostIds | \n",
407 | " RelatedPostTitles | \n",
408 | " num_candidates | \n",
409 | "
\n",
410 | " \n",
411 | " \n",
412 | " \n",
413 | " 1 | \n",
414 | " 3494593 | \n",
415 | " Shading a kernel density plot between two points. | \n",
416 | " [3494593, 14863744, 14094644, 16504452, 488531... | \n",
417 | " [Shading a kernel density plot between two poi... | \n",
418 | " 16 | \n",
419 | "
\n",
420 | " \n",
421 | " 2 | \n",
422 | " 37949409 | \n",
423 | " Dictionary in a numpy array? | \n",
424 | " [37949409, 47689224, 61517741] | \n",
425 | " [Dictionary in a numpy array?, How to access t... | \n",
426 | " 3 | \n",
427 | "
\n",
428 | " \n",
429 | " 8 | \n",
430 | " 19876079 | \n",
431 | " Cannot find module cv2 when using OpenCV | \n",
432 | " [19876079, 62443365, 64580641, 45606137, 60294... | \n",
433 | " [Cannot find module cv2 when using OpenCV, How... | \n",
434 | " 7 | \n",
435 | "
\n",
436 | " \n",
437 | " 12 | \n",
438 | " 35082143 | \n",
439 | " Error: package or namespace load failed for ‘car’ | \n",
440 | " [35082143, 65941744, 68515009, 56409535] | \n",
441 | " [Error: package or namespace load failed for ‘... | \n",
442 | " 4 | \n",
443 | "
\n",
444 | " \n",
445 | " 14 | \n",
446 | " 2673651 | \n",
447 | " inheritance from str or int | \n",
448 | " [2673651, 48465797, 3120562, 15085917, 3238350... | \n",
449 | " [inheritance from str or int, Inherited class ... | \n",
450 | " 15 | \n",
451 | "
\n",
452 | " \n",
453 | " ... | \n",
454 | " ... | \n",
455 | " ... | \n",
456 | " ... | \n",
457 | " ... | \n",
458 | " ... | \n",
459 | "
\n",
460 | " \n",
461 | " 33231 | \n",
462 | " 28419763 | \n",
463 | " Expand Text widget to fill the entire parent F... | \n",
464 | " [28419763, 48171462] | \n",
465 | " [Expand Text widget to fill the entire parent ... | \n",
466 | " 2 | \n",
467 | "
\n",
468 | " \n",
469 | " 33234 | \n",
470 | " 40332743 | \n",
471 | " Source code for str.split? | \n",
472 | " [40332743, 51355719] | \n",
473 | " [Source code for str.split?, where can I find ... | \n",
474 | " 2 | \n",
475 | "
\n",
476 | " \n",
477 | " 33241 | \n",
478 | " 27443414 | \n",
479 | " Cannot perform a backup or restore operation w... | \n",
480 | " [27443414, 53216877] | \n",
481 | " [Cannot perform a backup or restore operation ... | \n",
482 | " 2 | \n",
483 | "
\n",
484 | " \n",
485 | " 33243 | \n",
486 | " 48536681 | \n",
487 | " What is the exact meaning of stride's list in ... | \n",
488 | " [48536681, 47305022] | \n",
489 | " [What is the exact meaning of stride's list in... | \n",
490 | " 2 | \n",
491 | "
\n",
492 | " \n",
493 | " 33244 | \n",
494 | " 37814201 | \n",
495 | " pandas time shift from utc to local | \n",
496 | " [37814201, 52390647] | \n",
497 | " [pandas time shift from utc to local, Convert ... | \n",
498 | " 2 | \n",
499 | "
\n",
500 | " \n",
501 | "
\n",
502 | "
6114 rows × 5 columns
\n",
503 | "
"
504 | ],
505 | "text/plain": [
506 | " PostId PostTitle \\\n",
507 | "1 3494593 Shading a kernel density plot between two points. \n",
508 | "2 37949409 Dictionary in a numpy array? \n",
509 | "8 19876079 Cannot find module cv2 when using OpenCV \n",
510 | "12 35082143 Error: package or namespace load failed for ‘car’ \n",
511 | "14 2673651 inheritance from str or int \n",
512 | "... ... ... \n",
513 | "33231 28419763 Expand Text widget to fill the entire parent F... \n",
514 | "33234 40332743 Source code for str.split? \n",
515 | "33241 27443414 Cannot perform a backup or restore operation w... \n",
516 | "33243 48536681 What is the exact meaning of stride's list in ... \n",
517 | "33244 37814201 pandas time shift from utc to local \n",
518 | "\n",
519 | " RelatedPostIds \\\n",
520 | "1 [3494593, 14863744, 14094644, 16504452, 488531... \n",
521 | "2 [37949409, 47689224, 61517741] \n",
522 | "8 [19876079, 62443365, 64580641, 45606137, 60294... \n",
523 | "12 [35082143, 65941744, 68515009, 56409535] \n",
524 | "14 [2673651, 48465797, 3120562, 15085917, 3238350... \n",
525 | "... ... \n",
526 | "33231 [28419763, 48171462] \n",
527 | "33234 [40332743, 51355719] \n",
528 | "33241 [27443414, 53216877] \n",
529 | "33243 [48536681, 47305022] \n",
530 | "33244 [37814201, 52390647] \n",
531 | "\n",
532 | " RelatedPostTitles num_candidates \n",
533 | "1 [Shading a kernel density plot between two poi... 16 \n",
534 | "2 [Dictionary in a numpy array?, How to access t... 3 \n",
535 | "8 [Cannot find module cv2 when using OpenCV, How... 7 \n",
536 | "12 [Error: package or namespace load failed for ‘... 4 \n",
537 | "14 [inheritance from str or int, Inherited class ... 15 \n",
538 | "... ... ... \n",
539 | "33231 [Expand Text widget to fill the entire parent ... 2 \n",
540 | "33234 [Source code for str.split?, where can I find ... 2 \n",
541 | "33241 [Cannot perform a backup or restore operation ... 2 \n",
542 | "33243 [What is the exact meaning of stride's list in... 2 \n",
543 | "33244 [pandas time shift from utc to local, Convert ... 2 \n",
544 | "\n",
545 | "[6114 rows x 5 columns]"
546 | ]
547 | },
548 | "execution_count": 5,
549 | "metadata": {},
550 | "output_type": "execute_result"
551 | }
552 | ],
553 | "source": [
554 | "df_posts = pd.read_parquet(path_posts_related)\n",
555 | "df_posts"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 8,
561 | "id": "ba0dc292-3101-457f-b80c-5ce061118c09",
562 | "metadata": {},
563 | "outputs": [
564 | {
565 | "data": {
566 | "application/json": {
567 | "PostId": 3494593,
568 | "PostTitle": "Shading a kernel density plot between two points.",
569 | "RelatedPostIds": [
570 | 3494593,
571 | 14863744,
572 | 14094644,
573 | 16504452,
574 | 48853178,
575 | 36948624,
576 | 47308146,
577 | 34029811,
578 | 31215748,
579 | 29499914,
580 | 41484896,
581 | 7787114,
582 | 27189453,
583 | 23680729,
584 | 36224394,
585 | 18742693
586 | ],
587 | "RelatedPostTitles": [
588 | "Shading a kernel density plot between two points.",
589 | "adding percentile lines to a density plot",
590 | "draw the following shaded area in R",
591 | "color a portion of the normal distribution",
592 | "How can I shade the area under a curve?",
593 | "Shade area under a curve",
594 | "Shading a region under a PDF",
595 | "Fill different colors for each quantile in geom_density() of ggplot",
596 | "How to shade part of a density curve in ggplot (with no y axis data)",
597 | "r density plot - fill area under curve",
598 | "Fill negative value area below geom_line",
599 | "polygon in density plot?",
600 | "Shade (fill or color) area under density curve by quantile",
601 | "Partially fill density plot for area of interest",
602 | "Shade density plot to the left of vline?",
603 | "Shade an area in a R plot"
604 | ],
605 | "num_candidates": 16
606 | },
607 | "text/plain": [
608 | ""
609 | ]
610 | },
611 | "execution_count": 8,
612 | "metadata": {
613 | "application/json": {
614 | "expanded": false,
615 | "root": "root"
616 | }
617 | },
618 | "output_type": "execute_result"
619 | }
620 | ],
621 | "source": [
622 | "JSON ( df_posts.iloc[0].to_dict() )"
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": null,
628 | "id": "9fb7ab6f-08fa-4099-939d-edcb7beca230",
629 | "metadata": {},
630 | "outputs": [],
631 | "source": []
632 | },
633 | {
634 | "cell_type": "code",
635 | "execution_count": null,
636 | "id": "473e7e29-7a27-4030-aad3-c60c89dc19bd",
637 | "metadata": {},
638 | "outputs": [],
639 | "source": []
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": null,
644 | "id": "ce7fc618-3b9c-450e-a89f-576d47fba15e",
645 | "metadata": {},
646 | "outputs": [],
647 | "source": []
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": null,
652 | "id": "30a57006-3696-4a2d-82ca-726ee7c5b6b3",
653 | "metadata": {},
654 | "outputs": [],
655 | "source": []
656 | },
657 | {
658 | "cell_type": "markdown",
659 | "id": "29ebbeec-f1de-4d07-b603-917e5aa3928b",
660 | "metadata": {},
661 | "source": [
662 | "## Metrics"
663 | ]
664 | },
665 | {
666 | "cell_type": "code",
667 | "execution_count": 4,
668 | "id": "3c824225-1fe7-488a-a291-f8ade3f82a82",
669 | "metadata": {},
670 | "outputs": [
671 | {
672 | "data": {
673 | "text/plain": [
674 | "\u001b[0;31mType:\u001b[0m module\n",
675 | "\u001b[0;31mString form:\u001b[0m \n",
676 | "\u001b[0;31mFile:\u001b[0m ~/projects/search-engine-workshop/notebooks/metrics_utils.py\n",
677 | "\u001b[0;31mSource:\u001b[0m \n",
678 | "\u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\n",
679 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
680 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
681 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
682 | "\u001b[0;34m\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
683 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
684 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m\u001b[0m\n",
685 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
686 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
687 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
688 | "\u001b[0;34m\u001b[0m \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n",
689 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
690 | "\u001b[0;34m\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
691 | "\u001b[0;34m\u001b[0m \u001b[0mfirst_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
692 | "\u001b[0;34m\u001b[0m \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfirst_index\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
693 | "\u001b[0;34m\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
694 | "\u001b[0;34m\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\n",
695 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
696 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmrr\u001b[0m\u001b[0;34m\u001b[0m\n",
697 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
698 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
699 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
700 | "\u001b[0;34m\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
701 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
702 | "\u001b[0;34m\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
703 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
704 | "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
705 | "\u001b[0;34m\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
706 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
707 | "\u001b[0;34m\u001b[0m \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n",
708 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
709 | "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
710 | "\u001b[0;34m\u001b[0m \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
711 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
712 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m\u001b[0m\n",
713 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
714 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
715 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mall_metrics\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
716 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
717 | "\u001b[0;34m\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m\u001b[0m\n",
718 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
719 | "\u001b[0;34m\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n",
720 | "\u001b[0;34m\u001b[0m \u001b[0;34m\"p@1\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
721 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@5\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
722 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@10\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
723 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"mrr\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
724 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"map\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
725 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
726 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
727 | "\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n",
728 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
729 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n"
730 | ]
731 | },
732 | "metadata": {},
733 | "output_type": "display_data"
734 | }
735 | ],
736 | "source": [
737 | "??metrics_utils"
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": null,
743 | "id": "9b37a9b9-ab34-4152-88af-22728c8758a9",
744 | "metadata": {},
745 | "outputs": [],
746 | "source": []
747 | },
748 | {
749 | "cell_type": "markdown",
750 | "id": "90285ffa-4312-4ea8-84a6-595199688140",
751 | "metadata": {},
752 | "source": [
753 | "relevant result at the end"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "execution_count": 11,
759 | "id": "356a2b4a-6f3d-42df-bf65-7796bc29c7d9",
760 | "metadata": {},
761 | "outputs": [
762 | {
763 | "data": {
764 | "text/plain": [
765 | "{'p@1': 0.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 0.2, 'map': 0.2}"
766 | ]
767 | },
768 | "execution_count": 11,
769 | "metadata": {},
770 | "output_type": "execute_result"
771 | }
772 | ],
773 | "source": [
774 | "metrics_utils.all_metrics([0,0,0,0,1])"
775 | ]
776 | },
777 | {
778 | "cell_type": "markdown",
779 | "id": "59f9f574-e506-45e0-9c4c-c65a2b3827eb",
780 | "metadata": {},
781 | "source": [
782 | "relevant result at the beginning"
783 | ]
784 | },
785 | {
786 | "cell_type": "code",
787 | "execution_count": 12,
788 | "id": "8252bfbc-7184-437b-91e6-b60d166a9742",
789 | "metadata": {},
790 | "outputs": [
791 | {
792 | "data": {
793 | "text/plain": [
794 | "{'p@1': 1.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 1.0, 'map': 1.0}"
795 | ]
796 | },
797 | "execution_count": 12,
798 | "metadata": {},
799 | "output_type": "execute_result"
800 | }
801 | ],
802 | "source": [
803 | "metrics_utils.all_metrics([1,0,0,0,0])"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": 13,
809 | "id": "196acac3-a263-4307-8ef9-075e7492870c",
810 | "metadata": {},
811 | "outputs": [
812 | {
813 | "data": {
814 | "text/plain": [
815 | "0.2"
816 | ]
817 | },
818 | "execution_count": 13,
819 | "metadata": {},
820 | "output_type": "execute_result"
821 | }
822 | ],
823 | "source": []
824 | },
825 | {
826 | "cell_type": "markdown",
827 | "id": "e3b1413a-81a2-4a7d-9a46-ac6c9938b17e",
828 | "metadata": {},
829 | "source": [
830 | "map captures that the relevant results are shown at the beginning"
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "execution_count": 14,
836 | "id": "ad52c3ad-952a-4340-87bd-d20369cb420d",
837 | "metadata": {},
838 | "outputs": [
839 | {
840 | "data": {
841 | "text/plain": [
842 | "{'p@1': 0.0,\n",
843 | " 'p@5': 0.4,\n",
844 | " 'p@10': 0.2,\n",
845 | " 'mrr': 0.3333333333333333,\n",
846 | " 'map': 0.41666666666666663}"
847 | ]
848 | },
849 | "execution_count": 14,
850 | "metadata": {},
851 | "output_type": "execute_result"
852 | }
853 | ],
854 | "source": [
855 | "metrics_utils.all_metrics([0,0,1,1,0])"
856 | ]
857 | },
858 | {
859 | "cell_type": "code",
860 | "execution_count": 15,
861 | "id": "8f61fc8e-7292-43dc-8f29-501d7fee8876",
862 | "metadata": {},
863 | "outputs": [
864 | {
865 | "data": {
866 | "text/plain": [
867 | "{'p@1': 1.0, 'p@5': 0.4, 'p@10': 0.2, 'mrr': 1.0, 'map': 1.0}"
868 | ]
869 | },
870 | "execution_count": 15,
871 | "metadata": {},
872 | "output_type": "execute_result"
873 | }
874 | ],
875 | "source": [
876 | "metrics_utils.all_metrics([1,1,0,0,0])"
877 | ]
878 | },
879 | {
880 | "cell_type": "code",
881 | "execution_count": null,
882 | "id": "8505c6aa-d009-4dea-9263-38ca4d9f2c4b",
883 | "metadata": {},
884 | "outputs": [],
885 | "source": []
886 | }
887 | ],
888 | "metadata": {
889 | "environment": {
890 | "kernel": "python3",
891 | "name": "pytorch-gpu.1-12.m99",
892 | "type": "gcloud",
893 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
894 | },
895 | "kernelspec": {
896 | "display_name": "Python 3",
897 | "language": "python",
898 | "name": "python3"
899 | },
900 | "language_info": {
901 | "codemirror_mode": {
902 | "name": "ipython",
903 | "version": 3
904 | },
905 | "file_extension": ".py",
906 | "mimetype": "text/x-python",
907 | "name": "python",
908 | "nbconvert_exporter": "python",
909 | "pygments_lexer": "ipython3",
910 | "version": "3.7.12"
911 | }
912 | },
913 | "nbformat": 4,
914 | "nbformat_minor": 5
915 | }
916 |
--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/metrics_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def precision_at_k(r, k):
4 |
5 | r = r[:k]
6 |
7 | return sum(r) / k
8 |
9 |
10 | def mean_reciprocal_rank(r):
11 | mrr = 0
12 |
13 | try:
14 | first_index = r.index(True)
15 | mrr = 1 / (first_index + 1)
16 | except:
17 | pass
18 |
19 | return mrr
20 |
21 | def average_precision(r):
22 |
23 | out = []
24 |
25 | for idx in range(len(r)):
26 |
27 | if r[idx]:
28 | out.append (precision_at_k(r, idx + 1) )
29 |
30 | ap = 0
31 |
32 | if out:
33 | ap = sum(out) / len(out)
34 |
35 | return ap
36 |
37 |
38 | def all_metrics(result):
39 |
40 | result = list(result)
41 |
42 | res= {
43 | "p@1" : precision_at_k(result, 1)
44 | , "p@5" : precision_at_k(result, 5)
45 | , "p@10" : precision_at_k(result, 10)
46 | , "mrr" : mean_reciprocal_rank(result)
47 | , "map" : average_precision(result)
48 |
49 |
50 | }
51 |
52 | return res
--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/test_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "ad224cf0-176b-4460-afc0-03f0aacdfe71",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import datetime\n",
11 | "import pickle\n",
12 | "import uuid\n",
13 | "import datetime\n",
14 | "import numpy as np\n",
15 | "import time"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "id": "9d7d74e6-1472-4f92-b582-fb74683a252e",
22 | "metadata": {},
23 | "outputs": [],
24 | "source": []
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "aae437e9-d66c-44a8-ab44-4523f7abb5b1",
29 | "metadata": {},
30 | "source": [
31 | "# Elastic Search"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "id": "b7831c3d-d97c-4027-bfbd-26b4f672b003",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "import elasticsearch"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "id": "69aff282-e9b0-466b-b828-87b69e3dcbc1",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "elasticsearch.__version__"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "id": "07c82415-1cd2-43e8-a88c-626eac3dea04",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "from elasticsearch import Elasticsearch"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "id": "5e2563f6-5723-4aa6-a120-f4764c4d8b07",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "es = Elasticsearch(hosts=\"http://localhost:9200\" , verify_certs=False)"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "id": "4f00be9b-8c7d-4993-9a75-2ed9f5a94b37",
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "index_name = \"test-index\""
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "id": "056b1007-5cc8-4a40-ab59-ffda53e269d2",
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "doc = {\n",
92 | " 'author': 'kimchy',\n",
93 | " 'text': 'Elasticsearch: cool. bonsai cool.',\n",
94 | " 'timestamp': datetime.datetime.now(),\n",
95 | "}"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "e994e955-f3ea-47f4-95e6-994390e5403e",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "resp = es.index(index=index_name, id=1, document=doc)\n"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "id": "6ecf186b-458a-4bf6-9ed1-ff1030b72f50",
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "print(resp['result'])\n",
116 | "\n",
117 | "resp = es.get(index=index_name, id=1)\n",
118 | "print(resp['_source'])\n",
119 | "\n",
120 | "es.indices.refresh(index=index_name)\n",
121 | "\n",
122 | "resp = es.search(index=index_name, query={\"match_all\": {}})\n",
123 | "print(\"Got %d Hits:\" % resp['hits']['total']['value'])\n",
124 | "for hit in resp['hits']['hits']:\n",
125 | " print(\"%(timestamp)s %(author)s: %(text)s\" % hit[\"_source\"])"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "id": "ec1b445b-6cc5-458d-bc9b-5b371c82d1a5",
132 | "metadata": {},
133 | "outputs": [],
134 | "source": []
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "id": "cf0dd18f-8fde-4a72-b226-f58ab22c4520",
139 | "metadata": {},
140 | "source": [
141 | "# Milvus"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "id": "53965619-9c81-472e-9110-7c4e2b1cc3a7",
147 | "metadata": {},
148 | "source": [
149 | "https://github.com/milvus-io/pymilvus/blob/master/examples/hello_milvus.ipynb"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "id": "8c384da7-3a72-4b9e-bacf-4e188af6a3b4",
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "from pymilvus import (\n",
160 | " connections,\n",
161 | " utility,\n",
162 | " FieldSchema, CollectionSchema, DataType,\n",
163 | " Collection,\n",
164 | ")\n"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "id": "c4c65116-21fa-46da-ba1c-231bf2a7569a",
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "num_entities, dim = 3000, 8\n"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "id": "ee5a1a32-5735-4951-8724-9a773d036ea6",
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "collection_name=\"hello_milvus\""
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "id": "e92ce9b8-cbb3-42d2-ad8b-791d21729d3c",
191 | "metadata": {},
192 | "outputs": [],
193 | "source": [
194 | "!ls"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "id": "55e5a4ad-9150-4b86-9cb6-f968b7fe51fd",
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "connections.connect(\"default\", host=\"localhost\", port=\"19530\")\n"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "id": "ae3415ca-a52f-4319-b20c-d08f49bd06e7",
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "if utility.has_collection(collection_name):\n",
215 | " utility.drop_collection(collection_name)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "id": "a4324ba0-4050-4b3c-9ba6-6e591177509d",
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "utility.list_collections()"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "id": "c070eb28-fa18-43ad-a2de-1067b2476274",
231 | "metadata": {},
232 | "source": [
233 | "create collection"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "id": "5e08cee9-e957-44b5-856f-c4549faa7b86",
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "fields = [\n",
244 | " FieldSchema(name=\"pk\", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),\n",
245 | " FieldSchema(name=\"random\", dtype=DataType.DOUBLE),\n",
246 | " FieldSchema(name=\"embeddings\", dtype=DataType.FLOAT_VECTOR, dim=dim)\n",
247 | "]\n",
248 | "\n",
249 | "schema = CollectionSchema(fields, \"hello_milvus is the simplest demo to introduce the APIs\")\n",
250 | "\n",
251 | "hello_milvus = Collection(collection_name, schema, consistency_level=\"Strong\")"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "id": "8eff6925-4089-46c3-ae03-00fc748c3135",
258 | "metadata": {},
259 | "outputs": [],
260 | "source": []
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "id": "99ee18c0-fb04-4970-8ba5-e36eca35680a",
266 | "metadata": {},
267 | "outputs": [],
268 | "source": [
269 | "rng = np.random.default_rng(seed=19530)\n",
270 | "entities = [\n",
271 | " # provide the pk field because `auto_id` is set to False\n",
272 | " [str(i) for i in range(num_entities)],\n",
273 | " rng.random(num_entities).tolist(), # field random, only supports list\n",
274 | " rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list\n",
275 | "]\n",
276 | "\n",
277 | "insert_result = hello_milvus.insert(entities)\n",
278 | "\n",
279 | "print(f\"Number of entities in Milvus: {hello_milvus.num_entities}\") # check the num_entites"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "id": "ede3539d-e940-458d-a7d9-6c345178f357",
286 | "metadata": {},
287 | "outputs": [],
288 | "source": []
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "id": "c272a614-8baa-4b3f-b77d-499aa30760d7",
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "index = {\n",
298 | " \"index_type\": \"IVF_FLAT\",\n",
299 | " \"metric_type\": \"L2\",\n",
300 | " \"params\": {\"nlist\": 128},\n",
301 | "}\n",
302 | "\n",
303 | "hello_milvus.create_index(\"embeddings\", index)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "id": "b2865bb6-854a-41a3-97f1-98634f9b57e4",
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "hello_milvus.load()\n"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "id": "0e8e8b2f-a00e-49a0-bbc9-cb99fecb0e7a",
320 | "metadata": {},
321 | "outputs": [],
322 | "source": [
323 | "vectors_to_search = entities[-1][-2:]\n",
324 | "search_params = {\n",
325 | " \"metric_type\": \"L2\",\n",
326 | " \"params\": {\"nprobe\": 10},\n",
327 | "}\n",
328 | "\n",
329 | "start_time = time.time()\n",
330 | "result = hello_milvus.search(vectors_to_search, \"embeddings\", search_params, limit=3, output_fields=[\"random\"])\n",
331 | "end_time = time.time()\n",
332 | "\n",
333 | "for hits in result:\n",
334 | " for hit in hits:\n",
335 | " print(f\"hit: {hit}, random field: {hit.entity.get('random')}\")\n",
336 | "print((end_time - start_time))"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "id": "385d0057-cc9e-4ee3-834e-bec4675cbb96",
343 | "metadata": {},
344 | "outputs": [],
345 | "source": []
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "id": "70601713-0913-4ccc-9efd-9058397d1266",
351 | "metadata": {},
352 | "outputs": [],
353 | "source": []
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "id": "4ef6e0b5-1975-422f-9fdd-6e45f2e0917a",
359 | "metadata": {},
360 | "outputs": [],
361 | "source": []
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "id": "493a4373-3de7-4cff-9b5f-c0b7d0288506",
366 | "metadata": {},
367 | "source": [
368 | "# weaviate"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "id": "27b9d308-face-444b-bf3c-5d3bec0072bb",
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "#!pip install weaviate-client==3.8.0"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "id": "8f818cf1-ed3b-478f-bf5b-b513043ebcb1",
385 | "metadata": {},
386 | "outputs": [],
387 | "source": [
388 | "import weaviate\n"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "id": "595ed265-609c-41be-b9f9-eadfb9820a2f",
395 | "metadata": {},
396 | "outputs": [],
397 | "source": [
398 | "def generate_uuid(class_name: str, identifier: str,\n",
399 | " test: str = 'teststrong') -> str:\n",
400 | " \"\"\" Generate a uuid based on an identifier\n",
401 | " :param identifier: characters used to generate the uuid\n",
402 | " :type identifier: str, required\n",
403 | " :param class_name: classname of the object to create a uuid for\n",
404 | " :type class_name: str, required\n",
405 | " \"\"\"\n",
406 | " test = 'overwritten'\n",
407 | " return str(uuid.uuid5(uuid.NAMESPACE_DNS, class_name + identifier))\n",
408 | "\n",
409 | "def log(i: str) -> str:\n",
410 | " \"\"\" A simple logger\n",
411 | " :param i: the log message\n",
412 | " :type i: str\n",
413 | " \"\"\"\n",
414 | " now = datetime.datetime.utcnow()\n",
415 | " print(now, \"| \" + str(i))"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "id": "0cd92cf1-a8f6-4f71-ab1b-008d17e659b3",
422 | "metadata": {},
423 | "outputs": [],
424 | "source": [
425 | "client = weaviate.Client(\"http://localhost:8081\")\n",
426 | "print(\"Client created\")"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "id": "3645408d-9a51-41b9-aa02-afd549cacdc3",
433 | "metadata": {},
434 | "outputs": [],
435 | "source": [
436 | "from sentence_transformers import SentenceTransformer\n",
437 | "sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') #, Initially load using this, then start using pickle to save time."
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "id": "b1499af5-7259-4e05-afca-117a12dfb659",
444 | "metadata": {},
445 | "outputs": [],
446 | "source": [
447 | "# from sentence_transformers import SentenceTransformer\n",
448 | "# # sbert_model = SentenceTransformer('bert-base-nli-mean-tokens'), Initially load using this, then start using pickle to save time.\n",
449 | "# with open(\"sbert\",'rb') as f:\n",
450 | "# sbert_model = pickle.load(f)\n",
451 | "\n",
452 | "print(\"sbert loaded\")\n",
453 | "\n",
454 | "# I am adding the texts in this list,\n",
455 | "# We can also add sentences of a large text individually to get more precise results when we query.\n",
456 | "documents = [\n",
457 | " '''Taj mahal is an immense mausoleum of white marble, built in Agra between 1631 and 1648 by order of the Mughal emperor Shah Jahan in memory of his favourite wife, the Taj Mahal is the jewel of Muslim art in India and one of the universally admired masterpieces of the world's heritage.''',\n",
458 | " '''The Statue of Liberty is a 305-foot (93-metre) statue located on Liberty Island in Upper New York Bay, off the coast of New York City. The statue is a personification of liberty in the form of a woman. She holds a torch in her raised right hand and clutches a tablet in her left.''',\n",
459 | " '''The Statue of Liberty was sculpted between 1875 and 1884 under the direction of French sculptor Frédéric-Auguste Bartholdi, who began drafting designs in 1870. Bartholdi and his team hammered roughly 31 tons of copper sheets onto a steel frame. Before being mounted on its current pedestal, the statue stood over 151 feet (46 metres) tall and weighed 225 tons.''',\n",
460 | " '''Badminton is a racquet sport played using racquets to hit a shuttlecock across a net. Although it may be played with larger teams, the most common forms of the game are \"singles\" (with one player per side) and \"doubles\" (with two players per side). Badminton is often played as a casual outdoor activity in a yard or on a beach; formal games are played on a rectangular indoor court. Points are scored by striking the shuttlecock with the racquet and landing it within the opposing side's half of the court.''',\n",
461 | " '''James Bond is a fictional character created by novelist Ian Fleming in 1953.''',\n",
462 | " '''A British secret agent working for MI6 under the codename 007, he has been portrayed on film by actors Sean Connery, David Niven, George Lazenby, Roger Moore, Timothy Dalton, Pierce Brosnan and Daniel Craig in twenty-seven productions.'''\n",
463 | "]\n",
464 | "\n",
465 | "# A dictionary to store the document and its feature vector (the vector generated by SBERT)\n",
466 | "doc_and_vec = {}\n",
467 | "\n",
468 | "def giveVector(texts):\n",
469 | " # this function returns the vector using SBERT\n",
470 | " return sbert_model.encode(texts)\n",
471 | "\n",
472 | "vectors = giveVector(documents)\n",
473 | "\n",
474 | "for doc,vec in zip(documents,vectors):\n",
475 | " doc_and_vec[doc] = vec\n",
476 | "\n",
477 | "print(\"vectors formed\")\n",
478 | "\n",
479 | "client.schema.delete_all()\n",
480 | "class_obj = {\n",
481 | " \"class\": \"Post\",\n",
482 | " \"vectorizer\": \"none\", # we are providing the vectors ourselves through our SBERT model, so this field is none\n",
483 | " \"properties\": [{\n",
484 | " \"name\": \"content\",\n",
485 | " \"dataType\": [\"text\"],\n",
486 | " }]\n",
487 | "}\n",
488 | "\n",
489 | "client.schema.create_class(class_obj)\n",
490 | "print(\"Schema class created\")\n",
491 | "\n",
492 | "for doc,vec in doc_and_vec.items():\n",
493 | " data_obj = {\n",
494 | " \"content\": doc\n",
495 | " }\n",
496 | " client.data_object.create(\n",
497 | " data_obj,\n",
498 | " \"Post\",\n",
499 | " generate_uuid('Post',doc),\n",
500 | " vector = vec,\n",
501 | " )\n",
502 | "print(\"Finished importing data\")\n",
503 | "\n",
504 | "def process_query(vec):\n",
505 | " nearVector = {\"vector\": vec}\n",
506 | " res = client.query.get(\"Post\", [\"content\", \"_additional {certainty}\"]).with_near_vector(nearVector).do()\n",
507 | " print(res)\n",
508 | " print(\"------------------------------------------------------------------------------------------------\")\n",
509 | " print(\"-----------------------------------Most similar text -------------------------------------------\")\n",
510 | " print(res['data']['Get']['Post'][0]['content'])\n",
511 | " print(\"------------------------------------------------------------------------------------------------\")\n",
512 | " print(res['data']['Get']['Post'][1]['content'])\n",
513 | " print(\"------------------------------------------------------------------------------------------------\")\n",
514 | "\n",
515 | " \n"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": null,
521 | "id": "065e4517-d430-48c7-89ae-9cd29c3a31f3",
522 | "metadata": {},
523 | "outputs": [],
524 | "source": [
525 | "query =\"american tourist destination\"\n",
526 | "query_vec = sbert_model.encode(query)\n",
527 | "process_query(query_vec)\n"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": null,
533 | "id": "1a232068-a38a-490b-a329-d5b5e773174b",
534 | "metadata": {},
535 | "outputs": [],
536 | "source": []
537 | }
538 | ],
539 | "metadata": {
540 | "environment": {
541 | "kernel": "python3",
542 | "name": "pytorch-gpu.1-12.m99",
543 | "type": "gcloud",
544 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
545 | },
546 | "kernelspec": {
547 | "display_name": "Python 3",
548 | "language": "python",
549 | "name": "python3"
550 | },
551 | "language_info": {
552 | "codemirror_mode": {
553 | "name": "ipython",
554 | "version": 3
555 | },
556 | "file_extension": ".py",
557 | "mimetype": "text/x-python",
558 | "name": "python",
559 | "nbconvert_exporter": "python",
560 | "pygments_lexer": "ipython3",
561 | "version": "3.7.12"
562 | }
563 | },
564 | "nbformat": 4,
565 | "nbformat_minor": 5
566 | }
567 |
--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/workshop_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "92773bd8-4d3e-47d0-af4f-52216bb43465",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": []
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "55d91d10-03a4-46ad-b011-a1cfd22ab1e7",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n",
19 | "import os"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "id": "1cd6384a-b5e3-4d82-9a07-35b821524321",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": []
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "id": "9451b070-e18c-46c3-a012-6e2878cc26f4",
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "model = SentenceTransformer('flax-sentence-embeddings/stackoverflow_mpnet-base')\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "id": "f414975a-3dc2-4eb6-9b3b-bd24be4d18a4",
44 | "metadata": {},
45 | "outputs": [],
46 | "source": []
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "id": "24f40c4c-a183-4d84-b70e-4a1a86a229ee",
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')\n"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "id": "8983c00d-214d-4df3-b024-154b2105ace5",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "id": "1770345e-6eff-4761-bf39-64020967cc51",
72 | "metadata": {},
73 | "outputs": [],
74 | "source": []
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "id": "a8834f31-56cf-4760-b0e6-c86e3b8efd39",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'\n",
84 | "\n",
85 | "if not os.path.exists(wikipedia_filepath):\n",
86 | " util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n"
87 | ]
88 | }
89 | ],
90 | "metadata": {
91 | "environment": {
92 | "kernel": "python3",
93 | "name": "pytorch-gpu.1-12.m99",
94 | "type": "gcloud",
95 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
96 | },
97 | "kernelspec": {
98 | "display_name": "Python 3",
99 | "language": "python",
100 | "name": "python3"
101 | },
102 | "language_info": {
103 | "codemirror_mode": {
104 | "name": "ipython",
105 | "version": 3
106 | },
107 | "file_extension": ".py",
108 | "mimetype": "text/x-python",
109 | "name": "python",
110 | "nbconvert_exporter": "python",
111 | "pygments_lexer": "ipython3",
112 | "version": "3.7.12"
113 | }
114 | },
115 | "nbformat": 4,
116 | "nbformat_minor": 5
117 | }
118 |
--------------------------------------------------------------------------------
/assets/all_assets.sw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/all_assets.sw
--------------------------------------------------------------------------------
/assets/slides_odsc2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_odsc2022.pdf
--------------------------------------------------------------------------------
/assets/slides_pydatanyc2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_pydatanyc2022.pdf
--------------------------------------------------------------------------------
/assets/slides_pydataseattle2023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_pydataseattle2023.pdf
--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | version: "3.0"
2 | services:
3 | elasticsearch:
4 | container_name: es-container
5 | image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0
6 | environment:
7 | - xpack.security.enabled=false
8 | - "discovery.type=single-node"
9 | ports:
10 | - 9200:9200
11 | volumes:
12 | - esdata:/usr/share/elasticsearch/data
13 |
14 |
15 | # milvus:
16 | # container_name: milvus
17 | # image: milvusdb/milvus:1.1.1-cpu-d061621-330cc6
18 | # ports:
19 | # - 19530:19530
20 | # - 19121:19121
21 | # volumes:
22 | # - milvusdata:/var/lib/milvus
23 |
24 |
25 | # milvus:
26 | # container_name: milvus
27 | # build:
28 | # context: docker_milvus
29 | # ports:
30 | # - 19530:19530
31 | # - 19121:19121
32 | # volumes:
33 | # - milvusdata:/var/lib/milvus
34 |
35 |
36 | # weaviate:
37 | # image: semitechnologies/weaviate:1.14.0
38 | # ports:
39 | # - 8081:8080
40 | # environment:
41 | # QUERY_DEFAULTS_LIMIT: 25
42 | # AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
43 | # PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
44 | # DEFAULT_VECTORIZER_MODULE: 'none'
45 | # ENABLE_MODULES: ''
46 | # CLUSTER_HOSTNAME: 'node1'
47 | # volumes:
48 | # - weaviatedata:/var/lib/weaviate
49 | volumes:
50 | esdata:
51 | # weaviatedata:
52 | # milvusdata:
--------------------------------------------------------------------------------
/docs/internal_notes.md:
--------------------------------------------------------------------------------
1 | # Internal Notes
2 |
3 | ## Setup Dep
4 |
5 | Install other deps
6 |
7 | ```bash
8 | sudo apt update && sudo apt install -y p7zip-full
9 | ```
10 |
11 | Create conda environemnt
12 |
13 | ```bash
14 | conda create -n workshop python=3.7 mamba
15 | conda activate workshop
16 | mamba env update -n workshop -f environment.yaml
17 | # mamba install anaconda jupyter ipykernel nb_conda_kernels
18 |
19 | mamba install ipython ipykernel nb_conda_kernels
20 |
21 | ipython kernel install --user --name=workshop
22 |
23 |
24 | conda create --name workshop --clone base
25 |
26 | ```
27 |
28 | Start ES/ Faiss for local dev
29 |
30 | ```bash
31 | docker-compose up
32 | ```
33 |
34 | ```bash
35 | docker run --user root -e GRANT_SUDO=yes -it app bash
36 | ```
37 |
38 |
39 |
40 | ```
41 | Go to DIR: /projects/search-engine-workshop
42 | Type: docker-compose up
43 |
44 | In the notebooks test... checks the milvus and elastic connections
45 |
46 |
47 | ```
48 | gsutil -m cp -r gs://np-training-tmp/stackoverflow/final* gs://np-public-training-temp/stackoverflow/
49 | ```
50 |
51 | ```
52 |
53 |
54 |
55 |
56 | ```
57 | zip -r data_processed.zip data/processed/
58 |
59 | gh release delete v1.0
60 |
61 | gh release create v1.0 'data_processed.zip#Hugging Face Dataset of Unsplashed collection' \
62 | --title "v1.0" --notes "initial release"
63 |
64 |
65 | ```
66 |
67 |
68 |
69 | ```
70 | zip -r /tmp/data.zip data/
71 | gsutil cp /tmp/data.zip gs://np-public-training-tmp/search-workshop/data.zip
72 |
73 |
74 | ```
--------------------------------------------------------------------------------
/docs/slide_notes.md:
--------------------------------------------------------------------------------
1 | PUT /items
2 |
3 | ```json
4 | {
5 | "mappings": {
6 | "properties": {
7 | "title": { "type": "text" },
8 | "description": { "type": "text" },
9 |
10 | "brand": { "type": "keyword" },
11 | "product_type": { "type": "keyword" },
12 |
13 | "price": { "type": "double" }
14 | }
15 | }
16 | }
17 | ```
18 |
19 | Nike shoe under 100$
20 |
21 | GET /items/_search
22 |
23 | ```json
24 | {
25 | "query": {
26 |
27 | "multi_match": {
28 | "query": "Nike shoe under 100$",
29 | "fields": ["title^2", "Description^1"]
30 | }
31 |
32 | ,"bool": {
33 | "filter": [
34 | { "term": { "brand": "nike" }}
35 | ]
36 | }
37 | ,"filtered": {
38 | "filter": {
39 | "range": {
40 | "price" : { "lte": 100 }
41 | }
42 | }
43 | }
44 | }
45 |
46 | ```
47 |
48 |
49 |
50 | ## PR curve
51 | ```
52 | Recall Perfect Classifier Baseline Classifier Good Classifier High Precision
53 | 0.1 0.95 0.5 0.9 0.91
54 | 0.2 0.95 0.5 0.85 0.91
55 | 0.3 0.95 0.5 0.85 0.91
56 | 0.4 0.95 0.5 0.8 0.9
57 | 0.5 0.95 0.5 0.8 0.4
58 | 0.6 0.95 0.5 0.8 0.4
59 | 0.7 0.95 0.5 0.8 0.4
60 | 0.8 0.95 0.5 0.8 0.2
61 | 0.9 0.95 0.5 0.7 0.2
62 | 1 0.9 0.5 0.2 0.1
63 | ```
64 |
65 |
66 |
67 |
68 | dcg
69 |
70 | ```
71 | Discounted\space Cumulative\space Gain
72 | = \sum_{1}^{p}\frac{ relevance (i)}{log_{2}(i+1)}
73 |
74 |
75 | \\
76 | DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{1}{log_{2}(3)} + {\color{Red}\frac{0}{log_{2}(4)} }+\frac{2}{log_{2}(5)} = 4.49
77 |
78 | \\
79 |
80 | (Ideal)\space DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{2}{log_{2}(3)} + \frac{1}{log_{2}(4)} + {\color{Red}\frac{0}{log_{2}(5)} } = 5.88
81 |
82 | \\
83 | Normalized\space Discounted\space Cumulative\space Gain
84 | = \frac{ DCG}{Ideal\space DCG} = \frac{4.49}{5.88}
85 | ```
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | #name: workshop
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - python==3.7.*
6 | - pip
7 | - mamba
8 | - nb_conda_kernels
9 | - pyarrow==9.0.*
10 | - lxml==4.9.*
11 | - pip:
12 | - -r requirements.txt
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/notebooks/04_ann.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "d6298118-b5f8-4250-bd82-e2a3787914ca",
6 | "metadata": {},
7 | "source": [
8 | "# Benchmarking Aproximate Nearest Neighbors"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "c2af1fec-6519-40d0-8826-f201d0acba0b",
14 | "metadata": {},
15 | "source": [
16 | "# About"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "6a45ed78-348b-4193-8870-cbcbaff240e1",
22 | "metadata": {},
23 | "source": [
24 | "In order for embedding retrieval to work at scale, need to use a vector database.\n",
25 | "We also need to use Approximate Nearest Search instead of brute force.\n",
26 | "\n",
27 | "\n",
28 | "In this notebook, we will use [FAISS]() a library from facebook.\n",
29 | "\n",
30 | "We will compare a brute force and the speedup gained from `IVF`.\n",
31 | "\n",
32 | "For a more detailed comparision, take a look here to find other solutions and benchmark data.\n",
33 | "\n",
34 | "\n",
35 | "We will look at `performance` and `recall@1`"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "id": "e45e627d-9c8c-4f19-ab91-0e64ed8677d7",
41 | "metadata": {},
42 | "source": [
43 | "# Setup"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 1,
49 | "id": "62298ef5-fa20-4164-8ec7-e7f43bf85c20",
50 | "metadata": {
51 | "execution": {
52 | "iopub.execute_input": "2023-04-26T14:52:36.834089Z",
53 | "iopub.status.busy": "2023-04-26T14:52:36.833681Z",
54 | "iopub.status.idle": "2023-04-26T14:52:37.774946Z",
55 | "shell.execute_reply": "2023-04-26T14:52:37.774030Z",
56 | "shell.execute_reply.started": "2023-04-26T14:52:36.834036Z"
57 | }
58 | },
59 | "outputs": [],
60 | "source": [
61 | "from pathlib import Path\n",
62 | "import numpy as np\n",
63 | "import pandas as pd\n",
64 | "import faiss\n",
65 | "import datasets"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "id": "6b9a73d2-aa07-46ef-b94d-8780ca9ecb68",
71 | "metadata": {},
72 | "source": [
73 | "## Load the embeddings of the image corpus"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 2,
79 | "id": "74c3f8aa-cb27-4ae8-a9a8-0060001d357c",
80 | "metadata": {
81 | "execution": {
82 | "iopub.execute_input": "2023-04-26T14:52:37.778006Z",
83 | "iopub.status.busy": "2023-04-26T14:52:37.776915Z",
84 | "iopub.status.idle": "2023-04-26T14:52:44.572677Z",
85 | "shell.execute_reply": "2023-04-26T14:52:44.571814Z",
86 | "shell.execute_reply.started": "2023-04-26T14:52:37.777973Z"
87 | }
88 | },
89 | "outputs": [],
90 | "source": [
91 | "dset = datasets.load_from_disk(\"../data/processed_embeddings\")\n",
92 | "## these embeddings will be used to create the search space.\n",
93 | "corpus = dset['embeddings']\n",
94 | "\n",
95 | "\n",
96 | "corpus = np.array(corpus).astype('float32')\n",
97 | "corpus = np.unique(corpus, axis=0)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 3,
103 | "id": "94eeb2f5-88ff-428f-8e9c-5014234427b8",
104 | "metadata": {
105 | "execution": {
106 | "iopub.execute_input": "2023-04-26T14:52:44.574012Z",
107 | "iopub.status.busy": "2023-04-26T14:52:44.573742Z",
108 | "iopub.status.idle": "2023-04-26T14:52:44.580175Z",
109 | "shell.execute_reply": "2023-04-26T14:52:44.579367Z",
110 | "shell.execute_reply.started": "2023-04-26T14:52:44.573987Z"
111 | }
112 | },
113 | "outputs": [
114 | {
115 | "data": {
116 | "text/plain": [
117 | "(24954, 512)"
118 | ]
119 | },
120 | "execution_count": 3,
121 | "metadata": {},
122 | "output_type": "execute_result"
123 | }
124 | ],
125 | "source": [
126 | "corpus.shape"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 4,
132 | "id": "939108fa-daf9-472c-b026-19c6d9708a77",
133 | "metadata": {
134 | "execution": {
135 | "iopub.execute_input": "2023-04-26T14:52:44.581364Z",
136 | "iopub.status.busy": "2023-04-26T14:52:44.581116Z",
137 | "iopub.status.idle": "2023-04-26T14:52:44.590669Z",
138 | "shell.execute_reply": "2023-04-26T14:52:44.589884Z",
139 | "shell.execute_reply.started": "2023-04-26T14:52:44.581340Z"
140 | }
141 | },
142 | "outputs": [
143 | {
144 | "data": {
145 | "text/plain": [
146 | "array([[-0.08344752, 0.01604629, 0.03037108, ..., 0.03962855,\n",
147 | " -0.02023211, -0.01102281],\n",
148 | " [-0.07890625, 0.02533851, 0.00522987, ..., 0.02622218,\n",
149 | " -0.05418065, -0.00765004],\n",
150 | " [-0.0781679 , 0.03937826, -0.01087696, ..., 0.04282334,\n",
151 | " -0.02091636, -0.01027698],\n",
152 | " ...,\n",
153 | " [ 0.0878398 , 0.01232621, 0.00077178, ..., -0.00705758,\n",
154 | " 0.01574707, -0.01541145],\n",
155 | " [ 0.0882502 , 0.03615745, -0.00961868, ..., 0.01392467,\n",
156 | " 0.00077467, -0.02139922],\n",
157 | " [ 0.09195283, 0.04004925, -0.00255262, ..., 0.0036222 ,\n",
158 | " -0.0181689 , -0.04212729]], dtype=float32)"
159 | ]
160 | },
161 | "execution_count": 4,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "corpus"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "id": "63581c47-4d2a-4106-883f-3f42c9070e99",
174 | "metadata": {},
175 | "outputs": [],
176 | "source": []
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 5,
181 | "id": "69bf8642-772f-4996-b3c7-910a1f38b1b2",
182 | "metadata": {
183 | "execution": {
184 | "iopub.execute_input": "2023-04-26T14:52:44.592214Z",
185 | "iopub.status.busy": "2023-04-26T14:52:44.591565Z",
186 | "iopub.status.idle": "2023-04-26T14:52:44.600591Z",
187 | "shell.execute_reply": "2023-04-26T14:52:44.599816Z",
188 | "shell.execute_reply.started": "2023-04-26T14:52:44.592163Z"
189 | }
190 | },
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "512"
196 | ]
197 | },
198 | "execution_count": 5,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "dimension = corpus.shape[-1]\n",
205 | "dimension"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "id": "b5fbf481-46ac-45d3-b277-e9e19185c214",
212 | "metadata": {},
213 | "outputs": [],
214 | "source": []
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "id": "06a59231-abcf-45f9-a69a-69149e73c2f8",
219 | "metadata": {
220 | "tags": []
221 | },
222 | "source": [
223 | "# Flat Index / Brute Force\n"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "id": "f2b9e1b9-c75b-4700-a3c9-cb2b8ecab451",
229 | "metadata": {},
230 | "source": [
231 | "FAISS supports a bruteforce index. \n",
232 | "This index is good if you want perfect recall. \n",
233 | "It requires all the data to be fit in memory. "
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "id": "926f4977-2ee0-48bd-8f50-f13e6ed82897",
239 | "metadata": {},
240 | "source": [
241 | "## Create the index"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 6,
247 | "id": "a447b05c-f30d-4d43-a6ed-fd29a99477a2",
248 | "metadata": {
249 | "execution": {
250 | "iopub.execute_input": "2023-04-26T14:52:44.601898Z",
251 | "iopub.status.busy": "2023-04-26T14:52:44.601570Z",
252 | "iopub.status.idle": "2023-04-26T14:52:44.609907Z",
253 | "shell.execute_reply": "2023-04-26T14:52:44.609154Z",
254 | "shell.execute_reply.started": "2023-04-26T14:52:44.601873Z"
255 | }
256 | },
257 | "outputs": [],
258 | "source": [
259 | "x_corpus = corpus\n",
260 | "x_corpus.shape\n",
261 | "dimension = x_corpus.shape[-1]"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "id": "cb28c78c-3156-4da9-b5c1-3b192cb4c70f",
267 | "metadata": {},
268 | "source": [
269 | "initialize the flat index for data dimension. \n",
270 | "In current example it is 512\n"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 7,
276 | "id": "5bf3d65c-c435-407a-a932-8cdd9655ff5a",
277 | "metadata": {
278 | "execution": {
279 | "iopub.execute_input": "2023-04-26T14:52:44.612718Z",
280 | "iopub.status.busy": "2023-04-26T14:52:44.612384Z",
281 | "iopub.status.idle": "2023-04-26T14:52:44.664297Z",
282 | "shell.execute_reply": "2023-04-26T14:52:44.663442Z",
283 | "shell.execute_reply.started": "2023-04-26T14:52:44.612692Z"
284 | }
285 | },
286 | "outputs": [],
287 | "source": [
288 | "index = faiss.IndexFlatL2(dimension)"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "id": "c121ab02-4fe3-4012-af73-038ae78f872e",
294 | "metadata": {},
295 | "source": [
296 | "since it is a brute force index, there is no \"training\" or parameters to learn"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 8,
302 | "id": "3c2d2c60-0afb-44cf-9cf6-5032811725a7",
303 | "metadata": {
304 | "execution": {
305 | "iopub.execute_input": "2023-04-26T14:52:44.666136Z",
306 | "iopub.status.busy": "2023-04-26T14:52:44.665598Z",
307 | "iopub.status.idle": "2023-04-26T14:52:44.670880Z",
308 | "shell.execute_reply": "2023-04-26T14:52:44.670173Z",
309 | "shell.execute_reply.started": "2023-04-26T14:52:44.666092Z"
310 | }
311 | },
312 | "outputs": [
313 | {
314 | "data": {
315 | "text/plain": [
316 | "True"
317 | ]
318 | },
319 | "execution_count": 8,
320 | "metadata": {},
321 | "output_type": "execute_result"
322 | }
323 | ],
324 | "source": [
325 | "index.is_trained\n"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "id": "a547440c-fd96-4ce5-9996-0210f00617a7",
331 | "metadata": {},
332 | "source": [
333 | "add data to the index. This is a CPU based index."
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 9,
339 | "id": "7aa739ee-42e1-4ac5-b1d8-9876ec777129",
340 | "metadata": {
341 | "execution": {
342 | "iopub.execute_input": "2023-04-26T14:52:44.672070Z",
343 | "iopub.status.busy": "2023-04-26T14:52:44.671820Z",
344 | "iopub.status.idle": "2023-04-26T14:52:44.747616Z",
345 | "shell.execute_reply": "2023-04-26T14:52:44.746751Z",
346 | "shell.execute_reply.started": "2023-04-26T14:52:44.672047Z"
347 | },
348 | "tags": []
349 | },
350 | "outputs": [],
351 | "source": [
352 | "index.add(x_corpus) "
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 10,
358 | "id": "ab9e1b45-1cf2-4e88-960c-6999ad312e22",
359 | "metadata": {
360 | "execution": {
361 | "iopub.execute_input": "2023-04-26T14:52:44.749140Z",
362 | "iopub.status.busy": "2023-04-26T14:52:44.748763Z",
363 | "iopub.status.idle": "2023-04-26T14:52:44.754419Z",
364 | "shell.execute_reply": "2023-04-26T14:52:44.753707Z",
365 | "shell.execute_reply.started": "2023-04-26T14:52:44.749112Z"
366 | }
367 | },
368 | "outputs": [
369 | {
370 | "data": {
371 | "text/plain": [
372 | "24954"
373 | ]
374 | },
375 | "execution_count": 10,
376 | "metadata": {},
377 | "output_type": "execute_result"
378 | }
379 | ],
380 | "source": [
381 | "len(x_corpus)"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": null,
387 | "id": "bfabea5d-26bd-45d8-8b7a-97f179bc4013",
388 | "metadata": {},
389 | "outputs": [],
390 | "source": []
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "id": "348f697d-db43-4093-aa4e-4858d4c058f0",
395 | "metadata": {},
396 | "source": [
397 | "number of vectors / results to retrieve"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 11,
403 | "id": "6200be29-b926-4847-9e42-daf90d99319d",
404 | "metadata": {
405 | "execution": {
406 | "iopub.execute_input": "2023-04-26T14:52:44.755735Z",
407 | "iopub.status.busy": "2023-04-26T14:52:44.755402Z",
408 | "iopub.status.idle": "2023-04-26T14:52:44.764114Z",
409 | "shell.execute_reply": "2023-04-26T14:52:44.763389Z",
410 | "shell.execute_reply.started": "2023-04-26T14:52:44.755710Z"
411 | }
412 | },
413 | "outputs": [],
414 | "source": [
415 | "k =1"
416 | ]
417 | },
418 | {
419 | "cell_type": "markdown",
420 | "id": "d57e8258-b7fa-49cb-a9f7-52a62a0dda17",
421 | "metadata": {},
422 | "source": [
423 | "#### Index Search\n",
424 | "search method returns query indices (I) similar to search query vector and their euclidean distances (D) from the search query vector."
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "id": "87ea66c3-ebda-4600-b5d8-34ad796cc7c2",
430 | "metadata": {},
431 | "source": [
432 | "search for single vector and get top 1 result"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 12,
438 | "id": "306e3a89-331c-4253-b7cf-ce7a0d951b42",
439 | "metadata": {
440 | "execution": {
441 | "iopub.execute_input": "2023-04-26T14:52:44.765288Z",
442 | "iopub.status.busy": "2023-04-26T14:52:44.765050Z",
443 | "iopub.status.idle": "2023-04-26T14:52:48.341486Z",
444 | "shell.execute_reply": "2023-04-26T14:52:48.340493Z",
445 | "shell.execute_reply.started": "2023-04-26T14:52:44.765265Z"
446 | }
447 | },
448 | "outputs": [
449 | {
450 | "name": "stdout",
451 | "output_type": "stream",
452 | "text": [
453 | "4.38 ms ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
454 | ]
455 | }
456 | ],
457 | "source": [
458 | "%%timeit\n",
459 | "D, I = index.search(x_corpus[:1], k=1) "
460 | ]
461 | },
462 | {
463 | "cell_type": "markdown",
464 | "id": "a2382cf6-2718-4240-b805-c96195af51f0",
465 | "metadata": {},
466 | "source": [
467 | "search for all vectors in corpus and get top 1 result"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": 13,
473 | "id": "badcb20e-f872-4763-baf5-1876d5dd617d",
474 | "metadata": {
475 | "execution": {
476 | "iopub.execute_input": "2023-04-26T14:52:48.342963Z",
477 | "iopub.status.busy": "2023-04-26T14:52:48.342590Z",
478 | "iopub.status.idle": "2023-04-26T14:52:58.653601Z",
479 | "shell.execute_reply": "2023-04-26T14:52:58.652643Z",
480 | "shell.execute_reply.started": "2023-04-26T14:52:48.342935Z"
481 | },
482 | "tags": []
483 | },
484 | "outputs": [
485 | {
486 | "name": "stdout",
487 | "output_type": "stream",
488 | "text": [
489 | "CPU times: user 30.3 s, sys: 8.92 ms, total: 30.3 s\n",
490 | "Wall time: 10.3 s\n"
491 | ]
492 | }
493 | ],
494 | "source": [
495 | "%%time\n",
496 | "D, I = index.search(x_corpus, k=1) "
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "id": "acf3d26f-119c-4f5c-8c24-fd28e3fccdfc",
503 | "metadata": {},
504 | "outputs": [],
505 | "source": []
506 | },
507 | {
508 | "cell_type": "markdown",
509 | "id": "1eac1d91-007d-4143-b0a3-ae19c763cc6c",
510 | "metadata": {},
511 | "source": [
512 | "distance of vector in corpus to query vector"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 14,
518 | "id": "0b555673-cb87-4b11-aa81-ed2ff69d513e",
519 | "metadata": {
520 | "execution": {
521 | "iopub.execute_input": "2023-04-26T14:52:58.660044Z",
522 | "iopub.status.busy": "2023-04-26T14:52:58.657749Z",
523 | "iopub.status.idle": "2023-04-26T14:52:58.667064Z",
524 | "shell.execute_reply": "2023-04-26T14:52:58.666198Z",
525 | "shell.execute_reply.started": "2023-04-26T14:52:58.660006Z"
526 | }
527 | },
528 | "outputs": [
529 | {
530 | "data": {
531 | "text/plain": [
532 | "array([[0.0000000e+00],\n",
533 | " [0.0000000e+00],\n",
534 | " [3.5762787e-07],\n",
535 | " ...,\n",
536 | " [0.0000000e+00],\n",
537 | " [1.3113022e-06],\n",
538 | " [7.1525574e-07]], dtype=float32)"
539 | ]
540 | },
541 | "execution_count": 14,
542 | "metadata": {},
543 | "output_type": "execute_result"
544 | }
545 | ],
546 | "source": [
547 | "D"
548 | ]
549 | },
550 | {
551 | "cell_type": "markdown",
552 | "id": "c4d271c3-850a-4375-87a2-100ffab7a416",
553 | "metadata": {},
554 | "source": [
555 | "top vertex id \n",
556 | "\n"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 15,
562 | "id": "65310b96-aab0-4a34-a9eb-48d0dbefc0ec",
563 | "metadata": {
564 | "execution": {
565 | "iopub.execute_input": "2023-04-26T14:52:58.668403Z",
566 | "iopub.status.busy": "2023-04-26T14:52:58.668046Z",
567 | "iopub.status.idle": "2023-04-26T14:52:58.690783Z",
568 | "shell.execute_reply": "2023-04-26T14:52:58.689783Z",
569 | "shell.execute_reply.started": "2023-04-26T14:52:58.668375Z"
570 | }
571 | },
572 | "outputs": [
573 | {
574 | "data": {
575 | "text/plain": [
576 | "array([[ 0],\n",
577 | " [ 1],\n",
578 | " [ 2],\n",
579 | " ...,\n",
580 | " [24951],\n",
581 | " [24952],\n",
582 | " [24953]])"
583 | ]
584 | },
585 | "execution_count": 15,
586 | "metadata": {},
587 | "output_type": "execute_result"
588 | }
589 | ],
590 | "source": [
591 | "I"
592 | ]
593 | },
594 | {
595 | "cell_type": "markdown",
596 | "id": "90283075-8783-4607-bf28-e5ce9f55c08c",
597 | "metadata": {},
598 | "source": [
599 | "because we are using the entire corpus and the ids are sequential, the ideal recall would be sequential too"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": 16,
605 | "id": "b3fb40a2-dd43-4676-a766-3d198943f957",
606 | "metadata": {
607 | "execution": {
608 | "iopub.execute_input": "2023-04-26T14:52:58.692358Z",
609 | "iopub.status.busy": "2023-04-26T14:52:58.691962Z",
610 | "iopub.status.idle": "2023-04-26T14:52:58.703731Z",
611 | "shell.execute_reply": "2023-04-26T14:52:58.702726Z",
612 | "shell.execute_reply.started": "2023-04-26T14:52:58.692330Z"
613 | }
614 | },
615 | "outputs": [
616 | {
617 | "data": {
618 | "text/plain": [
619 | "array([ True, True, True, ..., True, True, True])"
620 | ]
621 | },
622 | "execution_count": 16,
623 | "metadata": {},
624 | "output_type": "execute_result"
625 | }
626 | ],
627 | "source": [
628 | "res = I[:,0] == np.array( list(range(len(x_corpus))))\n",
629 | "res"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 17,
635 | "id": "27641884-4563-4c1a-9d37-1b8e61ee5322",
636 | "metadata": {
637 | "execution": {
638 | "iopub.execute_input": "2023-04-26T14:52:58.705103Z",
639 | "iopub.status.busy": "2023-04-26T14:52:58.704836Z",
640 | "iopub.status.idle": "2023-04-26T14:52:58.713367Z",
641 | "shell.execute_reply": "2023-04-26T14:52:58.712403Z",
642 | "shell.execute_reply.started": "2023-04-26T14:52:58.705078Z"
643 | }
644 | },
645 | "outputs": [
646 | {
647 | "data": {
648 | "text/plain": [
649 | "(array([], dtype=int64),)"
650 | ]
651 | },
652 | "execution_count": 17,
653 | "metadata": {},
654 | "output_type": "execute_result"
655 | }
656 | ],
657 | "source": [
658 | "np.where(res == False)"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 18,
664 | "id": "ec2181ae-7a70-4a3a-bc34-bd9899168fe5",
665 | "metadata": {
666 | "execution": {
667 | "iopub.execute_input": "2023-04-26T14:52:58.714883Z",
668 | "iopub.status.busy": "2023-04-26T14:52:58.714532Z",
669 | "iopub.status.idle": "2023-04-26T14:52:58.726900Z",
670 | "shell.execute_reply": "2023-04-26T14:52:58.725900Z",
671 | "shell.execute_reply.started": "2023-04-26T14:52:58.714856Z"
672 | }
673 | },
674 | "outputs": [
675 | {
676 | "data": {
677 | "text/plain": [
678 | "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}"
679 | ]
680 | },
681 | "execution_count": 18,
682 | "metadata": {},
683 | "output_type": "execute_result"
684 | }
685 | ],
686 | "source": [
687 | "{\n",
688 | " \"recall@1\": res.sum()\n",
689 | " , \"num_vectors\": len(res)\n",
690 | " , \"mismatch\": len(res) - res.sum()\n",
691 | "}\n"
692 | ]
693 | },
694 | {
695 | "cell_type": "markdown",
696 | "id": "f52a636e-041b-43e6-9b94-5c31447f31cc",
697 | "metadata": {
698 | "execution": {
699 | "iopub.execute_input": "2023-04-26T00:02:55.003678Z",
700 | "iopub.status.busy": "2023-04-26T00:02:55.002818Z",
701 | "iopub.status.idle": "2023-04-26T00:02:55.010824Z",
702 | "shell.execute_reply": "2023-04-26T00:02:55.010099Z",
703 | "shell.execute_reply.started": "2023-04-26T00:02:55.003640Z"
704 | }
705 | },
706 | "source": [
707 | "For this corpus, we are able to find the query vector as position 1"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": null,
713 | "id": "e88e34b6-f9e0-4835-90cc-ba0e6b2c0414",
714 | "metadata": {},
715 | "outputs": [],
716 | "source": []
717 | },
718 | {
719 | "cell_type": "code",
720 | "execution_count": null,
721 | "id": "02bac14d-9696-4a49-be12-9541beeb45a2",
722 | "metadata": {},
723 | "outputs": [],
724 | "source": []
725 | },
726 | {
727 | "cell_type": "markdown",
728 | "id": "7e79d4fc-8191-4151-b330-01b41a2b05d4",
729 | "metadata": {},
730 | "source": [
731 | "# FAISS IVF"
732 | ]
733 | },
734 | {
735 | "cell_type": "markdown",
736 | "id": "2c6f3e4c-2cad-461c-a8b7-5ffba0a5b354",
737 | "metadata": {},
738 | "source": [
739 | "
\n",
740 | "\n",
741 | " Image from Pinecone Faiss Tutorial
\n",
742 | "https://www.pinecone.io/learn/faiss-tutorial/\n",
743 | "\n",
744 | "\n",
745 | "**Parameters**:\n",
746 | "- nlist : number of clusters\n",
747 | "- nprobe: number of clusters to search"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 19,
753 | "id": "342966dc-d361-4fec-8ebe-c3c67864736e",
754 | "metadata": {
755 | "execution": {
756 | "iopub.execute_input": "2023-04-26T14:52:58.728467Z",
757 | "iopub.status.busy": "2023-04-26T14:52:58.728210Z",
758 | "iopub.status.idle": "2023-04-26T14:52:58.785614Z",
759 | "shell.execute_reply": "2023-04-26T14:52:58.784521Z",
760 | "shell.execute_reply.started": "2023-04-26T14:52:58.728443Z"
761 | },
762 | "tags": []
763 | },
764 | "outputs": [],
765 | "source": [
766 | "nlist = 20 # number of clusters\n",
767 | "quantizer = faiss.IndexFlatL2(dimension) # the other index\n",
768 | "index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)"
769 | ]
770 | },
771 | {
772 | "cell_type": "code",
773 | "execution_count": 20,
774 | "id": "fec38a91-8c3c-40fe-935d-a160474a7e4f",
775 | "metadata": {
776 | "execution": {
777 | "iopub.execute_input": "2023-04-26T14:52:58.787144Z",
778 | "iopub.status.busy": "2023-04-26T14:52:58.786846Z",
779 | "iopub.status.idle": "2023-04-26T14:52:58.883952Z",
780 | "shell.execute_reply": "2023-04-26T14:52:58.882788Z",
781 | "shell.execute_reply.started": "2023-04-26T14:52:58.787117Z"
782 | },
783 | "tags": []
784 | },
785 | "outputs": [],
786 | "source": [
787 | "assert not index.is_trained\n",
788 | "index.train(x_corpus)\n",
789 | "assert index.is_trained"
790 | ]
791 | },
792 | {
793 | "cell_type": "code",
794 | "execution_count": 21,
795 | "id": "6b577199-2e7b-439f-93a4-2653c7545eef",
796 | "metadata": {
797 | "execution": {
798 | "iopub.execute_input": "2023-04-26T14:52:58.885895Z",
799 | "iopub.status.busy": "2023-04-26T14:52:58.885561Z",
800 | "iopub.status.idle": "2023-04-26T14:52:58.946474Z",
801 | "shell.execute_reply": "2023-04-26T14:52:58.945447Z",
802 | "shell.execute_reply.started": "2023-04-26T14:52:58.885865Z"
803 | }
804 | },
805 | "outputs": [],
806 | "source": [
807 | "index.add(x_corpus) "
808 | ]
809 | },
810 | {
811 | "cell_type": "markdown",
812 | "id": "55f7c7b3-7298-4e53-b1e0-2c3a0f568579",
813 | "metadata": {},
814 | "source": [
815 | "we need to train the index first with a sample of vectors before indexing"
816 | ]
817 | },
818 | {
819 | "cell_type": "code",
820 | "execution_count": null,
821 | "id": "7a50b8c1-ed8b-45ac-9f19-4325f42b265a",
822 | "metadata": {},
823 | "outputs": [],
824 | "source": []
825 | },
826 | {
827 | "cell_type": "markdown",
828 | "id": "f8d0824c-0712-46fb-a5d4-16b32ee695c0",
829 | "metadata": {},
830 | "source": [
831 | "search for single vector"
832 | ]
833 | },
834 | {
835 | "cell_type": "code",
836 | "execution_count": 22,
837 | "id": "451988b2-0e03-4e12-92e2-8df408094526",
838 | "metadata": {
839 | "execution": {
840 | "iopub.execute_input": "2023-04-26T14:52:58.956643Z",
841 | "iopub.status.busy": "2023-04-26T14:52:58.953741Z",
842 | "iopub.status.idle": "2023-04-26T14:53:08.215103Z",
843 | "shell.execute_reply": "2023-04-26T14:53:08.214024Z",
844 | "shell.execute_reply.started": "2023-04-26T14:52:58.956606Z"
845 | },
846 | "tags": []
847 | },
848 | "outputs": [
849 | {
850 | "name": "stdout",
851 | "output_type": "stream",
852 | "text": [
853 | "114 µs ± 729 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
854 | ]
855 | }
856 | ],
857 | "source": [
858 | "%%timeit\n",
859 | "\n",
860 | "index.nprobe = 1 # default nprobe is 1\n",
861 | "\n",
862 | "D, I = index.search(x_corpus[:1], k) # actual search"
863 | ]
864 | },
865 | {
866 | "cell_type": "markdown",
867 | "id": "28e0a32e-5380-48da-9022-0be90db13e75",
868 | "metadata": {
869 | "execution": {
870 | "iopub.execute_input": "2023-04-26T00:07:36.782792Z",
871 | "iopub.status.busy": "2023-04-26T00:07:36.782538Z",
872 | "iopub.status.idle": "2023-04-26T00:07:36.788109Z",
873 | "shell.execute_reply": "2023-04-26T00:07:36.786807Z",
874 | "shell.execute_reply.started": "2023-04-26T00:07:36.782768Z"
875 | }
876 | },
877 | "source": [
878 | "in the above, we are only querying 1/20 of the search space"
879 | ]
880 | },
881 | {
882 | "cell_type": "code",
883 | "execution_count": null,
884 | "id": "f7547048-000e-4f32-b1c9-fccfd83b3918",
885 | "metadata": {},
886 | "outputs": [],
887 | "source": []
888 | },
889 | {
890 | "cell_type": "code",
891 | "execution_count": 23,
892 | "id": "27ef12d6-d585-4be3-9f5f-8b35ee192c38",
893 | "metadata": {
894 | "execution": {
895 | "iopub.execute_input": "2023-04-26T14:53:08.216344Z",
896 | "iopub.status.busy": "2023-04-26T14:53:08.216081Z",
897 | "iopub.status.idle": "2023-04-26T14:53:20.757164Z",
898 | "shell.execute_reply": "2023-04-26T14:53:20.755944Z",
899 | "shell.execute_reply.started": "2023-04-26T14:53:08.216319Z"
900 | },
901 | "tags": []
902 | },
903 | "outputs": [
904 | {
905 | "name": "stdout",
906 | "output_type": "stream",
907 | "text": [
908 | "1.55 ms ± 13.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
909 | ]
910 | }
911 | ],
912 | "source": [
913 | "%%timeit\n",
914 | "\n",
915 | "\n",
916 | "index.nprobe = 10 # default nprobe is 1\n",
917 | "\n",
918 | "D, I = index.search(x_corpus[:1], k) # actual search"
919 | ]
920 | },
921 | {
922 | "cell_type": "markdown",
923 | "id": "5e98f114-4d94-4b59-84f2-73232fc834da",
924 | "metadata": {
925 | "execution": {
926 | "iopub.status.busy": "2023-04-26T00:07:36.790059Z",
927 | "iopub.status.idle": "2023-04-26T00:07:36.790366Z",
928 | "shell.execute_reply": "2023-04-26T00:07:36.790209Z",
929 | "shell.execute_reply.started": "2023-04-26T00:07:36.790195Z"
930 | }
931 | },
932 | "source": [
933 | "in the above, we are only querying half of the search space"
934 | ]
935 | },
936 | {
937 | "cell_type": "code",
938 | "execution_count": null,
939 | "id": "c1045fcc-358f-4216-a00e-e0db6e1811a4",
940 | "metadata": {},
941 | "outputs": [],
942 | "source": []
943 | },
944 | {
945 | "cell_type": "code",
946 | "execution_count": null,
947 | "id": "50e83d62-b203-4048-95df-74e61fc2aa0c",
948 | "metadata": {},
949 | "outputs": [],
950 | "source": []
951 | },
952 | {
953 | "cell_type": "code",
954 | "execution_count": 24,
955 | "id": "2a140a52-213a-4da1-8077-8be794a36f30",
956 | "metadata": {
957 | "execution": {
958 | "iopub.execute_input": "2023-04-26T14:53:20.758823Z",
959 | "iopub.status.busy": "2023-04-26T14:53:20.758438Z",
960 | "iopub.status.idle": "2023-04-26T14:53:24.717603Z",
961 | "shell.execute_reply": "2023-04-26T14:53:24.716457Z",
962 | "shell.execute_reply.started": "2023-04-26T14:53:20.758795Z"
963 | }
964 | },
965 | "outputs": [
966 | {
967 | "name": "stdout",
968 | "output_type": "stream",
969 | "text": [
970 | "4.88 ms ± 57.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
971 | ]
972 | }
973 | ],
974 | "source": [
975 | "%%timeit\n",
976 | "\n",
977 | "\n",
978 | "index.nprobe = 20 # default nprobe is 1\n",
979 | "\n",
980 | "D, I = index.search(x_corpus[:1], k) # actual search"
981 | ]
982 | },
983 | {
984 | "cell_type": "markdown",
985 | "id": "971f14de-ec0c-4ac7-b63a-135ec7444834",
986 | "metadata": {
987 | "execution": {
988 | "iopub.status.busy": "2023-04-26T00:07:36.792692Z",
989 | "iopub.status.idle": "2023-04-26T00:07:36.792990Z",
990 | "shell.execute_reply": "2023-04-26T00:07:36.792856Z",
991 | "shell.execute_reply.started": "2023-04-26T00:07:36.792842Z"
992 | }
993 | },
994 | "source": [
995 | "in the above, we are querying the entire search space. This is the same as using Brute Force."
996 | ]
997 | },
998 | {
999 | "cell_type": "code",
1000 | "execution_count": null,
1001 | "id": "ec6f1eb2-a9cd-47d9-86bc-5c58ed5dbb1d",
1002 | "metadata": {},
1003 | "outputs": [],
1004 | "source": []
1005 | },
1006 | {
1007 | "cell_type": "code",
1008 | "execution_count": null,
1009 | "id": "fb11b2cd-eef5-4b15-a719-e9c5cbe0fb4f",
1010 | "metadata": {},
1011 | "outputs": [],
1012 | "source": []
1013 | },
1014 | {
1015 | "cell_type": "markdown",
1016 | "id": "d8524b69-4efd-4a6d-9f10-499f299ff762",
1017 | "metadata": {},
1018 | "source": [
1019 | "search for entire corpus"
1020 | ]
1021 | },
1022 | {
1023 | "cell_type": "code",
1024 | "execution_count": 25,
1025 | "id": "60795b88-8957-4c49-a729-22f935c4dc3f",
1026 | "metadata": {
1027 | "execution": {
1028 | "iopub.execute_input": "2023-04-26T14:53:24.719250Z",
1029 | "iopub.status.busy": "2023-04-26T14:53:24.718697Z",
1030 | "iopub.status.idle": "2023-04-26T14:53:25.774560Z",
1031 | "shell.execute_reply": "2023-04-26T14:53:25.773678Z",
1032 | "shell.execute_reply.started": "2023-04-26T14:53:24.719219Z"
1033 | }
1034 | },
1035 | "outputs": [
1036 | {
1037 | "name": "stdout",
1038 | "output_type": "stream",
1039 | "text": [
1040 | "CPU times: user 8.18 s, sys: 23.5 ms, total: 8.21 s\n",
1041 | "Wall time: 1.05 s\n"
1042 | ]
1043 | }
1044 | ],
1045 | "source": [
1046 | "%%time\n",
1047 | "\n",
1048 | "\n",
1049 | "index.nprobe = 1 \n",
1050 | "\n",
1051 | "D, I = index.search(x_corpus, k) # actual search"
1052 | ]
1053 | },
1054 | {
1055 | "cell_type": "code",
1056 | "execution_count": 26,
1057 | "id": "ddbabfeb-2b27-412c-b77b-1d795b00aa29",
1058 | "metadata": {
1059 | "execution": {
1060 | "iopub.execute_input": "2023-04-26T14:53:25.781104Z",
1061 | "iopub.status.busy": "2023-04-26T14:53:25.778777Z",
1062 | "iopub.status.idle": "2023-04-26T14:53:25.791108Z",
1063 | "shell.execute_reply": "2023-04-26T14:53:25.790296Z",
1064 | "shell.execute_reply.started": "2023-04-26T14:53:25.781066Z"
1065 | }
1066 | },
1067 | "outputs": [
1068 | {
1069 | "data": {
1070 | "text/plain": [
1071 | "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}"
1072 | ]
1073 | },
1074 | "execution_count": 26,
1075 | "metadata": {},
1076 | "output_type": "execute_result"
1077 | }
1078 | ],
1079 | "source": [
1080 | "z = I[:,0] == np.array( list(range(len(x_corpus))))\n",
1081 | "{\n",
1082 | " \"recall@1\": z.sum()\n",
1083 | " , \"num_vectors\": len(z)\n",
1084 | " , \"mismatch\": len(z) - z.sum()\n",
1085 | "}\n"
1086 | ]
1087 | },
1088 | {
1089 | "cell_type": "code",
1090 | "execution_count": null,
1091 | "id": "6c9da4ac-9d1f-480a-ac9f-ea97e1d16f9d",
1092 | "metadata": {},
1093 | "outputs": [],
1094 | "source": []
1095 | },
1096 | {
1097 | "cell_type": "markdown",
1098 | "id": "2b5515dc-09b7-459c-b93c-80424faac839",
1099 | "metadata": {},
1100 | "source": [
1101 | "increase the number of cells that are probed"
1102 | ]
1103 | },
1104 | {
1105 | "cell_type": "code",
1106 | "execution_count": null,
1107 | "id": "687d36fe-de5d-4ee4-9941-7ca604b446fb",
1108 | "metadata": {},
1109 | "outputs": [],
1110 | "source": []
1111 | },
1112 | {
1113 | "cell_type": "code",
1114 | "execution_count": 27,
1115 | "id": "31c0b86b-2d5e-480d-be94-c136e7fe07d1",
1116 | "metadata": {
1117 | "execution": {
1118 | "iopub.execute_input": "2023-04-26T14:53:25.792414Z",
1119 | "iopub.status.busy": "2023-04-26T14:53:25.792073Z",
1120 | "iopub.status.idle": "2023-04-26T14:54:03.768762Z",
1121 | "shell.execute_reply": "2023-04-26T14:54:03.767827Z",
1122 | "shell.execute_reply.started": "2023-04-26T14:53:25.792388Z"
1123 | }
1124 | },
1125 | "outputs": [
1126 | {
1127 | "name": "stdout",
1128 | "output_type": "stream",
1129 | "text": [
1130 | "4.73 s ± 230 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
1131 | ]
1132 | }
1133 | ],
1134 | "source": [
1135 | "%%timeit\n",
1136 | "\n",
1137 | "index.nprobe = 5 # default nprobe is 1\n",
1138 | "\n",
1139 | "D, I = index.search(x_corpus, k) "
1140 | ]
1141 | },
1142 | {
1143 | "cell_type": "code",
1144 | "execution_count": 28,
1145 | "id": "fbffb654-a3a6-4d42-956f-602dab97124e",
1146 | "metadata": {
1147 | "execution": {
1148 | "iopub.execute_input": "2023-04-26T14:54:03.777354Z",
1149 | "iopub.status.busy": "2023-04-26T14:54:03.769965Z",
1150 | "iopub.status.idle": "2023-04-26T14:54:03.787506Z",
1151 | "shell.execute_reply": "2023-04-26T14:54:03.786386Z",
1152 | "shell.execute_reply.started": "2023-04-26T14:54:03.777305Z"
1153 | }
1154 | },
1155 | "outputs": [
1156 | {
1157 | "data": {
1158 | "text/plain": [
1159 | "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}"
1160 | ]
1161 | },
1162 | "execution_count": 28,
1163 | "metadata": {},
1164 | "output_type": "execute_result"
1165 | }
1166 | ],
1167 | "source": [
1168 | "z = I[:,0] == np.array( list(range(len(x_corpus))))\n",
1169 | "{\n",
1170 | " \"recall@1\": z.sum()\n",
1171 | " , \"num_vectors\": len(z)\n",
1172 | " , \"mismatch\": len(z) - z.sum()\n",
1173 | "}\n"
1174 | ]
1175 | },
1176 | {
1177 | "cell_type": "code",
1178 | "execution_count": null,
1179 | "id": "fd7e1354-2002-45dc-8732-9506ef6200cf",
1180 | "metadata": {},
1181 | "outputs": [],
1182 | "source": []
1183 | }
1184 | ],
1185 | "metadata": {
1186 | "environment": {
1187 | "kernel": "python3",
1188 | "name": "pytorch-gpu.1-13.m107",
1189 | "type": "gcloud",
1190 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-13:m107"
1191 | },
1192 | "kernelspec": {
1193 | "display_name": "Python 3",
1194 | "language": "python",
1195 | "name": "python3"
1196 | },
1197 | "language_info": {
1198 | "codemirror_mode": {
1199 | "name": "ipython",
1200 | "version": 3
1201 | },
1202 | "file_extension": ".py",
1203 | "mimetype": "text/x-python",
1204 | "name": "python",
1205 | "nbconvert_exporter": "python",
1206 | "pygments_lexer": "ipython3",
1207 | "version": "3.7.12"
1208 | }
1209 | },
1210 | "nbformat": 4,
1211 | "nbformat_minor": 5
1212 | }
1213 |
--------------------------------------------------------------------------------
/notebooks/workshop_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 12,
6 | "id": "e9b14691-3881-4882-bae0-c46b23401f11",
7 | "metadata": {
8 | "execution": {
9 | "iopub.execute_input": "2023-04-26T14:49:50.397320Z",
10 | "iopub.status.busy": "2023-04-26T14:49:50.396553Z",
11 | "iopub.status.idle": "2023-04-26T14:49:50.401265Z",
12 | "shell.execute_reply": "2023-04-26T14:49:50.400495Z",
13 | "shell.execute_reply.started": "2023-04-26T14:49:50.397287Z"
14 | },
15 | "tags": []
16 | },
17 | "outputs": [],
18 | "source": [
19 | "import nltk\n",
20 | "from sentence_transformers import SentenceTransformer\n",
21 | "from transformers import AutoTokenizer\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "id": "68926c5f-4643-4470-9e76-e1284acee82a",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": []
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "id": "0d2cc671-31cd-4b00-8010-01932aa66d88",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": []
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 5,
43 | "id": "229a8069-c18a-4296-819e-fcebd7398fe8",
44 | "metadata": {
45 | "execution": {
46 | "iopub.execute_input": "2023-04-26T13:32:29.510730Z",
47 | "iopub.status.busy": "2023-04-26T13:32:29.509639Z",
48 | "iopub.status.idle": "2023-04-26T13:32:30.515597Z",
49 | "shell.execute_reply": "2023-04-26T13:32:30.514710Z",
50 | "shell.execute_reply.started": "2023-04-26T13:32:29.510700Z"
51 | }
52 | },
53 | "outputs": [
54 | {
55 | "name": "stderr",
56 | "output_type": "stream",
57 | "text": [
58 | "[nltk_data] Downloading package stopwords to\n",
59 | "[nltk_data] /home/jupyter/nltk_data...\n",
60 | "[nltk_data] Unzipping corpora/stopwords.zip.\n",
61 | "[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...\n",
62 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n",
63 | "[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...\n",
64 | "[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...\n"
65 | ]
66 | },
67 | {
68 | "data": {
69 | "text/plain": [
70 | "True"
71 | ]
72 | },
73 | "execution_count": 5,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "nltk.download('stopwords')\n",
80 | "nltk.download('punkt')\n",
81 | "nltk.download('wordnet')\n",
82 | "nltk.download('omw-1.4')"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "id": "da25e712-759d-4fa8-89bc-ab12e4094acf",
89 | "metadata": {},
90 | "outputs": [],
91 | "source": []
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "id": "32a7451b-7fe5-48f2-af6e-1a9d90b63c14",
97 | "metadata": {},
98 | "outputs": [],
99 | "source": []
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 14,
104 | "id": "6dd5ed16-c707-4ad5-97ee-6748e74af1a9",
105 | "metadata": {
106 | "execution": {
107 | "iopub.execute_input": "2023-04-26T14:50:36.001708Z",
108 | "iopub.status.busy": "2023-04-26T14:50:36.000639Z",
109 | "iopub.status.idle": "2023-04-26T14:50:36.005695Z",
110 | "shell.execute_reply": "2023-04-26T14:50:36.004847Z",
111 | "shell.execute_reply.started": "2023-04-26T14:50:36.001654Z"
112 | },
113 | "tags": []
114 | },
115 | "outputs": [],
116 | "source": [
117 | "models = ['sentence-transformers/all-MiniLM-L6-v2','sentence-transformers/clip-ViT-B-32' , 'sentence-transformers/clip-ViT-B-32-multilingual-v1']\n",
118 | "\n",
119 | "\n"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 15,
125 | "id": "096eca1c-1ea2-4fa8-bc26-7c8ce41f46f1",
126 | "metadata": {
127 | "execution": {
128 | "iopub.execute_input": "2023-04-26T14:50:36.285375Z",
129 | "iopub.status.busy": "2023-04-26T14:50:36.284663Z",
130 | "iopub.status.idle": "2023-04-26T14:50:36.289042Z",
131 | "shell.execute_reply": "2023-04-26T14:50:36.288270Z",
132 | "shell.execute_reply.started": "2023-04-26T14:50:36.285345Z"
133 | },
134 | "tags": []
135 | },
136 | "outputs": [],
137 | "source": [
138 | "text = \"men shoes\""
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 18,
144 | "id": "6958324b-63f0-4320-9880-83d7d920e55c",
145 | "metadata": {
146 | "execution": {
147 | "iopub.execute_input": "2023-04-26T14:51:44.205272Z",
148 | "iopub.status.busy": "2023-04-26T14:51:44.204519Z",
149 | "iopub.status.idle": "2023-04-26T14:51:49.430374Z",
150 | "shell.execute_reply": "2023-04-26T14:51:49.429497Z",
151 | "shell.execute_reply.started": "2023-04-26T14:51:44.205234Z"
152 | },
153 | "tags": []
154 | },
155 | "outputs": [
156 | {
157 | "name": "stdout",
158 | "output_type": "stream",
159 | "text": [
160 | "sentence-transformers/all-MiniLM-L6-v2 {'input_ids': tensor([ 101, 2273, 6007, 102], device='cuda:0'), 'token_type_ids': tensor([0, 0, 0, 0], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1], device='cuda:0'), 'token_embeddings': tensor([[-0.2272, 0.0027, 0.1586, ..., -0.3998, -0.4343, -0.0824],\n",
161 | " [-0.0189, 0.0759, -0.6014, ..., -0.5021, 0.3637, 0.0202],\n",
162 | " [-1.3425, 0.1124, 0.1479, ..., -1.1672, -0.9619, -0.5084],\n",
163 | " [-0.3310, 0.2602, -0.1229, ..., -0.3570, -0.1665, 0.4503]],\n",
164 | " device='cuda:0'), 'sentence_embedding': tensor([-5.8684e-02, 1.3790e-02, -1.2774e-02, 1.4896e-02, 1.0860e-02,\n",
165 | " -6.4392e-02, 6.0345e-02, -6.9710e-02, -3.2946e-02, 4.2863e-03,\n",
166 | " -5.0637e-03, 9.6656e-02, -4.2155e-02, -2.6262e-03, -3.4538e-02,\n",
167 | " -1.3516e-02, -7.5456e-02, 2.6242e-02, -2.1382e-02, 2.3227e-02,\n",
168 | " 3.1208e-02, -3.7767e-02, -4.7549e-02, 4.5558e-03, -9.1842e-02,\n",
169 | " -8.6363e-03, 1.8699e-02, 6.6197e-02, -1.1301e-02, 1.3095e-02,\n",
170 | " 3.8383e-02, 9.8821e-03, 6.1136e-02, 2.6724e-02, -4.4304e-02,\n",
171 | " -8.6510e-02, 1.6309e-02, -3.2358e-02, 3.3459e-04, 9.5446e-02,\n",
172 | " -4.2075e-02, -1.2752e-01, -4.7495e-03, 5.5425e-02, 5.1755e-02,\n",
173 | " 3.0031e-02, 2.5861e-02, 1.5485e-02, -4.3001e-02, 1.2418e-01,\n",
174 | " 9.2281e-04, 2.5372e-02, -2.9569e-02, 2.1765e-02, 5.3302e-02,\n",
175 | " -1.9069e-02, -7.0468e-03, -2.1080e-02, -1.4390e-02, -6.2965e-02,\n",
176 | " 1.2275e-01, 7.1885e-04, -6.6201e-02, 2.3436e-02, 3.0998e-02,\n",
177 | " 1.5899e-02, -4.3535e-02, 4.4102e-03, -1.0930e-02, 4.9731e-02,\n",
178 | " 2.4521e-02, -1.8387e-02, -1.5710e-02, 2.5058e-02, -2.1643e-02,\n",
179 | " 2.9889e-02, -7.1892e-02, -7.9145e-02, -6.6582e-02, -2.8073e-02,\n",
180 | " -4.3930e-02, -3.0970e-02, -9.7531e-03, -1.1777e-02, 1.0500e-02,\n",
181 | " -2.3791e-02, 6.4381e-03, -1.2262e-04, -2.8900e-02, 4.9136e-02,\n",
182 | " -1.3542e-01, -7.4252e-02, 1.3062e-02, -1.8075e-03, -6.6097e-02,\n",
183 | " -7.3589e-03, 1.3785e-02, 3.7384e-02, -8.5169e-02, 1.1559e-01,\n",
184 | " 2.1454e-02, -1.0230e-02, -3.5801e-02, -1.7123e-02, 1.9046e-02,\n",
185 | " 2.3251e-02, -1.7273e-02, 8.2963e-02, 1.8455e-02, 8.6537e-02,\n",
186 | " -2.5882e-02, 2.0791e-02, -7.3512e-02, 1.8831e-02, -5.0279e-02,\n",
187 | " -8.1372e-02, -7.6812e-03, 7.2167e-02, 7.7383e-02, 7.4222e-02,\n",
188 | " 9.1467e-03, 2.2712e-02, 3.6326e-02, -2.0288e-02, -3.1550e-02,\n",
189 | " 7.1721e-03, -3.4719e-02, -4.2586e-33, 5.7250e-03, 1.9000e-02,\n",
190 | " 1.1605e-02, -2.7531e-02, 3.0725e-02, 1.2585e-02, 2.1076e-02,\n",
191 | " -5.6845e-02, 1.6838e-03, -4.8026e-04, -2.1989e-02, 9.2065e-02,\n",
192 | " 7.0795e-03, -4.6472e-02, 5.3270e-02, -5.0890e-02, 7.3680e-02,\n",
193 | " 5.6044e-03, -7.5194e-02, -8.5179e-02, -7.4869e-03, 6.3311e-02,\n",
194 | " -1.0171e-02, 2.0384e-02, 5.0803e-02, -6.6822e-03, -1.5785e-02,\n",
195 | " -3.5164e-02, 4.9931e-02, 8.2824e-03, 9.2603e-02, -2.0116e-02,\n",
196 | " 4.5553e-02, 1.2080e-02, -5.9395e-02, 2.1852e-02, 5.8675e-02,\n",
197 | " 2.2210e-02, 9.2596e-03, -2.3409e-02, 4.5004e-02, -5.1507e-02,\n",
198 | " 2.8044e-02, -6.5380e-03, 7.4831e-03, 5.8157e-02, 3.4447e-02,\n",
199 | " 4.1475e-02, -6.1901e-02, -1.6561e-02, -4.5630e-02, 4.9367e-02,\n",
200 | " -1.6766e-02, -5.7993e-02, 5.2082e-02, -8.8322e-02, 2.9853e-02,\n",
201 | " 2.4510e-02, -9.7888e-03, 4.2248e-03, 7.1062e-03, 1.1276e-01,\n",
202 | " 7.1893e-02, 1.2149e-03, -5.7319e-02, -7.2250e-02, 5.5623e-02,\n",
203 | " 1.4792e-02, -1.4871e-02, 6.0891e-02, -3.7015e-02, -4.5080e-02,\n",
204 | " 6.8613e-02, 5.5002e-02, -1.1833e-02, 2.1440e-02, -2.9428e-02,\n",
205 | " 7.0045e-02, -5.4634e-02, -6.3302e-02, -9.2292e-02, -3.5121e-02,\n",
206 | " 2.7007e-02, 2.0125e-02, -3.6556e-02, -1.3703e-02, -6.1224e-02,\n",
207 | " -1.0986e-02, 1.5851e-02, -2.6837e-02, -1.1262e-01, -4.3161e-02,\n",
208 | " -1.4493e-02, 7.6786e-03, -7.4723e-03, 2.1595e-33, 3.5767e-02,\n",
209 | " 1.1363e-01, -1.4693e-02, 9.2684e-02, 3.3924e-02, -1.5614e-02,\n",
210 | " -1.6073e-02, -6.0525e-04, 2.5468e-03, 9.1995e-04, -2.1501e-02,\n",
211 | " 7.5731e-03, 7.2208e-02, 4.1097e-02, 2.0974e-02, -3.6957e-02,\n",
212 | " 7.8952e-02, 4.7166e-02, 1.3827e-02, 1.5595e-02, 3.9452e-02,\n",
213 | " -2.2254e-02, 2.3012e-02, -3.3751e-02, -7.7026e-02, 2.4340e-02,\n",
214 | " 1.0263e-01, -3.7329e-02, -1.6680e-01, 7.4254e-02, 2.9221e-02,\n",
215 | " -6.3063e-04, -5.0962e-02, 4.1396e-02, -1.9880e-02, 1.0247e-02,\n",
216 | " -1.7069e-01, 9.8603e-02, 5.3988e-02, 1.2776e-02, 4.2524e-02,\n",
217 | " -3.5128e-03, 3.9324e-02, 4.7423e-02, -2.8995e-02, -5.1358e-02,\n",
218 | " -9.3995e-03, 3.7709e-03, -3.8401e-02, -5.3929e-02, -1.8980e-02,\n",
219 | " 3.6103e-02, -6.3064e-02, -4.9003e-02, -5.8757e-02, -4.0821e-04,\n",
220 | " -1.0862e-01, 4.6522e-02, -5.9556e-02, 7.4809e-02, -3.2031e-02,\n",
221 | " 6.8497e-02, 2.7983e-02, -5.1754e-03, -4.0551e-02, 1.9033e-02,\n",
222 | " -1.1128e-02, 8.7901e-03, -3.8768e-02, -3.0256e-02, 9.6022e-02,\n",
223 | " -1.0334e-01, 8.3221e-02, 1.0190e-01, -2.3660e-02, -1.2010e-03,\n",
224 | " -2.6769e-02, 6.1521e-02, 2.9026e-02, -5.4091e-02, 3.6390e-03,\n",
225 | " -9.2658e-02, 7.8827e-03, 1.1917e-01, -5.4807e-02, 1.5506e-01,\n",
226 | " 4.6332e-02, 4.9145e-02, 5.1203e-03, 4.5503e-02, 1.6287e-02,\n",
227 | " 4.3451e-02, 4.7670e-02, 1.7630e-02, -1.9633e-02, -1.1029e-08,\n",
228 | " 6.5470e-02, 5.2181e-02, 3.3727e-02, 3.1607e-02, -1.7458e-02,\n",
229 | " -1.0251e-04, -1.4603e-02, 2.6509e-02, 7.8088e-02, 1.1715e-02,\n",
230 | " -7.6623e-02, -1.9881e-02, -3.8228e-02, 8.3098e-02, 3.7713e-02,\n",
231 | " 5.6128e-02, -4.2606e-02, -1.7351e-02, -4.4235e-02, -6.9647e-02,\n",
232 | " 7.4529e-03, -5.4775e-02, 4.2709e-02, 1.1223e-01, -1.0754e-02,\n",
233 | " 1.0371e-02, 2.4825e-02, -1.1770e-01, 1.0997e-02, 1.0135e-01,\n",
234 | " 1.2151e-02, 2.0672e-02, 1.5000e-02, -3.0694e-02, 7.1544e-02,\n",
235 | " 3.1737e-02, -3.7621e-03, -6.8717e-03, 5.2807e-02, -8.3122e-02,\n",
236 | " -9.0656e-02, -1.2168e-01, 4.1270e-02, -2.7971e-02, -4.6418e-02,\n",
237 | " -5.2283e-02, 3.0334e-04, 1.3859e-01, -1.0576e-01, 3.3442e-02,\n",
238 | " 8.5742e-03, -2.2963e-02, 5.4894e-02, -1.8929e-02, 1.1873e-04,\n",
239 | " -7.1593e-02, -1.7575e-02, 9.0126e-02, 2.2157e-02, -4.3015e-02,\n",
240 | " 3.9037e-02, -7.4174e-02, -3.6659e-02, -3.6811e-03], device='cuda:0')}\n",
241 | "sentence-transformers/clip-ViT-B-32 {'input_ids': tensor([49406, 1656, 4079, 49407], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1], device='cuda:0'), 'image_text_info': 1, 'sentence_embedding': tensor([-2.6722e-01, -2.5024e-01, 1.9831e-01, -8.4449e-02, -4.5411e-01,\n",
242 | " 1.5269e-01, 1.9422e-01, -7.6131e-01, 1.6059e-02, 5.6570e-02,\n",
243 | " -1.1732e-01, -1.7434e-01, -4.8341e-02, -5.8244e-02, 1.9622e-03,\n",
244 | " 3.3086e-02, 4.6739e-01, 5.9902e-02, -1.4106e-01, -2.5187e-01,\n",
245 | " 1.1865e-01, 2.7707e-01, -2.4586e-01, 1.6738e-01, -6.5906e-02,\n",
246 | " -4.9510e-01, 7.1441e-02, 4.7636e-02, -1.1008e-01, 2.0449e-01,\n",
247 | " 1.9936e-02, -1.8740e-01, -1.1138e-02, 1.2773e-01, -5.5551e-01,\n",
248 | " -4.6653e-02, 1.8026e-01, -9.6517e-02, 1.2344e-01, -1.3597e-01,\n",
249 | " -1.9992e-01, -1.9570e-01, 1.5068e-01, -5.0438e-01, 9.0387e-02,\n",
250 | " 2.1159e-01, 5.4030e-03, -4.0513e-02, 2.7146e-01, 3.7458e-02,\n",
251 | " 8.0310e-02, -2.9841e-02, 5.1565e-02, -6.1074e-01, -2.5701e-01,\n",
252 | " 1.3316e-01, -1.1232e-01, -5.3493e-03, -6.2610e-01, -3.1894e-01,\n",
253 | " 1.9283e-01, 2.0204e-02, -7.6878e-02, -4.0941e-01, 3.9715e-01,\n",
254 | " 3.7647e-02, 1.9459e-01, 3.6569e-01, 3.9625e-01, -1.1323e-01,\n",
255 | " 4.1932e-01, -2.1839e-01, 1.6538e-01, 3.2633e-01, 2.7650e-01,\n",
256 | " 2.5525e-01, -7.0886e-02, -2.6799e-02, -2.8865e-01, -4.0908e-01,\n",
257 | " -2.3828e-01, 1.0888e-01, 1.9548e-02, 1.9067e-01, 1.2776e-01,\n",
258 | " 4.7739e-01, -2.3506e-01, -1.8611e-01, 1.3253e-01, -2.1160e-01,\n",
259 | " -5.1571e-01, -2.1936e-01, -1.3583e+00, 6.3813e-01, 1.9401e-01,\n",
260 | " -1.7399e-01, 2.3773e-02, 1.7300e-01, 2.0512e-01, 3.8955e-01,\n",
261 | " 1.2784e-02, -1.5616e-01, 2.7312e-01, 1.1615e-01, 9.6609e-02,\n",
262 | " 2.5802e-01, -1.6590e-01, -2.5717e-01, -2.7002e-01, 2.2556e-01,\n",
263 | " -5.9875e-01, 4.1399e-01, 1.0067e-01, 2.3802e-01, 1.8393e-01,\n",
264 | " -7.7338e-02, 2.8407e-01, -1.8666e-01, 1.8673e-01, -2.7489e-02,\n",
265 | " 6.1416e-02, -5.3712e-02, -3.1903e-01, 2.2104e-01, 4.5745e-01,\n",
266 | " -1.4629e-01, -2.0385e-01, -3.0613e-03, 1.0831e-01, 1.3029e-01,\n",
267 | " 7.5172e-02, 1.7098e-01, -1.7295e-01, 6.0518e+00, -4.3587e-01,\n",
268 | " -3.6123e-02, -2.3084e-01, -2.7611e-01, -2.6074e-01, -2.6225e-02,\n",
269 | " -1.2373e-01, -3.8144e-02, 3.9868e-02, 1.2663e-01, -1.5586e-02,\n",
270 | " -1.0817e-01, -1.9468e-01, -4.2929e-01, 1.9576e-01, 4.2164e-02,\n",
271 | " 2.4703e-01, 1.4054e-01, 5.2742e-01, 1.1616e-02, -1.9511e-01,\n",
272 | " 1.8861e-02, -3.4501e-02, 2.4374e-01, -1.0141e-01, -8.6388e-02,\n",
273 | " -3.6278e-02, -6.0471e-02, 1.7178e-01, 4.3191e-02, -5.6021e-02,\n",
274 | " 2.1537e-01, 2.2960e-01, -2.8191e-01, 3.6694e-01, -2.1738e-01,\n",
275 | " -1.5248e-01, -5.3858e-01, 4.8218e-04, 1.0611e-01, -4.3668e-01,\n",
276 | " 3.2565e-01, 2.1495e-01, 7.9084e-02, -1.1144e-01, -8.2238e-02,\n",
277 | " -1.0823e-02, 3.2505e-01, -1.6962e-02, 2.4763e-03, -2.1186e-01,\n",
278 | " -1.7482e-01, 5.7193e-01, 2.6038e-02, -2.0085e-01, 3.7864e-01,\n",
279 | " 2.8407e-02, -1.4909e-01, 4.4668e-02, -9.7271e-02, -7.5325e-02,\n",
280 | " 2.7872e-01, -1.5554e-01, 2.2307e-01, 9.8059e-02, -4.1579e-02,\n",
281 | " -2.8606e-02, -2.3032e-01, 3.1150e-01, -7.1310e-02, -2.4904e-01,\n",
282 | " -7.7626e-02, -1.2460e-01, -2.6306e-02, -1.9954e-01, 8.6190e-02,\n",
283 | " 3.0547e-05, 2.2495e-03, 2.2625e-01, -1.2017e-01, -1.5577e-01,\n",
284 | " 3.8977e-01, 6.9040e-03, -1.5162e-01, -1.7155e-01, 1.7012e-01,\n",
285 | " 2.9838e-01, -3.6955e-01, 1.8884e-01, -3.1549e-01, -2.8930e-01,\n",
286 | " -5.5932e-01, 3.5717e-01, 2.4953e-01, -2.4460e-01, 7.5881e-03,\n",
287 | " -1.7940e-01, -2.2461e-02, -1.8611e-01, 1.4540e-01, 3.2545e-01,\n",
288 | " 1.3929e-01, -3.1123e-01, -2.8450e-01, 4.3041e-01, -4.7437e-02,\n",
289 | " 6.9766e-02, -1.7365e-01, 5.9896e-02, 5.9521e-01, -4.6284e-02,\n",
290 | " 3.8939e-02, 1.3634e-01, -1.1272e-01, 1.6409e-01, 9.2894e-03,\n",
291 | " -4.9759e-02, -4.7318e-02, 1.4064e-01, -1.3499e-01, -3.7380e-02,\n",
292 | " -1.2251e-01, 5.5834e-01, -2.6055e-01, 9.8233e-02, -5.9575e-02,\n",
293 | " 5.2060e-02, -3.2568e-02, -2.9983e-02, 1.0168e-01, 1.8758e-01,\n",
294 | " -4.8515e-02, 2.4001e-01, -1.5750e-02, 1.0778e-01, 6.0214e-02,\n",
295 | " -2.2326e-01, -2.0359e-02, 2.0070e-01, 4.8625e-02, -1.4054e-02,\n",
296 | " 9.4663e-02, -1.7493e-01, 2.9097e-02, 1.6428e-01, -2.8810e-01,\n",
297 | " 1.3145e-01, 1.0770e-01, -3.8759e-02, -8.3538e-02, -7.6144e-02,\n",
298 | " -5.5353e-04, 3.1190e-01, -1.5708e-01, 7.7211e-02, -8.1434e-03,\n",
299 | " 2.1436e-01, -9.9840e-02, 1.8908e-01, 4.1516e-01, 1.0249e-02,\n",
300 | " -3.0407e-01, -3.6515e-01, -1.2376e-01, 2.7788e-02, 1.9021e-02,\n",
301 | " 1.8605e-01, 2.9233e-01, 5.6915e-02, 2.0724e-01, 7.0265e-02,\n",
302 | " -9.5444e-02, -2.1719e-01, 1.0457e-01, -2.2053e-01, 1.3246e-01,\n",
303 | " -1.5537e-01, -9.5731e-02, 6.0466e+00, 4.0778e-01, 1.1285e-02,\n",
304 | " 1.1125e-01, -2.3022e-01, 3.2817e-01, 3.3474e-01, 1.3995e-01,\n",
305 | " 1.5649e-01, 3.0448e-01, -1.5101e-02, -1.7316e-01, -3.8413e-01,\n",
306 | " 2.2023e-01, -4.7145e-01, 9.5377e-02, -1.6545e-01, -1.7516e+00,\n",
307 | " 2.9473e-01, 1.4115e-01, 1.8588e-01, -1.2569e-01, 6.5745e-02,\n",
308 | " -1.4898e-01, -9.3745e-02, 3.1413e-01, 1.2686e-01, -1.0292e-01,\n",
309 | " -2.1954e-01, 3.0948e-02, -4.2561e-01, -2.6715e-01, 5.5388e-02,\n",
310 | " -8.8885e-02, -9.3119e-02, 2.2464e-01, -4.0584e-02, 1.0608e-01,\n",
311 | " 2.8624e-01, -1.7813e-01, -5.1491e-02, 6.9263e-02, 1.7411e-01,\n",
312 | " -4.3583e-01, -1.5844e-01, 1.3288e-02, 1.3611e-01, 2.8650e-01,\n",
313 | " 2.2427e-01, 2.9285e-01, -8.2768e-02, -2.1313e-01, 4.6977e-01,\n",
314 | " 1.3981e-01, 4.4196e-01, 5.9897e-01, -2.0943e-01, 1.3199e-01,\n",
315 | " -1.2770e-01, 7.0080e-02, -8.7112e-02, 2.5521e-01, -3.5098e-01,\n",
316 | " 2.7341e-01, -3.3966e-02, -1.0843e-01, 4.3469e-01, -6.2516e-02,\n",
317 | " -1.0742e-01, -3.9380e-01, 2.8758e-02, -7.4800e-01, -1.1042e-01,\n",
318 | " 7.4263e-02, -1.0410e-01, -3.2008e-01, -2.6784e-01, 8.2537e-03,\n",
319 | " -3.0056e-01, 1.5119e-01, -1.6852e-01, -5.2178e-03, -1.4731e-01,\n",
320 | " 4.4588e-01, 3.7448e-01, -3.9732e-01, 1.3030e-01, 6.4875e-01,\n",
321 | " -2.1301e-01, 1.2479e-01, 2.6747e-01, -2.6366e-01, -1.2018e-01,\n",
322 | " -2.2133e-02, -1.6639e-02, -1.1777e-01, -3.0975e-01, 1.7078e-01,\n",
323 | " -1.0861e-04, 1.3197e-01, -2.0081e-01, -2.6618e-01, 3.1587e-02,\n",
324 | " 1.0479e-01, 8.0152e-02, 1.1496e-01, -1.7642e-01, -1.5247e-01,\n",
325 | " -4.1968e-01, 2.1120e-01, 1.1615e-01, -2.0123e-01, -1.9122e-01,\n",
326 | " 6.6410e-02, -2.2218e-01, 1.0440e-01, 7.5966e-03, 4.2399e-01,\n",
327 | " -9.5817e-02, -3.5184e-01, -6.2223e-02, -1.7677e-01, -4.7161e-01,\n",
328 | " -3.8534e-02, -8.0673e-02, -4.5963e-03, -6.4624e-03, -1.8014e-02,\n",
329 | " -1.1445e-01, 4.2948e-02, -2.5978e-01, 5.9180e-02, 3.8135e-02,\n",
330 | " 2.0812e-01, -2.4710e-01, -3.7079e-01, 2.0508e-01, 2.1899e-01,\n",
331 | " -9.4519e-01, 2.7558e-01, 2.7138e-01, 1.4367e-01, -1.5441e-01,\n",
332 | " -1.8515e-01, 9.0163e-02, 2.0217e-01, -1.0698e-01, -2.3912e-01,\n",
333 | " -2.2943e-01, 2.2451e-01, 1.2780e-01, 1.2775e-01, 3.1585e-01,\n",
334 | " -2.1394e-01, -1.2684e-01, -9.6884e-02, -6.5346e-01, -2.9291e-01,\n",
335 | " -5.3387e-01, -2.2139e-02, -1.9091e-01, -5.3790e-01, -1.2427e-01,\n",
336 | " 3.3499e-01, -1.1048e-02, 6.9154e-02, -4.7823e-02, -1.6247e-02,\n",
337 | " 1.7433e-01, -6.0335e-01, -1.3460e-02, -3.3694e-01, -4.4399e-01,\n",
338 | " -4.2622e-02, 3.5184e-01, -1.3890e-01, 2.1893e-01, -1.5696e-02,\n",
339 | " 1.5114e-01, 1.4282e-01, 2.9100e-01, 8.3577e-01, -7.7831e-02,\n",
340 | " -6.1096e-02, -5.0251e-02, -2.8395e-01, -1.4788e-01, -1.7162e-01,\n",
341 | " -9.0106e-02, 2.4305e-01, -1.6364e-01, 2.7089e-01, 3.6500e-01,\n",
342 | " -2.4414e-01, -4.1538e-01, 3.7322e-01, 3.0024e-01, 2.3689e-01,\n",
343 | " 7.3204e-02, -3.4812e-02], device='cuda:0')}\n",
344 | "Model name:sentence-transformers/clip-ViT-B-32 ; tokenizer doesn't exist; sentence-transformers/clip-ViT-B-32 does not appear to have a file named config.json. Checkout 'https://huggingface.co/sentence-transformers/clip-ViT-B-32/main' for available files.\n",
345 | "sentence-transformers/clip-ViT-B-32-multilingual-v1 {'input_ids': tensor([ 101, 10588, 48201, 47125, 102], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1, 1], device='cuda:0'), 'token_embeddings': tensor([[ 0.2305, 0.0549, -0.1571, ..., 0.5232, -0.0930, -0.1172],\n",
346 | " [-0.2680, 0.2789, -0.0539, ..., 0.4740, -0.2368, -0.3543],\n",
347 | " [ 0.3758, 0.2741, -0.3703, ..., 0.7613, 0.3820, -0.0288],\n",
348 | " [ 0.4044, 0.2899, -0.4232, ..., 0.8181, 0.3925, -0.0369],\n",
349 | " [ 0.1187, 0.0554, -0.3584, ..., 0.3664, 0.1128, -0.1268]],\n",
350 | " device='cuda:0'), 'sentence_embedding': tensor([-2.0334e-01, -1.6914e-01, 7.4771e-02, -1.4223e-01, -3.0504e-01,\n",
351 | " 2.0065e-01, 9.7776e-02, -9.3886e-01, -2.3021e-03, 9.3738e-02,\n",
352 | " 1.8778e-02, -2.6235e-01, -1.2688e-01, -1.7641e-01, 9.0206e-02,\n",
353 | " -6.5903e-02, 1.5076e-01, 6.2253e-02, -3.2784e-02, -2.5361e-01,\n",
354 | " 2.5175e-01, 3.9041e-01, -9.7813e-02, 2.4359e-01, 4.1893e-02,\n",
355 | " -3.3285e-01, 9.3473e-02, 1.3778e-01, -1.5428e-01, 2.4548e-01,\n",
356 | " 1.9890e-02, -2.7075e-01, -1.8032e-02, 6.4909e-02, -4.3061e-01,\n",
357 | " -1.1657e-01, 2.4541e-01, -4.9875e-02, 2.3737e-02, -2.3621e-02,\n",
358 | " -3.1835e-02, -2.0869e-01, 5.7527e-02, -5.0633e-01, 1.3199e-01,\n",
359 | " 2.9206e-01, 2.9964e-02, -3.3747e-02, 2.3226e-01, 3.6006e-02,\n",
360 | " 1.2699e-01, 9.7488e-02, 7.3582e-02, -5.8144e-01, -2.2830e-01,\n",
361 | " 1.3345e-01, -5.1341e-02, -8.0402e-02, -5.1184e-01, -1.9752e-01,\n",
362 | " 1.4001e-01, -5.0529e-02, 2.0968e-02, -3.1461e-01, 2.1346e-01,\n",
363 | " -3.8276e-02, 1.8773e-01, 2.2569e-01, 1.9953e-01, -7.3426e-02,\n",
364 | " 4.3863e-01, -1.7246e-01, 1.5217e-01, 1.7116e-01, 2.1022e-01,\n",
365 | " 4.1116e-01, 1.4070e-01, -1.1562e-01, -2.2117e-01, -4.7348e-01,\n",
366 | " -2.1663e-01, 4.4288e-03, 1.7295e-02, 2.5705e-01, 1.1410e-01,\n",
367 | " 5.6728e-01, -2.1271e-01, -2.2724e-01, 1.2777e-01, -1.4383e-01,\n",
368 | " -3.4052e-01, -1.4664e-01, -1.4763e+00, 6.1955e-01, 1.6093e-01,\n",
369 | " -1.7167e-01, 2.3186e-02, 1.3621e-01, 9.5215e-02, 3.2798e-01,\n",
370 | " -1.8117e-02, -4.4082e-02, 2.5197e-01, 1.7477e-01, 1.9436e-01,\n",
371 | " 3.0611e-01, -8.9931e-02, -7.9702e-02, -2.5298e-01, 2.7659e-01,\n",
372 | " -5.9393e-01, 4.1021e-01, 7.9611e-02, 1.3278e-01, 1.3027e-01,\n",
373 | " -4.7478e-02, 1.4775e-01, -1.3919e-02, 3.9992e-02, 1.2915e-01,\n",
374 | " -2.0684e-02, -1.4745e-01, -2.2467e-01, 2.0943e-01, 4.3064e-01,\n",
375 | " -1.1368e-01, -1.3093e-01, 8.6813e-03, 1.4154e-02, 1.3849e-01,\n",
376 | " 1.8656e-01, 1.8730e-01, -3.0004e-01, 5.7339e+00, -3.2605e-01,\n",
377 | " -4.2595e-02, -4.1856e-01, -3.0168e-01, -8.9973e-02, 9.5640e-04,\n",
378 | " -1.4158e-02, -1.0548e-02, -1.1700e-01, 2.3475e-01, -2.6989e-02,\n",
379 | " -1.3903e-01, -9.5062e-02, -3.3901e-01, 3.1460e-01, -5.6003e-02,\n",
380 | " 2.0593e-01, 9.9136e-02, 5.2334e-01, 7.3437e-02, -1.5021e-01,\n",
381 | " -6.6951e-02, -6.3785e-02, 2.3081e-01, 2.6384e-02, -8.1484e-02,\n",
382 | " -4.4474e-02, -4.0740e-02, 2.5724e-01, 3.1777e-02, -9.1613e-02,\n",
383 | " 3.9941e-02, 4.0293e-02, -3.0676e-01, 3.4817e-01, -2.2957e-01,\n",
384 | " -5.3091e-02, -3.3662e-01, -5.1250e-02, 1.4828e-01, -2.7174e-01,\n",
385 | " 3.7047e-01, -3.9839e-02, 1.3486e-01, -5.5569e-02, -6.2613e-02,\n",
386 | " 2.6114e-02, 2.9128e-01, 6.0373e-02, -1.2501e-02, -1.6453e-01,\n",
387 | " -9.3317e-03, 4.4107e-01, 1.1232e-01, -1.3276e-01, 3.4949e-01,\n",
388 | " 9.3545e-02, 4.6813e-03, -2.4483e-02, 6.4613e-02, -1.1478e-01,\n",
389 | " 3.0463e-01, -4.4370e-02, 2.0068e-01, -4.3869e-02, -8.9185e-02,\n",
390 | " 1.1884e-01, -1.3263e-01, 3.5835e-01, -1.0064e-01, -3.2594e-01,\n",
391 | " -1.5505e-01, -5.2067e-02, -1.1747e-03, -7.0044e-02, 4.5385e-02,\n",
392 | " -6.7558e-02, 1.6928e-01, 2.6598e-01, 6.8568e-02, -1.9572e-01,\n",
393 | " 2.9449e-01, 4.5720e-03, -6.7686e-02, -3.0300e-01, 2.0569e-01,\n",
394 | " 2.2198e-01, -2.7167e-01, 2.2274e-01, -1.5873e-01, -3.3218e-01,\n",
395 | " -4.9440e-01, 3.7218e-01, 2.3206e-01, -1.8265e-01, 1.3722e-01,\n",
396 | " -2.2112e-01, -3.3449e-02, -1.0907e-01, 3.2806e-02, 2.4866e-01,\n",
397 | " 2.5990e-02, -1.8595e-01, -2.5502e-01, 2.8818e-01, -2.1293e-01,\n",
398 | " 1.0490e-01, -6.9663e-02, 1.0206e-01, 5.4036e-01, 6.0295e-03,\n",
399 | " -8.6853e-03, 7.8703e-02, -6.7371e-02, 6.5200e-02, 9.7390e-03,\n",
400 | " 3.3331e-02, -1.1727e-01, 2.7078e-02, -7.0596e-02, 2.8880e-02,\n",
401 | " -1.6717e-01, 4.1236e-01, -1.3797e-01, 1.2633e-02, 4.0977e-02,\n",
402 | " -2.9763e-02, -4.3454e-02, 2.7565e-02, -4.0571e-02, 5.7820e-02,\n",
403 | " -4.8267e-02, 2.4278e-01, -7.6660e-03, 2.9698e-02, 4.8585e-02,\n",
404 | " -1.5352e-01, 1.4553e-02, 1.5029e-01, 6.8895e-03, 1.2136e-02,\n",
405 | " 4.9711e-02, -1.0585e-01, -3.0702e-02, 9.3002e-02, -8.1609e-02,\n",
406 | " 2.3991e-01, 3.9812e-03, -1.9190e-01, -1.3853e-01, -1.6096e-01,\n",
407 | " -2.1159e-01, 2.1663e-01, -8.0867e-02, -1.8770e-02, -1.5257e-01,\n",
408 | " 1.9546e-01, -6.9405e-02, 1.4577e-01, 3.3809e-01, 6.0678e-02,\n",
409 | " -2.6670e-01, -2.0814e-01, -5.2758e-03, 5.5603e-02, 4.4693e-02,\n",
410 | " 2.0097e-01, 2.7027e-01, 1.2986e-01, 2.3777e-01, 7.5795e-02,\n",
411 | " -7.5986e-02, -3.1976e-01, 5.8500e-02, -9.6743e-02, 5.7583e-04,\n",
412 | " -1.8898e-01, -1.7358e-01, 5.7260e+00, 4.8272e-01, -8.1259e-02,\n",
413 | " 3.2061e-03, -3.5555e-01, 1.5947e-01, 3.3509e-01, 1.7805e-01,\n",
414 | " 9.1990e-02, 3.5931e-02, -7.3387e-02, -6.7740e-02, -1.9722e-01,\n",
415 | " 1.6325e-01, -5.5183e-01, 1.3599e-01, -1.9017e-01, -2.1158e+00,\n",
416 | " 1.7186e-01, 2.0626e-01, 3.9789e-02, -1.1614e-01, 1.3275e-02,\n",
417 | " -2.0151e-01, -2.5364e-02, 3.5192e-01, 1.1858e-01, -4.1850e-02,\n",
418 | " -2.7810e-01, 2.4617e-02, -3.5590e-01, -2.9734e-01, 5.7519e-02,\n",
419 | " -1.2361e-01, 7.0654e-02, 1.9895e-01, -1.0288e-01, 1.0614e-01,\n",
420 | " 2.2623e-01, -1.1561e-01, -4.4942e-02, 1.0816e-01, 1.0192e-01,\n",
421 | " -2.6893e-01, -1.7173e-01, -2.1719e-01, 1.1773e-01, 2.5459e-01,\n",
422 | " 3.0536e-01, 1.7029e-01, 2.4242e-02, -1.4146e-01, 5.0061e-01,\n",
423 | " 7.2468e-02, 3.1891e-01, 3.2688e-01, -1.5695e-01, 1.7149e-01,\n",
424 | " -2.0500e-01, 4.5032e-02, -1.7520e-01, 2.2878e-01, -1.6988e-01,\n",
425 | " 1.5373e-01, -5.7668e-02, -8.8306e-03, 3.1439e-01, -2.9895e-02,\n",
426 | " -7.8530e-02, -2.2839e-01, -5.0848e-02, -6.3631e-01, -8.8465e-02,\n",
427 | " -1.5557e-02, -2.0264e-01, -2.6219e-01, -3.1466e-01, 2.0764e-02,\n",
428 | " -3.4295e-01, 1.9736e-01, -1.6578e-01, -1.0560e-01, -2.2583e-01,\n",
429 | " 2.5710e-01, 2.4691e-01, -4.3547e-01, 1.1362e-01, 4.6921e-01,\n",
430 | " -2.3151e-01, 1.5638e-01, 2.0220e-01, -2.8883e-01, -1.3096e-01,\n",
431 | " -2.4510e-02, 7.7040e-02, -7.8399e-04, -4.7700e-01, 4.2299e-02,\n",
432 | " 1.5806e-01, 1.6993e-01, -8.3540e-02, -9.5018e-02, -1.5459e-02,\n",
433 | " 4.4254e-02, 1.1774e-01, -3.8907e-02, -1.5936e-01, -2.1897e-02,\n",
434 | " -3.0448e-01, 2.6064e-01, 1.6372e-01, -2.3275e-01, -1.8462e-01,\n",
435 | " 9.4053e-02, -1.6129e-01, -1.5300e-01, 1.7068e-01, 4.8920e-01,\n",
436 | " 2.1387e-02, -2.2186e-01, 7.1614e-02, -1.5353e-02, -3.8598e-01,\n",
437 | " -1.4085e-01, -1.0007e-01, -7.3114e-02, -8.1861e-02, -2.0652e-02,\n",
438 | " -1.9611e-01, -1.1353e-02, -1.4559e-01, 1.9196e-01, 1.0416e-01,\n",
439 | " 1.1943e-01, -2.0479e-01, -2.3482e-01, 1.6960e-01, 2.5849e-01,\n",
440 | " -8.1506e-01, 3.2976e-01, 2.7162e-01, 1.1857e-01, -1.2812e-01,\n",
441 | " -1.3794e-01, 1.0144e-01, 1.3436e-01, -8.4321e-02, -2.6041e-01,\n",
442 | " -1.9379e-01, 2.6946e-01, -2.9697e-02, -2.7816e-02, 4.6124e-01,\n",
443 | " -1.1000e-01, 2.2152e-02, -7.0317e-02, -5.6700e-01, -3.9634e-01,\n",
444 | " -5.3712e-01, -2.5599e-02, -2.3336e-01, -4.9603e-01, -1.3422e-01,\n",
445 | " 2.4102e-01, 6.0139e-02, 1.9689e-02, -2.9797e-01, 7.1472e-02,\n",
446 | " 5.7367e-02, -4.8900e-01, 3.0909e-03, -2.3196e-01, -4.4083e-01,\n",
447 | " -1.6809e-02, 4.6111e-01, -5.8852e-02, 1.3696e-01, 9.9999e-02,\n",
448 | " 1.7090e-01, 1.2402e-01, 3.0726e-01, 8.3289e-01, 5.7107e-03,\n",
449 | " -5.5376e-02, -3.5042e-02, -2.4841e-01, -1.3627e-01, -2.5289e-01,\n",
450 | " -1.9113e-01, 1.7571e-01, -2.1889e-01, 5.0180e-01, 2.4607e-01,\n",
451 | " -3.9292e-01, -3.9180e-01, 2.2482e-01, 2.2826e-01, -2.6152e-02,\n",
452 | " -3.1935e-03, -9.9769e-02], device='cuda:0')}\n"
453 | ]
454 | },
455 | {
456 | "data": {
457 | "application/vnd.jupyter.widget-view+json": {
458 | "model_id": "d966829260ec4907a130766b61833261",
459 | "version_major": 2,
460 | "version_minor": 0
461 | },
462 | "text/plain": [
463 | "Downloading (…)okenizer_config.json: 0%| | 0.00/371 [00:00, ?B/s]"
464 | ]
465 | },
466 | "metadata": {},
467 | "output_type": "display_data"
468 | },
469 | {
470 | "data": {
471 | "application/vnd.jupyter.widget-view+json": {
472 | "model_id": "2a89bf62a7304bdeb19b75a5a76a74fe",
473 | "version_major": 2,
474 | "version_minor": 0
475 | },
476 | "text/plain": [
477 | "Downloading (…)lve/main/config.json: 0%| | 0.00/572 [00:00, ?B/s]"
478 | ]
479 | },
480 | "metadata": {},
481 | "output_type": "display_data"
482 | },
483 | {
484 | "data": {
485 | "application/vnd.jupyter.widget-view+json": {
486 | "model_id": "5b38dbfeb07740e888bbb86ba760fe91",
487 | "version_major": 2,
488 | "version_minor": 0
489 | },
490 | "text/plain": [
491 | "Downloading (…)solve/main/vocab.txt: 0%| | 0.00/996k [00:00, ?B/s]"
492 | ]
493 | },
494 | "metadata": {},
495 | "output_type": "display_data"
496 | },
497 | {
498 | "data": {
499 | "application/vnd.jupyter.widget-view+json": {
500 | "model_id": "2b3f54f0f109410580f47a34e06b15c1",
501 | "version_major": 2,
502 | "version_minor": 0
503 | },
504 | "text/plain": [
505 | "Downloading (…)/main/tokenizer.json: 0%| | 0.00/1.96M [00:00, ?B/s]"
506 | ]
507 | },
508 | "metadata": {},
509 | "output_type": "display_data"
510 | },
511 | {
512 | "data": {
513 | "application/vnd.jupyter.widget-view+json": {
514 | "model_id": "0f0768a02ffa440a87c1b341f1d1d183",
515 | "version_major": 2,
516 | "version_minor": 0
517 | },
518 | "text/plain": [
519 | "Downloading (…)cial_tokens_map.json: 0%| | 0.00/112 [00:00, ?B/s]"
520 | ]
521 | },
522 | "metadata": {},
523 | "output_type": "display_data"
524 | }
525 | ],
526 | "source": [
527 | "for model_name in models:\n",
528 | " model = SentenceTransformer(model_name)\n",
529 | " resp = model.encode(text, output_value=None)\n",
530 | " \n",
531 | "\n",
532 | " print (model_name, resp)\n",
533 | " \n",
534 | " try:\n",
535 | " tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
536 | " resp = tokenizer.tokenize(text)\n",
537 | " except Exception as e:\n",
538 | " print (f\"Model name:{model_name} ; tokenizer doesn't exist; {e}\")\n"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": null,
544 | "id": "c7d3ca6f-8a0e-45ac-a4a7-49d9d24bbefd",
545 | "metadata": {},
546 | "outputs": [],
547 | "source": []
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": null,
552 | "id": "386b27e7-aad9-4d50-8324-46d3673f7bdc",
553 | "metadata": {},
554 | "outputs": [],
555 | "source": []
556 | }
557 | ],
558 | "metadata": {
559 | "environment": {
560 | "kernel": "python3",
561 | "name": "pytorch-gpu.1-13.m107",
562 | "type": "gcloud",
563 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-13:m107"
564 | },
565 | "kernelspec": {
566 | "display_name": "Python 3",
567 | "language": "python",
568 | "name": "python3"
569 | },
570 | "language_info": {
571 | "codemirror_mode": {
572 | "name": "ipython",
573 | "version": 3
574 | },
575 | "file_extension": ".py",
576 | "mimetype": "text/x-python",
577 | "name": "python",
578 | "nbconvert_exporter": "python",
579 | "pygments_lexer": "ipython3",
580 | "version": "3.7.12"
581 | }
582 | },
583 | "nbformat": 4,
584 | "nbformat_minor": 5
585 | }
586 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Search Engine Workshop
2 |
3 |
4 | ## About
5 |
6 | Handson workshop for building a semantic search engine.
7 |
8 |
9 |
10 |
11 | ## Setup
12 |
13 | If you came to this repo, during a workshop visit this custom [jupyter hub](http://hub.np.training) with all the dependencies already set up.
14 |
15 | The repo is located at [npatta01/search-engine-workshop](https://github.com/npatta01/search-engine-workshop)
16 |
17 | To use this repo outside a workshop, please use Binder
18 | [](https://mybinder.org/v2/gh/npatta01/search-engine-workshop/main)
19 |
20 | ## Content (Notebooks)
21 |
22 |
23 | **Data Fetching**
24 |
25 | [setup notebook](notebooks/00_a_setup_dataset.ipynb)
26 | [stats notebook](notebooks/00_b_setup_stats.ipynb)
27 | [sample image notebook](notebooks/00_c_sample_images.ipynb)
28 |
29 |
30 | Notebooks to download unsplash dataset and save as hugging face dataset format
31 |
32 |
33 | **Non Deep Learning Retrieval**
34 |
35 | BM25 retrieval with elastic search: [notebook](notebooks/01_bm25_elastic.ipynb)
36 |
37 |
38 | **Deep Learning Retrieval (text)**
39 |
40 |
41 | Text Deep Learning retrieval: [Link](notebooks/02_dense_retriever.ipynb)
42 |
43 |
44 | **Deep Learning Retrieval (image)**
45 |
46 |
47 | Clip Retrieval: [Link](notebooks/03_clip_embed.ipynb)
48 |
49 | **ANN**
50 |
51 | Shows how to speed up Deep Learning retrieval by exploring different ANN indexes
52 | [Link](notebooks/04_ann.ipynb)
53 |
54 |
55 |
56 |
57 | ## Slides
58 |
59 | [PyData Seattle 2022](assets/slides_pydataseattle2023.pdf)
60 |
61 | [PyData NYC 2022](assets/slides_pydatanyc2022.pdf)
62 |
63 |
64 | [ODSC 2022](assets/slides_odsc2022.pdf)
65 |
66 |
67 | ## Contact
68 |
69 | For help or feedback, please reach out to :
70 |
71 | - [Nidhin Pattaniyil](https://www.linkedin.com/in/nidhinpattaniyil/)
72 | - [Ravi Yadav](https://www.linkedin.com/in/ravi-kumar-yadav-535b268/)
73 | - [Mustafa Zengin](https://www.linkedin.com/in/mustafazengin/)
74 |
75 |
76 |
77 |
78 |
79 | ## Acknowledgments
80 |
81 | This workshop uses Unsplash Lite Dataset 1.2.0 [link](unsplash.com/data)
82 |
83 | The hands on portion of the workshop was made possible due to [JupyterHub Helm Chart](https://github.com/jupyterhub/helm-chart)
84 |
85 | ## Changelog
86 |
87 | **v1.1**
88 | - setup for PyDataNYC
89 | - replaced stackoverflow data with unsplash data
90 |
91 | **v1.0**
92 | - setup for ODSC
93 | - used stackoverflow data
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | elasticsearch==8.7.*
2 | pandarallel==1.6.*
3 | jupyterlab-execute-time==2.3.*
4 | ipywidgets==8.0.*
5 | datasets==2.6.*
6 | gcsfs==2023.1.*
7 | rank_bm25==0.2.*
8 | faiss_cpu==1.7.*
9 | sentence-transformers==2.2.*
10 | transformers==4.28.*
11 | papermill==2.4.*
12 | cloudpickle==2.2.*
13 | rich==13.3.*
14 | ipyplot==1.1.*
15 |
16 | # pyarrow<11.0.*
17 | # google-cloud-bigquery-storage
18 | # weaviate-client==3.8.0
19 | # datasets==2.6.*
20 | # milvus==2.1.*
21 | # rich==12.6.*
22 | # papermill==2.4.*
23 |
--------------------------------------------------------------------------------
/workshop_infra/Dockerfile:
--------------------------------------------------------------------------------
1 |
2 | #FROM jupyter/scipy-notebook:python-3.10.6
3 | FROM jupyter/scipy-notebook:python-3.7.12
4 |
5 |
6 |
7 | USER root
8 |
9 |
10 | RUN apt-get update && apt-get --yes install apt-utils && \
11 | apt-get --yes --no-install-recommends install htop tmux graphviz curl build-essential libsasl2-dev gfortran && \
12 | apt-get clean;
13 |
14 |
15 | # set the user back to original setting
16 | USER $NB_UID
17 |
18 |
19 |
20 | # Install from requirements.txt file
21 | COPY --chown=${NB_UID}:${NB_GID} environment.yaml /tmp/
22 | COPY --chown=${NB_UID}:${NB_GID} requirements.txt /tmp/
23 |
24 | RUN mamba env update -n base -f /tmp/environment.yaml && \
25 | fix-permissions "${CONDA_DIR}" && \
26 | fix-permissions "/home/${NB_USER}"
27 |
28 | COPY --chown=${NB_UID}:${NB_GID} workshop_infra/scripts /tmp/scripts/
29 |
30 | USER root
31 |
32 | RUN bash /tmp/scripts/build_setup_root.sh
33 | USER $NB_UID
34 |
35 |
36 | COPY --chown=${NB_UID}:${NB_GID} notebooks/workshop_setup.ipynb /tmp/workshop/notebooks/
37 |
38 | RUN bash /tmp/scripts/build_setup_user.sh
39 |
40 | COPY --chown=${NB_UID}:${NB_GID} . /tmp/workshop/
41 |
42 |
43 | ENV PATH="/opt/google-cloud-sdk/bin:${PATH}"
44 |
45 |
46 | #COPY --chown=${NB_UID}:${NB_GID} docker-setup.sh /tmp/
47 |
48 | #COPY --chown=${NB_UID}:${NB_GID} setup.ipynb /tmp/
49 |
50 | # RUN papermill /tmp/setup.ipynb /tmp/setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar && \
51 | # fix-permissions "${CONDA_DIR}" && \
52 | # fix-permissions "/home/${NB_USER}"
--------------------------------------------------------------------------------
/workshop_infra/cert/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/workshop_infra/cert/.gitkeep
--------------------------------------------------------------------------------
/workshop_infra/config.enc.yaml:
--------------------------------------------------------------------------------
1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
2 | scheduling:
3 | userScheduler:
4 | enabled: true
5 | podPriority:
6 | enabled: true
7 | userPlaceholder:
8 | enabled: true
9 | # Specify five dummy user pods will be used as placeholders
10 | replicas: 5
11 | userPods:
12 | nodeAffinity:
13 | matchNodePurpose: require
14 | cull:
15 | enabled: true
16 | timeout: 3600
17 | every: 300
18 | singleuser:
19 | cpu:
20 | limit: 4
21 | guarantee: 4
22 | memory:
23 | limit: 8G
24 | guarantee: 8G
25 | image:
26 | # You should replace the "latest" tag with a fixed version from:
27 | # https://hub.docker.com/r/jupyter/datascience-notebook/tags/
28 | # Inspect the Dockerfile at:
29 | # https://github.com/jupyter/docker-stacks/tree/HEAD/datascience-notebook/Dockerfile
30 | name: gcr.io/np-public-training/semantic-search-workshop
31 | tag: v1.0
32 | #defaultUrl: /lab
33 | # extraEnv:
34 | # GOOGLE_APPLICATION_CREDENTIALS: /etc/secrets/keyfile.json
35 | storage:
36 | # extraVolumes:
37 | # - name: gcsfs-creds
38 | # secret:
39 | # secretName: gcsfs-creds
40 | # items:
41 | # - key: keyfile.json
42 | # path: keyfile.json
43 | # extraVolumeMounts:
44 | # - name: gcsfs-creds
45 | # mountPath: /etc/secrets
46 | # readOnly: true
47 | type: none
48 | lifecycleHooks:
49 | postStart:
50 | exec:
51 | command:
52 | - sh
53 | - -c
54 | - bash /tmp/workshop/workshop_infra/scripts/container_startup.sh
55 | extraContainers:
56 | - name: elastic-search
57 | image: elasticsearch:8.7.0
58 | env:
59 | - name: discovery.type
60 | value: single-node
61 | - name: xpack.security.enabled
62 | value: "false"
63 | - name: ES_JAVA_OPTS
64 | value: -Xms1g -Xmx1g
65 | # - name: milvus
66 | # image: gcr.io/np-public-training/custom-milvus:v2.1.4-1
67 | # - name: weaviate
68 | # image: semitechnologies/weaviate:1.14.0
69 | # env:
70 | # - name: QUERY_DEFAULTS_LIMIT
71 | # value: "25"
72 | # - name: AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED
73 | # value: "true"
74 | # - name: PERSISTENCE_DATA_PATH
75 | # value: /var/lib/weaviate
76 | # - name: DEFAULT_VECTORIZER_MODULE
77 | # value: none
78 | # - name: ENABLE_MODULES
79 | # value: ""
80 | # - name: CLUSTER_HOSTNAME
81 | # value: node1
82 | # proxy:
83 | # https:
84 | # enabled: true
85 | # hosts:
86 | # - hub.np.training
87 | # letsencrypt:
88 | # contactEmail: npatta01@gmail.com
89 | # service:
90 | # loadBalancerIP: "34.145.156.81"
91 | # proxy:
92 | # service:
93 | # loadBalancerIP: "34.145.156.81"
94 | proxy:
95 | https:
96 | enabled: true
97 | hosts:
98 | - hub.np.training
99 | type: secret
100 | secret:
101 | name: workshop-tls
102 | service:
103 | loadBalancerIP: 34.102.71.215
104 | hub:
105 | config:
106 | Authenticator:
107 | admin_users:
108 | - npatta01
109 | - vishalkumar95
110 | - mzengin
111 | - rkyadav-ncsu
112 | GitHubOAuthenticator:
113 | client_id: ENC[AES256_GCM,data:Af0qVw8uUkPGgukNRUihE/v6Yxw=,iv:glsdWx5z0/cJ1PKZUQp+7LvDpi2pn4RGkKIza0sP7rA=,tag:bWW55KyWY+Sh4ECtdsUGlw==,type:str]
114 | client_secret: ENC[AES256_GCM,data:/JyxvL2uXAd1/I45blHoOq8cfco58SnqtLdbDfnPNRakJuch4ShW1g==,iv:+xoS5twfpnX+2xduZMQNTcffqcuaLWIIsss/8Whvn00=,tag:RG4cuZjt8TookMgnXWHL1A==,type:str]
115 | oauth_callback_url: https://hub.np.training/hub/oauth_callback
116 | JupyterHub:
117 | authenticator_class: github
118 | sops:
119 | kms: []
120 | gcp_kms:
121 | - resource_id: projects/np-public-training/locations/global/keyRings/sops/cryptoKeys/sops-key
122 | created_at: "2022-10-13T23:23:47Z"
123 | enc: CiQAtA68IX63yVjyNNzcuN6oxKMDvZI/hnlne6POMs/AToxGvoUSSQDOyIoWf1EgyIyvrp486rhLw/G2J+YuUkobdqfonbEr5Tss0E60rJY5vCtgqzes+/7aunlxPDTU5zngKhkH/vP7dz/z69G3ZmQ=
124 | azure_kv: []
125 | hc_vault: []
126 | age: []
127 | lastmodified: "2023-04-26T01:39:50Z"
128 | mac: ENC[AES256_GCM,data:cDB37QZ0UxitWQTSkpQCWJLc1lLfiiqWTbjujMAJi6FB7MryXEaQU7k/8vueuV2+/3k1Zhp++H68SB3zZWJ9XBg4UXsUkLTgqlCQ9Fv3M2rrwVL/h1LQ7wqHVlnvEySy+qODb0PoXS3QfUShOvrPNPf/ZtFHWIRYrfJEprkTys0=,iv:xMc7R4v+PNMFh9DXYxKVnPr9v81MbPqNDq5AVQcwYW4=,tag:Jp8+haROua5fHqWovZNb+Q==,type:str]
129 | pgp: []
130 | encrypted_regex: ^(client_id|client_secret)$
131 | version: 3.7.3
132 |
--------------------------------------------------------------------------------
/workshop_infra/config_public.yaml:
--------------------------------------------------------------------------------
1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
2 | scheduling:
3 | userScheduler:
4 | enabled: true
5 | podPriority:
6 | enabled: true
7 | userPlaceholder:
8 | enabled: true
9 | # Specify five dummy user pods will be used as placeholders
10 | replicas: 1
11 | userPods:
12 | nodeAffinity:
13 | matchNodePurpose: require
14 | cull:
15 | enabled: true
16 | timeout: 3600
17 | every: 300
18 | singleuser:
19 | cpu:
20 | limit: 4
21 | guarantee: 4
22 | memory:
23 | limit: 8G
24 | guarantee: 8G
25 | image:
26 | # You should replace the "latest" tag with a fixed version from:
27 | # https://hub.docker.com/r/jupyter/datascience-notebook/tags/
28 | # Inspect the Dockerfile at:
29 | # https://github.com/jupyter/docker-stacks/tree/HEAD/datascience-notebook/Dockerfile
30 | name: gcr.io/np-public-training/semantic-search-workshop
31 | tag: v1.0
32 | #defaultUrl: /lab
33 | # extraEnv:
34 | # GOOGLE_APPLICATION_CREDENTIALS: /etc/secrets/keyfile.json
35 | storage:
36 | type: none
37 | # extraVolumes:
38 | # - name: gcsfs-creds
39 | # secret:
40 | # secretName: gcsfs-creds
41 | # items:
42 | # - key: keyfile.json
43 | # path: keyfile.json
44 | # extraVolumeMounts:
45 | # - name: gcsfs-creds
46 | # mountPath: /etc/secrets
47 | # readOnly: true
48 | lifecycleHooks:
49 | postStart:
50 | exec:
51 | command:
52 | - sh
53 | - -c
54 | - "bash /tmp/workshop/workshop_infra/scripts/container_startup.sh"
55 | extraContainers:
56 | - name: elastic-search
57 | image: elasticsearch:8.7.0
58 | env:
59 | - name: discovery.type
60 | value: single-node
61 | - name: xpack.security.enabled
62 | value: "false"
63 | - name: ES_JAVA_OPTS
64 | value: -Xms1g -Xmx1g
65 | # - name: milvus
66 | # image: gcr.io/np-public-training/custom-milvus:v2.1.4-1
67 | # - name: weaviate
68 | # image: semitechnologies/weaviate:1.14.0
69 | # env:
70 | # - name: QUERY_DEFAULTS_LIMIT
71 | # value: "25"
72 | # - name: AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED
73 | # value: "true"
74 | # - name: PERSISTENCE_DATA_PATH
75 | # value: /var/lib/weaviate
76 | # - name: DEFAULT_VECTORIZER_MODULE
77 | # value: none
78 | # - name: ENABLE_MODULES
79 | # value: ""
80 | # - name: CLUSTER_HOSTNAME
81 | # value: node1
82 | # proxy:
83 | # https:
84 | # enabled: true
85 | # hosts:
86 | # - hub.np.training
87 | # letsencrypt:
88 | # contactEmail: npatta01@gmail.com
89 | # service:
90 | # loadBalancerIP: "34.145.156.81"
91 | # proxy:
92 | # service:
93 | # loadBalancerIP: "34.145.156.81"
94 |
95 | hub:
96 | config:
97 | Authenticator:
98 | admin_users:
99 | - npatta01
100 | - vishalkumar95
101 | - mzengin
102 | - rkyadav-ncsu
103 |
104 |
--------------------------------------------------------------------------------
/workshop_infra/scripts/build_setup_root.sh:
--------------------------------------------------------------------------------
1 | # apt-get install --reinstall systemd --yes
2 | # wget https://github.com/milvus-io/milvus/releases/download/v2.1.4/milvus_2.1.4-1_amd64.deb
3 | # apt-get update --yes
4 | # dpkg -i milvus_2.1.4-1_amd64.deb
5 | # apt-get -f install --yes
6 | # apt-get install --yes --no-install-recommends build-essential libsasl2-dev gfortran milvus
7 |
8 | #pip install milvus==2.1.*
9 | apt-get update --yes
10 | apt-get install --yes --no-install-recommends build-essential libsasl2-dev gfortran pigz
11 |
12 | #python -c "import milvus; milvus.before()"
13 |
14 | #bash /var/bin/e-milvus/lib/install_deps.sh
15 |
16 |
17 |
18 | EXPORT CLOUDSDK_CORE_DISABLE_PROMPTS=1
19 |
20 | curl https://sdk.cloud.google.com > install.sh
21 | bash install.sh --disable-prompts --install-dir=/opt
22 |
--------------------------------------------------------------------------------
/workshop_infra/scripts/build_setup_user.sh:
--------------------------------------------------------------------------------
1 |
2 | papermill /tmp/workshop/notebooks/workshop_setup.ipynb /tmp/workshop_setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar
--------------------------------------------------------------------------------
/workshop_infra/scripts/container_startup.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | #gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS || echo "skipped gcloud authentication"
4 |
5 |
6 | #cp -r /tmp/workshop /home/jovyan
7 |
8 |
9 |
10 | echo $pwd
11 |
12 | GIT_BRANCH="pydata_seattle"
13 | GIT_BRANCH="main"
14 |
15 | echo "cloning repo"
16 | git clone --depth 1 https://github.com/npatta01/search-engine-workshop.git -b $GIT_BRANCH
17 |
18 |
19 | cd search-engine-workshop
20 |
21 |
22 | url="https://storage.googleapis.com/np-public-training-tmp/search-workshop/data.zip"
23 |
24 | if wget --spider $url 2>/dev/null; then
25 |
26 | echo "getting data from gcs"
27 | wget $url
28 | unzip -q data.zip
29 | else
30 | echo "getting from github"
31 | wget https://github.com/npatta01/search-engine-workshop/releases/download/v1.0/data_processed.zip
32 | unzip -q data_processed.zip
33 |
34 | fi
35 |
36 |
--------------------------------------------------------------------------------
/workshop_infra/setup.md:
--------------------------------------------------------------------------------
1 | # Workshop Setup
2 |
3 | The following included commands and steps that were used to create a working jupyter hub installation for the workshop.
4 |
5 | The instructions assume that you are plannning to use GCP and have gcloud setup.
6 |
7 |
8 | Most of the instructions are taken from [zero-to-jupyterhub](https://zero-to-jupyterhub.readthedocs.io/en/latest/index.html) project.
9 |
10 |
11 | ## Step 1: common variables
12 |
13 | ```bash
14 | REGION="us-west2"
15 | ZONE="$REGION-a"
16 | NODE_TYPE_USER="e2-highmem-16"
17 | NODE_TYPE_DEFAULT="e2-standard-2"
18 |
19 | CLUSTER_NAME=workshop
20 | NODES_MIN=0
21 | NODES_MAX=400
22 |
23 | EMAIL="npatta01@gmail.com"
24 | GCP_PROJECT="np-public-training"
25 |
26 | HELM_NAMESPACE=$CLUSTER_NAME
27 |
28 | HELM_CHART_VERSION="2.0.0"
29 | ```
30 |
31 | ## Step 2: create static ip address
32 |
33 | ```bash
34 | gcloud compute addresses create $CLUSTER_NAME \
35 | --region $REGION \
36 | --project $GCP_PROJECT
37 |
38 | gcloud compute addresses describe $CLUSTER_NAME \
39 | --region $REGION \
40 | --project $GCP_PROJECT
41 |
42 | ```
43 |
44 | Create an `A` record with your DNS provider.
45 |
46 | I am using `hub` for my domain `np.training`
47 |
48 |
49 |
50 |
51 | ## Step 3: Create cluster
52 |
53 |
54 | ```bash
55 |
56 | gcloud container clusters create \
57 | --machine-type $NODE_TYPE_DEFAULT \
58 | --num-nodes 1 \
59 | --region $REGION \
60 | --cluster-version latest \
61 | $CLUSTER_NAME \
62 | --project $GCP_PROJECT
63 |
64 | ```
65 |
66 | Get kubectl credentials
67 |
68 | ```bash
69 | gcloud container clusters get-credentials \
70 | $CLUSTER_NAME \
71 | --region $REGION \
72 | --project $GCP_PROJECT
73 | ```
74 |
75 | Create admin access for user
76 |
77 | ```bash
78 | kubectl create clusterrolebinding cluster-admin-binding \
79 | --clusterrole=cluster-admin \
80 | --user $EMAIL
81 | ```
82 |
83 | Create separate node pool for jupyter notebook
84 |
85 | ```bash
86 | gcloud beta container node-pools create user-pool \
87 | --machine-type $NODE_TYPE_USER \
88 | --num-nodes 0 \
89 | --enable-autoscaling \
90 | --min-nodes $NODES_MIN \
91 | --max-nodes $NODES_MAX \
92 | --node-labels hub.jupyter.org/node-purpose=user \
93 | --node-taints hub.jupyter.org_dedicated=user:NoSchedule \
94 | --scopes "https://www.googleapis.com/auth/cloud-platform" \
95 | --region $REGION \
96 | --cluster $CLUSTER_NAME \
97 | --project $GCP_PROJECT
98 | ```
99 |
100 |
101 | ## Step 3b: Cert (optional)
102 |
103 | By default the Helm chart we will use supports LetsEncrypt. However, I had trouble getting it to work.
104 | So, I used followed the steps bellow to get create my own cert
105 |
106 | create certificate signing request for "*.np.training"
107 |
108 | ```bash
109 | openssl req -nodes -newkey rsa:2048 \
110 | -keyout cert/server.key \
111 | -out cert/server.csr \
112 | -subj "/C=US/ST=New York/L=New York/O=NP Training./OU=IT/CN=*.np.training"
113 | ```
114 |
115 | I bought a wildcard cert from Namecheap
116 |
117 |
118 | Download my cert and create a kubectl cert
119 | ```bash
120 |
121 | kubectl create namespace $HELM_NAMESPACE
122 |
123 |
124 |
125 |
126 | gsutil cp "gs://np-training-private/certs/_star.np.training/*" workshop_infra/cert
127 |
128 |
129 | kubectl create namespace $HELM_NAMESPACE
130 | cd workshop_infra/cert
131 | kubectl create secret tls $HELM_NAMESPACE-tls --key="tls.key" --cert="tls.crt" --namespace $HELM_NAMESPACE
132 | cd ../../
133 |
134 | ```
135 |
136 |
137 | download storage key
138 |
139 | ```
140 | gcloud iam service-accounts keys create workshop_infra/keyfile.json \
141 | --iam-account=public-storage-reader-sa@np-public-training.iam.gserviceaccount.com
142 |
143 | gsutil cp gs://np-training-private/service_accounts/keyfile.json workshop_infra/keyfile.json
144 |
145 | kubectl create secret generic gcsfs-creds --from-file=workshop_infra/keyfile.json --namespace $HELM_NAMESPACE
146 |
147 |
148 |
149 | ```
150 |
151 | ## Step 4: Helm setup
152 |
153 | ```bash
154 |
155 | curl https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | bash
156 |
157 | helm version
158 |
159 | helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/
160 | helm repo update
161 |
162 | ```
163 |
164 |
165 | ## Step 5: Update config file (optional)
166 |
167 |
168 | build docker image
169 |
170 | ```bash
171 | docker build -t gcr.io/$GCP_PROJECT/semantic-search-workshop:v1.0 .
172 | docker push gcr.io/$GCP_PROJECT/semantic-search-workshop:v1.0
173 |
174 | ```
175 |
176 | build milvus
177 |
178 | ```bash
179 | cd docker_milvus
180 |
181 | docker build -t gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 .
182 | docker push gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1
183 | echo "gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 "
184 | cd ..
185 | ```
186 |
187 | encrypt setup
188 |
189 | ```bash
190 | gcloud kms keyrings create sops --location global --project $GCP_PROJECT
191 | gcloud kms keys create sops-key --location global --keyring sops --purpose encryption --project $GCP_PROJECT
192 | gcloud kms keys list --location global --keyring sops --project $GCP_PROJECT
193 | ```
194 |
195 |
196 | ```bash
197 | sops --encrypt --gcp-kms projects/$GCP_PROJECT/locations/global/keyRings/sops/cryptoKeys/sops-key \
198 | --encrypted-regex '^(client_id|client_secret)$' \
199 | workshop_infra/config.yaml > workshop_infra/config.enc.yaml
200 | ```
201 |
202 | ```bash
203 | sops --decrypt workshop_infra/config.enc.yaml > workshop_infra/config.yaml
204 | ```
205 |
206 |
207 |
208 |
209 |
210 | replace values in [config.yaml](workshop_infra/config.yaml)
211 |
212 | - GitHubOAuthenticator
213 | - singleuser.image.name
214 | - scheduling.userPlaceholder.replicas
215 | - proxy.https.host
216 | - proxy.https.service.loadBalancerIP
217 |
218 |
219 |
220 | ## Step 6: Helm Install with authentication
221 |
222 | setup with authentication and git oauth
223 |
224 | ```bash
225 | helm upgrade --cleanup-on-fail \
226 | --install $HELM_NAMESPACE jupyterhub/jupyterhub \
227 | --namespace $HELM_NAMESPACE \
228 | --create-namespace \
229 | --version $HELM_CHART_VERSION \
230 | --values workshop_infra/config.yaml
231 |
232 | ```
233 |
234 | ```bash
235 | kubectl --namespace=$HELM_NAMESPACE get pod
236 |
237 | kubectl --namespace=$HELM_NAMESPACE get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
238 | ```
239 |
240 |
241 | ## Step 6b: Helm Install with no authentication ( not auth)
242 |
243 | ```bash
244 | helm upgrade --cleanup-on-fail \
245 | --install $HELM_NAMESPACE-public jupyterhub/jupyterhub \
246 | --namespace $HELM_NAMESPACE-public \
247 | --create-namespace \
248 | --version $HELM_CHART_VERSION \
249 | --values workshop_infra/config_public.yaml
250 |
251 |
252 | kubectl --namespace=$HELM_NAMESPACE-public get pod
253 |
254 | kubectl --namespace=$HELM_NAMESPACE-public get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
255 | ```
256 |
257 | add the external ip to dns
258 |
259 |
260 | ## Step 7: Cleanup (Helm Delete)
261 |
262 | ```bash
263 |
264 | helm delete $HELM_NAMESPACE --namespace $HELM_NAMESPACE
265 | kubectl delete namespace $HELM_NAMESPACE
266 |
267 | helm delete $HELM_NAMESPACE-public --namespace $HELM_NAMESPACE-public
268 | kubectl delete namespace $HELM_NAMESPACE-public
269 |
270 |
271 | gcloud container clusters delete $CLUSTER_NAME --region $REGION --project $GCP_PROJECT
272 |
273 | ```
--------------------------------------------------------------------------------