├── .dockerignore ├── .gitignore ├── archive ├── docker_milvus │ ├── Dockerfile │ ├── install.sh │ ├── readme.md │ └── supervisord.conf ├── notebooks │ ├── 01_bm25.ipynb │ ├── 02_dense_retriever_milvus.ipynb │ └── 04_ann-elastic.ipynb └── notebooks_stackoverflow │ ├── 00_data_fetch_bq.ipynb │ ├── 00_data_fetch_spark.ipynb │ ├── 01_b_setup.ipynb │ ├── 01_data_cleanup.ipynb │ ├── 01_data_subset.ipynb │ ├── 01_workshop_data_preview.ipynb │ ├── 02_retrieval_dense_milvus.ipynb │ ├── 02_retrieval_sparse.ipynb │ ├── 03_comparision.ipynb │ ├── ann_benchmark_recall.ipynb │ ├── metrics_utils.py │ ├── other__retrieve_rerank_simple_wikipedia.ipynb │ ├── test_setup.ipynb │ └── workshop_setup.ipynb ├── assets ├── all_assets.sw ├── slides_odsc2022.pdf ├── slides_pydatanyc2022.pdf └── slides_pydataseattle2023.pdf ├── docker-compose.yaml ├── docs ├── internal_notes.md └── slide_notes.md ├── environment.yaml ├── notebooks ├── 00_a_setup_dataset.ipynb ├── 00_b_setup_stats.ipynb ├── 00_c_sample_images.ipynb ├── 01_bm25_elastic.ipynb ├── 02_dense_retriever.ipynb ├── 03_clip_embed.ipynb ├── 04_ann.ipynb └── workshop_setup.ipynb ├── readme.md ├── requirements.txt └── workshop_infra ├── Dockerfile ├── cert └── .gitkeep ├── config.enc.yaml ├── config_public.yaml ├── scripts ├── build_setup_root.sh ├── build_setup_user.sh └── container_startup.sh └── setup.md /.dockerignore: -------------------------------------------------------------------------------- 1 | data/ 2 | workshop_infra/ 3 | !workshop_infra/scripts/ 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | 132 | data/ 133 | 134 | 135 | workshop_infra/cert/* 136 | workshop_infra/config.yaml 137 | workshop_infra/key_file.json 138 | 139 | *.db 140 | tmp/ 141 | .DS_Store 142 | 143 | !/**/.gitkeep 144 | workshop_infra/keyfile.json 145 | *.zip 146 | -------------------------------------------------------------------------------- /archive/docker_milvus/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | ENV MILVUS_VERSION "2.1.4-1" 4 | 5 | COPY install.sh /tmp/install.sh 6 | 7 | RUN bash /tmp/install.sh 8 | 9 | 10 | # ARG S6_OVERLAY_VERSION=3.1.2.1 11 | 12 | 13 | # ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz /tmp 14 | 15 | # RUN tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz 16 | 17 | # ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-x86_64.tar.xz /tmp 18 | 19 | # RUN tar -C / -Jxpf /tmp/s6-overlay-x86_64.tar.xz 20 | 21 | 22 | COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf 23 | 24 | 25 | 26 | CMD ["/usr/bin/supervisord"] 27 | -------------------------------------------------------------------------------- /archive/docker_milvus/install.sh: -------------------------------------------------------------------------------- 1 | apt-get update -y 2 | 3 | apt install software-properties-common -y 4 | 5 | add-apt-repository ppa:milvusdb/milvus 6 | 7 | 8 | apt-get update -y 9 | 10 | 11 | apt-get install "milvus=$MILVUS_VERSION" -y 12 | 13 | 14 | 15 | #mkdir -p /etc/services.d/system/ 16 | 17 | #cp /lib/systemd/system/milvus* /etc/services.d/system/ 18 | 19 | #cp /lib/systemd/system/milvus* /etc/services.d/system/ 20 | 21 | #COPY resources/docker/services.d /etc/services.d 22 | 23 | 24 | 25 | apt-get update && apt-get install -y supervisor 26 | mkdir -p /var/log/supervisor -------------------------------------------------------------------------------- /archive/docker_milvus/readme.md: -------------------------------------------------------------------------------- 1 | https://github.com/just-containers/s6-overlay 2 | 3 | 4 | 5 | cat /etc/services.d/system/milvus-etcd.service 6 | ExecStart=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data 7 | 8 | 9 | cat /etc/services.d/system/milvus-minio.service 10 | ExecStart=/usr/bin/milvus-minio server /var/lib/milvus/minio-data 11 | 12 | 13 | 14 | cat /etc/services.d/system/milvus.service 15 | 16 | Environment=MILVUSCONF=/etc/milvus/configs/ 17 | ExecStart=/usr/bin/milvus run standalone 18 | 19 | 20 | 21 | https://gdevillele.github.io/engine/admin/using_supervisord/ -------------------------------------------------------------------------------- /archive/docker_milvus/supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | nodaemon=true 3 | 4 | [program:milvus-minio] 5 | command=/usr/bin/milvus-minio server /var/lib/milvus/minio-data 6 | 7 | [program:milvus-etcd] 8 | command=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data 9 | 10 | 11 | [program:milvus] 12 | environment=MILVUSCONF=/etc/milvus/configs/ 13 | command=/usr/bin/milvus run standalone -------------------------------------------------------------------------------- /archive/notebooks_stackoverflow/00_data_fetch_bq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "2d9c002c-9ba7-48cb-83a5-3d2903056d43", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import modin.pandas as pd\n", 11 | "import re\n", 12 | "import lxml.html\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "id": "05461bbb-02f3-4749-b6ca-dba3a02bf1e8", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "UserWarning: Ray execution environment not yet initialized. Initializing...\n", 26 | "To remove this warning, run the following python code before doing dataframe operations:\n", 27 | "\n", 28 | " import ray\n", 29 | " ray.init()\n", 30 | "\n", 31 | "UserWarning: `read_gbq` defaulting to pandas implementation.\n", 32 | "To request implementation, send an email to feature_requests@modin.org.\n" 33 | ] 34 | }, 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | "
IdType
01Question
12Answer
23Wiki
34TagWikiExcerpt
45TagWiki
56ModeratorNomination
67WikiPlaceholder
78PrivilegeWiki
\n", 103 | "
" 104 | ], 105 | "text/plain": [ 106 | " Id Type\n", 107 | "0 1 Question\n", 108 | "1 2 Answer\n", 109 | "2 3 Wiki\n", 110 | "3 4 TagWikiExcerpt\n", 111 | "4 5 TagWiki\n", 112 | "5 6 ModeratorNomination\n", 113 | "6 7 WikiPlaceholder\n", 114 | "7 8 PrivilegeWiki" 115 | ] 116 | }, 117 | "execution_count": 3, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "pd.read_gbq(f\"\"\"\n", 124 | "select *\n", 125 | "FROM`sotorrent-org.2020_12_31.PostType`\n", 126 | "\n", 127 | "\"\"\", use_bqstorage_api=True)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 4, 133 | "id": "df33775c-cdb8-4fe2-8457-e8642c8265c5", 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "name": "stderr", 138 | "output_type": "stream", 139 | "text": [ 140 | "UserWarning: `read_gbq` defaulting to pandas implementation.\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "df_raw = pd.read_gbq (f\"\"\"\n", 146 | "\n", 147 | "with qn as (\n", 148 | " select Id , AcceptedAnswerId, Title, Body as QuestionBody, Tags , ViewCount, AnswerCount, CommentCount , Score, CreationDate\n", 149 | " FROM `sotorrent-org.2020_12_31.Posts` \n", 150 | " where PostTypeId = 1\n", 151 | "),\n", 152 | "ans as (\n", 153 | " select Id , Body as AnswerBody\n", 154 | " FROM `sotorrent-org.2020_12_31.Posts` \n", 155 | " where PostTypeId = 2\n", 156 | ")\n", 157 | "\n", 158 | "\n", 159 | "SELECT qn.*, ans.AnswerBody\n", 160 | "From qn \n", 161 | "inner join ans \n", 162 | "on qn.AcceptedAnswerId = ans.Id\n", 163 | "\n", 164 | "\"\"\", use_bqstorage_api=True)\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 5, 170 | "id": "51c4dd43-b3be-4253-8b1c-76eb854d4668", 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/html": [ 176 | "
\n", 177 | "\n", 190 | "\n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
IdAcceptedAnswerIdTitleQuestionBodyTagsViewCountAnswerCountCommentCountScoreCreationDateAnswerBody
02248646922488014Memory Mapping Large File Haskell<p>I am experimenting with the Haskell mmap pa...<haskell>56611102014-03-18 17:18:08<p>Looks like a typo. If I replace this:</p>\\n...
12090277520902933How to check if auto-rotate screen setting is ...<p>I think each android device has an abitily ...<java><android>1120133122014-01-03 11:37:35<p>Hope this code snippet helps you out:-</p>\\...
23961302339623807Understanding the FFT output<p>I'm currently occupied in a practicum and m...<java><fft>27710-42016-09-21 09:46:43<p>Computing a 512-point fourier transform aft...
327706302771563PDO::fetchAll vs. PDO::fetch in a loop<p>Just a quick question.</p>\\n\\n<p>Is there a...<php><mysql><pdo><fetch>8600671722010-05-05 04:31:40<p>Little benchmark with 200k random records. ...
43172520640180517Unable to Flash eMMC from SD Card BeagleBone B...<p>I am working on BeagleBone Black and Debian...<debian><beagleboneblack>3166483172015-07-30 13:30:39<p>Did you remember to remove the \"#\" at the b...
\n", 280 | "
" 281 | ], 282 | "text/plain": [ 283 | " Id AcceptedAnswerId \\\n", 284 | "0 22486469 22488014 \n", 285 | "1 20902775 20902933 \n", 286 | "2 39613023 39623807 \n", 287 | "3 2770630 2771563 \n", 288 | "4 31725206 40180517 \n", 289 | "\n", 290 | " Title \\\n", 291 | "0 Memory Mapping Large File Haskell \n", 292 | "1 How to check if auto-rotate screen setting is ... \n", 293 | "2 Understanding the FFT output \n", 294 | "3 PDO::fetchAll vs. PDO::fetch in a loop \n", 295 | "4 Unable to Flash eMMC from SD Card BeagleBone B... \n", 296 | "\n", 297 | " QuestionBody \\\n", 298 | "0

I am experimenting with the Haskell mmap pa... \n", 299 | "1

I think each android device has an abitily ... \n", 300 | "2

I'm currently occupied in a practicum and m... \n", 301 | "3

Just a quick question.

\\n\\n

Is there a... \n", 302 | "4

I am working on BeagleBone Black and Debian... \n", 303 | "\n", 304 | " Tags ViewCount AnswerCount CommentCount Score \\\n", 305 | "0 566 1 1 10 \n", 306 | "1 11201 3 3 12 \n", 307 | "2 277 1 0 -4 \n", 308 | "3 86006 7 1 72 \n", 309 | "4 31664 8 3 17 \n", 310 | "\n", 311 | " CreationDate AnswerBody \n", 312 | "0 2014-03-18 17:18:08

Looks like a typo. If I replace this:

\\n... \n", 313 | "1 2014-01-03 11:37:35

Hope this code snippet helps you out:-

\\... \n", 314 | "2 2016-09-21 09:46:43

Computing a 512-point fourier transform aft... \n", 315 | "3 2010-05-05 04:31:40

Little benchmark with 200k random records. ... \n", 316 | "4 2015-07-30 13:30:39

Did you remember to remove the \"#\" at the b... " 317 | ] 318 | }, 319 | "execution_count": 5, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | } 323 | ], 324 | "source": [ 325 | "df_raw.head()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "id": "8546ce34-5cce-45e1-8b90-e178e98e7415", 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "df_raw.to_parquet(\"../data/df_raw\",index=False)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "id": "4abb6cc3-040d-44b5-a5a5-68161b732b9e", 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "df_raw = pd.read_parquet(\"../data/df_raw\")" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "id": "c56f1bbd-6195-414a-adee-1075aede6aca", 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "len(df_raw)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "9cd18238-7318-44bb-8302-89232429028e", 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "def strip_html(s:str):\n", 366 | " try:\n", 367 | " return str(lxml.html.fromstring(s).text_content())\n", 368 | " except:\n", 369 | " return ''\n", 370 | "\n", 371 | "def parse_tags(content:str):\n", 372 | " return re.findall(r'<(.+?)>',content)\n" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "id": "8eda050e-3ab9-4555-9f7a-fe9cb190d824", 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "strip_html(f\"\"\"\n", 383 | "\n", 384 | "

I was asked to create a singleton that will...\t\n", 385 | "\"\"\")" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "id": "a60b2cd6-2f02-447b-99ac-2de8598bb470", 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "df = df_raw" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "id": "f8fa3f43-d0c1-42ff-949c-cb7519e11454", 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "df['Body'] = df['Body'].apply(strip_html)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "id": "d957c0c3-aba6-451b-816a-67224b8e6578", 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "df['Tags'] = df['Tags'].apply(parse_tags)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "id": "cb4c000b-0cd1-4d21-9de3-56589a04c40d", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "df_final.to_parquet(\"../data/df_processed/\")" 426 | ] 427 | } 428 | ], 429 | "metadata": { 430 | "environment": { 431 | "kernel": "python3", 432 | "name": "pytorch-gpu.1-11.m94", 433 | "type": "gcloud", 434 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94" 435 | }, 436 | "kernelspec": { 437 | "display_name": "Python 3", 438 | "language": "python", 439 | "name": "python3" 440 | }, 441 | "language_info": { 442 | "codemirror_mode": { 443 | "name": "ipython", 444 | "version": 3 445 | }, 446 | "file_extension": ".py", 447 | "mimetype": "text/x-python", 448 | "name": "python", 449 | "nbconvert_exporter": "python", 450 | "pygments_lexer": "ipython3", 451 | "version": "3.7.12" 452 | } 453 | }, 454 | "nbformat": 4, 455 | "nbformat_minor": 5 456 | } 457 | -------------------------------------------------------------------------------- /archive/notebooks_stackoverflow/01_b_setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "4885264f-1d3f-4ad5-a29a-e338cf64e59c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "2d9c002c-9ba7-48cb-83a5-3d2903056d43", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import modin.pandas as pd\n", 19 | "import re\n", 20 | "import lxml.html\n", 21 | "import re" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "ce8fc711-181c-4481-a6c5-fb580bf7e5d0", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stderr", 32 | "output_type": "stream", 33 | "text": [ 34 | "UserWarning: Ray execution environment not yet initialized. Initializing...\n", 35 | "To remove this warning, run the following python code before doing dataframe operations:\n", 36 | "\n", 37 | " import ray\n", 38 | " ray.init()\n", 39 | "\n", 40 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n", 41 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n", 42 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n", 43 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n", 44 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n", 45 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n", 46 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n", 47 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n", 48 | "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n", 49 | "\u001b[2m\u001b[33m(raylet)\u001b[0m if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n", 50 | "2022-10-01 21:40:38,955\tWARNING worker.py:1257 -- (ip=10.52.136.102) The agent on node nup0013-dl failed to be restarted 5 times. There are 3 possible problems if you see this error.\n", 51 | " 1. The dashboard might not display correct information on this node.\n", 52 | " 2. Metrics on this node won't be reported.\n", 53 | " 3. runtime_env APIs won't work.\n", 54 | "Check out the `dashboard_agent.log` to see the detailed failure messages.\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "df_raw = pd.read_parquet(\"../data/df_raw/\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "id": "e0a22a80-c5c6-4bd7-8546-ffe39db0b7c6", 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "Index(['Id', 'AcceptedAnswerId', 'Title', 'Body', 'Tags', 'ViewCount',\n", 72 | " 'AnswerCount', 'CommentCount', 'Score', 'CreationDate', 'AnswerBody'],\n", 73 | " dtype='object')" 74 | ] 75 | }, 76 | "execution_count": 3, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "df_raw.columns" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "id": "57723608-d170-419b-b1f0-85dde823485f", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "00_data_fetch_bq.ipynb\t 01_b_setup_new.ipynb 02_indexing_faiss.ipynb old\n", 96 | "00_data_fetch_spark.ipynb 01_data_cleanup.ipynb 03_searching_es.ipynb\n", 97 | "01_b_setup.ipynb\t 02_indexing_es.ipynb Untitled.ipynb\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "!ls" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 5, 108 | "id": "9d757c53-4328-4baf-a3d1-f26dffd00ca4", 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/html": [ 114 | "

\n", 115 | "\n", 128 | "\n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | "
IdAcceptedAnswerIdTitleBodyTagsViewCountAnswerCountCommentCountScoreCreationDateAnswerBody
01712365217124724hierarchical encryption scheme<p>I am in need of the \"hierarchical\" encrypti...<cryptography><key><hierarchical>6311022013-06-15 12:29:50.987<p>A partial solution. You own the master key...
14457713944577209Uncaught TypeError: this.source is not a function<p>I want to prelaod all the customers and giv...<ajax><jquery-ui>33961012017-06-15 21:14:46.990<p>Initialize <strong>autocomplete</strong> af...
24516288145162984Class App\\Http\\Controllers\\ does not exist<p>This is my Route: </p>\\n\\n<pre><code> Route...<php><laravel><controller><routes>322413122017-07-18 09:35:26.630<p>At the first of controller you do not need ...
318878411890092Grails startup is slow<p>Help! I'm porting a large ruby app to Grail...<grails>959250282009-12-11 12:43:03.790<p>Unfortunately, I am not sure too much can b...
481511298151158AlertDialog - trying to understand this syntax<p>This is code from the book sample:</p>\\n\\n<...<java><android>4905122011-11-16 11:46:42.137<pre><code>// Create a builder\\nAlertDialog.Bu...
\n", 218 | "
" 219 | ], 220 | "text/plain": [ 221 | " Id AcceptedAnswerId \\\n", 222 | "0 17123652 17124724 \n", 223 | "1 44577139 44577209 \n", 224 | "2 45162881 45162984 \n", 225 | "3 1887841 1890092 \n", 226 | "4 8151129 8151158 \n", 227 | "\n", 228 | " Title \\\n", 229 | "0 hierarchical encryption scheme \n", 230 | "1 Uncaught TypeError: this.source is not a function \n", 231 | "2 Class App\\Http\\Controllers\\ does not exist \n", 232 | "3 Grails startup is slow \n", 233 | "4 AlertDialog - trying to understand this syntax \n", 234 | "\n", 235 | " Body \\\n", 236 | "0

I am in need of the \"hierarchical\" encrypti... \n", 237 | "1

I want to prelaod all the customers and giv... \n", 238 | "2

This is my Route:

\\n\\n
 Route...   \n",
239 |        "3  

Help! I'm porting a large ruby app to Grail... \n", 240 | "4

This is code from the book sample:

\\n\\n<... \n", 241 | "\n", 242 | " Tags ViewCount AnswerCount CommentCount \\\n", 243 | "0 631 1 0 \n", 244 | "1 3396 1 0 \n", 245 | "2 32241 3 1 \n", 246 | "3 9592 5 0 \n", 247 | "4 490 5 1 \n", 248 | "\n", 249 | " Score CreationDate \\\n", 250 | "0 2 2013-06-15 12:29:50.987 \n", 251 | "1 1 2017-06-15 21:14:46.990 \n", 252 | "2 2 2017-07-18 09:35:26.630 \n", 253 | "3 28 2009-12-11 12:43:03.790 \n", 254 | "4 2 2011-11-16 11:46:42.137 \n", 255 | "\n", 256 | " AnswerBody \n", 257 | "0

A partial solution. You own the master key... \n", 258 | "1

Initialize autocomplete af... \n", 259 | "2

At the first of controller you do not need ... \n", 260 | "3

Unfortunately, I am not sure too much can b... \n", 261 | "4

// Create a builder\\nAlertDialog.Bu...  "
262 |       ]
263 |      },
264 |      "execution_count": 5,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "df_raw.head()"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "id": "bcf8431e-c037-429f-bc53-13a8214be375",
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": []
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 6,
284 |    "id": "72644750-923e-4c69-8ba1-581e2929540d",
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "regex = r\"\"\"\n",
289 |     "\t
.*?
\n", 290 | "\t\"\"\"\n", 291 | "\n", 292 | "def clean_text(snippet:str):\n", 293 | " snippet = re.sub(pattern=regex, repl = '[CODE]', string = snippet, flags = re.IGNORECASE | re.DOTALL | re.MULTILINE | re.VERBOSE )\n", 294 | " \n", 295 | " snippet = str(lxml.html.fromstring(snippet).text_content())\n", 296 | " \n", 297 | " return snippet\n", 298 | "\n", 299 | "def parse_tags(content:str):\n", 300 | " return re.findall(r'<(.+?)>',content)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "id": "99a5b7bf-694f-4798-bd6a-d41f4ec7dfeb", 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "\n", 311 | "\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 7, 317 | "id": "8eda050e-3ab9-4555-9f7a-fe9cb190d824", 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "'I was asked to create a singleton that will...\\n\\n[CODE]\\n\\n test \\n\\n'" 324 | ] 325 | }, 326 | "execution_count": 7, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "clean_text(f\"\"\"\n", 333 | "\n", 334 | "

I was asked to create a singleton that will..

.\n", 335 | "\n", 336 | "
KDF 
\n", 337 | "\n", 338 | "

test

\n", 339 | "\n", 340 | "\"\"\")" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "id": "a9a49fef-5499-40de-9bea-01301ce4e339", 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 8, 354 | "id": "a60b2cd6-2f02-447b-99ac-2de8598bb470", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "df = df_raw" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 9, 364 | "id": "f8fa3f43-d0c1-42ff-949c-cb7519e11454", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "df['QuestionBody'] = df['Body'].apply(clean_text)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 10, 374 | "id": "d45cc4c4-0142-4d7a-abad-7bd941a331a1", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "df['AnswerBody'] = df['AnswerBody'].apply(clean_text)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 11, 384 | "id": "d957c0c3-aba6-451b-816a-67224b8e6578", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "df['Tags'] = df['Tags'].apply(parse_tags)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 12, 394 | "id": "cb4c000b-0cd1-4d21-9de3-56589a04c40d", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "df.to_parquet(\"../data/df_processed/\")" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "id": "f7dc6fdc-8d41-4f9b-bffa-0d8c5274f9f1", 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "id": "bcb300a3-53cb-4454-b472-c9cc422f6cc4", 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "part-0000.snappy.parquet part-0006.snappy.parquet part-0012.snappy.parquet\n", 420 | "part-0001.snappy.parquet part-0007.snappy.parquet part-0013.snappy.parquet\n", 421 | "part-0002.snappy.parquet part-0008.snappy.parquet part-0014.snappy.parquet\n", 422 | "part-0003.snappy.parquet part-0009.snappy.parquet part-0015.snappy.parquet\n", 423 | "part-0004.snappy.parquet part-0010.snappy.parquet\n", 424 | "part-0005.snappy.parquet part-0011.snappy.parquet\n" 425 | ] 426 | } 427 | ], 428 | "source": [ 429 | "!ls ../data/df_processed/" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "id": "c722b250-f4ac-4523-adac-280b34dc3209", 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "id": "0569dbf8-8a79-42a2-b854-3f0628e8275d", 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [] 447 | } 448 | ], 449 | "metadata": { 450 | "environment": { 451 | "kernel": "stackoverflow", 452 | "name": "pytorch-gpu.1-11.m94", 453 | "type": "gcloud", 454 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94" 455 | }, 456 | "kernelspec": { 457 | "display_name": "Python 3.8.5 ('py38')", 458 | "language": "python", 459 | "name": "python3" 460 | }, 461 | "language_info": { 462 | "codemirror_mode": { 463 | "name": "ipython", 464 | "version": 3 465 | }, 466 | "file_extension": ".py", 467 | "mimetype": "text/x-python", 468 | "name": "python", 469 | "nbconvert_exporter": "python", 470 | "pygments_lexer": "ipython3", 471 | "version": "3.8.5" 472 | }, 473 | "vscode": { 474 | "interpreter": { 475 | "hash": "aefe80b7c360a2b6e560f9a0dcb6ff028291678d8b74cab0042c4a74d0e7253b" 476 | } 477 | } 478 | }, 479 | "nbformat": 4, 480 | "nbformat_minor": 5 481 | } 482 | -------------------------------------------------------------------------------- /archive/notebooks_stackoverflow/01_data_subset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "776a1f38-5ec7-4478-b392-bb943274b958", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "a462938f-432c-48cc-b7ae-a20f4df6c3ff", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "df_posts = pd.read_parquet(\"gs://np-training-tmp/stackoverflow/final/posts.parquet\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "id": "f155aa49-6b8d-4056-ad64-eea6fb96cb19", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
IdAcceptedAnswerIdTitleQuestionBodyTagsViewCountAnswerCountCommentCountScoreCreationDateAnswerIdAcceptedAnswerBody
033760194NaNPython How to burning discs with the monitorin...I'm writing the programm on Python with module...[python, event-handling, progressmonitor]4910202015-11-17 15:02:09.103NaNNone
115020895NaNPython int-byte efficient data structurei am currently storing key-values of type int-...[python, data-structures]1550312013-02-22 09:33:26.360NaNNone
247234657NaNconverting word into other word keeping the or...def translate(string, translations):\\n\\n[CODE]...[python, python-3.x]4821-12017-11-11 05:23:34.343NaNNone
337310210NaNCamera Calibration with OpenCV - How to adjust...I am working on a camera calibration program u...[python, python-2.7, opencv, camera, camera-ca...81642332016-05-18 21:14:34.110NaNNone
470675292NaNPython Same Period Last Year in Pandas with Gr...I have following DataFrame:\\nimport pandas as ...[python, pandas, group-by, offset, forecasting]701002022-01-12 01:19:53.640NaNNone
\n", 142 | "
" 143 | ], 144 | "text/plain": [ 145 | " Id AcceptedAnswerId \\\n", 146 | "0 33760194 NaN \n", 147 | "1 15020895 NaN \n", 148 | "2 47234657 NaN \n", 149 | "3 37310210 NaN \n", 150 | "4 70675292 NaN \n", 151 | "\n", 152 | " Title \\\n", 153 | "0 Python How to burning discs with the monitorin... \n", 154 | "1 Python int-byte efficient data structure \n", 155 | "2 converting word into other word keeping the or... \n", 156 | "3 Camera Calibration with OpenCV - How to adjust... \n", 157 | "4 Python Same Period Last Year in Pandas with Gr... \n", 158 | "\n", 159 | " QuestionBody \\\n", 160 | "0 I'm writing the programm on Python with module... \n", 161 | "1 i am currently storing key-values of type int-... \n", 162 | "2 def translate(string, translations):\\n\\n[CODE]... \n", 163 | "3 I am working on a camera calibration program u... \n", 164 | "4 I have following DataFrame:\\nimport pandas as ... \n", 165 | "\n", 166 | " Tags ViewCount AnswerCount \\\n", 167 | "0 [python, event-handling, progressmonitor] 491 0 \n", 168 | "1 [python, data-structures] 155 0 \n", 169 | "2 [python, python-3.x] 48 2 \n", 170 | "3 [python, python-2.7, opencv, camera, camera-ca... 8164 2 \n", 171 | "4 [python, pandas, group-by, offset, forecasting] 70 1 \n", 172 | "\n", 173 | " CommentCount Score CreationDate AnswerId AcceptedAnswerBody \n", 174 | "0 2 0 2015-11-17 15:02:09.103 NaN None \n", 175 | "1 3 1 2013-02-22 09:33:26.360 NaN None \n", 176 | "2 1 -1 2017-11-11 05:23:34.343 NaN None \n", 177 | "3 3 3 2016-05-18 21:14:34.110 NaN None \n", 178 | "4 0 0 2022-01-12 01:19:53.640 NaN None " 179 | ] 180 | }, 181 | "execution_count": 3, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "df_posts.head()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 4, 193 | "id": "6069508b-5b2e-4572-ace5-ce01d47f9de2", 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/html": [ 199 | "
\n", 200 | "\n", 213 | "\n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | "
PostIdPostTitleRelatedPostIdsRelatedPostTitlesnum_candidates
057348742How do I simulate a Scrollbar in tkInter Canvas[57348742, 68340045][How do I simulate a Scrollbar in tkInter Canv...2
13494593Shading a kernel density plot between two points.[3494593, 14863744, 14094644, 16504452, 488531...[Shading a kernel density plot between two poi...16
237949409Dictionary in a numpy array?[37949409, 47689224, 61517741][Dictionary in a numpy array?, How to access t...3
351519086How to remove tkinter - - - - line's when crea...[51519086, 55088055][How to remove tkinter - - - - line's when cre...2
463107594How to deal with multi-level column names down...[63107594, 63107603, 62966295, 68674235, 63124...[How to deal with multi-level column names dow...6
..................
3324348536681What is the exact meaning of stride's list in ...[48536681, 47305022][What is the exact meaning of stride's list in...2
3324437814201pandas time shift from utc to local[37814201, 52390647][pandas time shift from utc to local, Convert ...2
332452316987Converting a string to a formatted date-time s...[2316987, 48848730][Converting a string to a formatted date-time ...2
3324652027033Convert datetime to another format without cha...[52027033, 52252961][Convert datetime to another format without ch...2
3324717622419Creating a namedtuple object using only a subs...[17622419, 50899076][Creating a namedtuple object using only a sub...2
\n", 315 | "

33248 rows × 5 columns

\n", 316 | "
" 317 | ], 318 | "text/plain": [ 319 | " PostId PostTitle \\\n", 320 | "0 57348742 How do I simulate a Scrollbar in tkInter Canvas \n", 321 | "1 3494593 Shading a kernel density plot between two points. \n", 322 | "2 37949409 Dictionary in a numpy array? \n", 323 | "3 51519086 How to remove tkinter - - - - line's when crea... \n", 324 | "4 63107594 How to deal with multi-level column names down... \n", 325 | "... ... ... \n", 326 | "33243 48536681 What is the exact meaning of stride's list in ... \n", 327 | "33244 37814201 pandas time shift from utc to local \n", 328 | "33245 2316987 Converting a string to a formatted date-time s... \n", 329 | "33246 52027033 Convert datetime to another format without cha... \n", 330 | "33247 17622419 Creating a namedtuple object using only a subs... \n", 331 | "\n", 332 | " RelatedPostIds \\\n", 333 | "0 [57348742, 68340045] \n", 334 | "1 [3494593, 14863744, 14094644, 16504452, 488531... \n", 335 | "2 [37949409, 47689224, 61517741] \n", 336 | "3 [51519086, 55088055] \n", 337 | "4 [63107594, 63107603, 62966295, 68674235, 63124... \n", 338 | "... ... \n", 339 | "33243 [48536681, 47305022] \n", 340 | "33244 [37814201, 52390647] \n", 341 | "33245 [2316987, 48848730] \n", 342 | "33246 [52027033, 52252961] \n", 343 | "33247 [17622419, 50899076] \n", 344 | "\n", 345 | " RelatedPostTitles num_candidates \n", 346 | "0 [How do I simulate a Scrollbar in tkInter Canv... 2 \n", 347 | "1 [Shading a kernel density plot between two poi... 16 \n", 348 | "2 [Dictionary in a numpy array?, How to access t... 3 \n", 349 | "3 [How to remove tkinter - - - - line's when cre... 2 \n", 350 | "4 [How to deal with multi-level column names dow... 6 \n", 351 | "... ... ... \n", 352 | "33243 [What is the exact meaning of stride's list in... 2 \n", 353 | "33244 [pandas time shift from utc to local, Convert ... 2 \n", 354 | "33245 [Converting a string to a formatted date-time ... 2 \n", 355 | "33246 [Convert datetime to another format without ch... 2 \n", 356 | "33247 [Creating a namedtuple object using only a sub... 2 \n", 357 | "\n", 358 | "[33248 rows x 5 columns]" 359 | ] 360 | }, 361 | "execution_count": 4, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "df_related = pd.read_parquet(\"gs://np-training-tmp/stackoverflow/final/related_posts.parquet\")\n", 368 | "df_related" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "12502a21-39cf-4f73-b6b2-d106f446516f", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 15, 382 | "id": "0174520e-92ed-48de-bab5-214d04d0249e", 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "post_ids = set (df_posts.sample(frac=1, random_state=42).head(200_000)['Id'] )" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 16, 392 | "id": "586796a6-f69f-4faf-bd42-5f967986dfc1", 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "def match_exists(related_post_ids):\n", 397 | " res = set(related_post_ids ) & post_ids\n", 398 | " return len(res) > 0" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 17, 404 | "id": "3412f27f-c39e-4fcf-9dd1-9147fbc0eac7", 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "df_related_subset = df_related [ df_related['RelatedPostIds'].apply(match_exists) ]\n", 409 | "post_ids_additional = set(df_candidates['RelatedPostIds'].explode() )\n", 410 | "\n", 411 | "post_id_final = post_ids | post_ids_additional" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "id": "f323527c-021a-474f-a9d0-b73aa3d55681", 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "len(" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 18, 427 | "id": "649e579a-106f-4835-b646-d76e6c2e8305", 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "df_posts_subset = df_posts [ df_posts['Id'].isin(post_id_final)]" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 19, 437 | "id": "8712592a-1549-4c4e-a508-8357f693d2eb", 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "219841" 444 | ] 445 | }, 446 | "execution_count": 19, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "len(df_posts_subset)" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "id": "916d2770-d343-47e3-9d06-f9c399e7e6a7", 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 20, 466 | "id": "a503bf07-16fe-4b2a-84d6-cbd86851067e", 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "df_posts_subset.to_parquet(\"gs://np-training-tmp/stackoverflow/final_subset/posts.parquet\")" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 21, 476 | "id": "f4fbd837-2557-4b63-b263-9af66690815a", 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "df_related_subset.to_parquet(\"gs://np-training-tmp/stackoverflow/final_subset/related_posts.parquet\")" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "id": "9e5b4524-14ec-44b7-bfe4-38f95c39e15b", 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "id": "1809b800-0eb7-46ec-9a35-4d52070c6840", 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "id": "4fde525e-db57-4c32-a07a-d0cc2b32926a", 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "!gsutil -m cp -r gs://np-training-tmp/stackoverflow/final_subset/* ../data/final_subset/" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "id": "7fd4e6ab-9c57-4fe6-a606-eaa4932f4244", 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "!gsutil -m cp -r gs://np-training-tmp/stackoverflow/final/* ../data/final/\n", 517 | "\n" 518 | ] 519 | } 520 | ], 521 | "metadata": { 522 | "environment": { 523 | "kernel": "stackoverflow", 524 | "name": "pytorch-gpu.1-12.m99", 525 | "type": "gcloud", 526 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99" 527 | }, 528 | "kernelspec": { 529 | "display_name": "stackoverflow", 530 | "language": "python", 531 | "name": "stackoverflow" 532 | }, 533 | "language_info": { 534 | "codemirror_mode": { 535 | "name": "ipython", 536 | "version": 3 537 | }, 538 | "file_extension": ".py", 539 | "mimetype": "text/x-python", 540 | "name": "python", 541 | "nbconvert_exporter": "python", 542 | "pygments_lexer": "ipython3", 543 | "version": "3.7.12" 544 | } 545 | }, 546 | "nbformat": 4, 547 | "nbformat_minor": 5 548 | } 549 | -------------------------------------------------------------------------------- /archive/notebooks_stackoverflow/01_workshop_data_preview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "724dc187-f812-4c97-81dd-ad527f9d8338", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "from IPython.display import JSON\n", 12 | "import metrics_utils" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "37b1ba40-6527-4ec3-8180-7db66fc9d808", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 11, 26 | "id": "dfeca3c8-2684-44a1-8497-1bf4c4c89c9d", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | " 1.35 GiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final/posts.parquet\n", 34 | " 5.26 MiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final/related_posts.parquet\n", 35 | "115.09 MiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final_subset/posts.parquet\n", 36 | " 1.08 GiB 2022-11-02T11:42:53Z gs://np-public-training-temp/stackoverflow/final_subset/posts_with_embedding.parquet\n", 37 | " 1.4 MiB 2022-11-02T08:48:12Z gs://np-public-training-temp/stackoverflow/final_subset/related_posts.parquet\n", 38 | "TOTAL: 5 objects, 2736956352 bytes (2.55 GiB)\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "!gsutil ls -lh gs://np-public-training-temp/stackoverflow/**" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "id": "e218afd6-edcb-46cc-8263-94611d54ffeb", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "path_posts = \"gs://np-public-training-temp/stackoverflow/final_subset/posts.parquet\"\n", 54 | "path_posts_related = \"gs://np-public-training-temp/stackoverflow/final_subset/related_posts.parquet\"\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "id": "431537d1-2701-4d8e-a3fc-22d877cd14bb", 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "
\n", 67 | "\n", 80 | "\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | "
IdAcceptedAnswerIdTitleQuestionBodyTagsViewCountAnswerCountCommentCountScoreCreationDateAnswerIdAcceptedAnswerBody
115020895NaNPython int-byte efficient data structurei am currently storing key-values of type int-...[python, data-structures]1550312013-02-22 09:33:26.360NaNNone
968487902NaNWhy does the Variance of Laplace very differen...TL;DR: How can I use skimage.filters.laplace(i...[python, opencv, image-processing, computer-vi...3910512021-07-22 15:50:34.220NaNNone
1561391327NaNWhy input never endsI have python 3.7 installed and I have this co...[python, python-3.x, input]1041632020-04-23 15:43:03.497NaNNone
2728852710NaNCrashes with piecewise linear objective for gu...We have a complex optimization problem which i...[python, crash, gurobi, piecewise]4031132015-03-04 10:58:16.370NaNNone
2924043029NaNPython TypeError: plotdatehist() got an unexpe...apologies beforehand if this is a stupid quest...[python, typeerror]4190702014-06-04 16:42:32.257NaNNone
.......................................
26613765543174955431832.0Handling exception returned by a methodI am calling a method that throws Valuerror ex...[python-3.x]261212019-03-30 13:07:07.89355431832.0You need to place call to sanitize method in t...
26613781379453213794740.0Python regular expression for Beautiful SoupI am using Beautiful Soup to pull out specific...[python, regex, beautifulsoup]107231352012-12-10 03:18:14.74313794740.0I think I've got it:\\n\\n[CODE]\\n\\nNotice that,...
26613902508394325084142.0Search has no attribute teaserI am trying to access teaser. I tried many dif...[python, regex, json, python-3.x]6212-22014-08-01 15:45:26.73325084142.0Not exactly sure what you are trying to do but...
266140182213248221764.0Is there a reason the SQLAlchemy ORM tutorial ...The SQLAlchemy ORM tutorial uses this class:\\n...[python, sqlalchemy]8343142011-11-22 02:42:24.1578221764.0Bear in mind that eval is not used too much; c...
26614155767942957679695.0How can I turn a list of column names into a p...I have a list of pandas column names (consisti...[python, string, list, patsy]1061002019-08-27 17:11:24.39057679695.0[CODE]\\n\\n[CODE]\\n
\n", 266 | "

219841 rows × 12 columns

\n", 267 | "
" 268 | ], 269 | "text/plain": [ 270 | " Id AcceptedAnswerId \\\n", 271 | "1 15020895 NaN \n", 272 | "9 68487902 NaN \n", 273 | "15 61391327 NaN \n", 274 | "27 28852710 NaN \n", 275 | "29 24043029 NaN \n", 276 | "... ... ... \n", 277 | "2661376 55431749 55431832.0 \n", 278 | "2661378 13794532 13794740.0 \n", 279 | "2661390 25083943 25084142.0 \n", 280 | "2661401 8221324 8221764.0 \n", 281 | "2661415 57679429 57679695.0 \n", 282 | "\n", 283 | " Title \\\n", 284 | "1 Python int-byte efficient data structure \n", 285 | "9 Why does the Variance of Laplace very differen... \n", 286 | "15 Why input never ends \n", 287 | "27 Crashes with piecewise linear objective for gu... \n", 288 | "29 Python TypeError: plotdatehist() got an unexpe... \n", 289 | "... ... \n", 290 | "2661376 Handling exception returned by a method \n", 291 | "2661378 Python regular expression for Beautiful Soup \n", 292 | "2661390 Search has no attribute teaser \n", 293 | "2661401 Is there a reason the SQLAlchemy ORM tutorial ... \n", 294 | "2661415 How can I turn a list of column names into a p... \n", 295 | "\n", 296 | " QuestionBody \\\n", 297 | "1 i am currently storing key-values of type int-... \n", 298 | "9 TL;DR: How can I use skimage.filters.laplace(i... \n", 299 | "15 I have python 3.7 installed and I have this co... \n", 300 | "27 We have a complex optimization problem which i... \n", 301 | "29 apologies beforehand if this is a stupid quest... \n", 302 | "... ... \n", 303 | "2661376 I am calling a method that throws Valuerror ex... \n", 304 | "2661378 I am using Beautiful Soup to pull out specific... \n", 305 | "2661390 I am trying to access teaser. I tried many dif... \n", 306 | "2661401 The SQLAlchemy ORM tutorial uses this class:\\n... \n", 307 | "2661415 I have a list of pandas column names (consisti... \n", 308 | "\n", 309 | " Tags ViewCount \\\n", 310 | "1 [python, data-structures] 155 \n", 311 | "9 [python, opencv, image-processing, computer-vi... 391 \n", 312 | "15 [python, python-3.x, input] 104 \n", 313 | "27 [python, crash, gurobi, piecewise] 403 \n", 314 | "29 [python, typeerror] 419 \n", 315 | "... ... ... \n", 316 | "2661376 [python-3.x] 26 \n", 317 | "2661378 [python, regex, beautifulsoup] 10723 \n", 318 | "2661390 [python, regex, json, python-3.x] 62 \n", 319 | "2661401 [python, sqlalchemy] 834 \n", 320 | "2661415 [python, string, list, patsy] 106 \n", 321 | "\n", 322 | " AnswerCount CommentCount Score CreationDate AnswerId \\\n", 323 | "1 0 3 1 2013-02-22 09:33:26.360 NaN \n", 324 | "9 0 5 1 2021-07-22 15:50:34.220 NaN \n", 325 | "15 1 6 3 2020-04-23 15:43:03.497 NaN \n", 326 | "27 1 1 3 2015-03-04 10:58:16.370 NaN \n", 327 | "29 0 7 0 2014-06-04 16:42:32.257 NaN \n", 328 | "... ... ... ... ... ... \n", 329 | "2661376 1 2 1 2019-03-30 13:07:07.893 55431832.0 \n", 330 | "2661378 1 3 5 2012-12-10 03:18:14.743 13794740.0 \n", 331 | "2661390 1 2 -2 2014-08-01 15:45:26.733 25084142.0 \n", 332 | "2661401 3 1 4 2011-11-22 02:42:24.157 8221764.0 \n", 333 | "2661415 1 0 0 2019-08-27 17:11:24.390 57679695.0 \n", 334 | "\n", 335 | " AcceptedAnswerBody \n", 336 | "1 None \n", 337 | "9 None \n", 338 | "15 None \n", 339 | "27 None \n", 340 | "29 None \n", 341 | "... ... \n", 342 | "2661376 You need to place call to sanitize method in t... \n", 343 | "2661378 I think I've got it:\\n\\n[CODE]\\n\\nNotice that,... \n", 344 | "2661390 Not exactly sure what you are trying to do but... \n", 345 | "2661401 Bear in mind that eval is not used too much; c... \n", 346 | "2661415 [CODE]\\n\\n[CODE]\\n \n", 347 | "\n", 348 | "[219841 rows x 12 columns]" 349 | ] 350 | }, 351 | "execution_count": 4, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | } 355 | ], 356 | "source": [ 357 | "df_posts = pd.read_parquet(path_posts)\n", 358 | "df_posts" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "id": "bd200fc0-da3e-4a72-8fd2-2004d540691a", 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "id": "d557f519-6249-4a00-ba28-0948db54405a", 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 5, 380 | "id": "72242ee8-cc09-4ddd-ab0d-89f7ea0d1b78", 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "text/html": [ 386 | "
\n", 387 | "\n", 400 | "\n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | "
PostIdPostTitleRelatedPostIdsRelatedPostTitlesnum_candidates
13494593Shading a kernel density plot between two points.[3494593, 14863744, 14094644, 16504452, 488531...[Shading a kernel density plot between two poi...16
237949409Dictionary in a numpy array?[37949409, 47689224, 61517741][Dictionary in a numpy array?, How to access t...3
819876079Cannot find module cv2 when using OpenCV[19876079, 62443365, 64580641, 45606137, 60294...[Cannot find module cv2 when using OpenCV, How...7
1235082143Error: package or namespace load failed for ‘car’[35082143, 65941744, 68515009, 56409535][Error: package or namespace load failed for ‘...4
142673651inheritance from str or int[2673651, 48465797, 3120562, 15085917, 3238350...[inheritance from str or int, Inherited class ...15
..................
3323128419763Expand Text widget to fill the entire parent F...[28419763, 48171462][Expand Text widget to fill the entire parent ...2
3323440332743Source code for str.split?[40332743, 51355719][Source code for str.split?, where can I find ...2
3324127443414Cannot perform a backup or restore operation w...[27443414, 53216877][Cannot perform a backup or restore operation ...2
3324348536681What is the exact meaning of stride's list in ...[48536681, 47305022][What is the exact meaning of stride's list in...2
3324437814201pandas time shift from utc to local[37814201, 52390647][pandas time shift from utc to local, Convert ...2
\n", 502 | "

6114 rows × 5 columns

\n", 503 | "
" 504 | ], 505 | "text/plain": [ 506 | " PostId PostTitle \\\n", 507 | "1 3494593 Shading a kernel density plot between two points. \n", 508 | "2 37949409 Dictionary in a numpy array? \n", 509 | "8 19876079 Cannot find module cv2 when using OpenCV \n", 510 | "12 35082143 Error: package or namespace load failed for ‘car’ \n", 511 | "14 2673651 inheritance from str or int \n", 512 | "... ... ... \n", 513 | "33231 28419763 Expand Text widget to fill the entire parent F... \n", 514 | "33234 40332743 Source code for str.split? \n", 515 | "33241 27443414 Cannot perform a backup or restore operation w... \n", 516 | "33243 48536681 What is the exact meaning of stride's list in ... \n", 517 | "33244 37814201 pandas time shift from utc to local \n", 518 | "\n", 519 | " RelatedPostIds \\\n", 520 | "1 [3494593, 14863744, 14094644, 16504452, 488531... \n", 521 | "2 [37949409, 47689224, 61517741] \n", 522 | "8 [19876079, 62443365, 64580641, 45606137, 60294... \n", 523 | "12 [35082143, 65941744, 68515009, 56409535] \n", 524 | "14 [2673651, 48465797, 3120562, 15085917, 3238350... \n", 525 | "... ... \n", 526 | "33231 [28419763, 48171462] \n", 527 | "33234 [40332743, 51355719] \n", 528 | "33241 [27443414, 53216877] \n", 529 | "33243 [48536681, 47305022] \n", 530 | "33244 [37814201, 52390647] \n", 531 | "\n", 532 | " RelatedPostTitles num_candidates \n", 533 | "1 [Shading a kernel density plot between two poi... 16 \n", 534 | "2 [Dictionary in a numpy array?, How to access t... 3 \n", 535 | "8 [Cannot find module cv2 when using OpenCV, How... 7 \n", 536 | "12 [Error: package or namespace load failed for ‘... 4 \n", 537 | "14 [inheritance from str or int, Inherited class ... 15 \n", 538 | "... ... ... \n", 539 | "33231 [Expand Text widget to fill the entire parent ... 2 \n", 540 | "33234 [Source code for str.split?, where can I find ... 2 \n", 541 | "33241 [Cannot perform a backup or restore operation ... 2 \n", 542 | "33243 [What is the exact meaning of stride's list in... 2 \n", 543 | "33244 [pandas time shift from utc to local, Convert ... 2 \n", 544 | "\n", 545 | "[6114 rows x 5 columns]" 546 | ] 547 | }, 548 | "execution_count": 5, 549 | "metadata": {}, 550 | "output_type": "execute_result" 551 | } 552 | ], 553 | "source": [ 554 | "df_posts = pd.read_parquet(path_posts_related)\n", 555 | "df_posts" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 8, 561 | "id": "ba0dc292-3101-457f-b80c-5ce061118c09", 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "data": { 566 | "application/json": { 567 | "PostId": 3494593, 568 | "PostTitle": "Shading a kernel density plot between two points.", 569 | "RelatedPostIds": [ 570 | 3494593, 571 | 14863744, 572 | 14094644, 573 | 16504452, 574 | 48853178, 575 | 36948624, 576 | 47308146, 577 | 34029811, 578 | 31215748, 579 | 29499914, 580 | 41484896, 581 | 7787114, 582 | 27189453, 583 | 23680729, 584 | 36224394, 585 | 18742693 586 | ], 587 | "RelatedPostTitles": [ 588 | "Shading a kernel density plot between two points.", 589 | "adding percentile lines to a density plot", 590 | "draw the following shaded area in R", 591 | "color a portion of the normal distribution", 592 | "How can I shade the area under a curve?", 593 | "Shade area under a curve", 594 | "Shading a region under a PDF", 595 | "Fill different colors for each quantile in geom_density() of ggplot", 596 | "How to shade part of a density curve in ggplot (with no y axis data)", 597 | "r density plot - fill area under curve", 598 | "Fill negative value area below geom_line", 599 | "polygon in density plot?", 600 | "Shade (fill or color) area under density curve by quantile", 601 | "Partially fill density plot for area of interest", 602 | "Shade density plot to the left of vline?", 603 | "Shade an area in a R plot" 604 | ], 605 | "num_candidates": 16 606 | }, 607 | "text/plain": [ 608 | "" 609 | ] 610 | }, 611 | "execution_count": 8, 612 | "metadata": { 613 | "application/json": { 614 | "expanded": false, 615 | "root": "root" 616 | } 617 | }, 618 | "output_type": "execute_result" 619 | } 620 | ], 621 | "source": [ 622 | "JSON ( df_posts.iloc[0].to_dict() )" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "id": "9fb7ab6f-08fa-4099-939d-edcb7beca230", 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "id": "473e7e29-7a27-4030-aad3-c60c89dc19bd", 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "id": "ce7fc618-3b9c-450e-a89f-576d47fba15e", 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "id": "30a57006-3696-4a2d-82ca-726ee7c5b6b3", 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "id": "29ebbeec-f1de-4d07-b603-917e5aa3928b", 660 | "metadata": {}, 661 | "source": [ 662 | "## Metrics" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 4, 668 | "id": "3c824225-1fe7-488a-a291-f8ade3f82a82", 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "data": { 673 | "text/plain": [ 674 | "\u001b[0;31mType:\u001b[0m module\n", 675 | "\u001b[0;31mString form:\u001b[0m \n", 676 | "\u001b[0;31mFile:\u001b[0m ~/projects/search-engine-workshop/notebooks/metrics_utils.py\n", 677 | "\u001b[0;31mSource:\u001b[0m \n", 678 | "\u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\n", 679 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", 680 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 681 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 682 | "\u001b[0;34m\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", 683 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 684 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m\u001b[0m\n", 685 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 686 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", 687 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 688 | "\u001b[0;34m\u001b[0m \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n", 689 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 690 | "\u001b[0;34m\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 691 | "\u001b[0;34m\u001b[0m \u001b[0mfirst_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 692 | "\u001b[0;34m\u001b[0m \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfirst_index\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 693 | "\u001b[0;34m\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 694 | "\u001b[0;34m\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\n", 695 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 696 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmrr\u001b[0m\u001b[0;34m\u001b[0m\n", 697 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", 698 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 699 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", 700 | "\u001b[0;34m\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n", 701 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 702 | "\u001b[0;34m\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 703 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 704 | "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 705 | "\u001b[0;34m\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 706 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 707 | "\u001b[0;34m\u001b[0m \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n", 708 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 709 | "\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 710 | "\u001b[0;34m\u001b[0m \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 711 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 712 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m\u001b[0m\n", 713 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 714 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", 715 | "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mall_metrics\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n", 716 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 717 | "\u001b[0;34m\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m\u001b[0m\n", 718 | "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n", 719 | "\u001b[0;34m\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n", 720 | "\u001b[0;34m\u001b[0m \u001b[0;34m\"p@1\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 721 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@5\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 722 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@10\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 723 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"mrr\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 724 | "\u001b[0;34m\u001b[0m \u001b[0;34m,\u001b[0m \u001b[0;34m\"map\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n", 725 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 726 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 727 | "\u001b[0;34m\u001b[0m \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n", 728 | "\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n", 729 | "\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n" 730 | ] 731 | }, 732 | "metadata": {}, 733 | "output_type": "display_data" 734 | } 735 | ], 736 | "source": [ 737 | "??metrics_utils" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "id": "9b37a9b9-ab34-4152-88af-22728c8758a9", 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [] 747 | }, 748 | { 749 | "cell_type": "markdown", 750 | "id": "90285ffa-4312-4ea8-84a6-595199688140", 751 | "metadata": {}, 752 | "source": [ 753 | "relevant result at the end" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 11, 759 | "id": "356a2b4a-6f3d-42df-bf65-7796bc29c7d9", 760 | "metadata": {}, 761 | "outputs": [ 762 | { 763 | "data": { 764 | "text/plain": [ 765 | "{'p@1': 0.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 0.2, 'map': 0.2}" 766 | ] 767 | }, 768 | "execution_count": 11, 769 | "metadata": {}, 770 | "output_type": "execute_result" 771 | } 772 | ], 773 | "source": [ 774 | "metrics_utils.all_metrics([0,0,0,0,1])" 775 | ] 776 | }, 777 | { 778 | "cell_type": "markdown", 779 | "id": "59f9f574-e506-45e0-9c4c-c65a2b3827eb", 780 | "metadata": {}, 781 | "source": [ 782 | "relevant result at the beginning" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 12, 788 | "id": "8252bfbc-7184-437b-91e6-b60d166a9742", 789 | "metadata": {}, 790 | "outputs": [ 791 | { 792 | "data": { 793 | "text/plain": [ 794 | "{'p@1': 1.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 1.0, 'map': 1.0}" 795 | ] 796 | }, 797 | "execution_count": 12, 798 | "metadata": {}, 799 | "output_type": "execute_result" 800 | } 801 | ], 802 | "source": [ 803 | "metrics_utils.all_metrics([1,0,0,0,0])" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 13, 809 | "id": "196acac3-a263-4307-8ef9-075e7492870c", 810 | "metadata": {}, 811 | "outputs": [ 812 | { 813 | "data": { 814 | "text/plain": [ 815 | "0.2" 816 | ] 817 | }, 818 | "execution_count": 13, 819 | "metadata": {}, 820 | "output_type": "execute_result" 821 | } 822 | ], 823 | "source": [] 824 | }, 825 | { 826 | "cell_type": "markdown", 827 | "id": "e3b1413a-81a2-4a7d-9a46-ac6c9938b17e", 828 | "metadata": {}, 829 | "source": [ 830 | "map captures that the relevant results are shown at the beginning" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 14, 836 | "id": "ad52c3ad-952a-4340-87bd-d20369cb420d", 837 | "metadata": {}, 838 | "outputs": [ 839 | { 840 | "data": { 841 | "text/plain": [ 842 | "{'p@1': 0.0,\n", 843 | " 'p@5': 0.4,\n", 844 | " 'p@10': 0.2,\n", 845 | " 'mrr': 0.3333333333333333,\n", 846 | " 'map': 0.41666666666666663}" 847 | ] 848 | }, 849 | "execution_count": 14, 850 | "metadata": {}, 851 | "output_type": "execute_result" 852 | } 853 | ], 854 | "source": [ 855 | "metrics_utils.all_metrics([0,0,1,1,0])" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 15, 861 | "id": "8f61fc8e-7292-43dc-8f29-501d7fee8876", 862 | "metadata": {}, 863 | "outputs": [ 864 | { 865 | "data": { 866 | "text/plain": [ 867 | "{'p@1': 1.0, 'p@5': 0.4, 'p@10': 0.2, 'mrr': 1.0, 'map': 1.0}" 868 | ] 869 | }, 870 | "execution_count": 15, 871 | "metadata": {}, 872 | "output_type": "execute_result" 873 | } 874 | ], 875 | "source": [ 876 | "metrics_utils.all_metrics([1,1,0,0,0])" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": null, 882 | "id": "8505c6aa-d009-4dea-9263-38ca4d9f2c4b", 883 | "metadata": {}, 884 | "outputs": [], 885 | "source": [] 886 | } 887 | ], 888 | "metadata": { 889 | "environment": { 890 | "kernel": "python3", 891 | "name": "pytorch-gpu.1-12.m99", 892 | "type": "gcloud", 893 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99" 894 | }, 895 | "kernelspec": { 896 | "display_name": "Python 3", 897 | "language": "python", 898 | "name": "python3" 899 | }, 900 | "language_info": { 901 | "codemirror_mode": { 902 | "name": "ipython", 903 | "version": 3 904 | }, 905 | "file_extension": ".py", 906 | "mimetype": "text/x-python", 907 | "name": "python", 908 | "nbconvert_exporter": "python", 909 | "pygments_lexer": "ipython3", 910 | "version": "3.7.12" 911 | } 912 | }, 913 | "nbformat": 4, 914 | "nbformat_minor": 5 915 | } 916 | -------------------------------------------------------------------------------- /archive/notebooks_stackoverflow/metrics_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def precision_at_k(r, k): 4 | 5 | r = r[:k] 6 | 7 | return sum(r) / k 8 | 9 | 10 | def mean_reciprocal_rank(r): 11 | mrr = 0 12 | 13 | try: 14 | first_index = r.index(True) 15 | mrr = 1 / (first_index + 1) 16 | except: 17 | pass 18 | 19 | return mrr 20 | 21 | def average_precision(r): 22 | 23 | out = [] 24 | 25 | for idx in range(len(r)): 26 | 27 | if r[idx]: 28 | out.append (precision_at_k(r, idx + 1) ) 29 | 30 | ap = 0 31 | 32 | if out: 33 | ap = sum(out) / len(out) 34 | 35 | return ap 36 | 37 | 38 | def all_metrics(result): 39 | 40 | result = list(result) 41 | 42 | res= { 43 | "p@1" : precision_at_k(result, 1) 44 | , "p@5" : precision_at_k(result, 5) 45 | , "p@10" : precision_at_k(result, 10) 46 | , "mrr" : mean_reciprocal_rank(result) 47 | , "map" : average_precision(result) 48 | 49 | 50 | } 51 | 52 | return res -------------------------------------------------------------------------------- /archive/notebooks_stackoverflow/test_setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "ad224cf0-176b-4460-afc0-03f0aacdfe71", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import datetime\n", 11 | "import pickle\n", 12 | "import uuid\n", 13 | "import datetime\n", 14 | "import numpy as np\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "9d7d74e6-1472-4f92-b582-fb74683a252e", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "aae437e9-d66c-44a8-ab44-4523f7abb5b1", 29 | "metadata": {}, 30 | "source": [ 31 | "# Elastic Search" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "b7831c3d-d97c-4027-bfbd-26b4f672b003", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import elasticsearch" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "69aff282-e9b0-466b-b828-87b69e3dcbc1", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "elasticsearch.__version__" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "07c82415-1cd2-43e8-a88c-626eac3dea04", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "from elasticsearch import Elasticsearch" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "5e2563f6-5723-4aa6-a120-f4764c4d8b07", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "es = Elasticsearch(hosts=\"http://localhost:9200\" , verify_certs=False)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "4f00be9b-8c7d-4993-9a75-2ed9f5a94b37", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "index_name = \"test-index\"" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "056b1007-5cc8-4a40-ab59-ffda53e269d2", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "doc = {\n", 92 | " 'author': 'kimchy',\n", 93 | " 'text': 'Elasticsearch: cool. bonsai cool.',\n", 94 | " 'timestamp': datetime.datetime.now(),\n", 95 | "}" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "e994e955-f3ea-47f4-95e6-994390e5403e", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "resp = es.index(index=index_name, id=1, document=doc)\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "6ecf186b-458a-4bf6-9ed1-ff1030b72f50", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "print(resp['result'])\n", 116 | "\n", 117 | "resp = es.get(index=index_name, id=1)\n", 118 | "print(resp['_source'])\n", 119 | "\n", 120 | "es.indices.refresh(index=index_name)\n", 121 | "\n", 122 | "resp = es.search(index=index_name, query={\"match_all\": {}})\n", 123 | "print(\"Got %d Hits:\" % resp['hits']['total']['value'])\n", 124 | "for hit in resp['hits']['hits']:\n", 125 | " print(\"%(timestamp)s %(author)s: %(text)s\" % hit[\"_source\"])" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "ec1b445b-6cc5-458d-bc9b-5b371c82d1a5", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "cf0dd18f-8fde-4a72-b226-f58ab22c4520", 139 | "metadata": {}, 140 | "source": [ 141 | "# Milvus" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "53965619-9c81-472e-9110-7c4e2b1cc3a7", 147 | "metadata": {}, 148 | "source": [ 149 | "https://github.com/milvus-io/pymilvus/blob/master/examples/hello_milvus.ipynb" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "8c384da7-3a72-4b9e-bacf-4e188af6a3b4", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "from pymilvus import (\n", 160 | " connections,\n", 161 | " utility,\n", 162 | " FieldSchema, CollectionSchema, DataType,\n", 163 | " Collection,\n", 164 | ")\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "c4c65116-21fa-46da-ba1c-231bf2a7569a", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "num_entities, dim = 3000, 8\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "ee5a1a32-5735-4951-8724-9a773d036ea6", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "collection_name=\"hello_milvus\"" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "e92ce9b8-cbb3-42d2-ad8b-791d21729d3c", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "!ls" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "55e5a4ad-9150-4b86-9cb6-f968b7fe51fd", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "connections.connect(\"default\", host=\"localhost\", port=\"19530\")\n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "ae3415ca-a52f-4319-b20c-d08f49bd06e7", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "if utility.has_collection(collection_name):\n", 215 | " utility.drop_collection(collection_name)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "a4324ba0-4050-4b3c-9ba6-6e591177509d", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "utility.list_collections()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "id": "c070eb28-fa18-43ad-a2de-1067b2476274", 231 | "metadata": {}, 232 | "source": [ 233 | "create collection" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "5e08cee9-e957-44b5-856f-c4549faa7b86", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "fields = [\n", 244 | " FieldSchema(name=\"pk\", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),\n", 245 | " FieldSchema(name=\"random\", dtype=DataType.DOUBLE),\n", 246 | " FieldSchema(name=\"embeddings\", dtype=DataType.FLOAT_VECTOR, dim=dim)\n", 247 | "]\n", 248 | "\n", 249 | "schema = CollectionSchema(fields, \"hello_milvus is the simplest demo to introduce the APIs\")\n", 250 | "\n", 251 | "hello_milvus = Collection(collection_name, schema, consistency_level=\"Strong\")" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "id": "8eff6925-4089-46c3-ae03-00fc748c3135", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "99ee18c0-fb04-4970-8ba5-e36eca35680a", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "rng = np.random.default_rng(seed=19530)\n", 270 | "entities = [\n", 271 | " # provide the pk field because `auto_id` is set to False\n", 272 | " [str(i) for i in range(num_entities)],\n", 273 | " rng.random(num_entities).tolist(), # field random, only supports list\n", 274 | " rng.random((num_entities, dim)), # field embeddings, supports numpy.ndarray and list\n", 275 | "]\n", 276 | "\n", 277 | "insert_result = hello_milvus.insert(entities)\n", 278 | "\n", 279 | "print(f\"Number of entities in Milvus: {hello_milvus.num_entities}\") # check the num_entites" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "ede3539d-e940-458d-a7d9-6c345178f357", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "c272a614-8baa-4b3f-b77d-499aa30760d7", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "index = {\n", 298 | " \"index_type\": \"IVF_FLAT\",\n", 299 | " \"metric_type\": \"L2\",\n", 300 | " \"params\": {\"nlist\": 128},\n", 301 | "}\n", 302 | "\n", 303 | "hello_milvus.create_index(\"embeddings\", index)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "b2865bb6-854a-41a3-97f1-98634f9b57e4", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "hello_milvus.load()\n" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "0e8e8b2f-a00e-49a0-bbc9-cb99fecb0e7a", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "vectors_to_search = entities[-1][-2:]\n", 324 | "search_params = {\n", 325 | " \"metric_type\": \"L2\",\n", 326 | " \"params\": {\"nprobe\": 10},\n", 327 | "}\n", 328 | "\n", 329 | "start_time = time.time()\n", 330 | "result = hello_milvus.search(vectors_to_search, \"embeddings\", search_params, limit=3, output_fields=[\"random\"])\n", 331 | "end_time = time.time()\n", 332 | "\n", 333 | "for hits in result:\n", 334 | " for hit in hits:\n", 335 | " print(f\"hit: {hit}, random field: {hit.entity.get('random')}\")\n", 336 | "print((end_time - start_time))" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "id": "385d0057-cc9e-4ee3-834e-bec4675cbb96", 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "id": "70601713-0913-4ccc-9efd-9058397d1266", 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "id": "4ef6e0b5-1975-422f-9fdd-6e45f2e0917a", 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "493a4373-3de7-4cff-9b5f-c0b7d0288506", 366 | "metadata": {}, 367 | "source": [ 368 | "# weaviate" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "27b9d308-face-444b-bf3c-5d3bec0072bb", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "#!pip install weaviate-client==3.8.0" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "8f818cf1-ed3b-478f-bf5b-b513043ebcb1", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "import weaviate\n" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "595ed265-609c-41be-b9f9-eadfb9820a2f", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "def generate_uuid(class_name: str, identifier: str,\n", 399 | " test: str = 'teststrong') -> str:\n", 400 | " \"\"\" Generate a uuid based on an identifier\n", 401 | " :param identifier: characters used to generate the uuid\n", 402 | " :type identifier: str, required\n", 403 | " :param class_name: classname of the object to create a uuid for\n", 404 | " :type class_name: str, required\n", 405 | " \"\"\"\n", 406 | " test = 'overwritten'\n", 407 | " return str(uuid.uuid5(uuid.NAMESPACE_DNS, class_name + identifier))\n", 408 | "\n", 409 | "def log(i: str) -> str:\n", 410 | " \"\"\" A simple logger\n", 411 | " :param i: the log message\n", 412 | " :type i: str\n", 413 | " \"\"\"\n", 414 | " now = datetime.datetime.utcnow()\n", 415 | " print(now, \"| \" + str(i))" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "id": "0cd92cf1-a8f6-4f71-ab1b-008d17e659b3", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "client = weaviate.Client(\"http://localhost:8081\")\n", 426 | "print(\"Client created\")" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "id": "3645408d-9a51-41b9-aa02-afd549cacdc3", 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "from sentence_transformers import SentenceTransformer\n", 437 | "sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') #, Initially load using this, then start using pickle to save time." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "id": "b1499af5-7259-4e05-afca-117a12dfb659", 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "# from sentence_transformers import SentenceTransformer\n", 448 | "# # sbert_model = SentenceTransformer('bert-base-nli-mean-tokens'), Initially load using this, then start using pickle to save time.\n", 449 | "# with open(\"sbert\",'rb') as f:\n", 450 | "# sbert_model = pickle.load(f)\n", 451 | "\n", 452 | "print(\"sbert loaded\")\n", 453 | "\n", 454 | "# I am adding the texts in this list,\n", 455 | "# We can also add sentences of a large text individually to get more precise results when we query.\n", 456 | "documents = [\n", 457 | " '''Taj mahal is an immense mausoleum of white marble, built in Agra between 1631 and 1648 by order of the Mughal emperor Shah Jahan in memory of his favourite wife, the Taj Mahal is the jewel of Muslim art in India and one of the universally admired masterpieces of the world's heritage.''',\n", 458 | " '''The Statue of Liberty is a 305-foot (93-metre) statue located on Liberty Island in Upper New York Bay, off the coast of New York City. The statue is a personification of liberty in the form of a woman. She holds a torch in her raised right hand and clutches a tablet in her left.''',\n", 459 | " '''The Statue of Liberty was sculpted between 1875 and 1884 under the direction of French sculptor Frédéric-Auguste Bartholdi, who began drafting designs in 1870. Bartholdi and his team hammered roughly 31 tons of copper sheets onto a steel frame. Before being mounted on its current pedestal, the statue stood over 151 feet (46 metres) tall and weighed 225 tons.''',\n", 460 | " '''Badminton is a racquet sport played using racquets to hit a shuttlecock across a net. Although it may be played with larger teams, the most common forms of the game are \"singles\" (with one player per side) and \"doubles\" (with two players per side). Badminton is often played as a casual outdoor activity in a yard or on a beach; formal games are played on a rectangular indoor court. Points are scored by striking the shuttlecock with the racquet and landing it within the opposing side's half of the court.''',\n", 461 | " '''James Bond is a fictional character created by novelist Ian Fleming in 1953.''',\n", 462 | " '''A British secret agent working for MI6 under the codename 007, he has been portrayed on film by actors Sean Connery, David Niven, George Lazenby, Roger Moore, Timothy Dalton, Pierce Brosnan and Daniel Craig in twenty-seven productions.'''\n", 463 | "]\n", 464 | "\n", 465 | "# A dictionary to store the document and its feature vector (the vector generated by SBERT)\n", 466 | "doc_and_vec = {}\n", 467 | "\n", 468 | "def giveVector(texts):\n", 469 | " # this function returns the vector using SBERT\n", 470 | " return sbert_model.encode(texts)\n", 471 | "\n", 472 | "vectors = giveVector(documents)\n", 473 | "\n", 474 | "for doc,vec in zip(documents,vectors):\n", 475 | " doc_and_vec[doc] = vec\n", 476 | "\n", 477 | "print(\"vectors formed\")\n", 478 | "\n", 479 | "client.schema.delete_all()\n", 480 | "class_obj = {\n", 481 | " \"class\": \"Post\",\n", 482 | " \"vectorizer\": \"none\", # we are providing the vectors ourselves through our SBERT model, so this field is none\n", 483 | " \"properties\": [{\n", 484 | " \"name\": \"content\",\n", 485 | " \"dataType\": [\"text\"],\n", 486 | " }]\n", 487 | "}\n", 488 | "\n", 489 | "client.schema.create_class(class_obj)\n", 490 | "print(\"Schema class created\")\n", 491 | "\n", 492 | "for doc,vec in doc_and_vec.items():\n", 493 | " data_obj = {\n", 494 | " \"content\": doc\n", 495 | " }\n", 496 | " client.data_object.create(\n", 497 | " data_obj,\n", 498 | " \"Post\",\n", 499 | " generate_uuid('Post',doc),\n", 500 | " vector = vec,\n", 501 | " )\n", 502 | "print(\"Finished importing data\")\n", 503 | "\n", 504 | "def process_query(vec):\n", 505 | " nearVector = {\"vector\": vec}\n", 506 | " res = client.query.get(\"Post\", [\"content\", \"_additional {certainty}\"]).with_near_vector(nearVector).do()\n", 507 | " print(res)\n", 508 | " print(\"------------------------------------------------------------------------------------------------\")\n", 509 | " print(\"-----------------------------------Most similar text -------------------------------------------\")\n", 510 | " print(res['data']['Get']['Post'][0]['content'])\n", 511 | " print(\"------------------------------------------------------------------------------------------------\")\n", 512 | " print(res['data']['Get']['Post'][1]['content'])\n", 513 | " print(\"------------------------------------------------------------------------------------------------\")\n", 514 | "\n", 515 | " \n" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "id": "065e4517-d430-48c7-89ae-9cd29c3a31f3", 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "query =\"american tourist destination\"\n", 526 | "query_vec = sbert_model.encode(query)\n", 527 | "process_query(query_vec)\n" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "id": "1a232068-a38a-490b-a329-d5b5e773174b", 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [] 537 | } 538 | ], 539 | "metadata": { 540 | "environment": { 541 | "kernel": "python3", 542 | "name": "pytorch-gpu.1-12.m99", 543 | "type": "gcloud", 544 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99" 545 | }, 546 | "kernelspec": { 547 | "display_name": "Python 3", 548 | "language": "python", 549 | "name": "python3" 550 | }, 551 | "language_info": { 552 | "codemirror_mode": { 553 | "name": "ipython", 554 | "version": 3 555 | }, 556 | "file_extension": ".py", 557 | "mimetype": "text/x-python", 558 | "name": "python", 559 | "nbconvert_exporter": "python", 560 | "pygments_lexer": "ipython3", 561 | "version": "3.7.12" 562 | } 563 | }, 564 | "nbformat": 4, 565 | "nbformat_minor": 5 566 | } 567 | -------------------------------------------------------------------------------- /archive/notebooks_stackoverflow/workshop_setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "92773bd8-4d3e-47d0-af4f-52216bb43465", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "55d91d10-03a4-46ad-b011-a1cfd22ab1e7", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n", 19 | "import os" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "1cd6384a-b5e3-4d82-9a07-35b821524321", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "9451b070-e18c-46c3-a012-6e2878cc26f4", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "model = SentenceTransformer('flax-sentence-embeddings/stackoverflow_mpnet-base')\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "f414975a-3dc2-4eb6-9b3b-bd24be4d18a4", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "24f40c4c-a183-4d84-b70e-4a1a86a229ee", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "8983c00d-214d-4df3-b024-154b2105ace5", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "1770345e-6eff-4761-bf39-64020967cc51", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "a8834f31-56cf-4760-b0e6-c86e3b8efd39", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'\n", 84 | "\n", 85 | "if not os.path.exists(wikipedia_filepath):\n", 86 | " util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "environment": { 92 | "kernel": "python3", 93 | "name": "pytorch-gpu.1-12.m99", 94 | "type": "gcloud", 95 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99" 96 | }, 97 | "kernelspec": { 98 | "display_name": "Python 3", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.7.12" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 5 117 | } 118 | -------------------------------------------------------------------------------- /assets/all_assets.sw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/all_assets.sw -------------------------------------------------------------------------------- /assets/slides_odsc2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_odsc2022.pdf -------------------------------------------------------------------------------- /assets/slides_pydatanyc2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_pydatanyc2022.pdf -------------------------------------------------------------------------------- /assets/slides_pydataseattle2023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_pydataseattle2023.pdf -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.0" 2 | services: 3 | elasticsearch: 4 | container_name: es-container 5 | image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0 6 | environment: 7 | - xpack.security.enabled=false 8 | - "discovery.type=single-node" 9 | ports: 10 | - 9200:9200 11 | volumes: 12 | - esdata:/usr/share/elasticsearch/data 13 | 14 | 15 | # milvus: 16 | # container_name: milvus 17 | # image: milvusdb/milvus:1.1.1-cpu-d061621-330cc6 18 | # ports: 19 | # - 19530:19530 20 | # - 19121:19121 21 | # volumes: 22 | # - milvusdata:/var/lib/milvus 23 | 24 | 25 | # milvus: 26 | # container_name: milvus 27 | # build: 28 | # context: docker_milvus 29 | # ports: 30 | # - 19530:19530 31 | # - 19121:19121 32 | # volumes: 33 | # - milvusdata:/var/lib/milvus 34 | 35 | 36 | # weaviate: 37 | # image: semitechnologies/weaviate:1.14.0 38 | # ports: 39 | # - 8081:8080 40 | # environment: 41 | # QUERY_DEFAULTS_LIMIT: 25 42 | # AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 43 | # PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 44 | # DEFAULT_VECTORIZER_MODULE: 'none' 45 | # ENABLE_MODULES: '' 46 | # CLUSTER_HOSTNAME: 'node1' 47 | # volumes: 48 | # - weaviatedata:/var/lib/weaviate 49 | volumes: 50 | esdata: 51 | # weaviatedata: 52 | # milvusdata: -------------------------------------------------------------------------------- /docs/internal_notes.md: -------------------------------------------------------------------------------- 1 | # Internal Notes 2 | 3 | ## Setup Dep 4 | 5 | Install other deps 6 | 7 | ```bash 8 | sudo apt update && sudo apt install -y p7zip-full 9 | ``` 10 | 11 | Create conda environemnt 12 | 13 | ```bash 14 | conda create -n workshop python=3.7 mamba 15 | conda activate workshop 16 | mamba env update -n workshop -f environment.yaml 17 | # mamba install anaconda jupyter ipykernel nb_conda_kernels 18 | 19 | mamba install ipython ipykernel nb_conda_kernels 20 | 21 | ipython kernel install --user --name=workshop 22 | 23 | 24 | conda create --name workshop --clone base 25 | 26 | ``` 27 | 28 | Start ES/ Faiss for local dev 29 | 30 | ```bash 31 | docker-compose up 32 | ``` 33 | 34 | ```bash 35 | docker run --user root -e GRANT_SUDO=yes -it app bash 36 | ``` 37 | 38 | 39 | 40 | ``` 41 | Go to DIR: /projects/search-engine-workshop 42 | Type: docker-compose up 43 | 44 | In the notebooks test... checks the milvus and elastic connections 45 | 46 | 47 | ``` 48 | gsutil -m cp -r gs://np-training-tmp/stackoverflow/final* gs://np-public-training-temp/stackoverflow/ 49 | ``` 50 | 51 | ``` 52 | 53 | 54 | 55 | 56 | ``` 57 | zip -r data_processed.zip data/processed/ 58 | 59 | gh release delete v1.0 60 | 61 | gh release create v1.0 'data_processed.zip#Hugging Face Dataset of Unsplashed collection' \ 62 | --title "v1.0" --notes "initial release" 63 | 64 | 65 | ``` 66 | 67 | 68 | 69 | ``` 70 | zip -r /tmp/data.zip data/ 71 | gsutil cp /tmp/data.zip gs://np-public-training-tmp/search-workshop/data.zip 72 | 73 | 74 | ``` -------------------------------------------------------------------------------- /docs/slide_notes.md: -------------------------------------------------------------------------------- 1 | PUT /items 2 | 3 | ```json 4 | { 5 | "mappings": { 6 | "properties": { 7 | "title": { "type": "text" }, 8 | "description": { "type": "text" }, 9 | 10 | "brand": { "type": "keyword" }, 11 | "product_type": { "type": "keyword" }, 12 | 13 | "price": { "type": "double" } 14 | } 15 | } 16 | } 17 | ``` 18 | 19 | Nike shoe under 100$ 20 | 21 | GET /items/_search 22 | 23 | ```json 24 | { 25 | "query": { 26 | 27 | "multi_match": { 28 | "query": "Nike shoe under 100$", 29 | "fields": ["title^2", "Description^1"] 30 | } 31 | 32 | ,"bool": { 33 | "filter": [ 34 | { "term": { "brand": "nike" }} 35 | ] 36 | } 37 | ,"filtered": { 38 | "filter": { 39 | "range": { 40 | "price" : { "lte": 100 } 41 | } 42 | } 43 | } 44 | } 45 | 46 | ``` 47 | 48 | 49 | 50 | ## PR curve 51 | ``` 52 | Recall Perfect Classifier Baseline Classifier Good Classifier High Precision 53 | 0.1 0.95 0.5 0.9 0.91 54 | 0.2 0.95 0.5 0.85 0.91 55 | 0.3 0.95 0.5 0.85 0.91 56 | 0.4 0.95 0.5 0.8 0.9 57 | 0.5 0.95 0.5 0.8 0.4 58 | 0.6 0.95 0.5 0.8 0.4 59 | 0.7 0.95 0.5 0.8 0.4 60 | 0.8 0.95 0.5 0.8 0.2 61 | 0.9 0.95 0.5 0.7 0.2 62 | 1 0.9 0.5 0.2 0.1 63 | ``` 64 | 65 | 66 | 67 | 68 | dcg 69 | 70 | ``` 71 | Discounted\space Cumulative\space Gain 72 | = \sum_{1}^{p}\frac{ relevance (i)}{log_{2}(i+1)} 73 | 74 | 75 | \\ 76 | DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{1}{log_{2}(3)} + {\color{Red}\frac{0}{log_{2}(4)} }+\frac{2}{log_{2}(5)} = 4.49 77 | 78 | \\ 79 | 80 | (Ideal)\space DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{2}{log_{2}(3)} + \frac{1}{log_{2}(4)} + {\color{Red}\frac{0}{log_{2}(5)} } = 5.88 81 | 82 | \\ 83 | Normalized\space Discounted\space Cumulative\space Gain 84 | = \frac{ DCG}{Ideal\space DCG} = \frac{4.49}{5.88} 85 | ``` -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | #name: workshop 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python==3.7.* 6 | - pip 7 | - mamba 8 | - nb_conda_kernels 9 | - pyarrow==9.0.* 10 | - lxml==4.9.* 11 | - pip: 12 | - -r requirements.txt 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /notebooks/04_ann.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d6298118-b5f8-4250-bd82-e2a3787914ca", 6 | "metadata": {}, 7 | "source": [ 8 | "# Benchmarking Aproximate Nearest Neighbors" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "c2af1fec-6519-40d0-8826-f201d0acba0b", 14 | "metadata": {}, 15 | "source": [ 16 | "# About" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "6a45ed78-348b-4193-8870-cbcbaff240e1", 22 | "metadata": {}, 23 | "source": [ 24 | "In order for embedding retrieval to work at scale, need to use a vector database.\n", 25 | "We also need to use Approximate Nearest Search instead of brute force.\n", 26 | "\n", 27 | "\n", 28 | "In this notebook, we will use [FAISS]() a library from facebook.\n", 29 | "\n", 30 | "We will compare a brute force and the speedup gained from `IVF`.\n", 31 | "\n", 32 | "For a more detailed comparision, take a look here to find other solutions and benchmark data.\n", 33 | "\n", 34 | "\n", 35 | "We will look at `performance` and `recall@1`" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "e45e627d-9c8c-4f19-ab91-0e64ed8677d7", 41 | "metadata": {}, 42 | "source": [ 43 | "# Setup" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 1, 49 | "id": "62298ef5-fa20-4164-8ec7-e7f43bf85c20", 50 | "metadata": { 51 | "execution": { 52 | "iopub.execute_input": "2023-04-26T14:52:36.834089Z", 53 | "iopub.status.busy": "2023-04-26T14:52:36.833681Z", 54 | "iopub.status.idle": "2023-04-26T14:52:37.774946Z", 55 | "shell.execute_reply": "2023-04-26T14:52:37.774030Z", 56 | "shell.execute_reply.started": "2023-04-26T14:52:36.834036Z" 57 | } 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "from pathlib import Path\n", 62 | "import numpy as np\n", 63 | "import pandas as pd\n", 64 | "import faiss\n", 65 | "import datasets" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "6b9a73d2-aa07-46ef-b94d-8780ca9ecb68", 71 | "metadata": {}, 72 | "source": [ 73 | "## Load the embeddings of the image corpus" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 2, 79 | "id": "74c3f8aa-cb27-4ae8-a9a8-0060001d357c", 80 | "metadata": { 81 | "execution": { 82 | "iopub.execute_input": "2023-04-26T14:52:37.778006Z", 83 | "iopub.status.busy": "2023-04-26T14:52:37.776915Z", 84 | "iopub.status.idle": "2023-04-26T14:52:44.572677Z", 85 | "shell.execute_reply": "2023-04-26T14:52:44.571814Z", 86 | "shell.execute_reply.started": "2023-04-26T14:52:37.777973Z" 87 | } 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "dset = datasets.load_from_disk(\"../data/processed_embeddings\")\n", 92 | "## these embeddings will be used to create the search space.\n", 93 | "corpus = dset['embeddings']\n", 94 | "\n", 95 | "\n", 96 | "corpus = np.array(corpus).astype('float32')\n", 97 | "corpus = np.unique(corpus, axis=0)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 3, 103 | "id": "94eeb2f5-88ff-428f-8e9c-5014234427b8", 104 | "metadata": { 105 | "execution": { 106 | "iopub.execute_input": "2023-04-26T14:52:44.574012Z", 107 | "iopub.status.busy": "2023-04-26T14:52:44.573742Z", 108 | "iopub.status.idle": "2023-04-26T14:52:44.580175Z", 109 | "shell.execute_reply": "2023-04-26T14:52:44.579367Z", 110 | "shell.execute_reply.started": "2023-04-26T14:52:44.573987Z" 111 | } 112 | }, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "(24954, 512)" 118 | ] 119 | }, 120 | "execution_count": 3, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "corpus.shape" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 4, 132 | "id": "939108fa-daf9-472c-b026-19c6d9708a77", 133 | "metadata": { 134 | "execution": { 135 | "iopub.execute_input": "2023-04-26T14:52:44.581364Z", 136 | "iopub.status.busy": "2023-04-26T14:52:44.581116Z", 137 | "iopub.status.idle": "2023-04-26T14:52:44.590669Z", 138 | "shell.execute_reply": "2023-04-26T14:52:44.589884Z", 139 | "shell.execute_reply.started": "2023-04-26T14:52:44.581340Z" 140 | } 141 | }, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "array([[-0.08344752, 0.01604629, 0.03037108, ..., 0.03962855,\n", 147 | " -0.02023211, -0.01102281],\n", 148 | " [-0.07890625, 0.02533851, 0.00522987, ..., 0.02622218,\n", 149 | " -0.05418065, -0.00765004],\n", 150 | " [-0.0781679 , 0.03937826, -0.01087696, ..., 0.04282334,\n", 151 | " -0.02091636, -0.01027698],\n", 152 | " ...,\n", 153 | " [ 0.0878398 , 0.01232621, 0.00077178, ..., -0.00705758,\n", 154 | " 0.01574707, -0.01541145],\n", 155 | " [ 0.0882502 , 0.03615745, -0.00961868, ..., 0.01392467,\n", 156 | " 0.00077467, -0.02139922],\n", 157 | " [ 0.09195283, 0.04004925, -0.00255262, ..., 0.0036222 ,\n", 158 | " -0.0181689 , -0.04212729]], dtype=float32)" 159 | ] 160 | }, 161 | "execution_count": 4, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "corpus" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "63581c47-4d2a-4106-883f-3f42c9070e99", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 5, 181 | "id": "69bf8642-772f-4996-b3c7-910a1f38b1b2", 182 | "metadata": { 183 | "execution": { 184 | "iopub.execute_input": "2023-04-26T14:52:44.592214Z", 185 | "iopub.status.busy": "2023-04-26T14:52:44.591565Z", 186 | "iopub.status.idle": "2023-04-26T14:52:44.600591Z", 187 | "shell.execute_reply": "2023-04-26T14:52:44.599816Z", 188 | "shell.execute_reply.started": "2023-04-26T14:52:44.592163Z" 189 | } 190 | }, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "512" 196 | ] 197 | }, 198 | "execution_count": 5, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "dimension = corpus.shape[-1]\n", 205 | "dimension" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "b5fbf481-46ac-45d3-b277-e9e19185c214", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "06a59231-abcf-45f9-a69a-69149e73c2f8", 219 | "metadata": { 220 | "tags": [] 221 | }, 222 | "source": [ 223 | "# Flat Index / Brute Force\n" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "f2b9e1b9-c75b-4700-a3c9-cb2b8ecab451", 229 | "metadata": {}, 230 | "source": [ 231 | "FAISS supports a bruteforce index. \n", 232 | "This index is good if you want perfect recall. \n", 233 | "It requires all the data to be fit in memory. " 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "926f4977-2ee0-48bd-8f50-f13e6ed82897", 239 | "metadata": {}, 240 | "source": [ 241 | "## Create the index" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 6, 247 | "id": "a447b05c-f30d-4d43-a6ed-fd29a99477a2", 248 | "metadata": { 249 | "execution": { 250 | "iopub.execute_input": "2023-04-26T14:52:44.601898Z", 251 | "iopub.status.busy": "2023-04-26T14:52:44.601570Z", 252 | "iopub.status.idle": "2023-04-26T14:52:44.609907Z", 253 | "shell.execute_reply": "2023-04-26T14:52:44.609154Z", 254 | "shell.execute_reply.started": "2023-04-26T14:52:44.601873Z" 255 | } 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "x_corpus = corpus\n", 260 | "x_corpus.shape\n", 261 | "dimension = x_corpus.shape[-1]" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "id": "cb28c78c-3156-4da9-b5c1-3b192cb4c70f", 267 | "metadata": {}, 268 | "source": [ 269 | "initialize the flat index for data dimension. \n", 270 | "In current example it is 512\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 7, 276 | "id": "5bf3d65c-c435-407a-a932-8cdd9655ff5a", 277 | "metadata": { 278 | "execution": { 279 | "iopub.execute_input": "2023-04-26T14:52:44.612718Z", 280 | "iopub.status.busy": "2023-04-26T14:52:44.612384Z", 281 | "iopub.status.idle": "2023-04-26T14:52:44.664297Z", 282 | "shell.execute_reply": "2023-04-26T14:52:44.663442Z", 283 | "shell.execute_reply.started": "2023-04-26T14:52:44.612692Z" 284 | } 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "index = faiss.IndexFlatL2(dimension)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "id": "c121ab02-4fe3-4012-af73-038ae78f872e", 294 | "metadata": {}, 295 | "source": [ 296 | "since it is a brute force index, there is no \"training\" or parameters to learn" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "id": "3c2d2c60-0afb-44cf-9cf6-5032811725a7", 303 | "metadata": { 304 | "execution": { 305 | "iopub.execute_input": "2023-04-26T14:52:44.666136Z", 306 | "iopub.status.busy": "2023-04-26T14:52:44.665598Z", 307 | "iopub.status.idle": "2023-04-26T14:52:44.670880Z", 308 | "shell.execute_reply": "2023-04-26T14:52:44.670173Z", 309 | "shell.execute_reply.started": "2023-04-26T14:52:44.666092Z" 310 | } 311 | }, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "True" 317 | ] 318 | }, 319 | "execution_count": 8, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | } 323 | ], 324 | "source": [ 325 | "index.is_trained\n" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "a547440c-fd96-4ce5-9996-0210f00617a7", 331 | "metadata": {}, 332 | "source": [ 333 | "add data to the index. This is a CPU based index." 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 9, 339 | "id": "7aa739ee-42e1-4ac5-b1d8-9876ec777129", 340 | "metadata": { 341 | "execution": { 342 | "iopub.execute_input": "2023-04-26T14:52:44.672070Z", 343 | "iopub.status.busy": "2023-04-26T14:52:44.671820Z", 344 | "iopub.status.idle": "2023-04-26T14:52:44.747616Z", 345 | "shell.execute_reply": "2023-04-26T14:52:44.746751Z", 346 | "shell.execute_reply.started": "2023-04-26T14:52:44.672047Z" 347 | }, 348 | "tags": [] 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "index.add(x_corpus) " 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 10, 358 | "id": "ab9e1b45-1cf2-4e88-960c-6999ad312e22", 359 | "metadata": { 360 | "execution": { 361 | "iopub.execute_input": "2023-04-26T14:52:44.749140Z", 362 | "iopub.status.busy": "2023-04-26T14:52:44.748763Z", 363 | "iopub.status.idle": "2023-04-26T14:52:44.754419Z", 364 | "shell.execute_reply": "2023-04-26T14:52:44.753707Z", 365 | "shell.execute_reply.started": "2023-04-26T14:52:44.749112Z" 366 | } 367 | }, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/plain": [ 372 | "24954" 373 | ] 374 | }, 375 | "execution_count": 10, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "len(x_corpus)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "id": "bfabea5d-26bd-45d8-8b7a-97f179bc4013", 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "id": "348f697d-db43-4093-aa4e-4858d4c058f0", 395 | "metadata": {}, 396 | "source": [ 397 | "number of vectors / results to retrieve" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 11, 403 | "id": "6200be29-b926-4847-9e42-daf90d99319d", 404 | "metadata": { 405 | "execution": { 406 | "iopub.execute_input": "2023-04-26T14:52:44.755735Z", 407 | "iopub.status.busy": "2023-04-26T14:52:44.755402Z", 408 | "iopub.status.idle": "2023-04-26T14:52:44.764114Z", 409 | "shell.execute_reply": "2023-04-26T14:52:44.763389Z", 410 | "shell.execute_reply.started": "2023-04-26T14:52:44.755710Z" 411 | } 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "k =1" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "id": "d57e8258-b7fa-49cb-a9f7-52a62a0dda17", 421 | "metadata": {}, 422 | "source": [ 423 | "#### Index Search\n", 424 | "search method returns query indices (I) similar to search query vector and their euclidean distances (D) from the search query vector." 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "id": "87ea66c3-ebda-4600-b5d8-34ad796cc7c2", 430 | "metadata": {}, 431 | "source": [ 432 | "search for single vector and get top 1 result" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 12, 438 | "id": "306e3a89-331c-4253-b7cf-ce7a0d951b42", 439 | "metadata": { 440 | "execution": { 441 | "iopub.execute_input": "2023-04-26T14:52:44.765288Z", 442 | "iopub.status.busy": "2023-04-26T14:52:44.765050Z", 443 | "iopub.status.idle": "2023-04-26T14:52:48.341486Z", 444 | "shell.execute_reply": "2023-04-26T14:52:48.340493Z", 445 | "shell.execute_reply.started": "2023-04-26T14:52:44.765265Z" 446 | } 447 | }, 448 | "outputs": [ 449 | { 450 | "name": "stdout", 451 | "output_type": "stream", 452 | "text": [ 453 | "4.38 ms ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "%%timeit\n", 459 | "D, I = index.search(x_corpus[:1], k=1) " 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "id": "a2382cf6-2718-4240-b805-c96195af51f0", 465 | "metadata": {}, 466 | "source": [ 467 | "search for all vectors in corpus and get top 1 result" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 13, 473 | "id": "badcb20e-f872-4763-baf5-1876d5dd617d", 474 | "metadata": { 475 | "execution": { 476 | "iopub.execute_input": "2023-04-26T14:52:48.342963Z", 477 | "iopub.status.busy": "2023-04-26T14:52:48.342590Z", 478 | "iopub.status.idle": "2023-04-26T14:52:58.653601Z", 479 | "shell.execute_reply": "2023-04-26T14:52:58.652643Z", 480 | "shell.execute_reply.started": "2023-04-26T14:52:48.342935Z" 481 | }, 482 | "tags": [] 483 | }, 484 | "outputs": [ 485 | { 486 | "name": "stdout", 487 | "output_type": "stream", 488 | "text": [ 489 | "CPU times: user 30.3 s, sys: 8.92 ms, total: 30.3 s\n", 490 | "Wall time: 10.3 s\n" 491 | ] 492 | } 493 | ], 494 | "source": [ 495 | "%%time\n", 496 | "D, I = index.search(x_corpus, k=1) " 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "id": "acf3d26f-119c-4f5c-8c24-fd28e3fccdfc", 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "id": "1eac1d91-007d-4143-b0a3-ae19c763cc6c", 510 | "metadata": {}, 511 | "source": [ 512 | "distance of vector in corpus to query vector" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 14, 518 | "id": "0b555673-cb87-4b11-aa81-ed2ff69d513e", 519 | "metadata": { 520 | "execution": { 521 | "iopub.execute_input": "2023-04-26T14:52:58.660044Z", 522 | "iopub.status.busy": "2023-04-26T14:52:58.657749Z", 523 | "iopub.status.idle": "2023-04-26T14:52:58.667064Z", 524 | "shell.execute_reply": "2023-04-26T14:52:58.666198Z", 525 | "shell.execute_reply.started": "2023-04-26T14:52:58.660006Z" 526 | } 527 | }, 528 | "outputs": [ 529 | { 530 | "data": { 531 | "text/plain": [ 532 | "array([[0.0000000e+00],\n", 533 | " [0.0000000e+00],\n", 534 | " [3.5762787e-07],\n", 535 | " ...,\n", 536 | " [0.0000000e+00],\n", 537 | " [1.3113022e-06],\n", 538 | " [7.1525574e-07]], dtype=float32)" 539 | ] 540 | }, 541 | "execution_count": 14, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "D" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "id": "c4d271c3-850a-4375-87a2-100ffab7a416", 553 | "metadata": {}, 554 | "source": [ 555 | "top vertex id \n", 556 | "\n" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 15, 562 | "id": "65310b96-aab0-4a34-a9eb-48d0dbefc0ec", 563 | "metadata": { 564 | "execution": { 565 | "iopub.execute_input": "2023-04-26T14:52:58.668403Z", 566 | "iopub.status.busy": "2023-04-26T14:52:58.668046Z", 567 | "iopub.status.idle": "2023-04-26T14:52:58.690783Z", 568 | "shell.execute_reply": "2023-04-26T14:52:58.689783Z", 569 | "shell.execute_reply.started": "2023-04-26T14:52:58.668375Z" 570 | } 571 | }, 572 | "outputs": [ 573 | { 574 | "data": { 575 | "text/plain": [ 576 | "array([[ 0],\n", 577 | " [ 1],\n", 578 | " [ 2],\n", 579 | " ...,\n", 580 | " [24951],\n", 581 | " [24952],\n", 582 | " [24953]])" 583 | ] 584 | }, 585 | "execution_count": 15, 586 | "metadata": {}, 587 | "output_type": "execute_result" 588 | } 589 | ], 590 | "source": [ 591 | "I" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "id": "90283075-8783-4607-bf28-e5ce9f55c08c", 597 | "metadata": {}, 598 | "source": [ 599 | "because we are using the entire corpus and the ids are sequential, the ideal recall would be sequential too" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 16, 605 | "id": "b3fb40a2-dd43-4676-a766-3d198943f957", 606 | "metadata": { 607 | "execution": { 608 | "iopub.execute_input": "2023-04-26T14:52:58.692358Z", 609 | "iopub.status.busy": "2023-04-26T14:52:58.691962Z", 610 | "iopub.status.idle": "2023-04-26T14:52:58.703731Z", 611 | "shell.execute_reply": "2023-04-26T14:52:58.702726Z", 612 | "shell.execute_reply.started": "2023-04-26T14:52:58.692330Z" 613 | } 614 | }, 615 | "outputs": [ 616 | { 617 | "data": { 618 | "text/plain": [ 619 | "array([ True, True, True, ..., True, True, True])" 620 | ] 621 | }, 622 | "execution_count": 16, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "res = I[:,0] == np.array( list(range(len(x_corpus))))\n", 629 | "res" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 17, 635 | "id": "27641884-4563-4c1a-9d37-1b8e61ee5322", 636 | "metadata": { 637 | "execution": { 638 | "iopub.execute_input": "2023-04-26T14:52:58.705103Z", 639 | "iopub.status.busy": "2023-04-26T14:52:58.704836Z", 640 | "iopub.status.idle": "2023-04-26T14:52:58.713367Z", 641 | "shell.execute_reply": "2023-04-26T14:52:58.712403Z", 642 | "shell.execute_reply.started": "2023-04-26T14:52:58.705078Z" 643 | } 644 | }, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/plain": [ 649 | "(array([], dtype=int64),)" 650 | ] 651 | }, 652 | "execution_count": 17, 653 | "metadata": {}, 654 | "output_type": "execute_result" 655 | } 656 | ], 657 | "source": [ 658 | "np.where(res == False)" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 18, 664 | "id": "ec2181ae-7a70-4a3a-bc34-bd9899168fe5", 665 | "metadata": { 666 | "execution": { 667 | "iopub.execute_input": "2023-04-26T14:52:58.714883Z", 668 | "iopub.status.busy": "2023-04-26T14:52:58.714532Z", 669 | "iopub.status.idle": "2023-04-26T14:52:58.726900Z", 670 | "shell.execute_reply": "2023-04-26T14:52:58.725900Z", 671 | "shell.execute_reply.started": "2023-04-26T14:52:58.714856Z" 672 | } 673 | }, 674 | "outputs": [ 675 | { 676 | "data": { 677 | "text/plain": [ 678 | "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}" 679 | ] 680 | }, 681 | "execution_count": 18, 682 | "metadata": {}, 683 | "output_type": "execute_result" 684 | } 685 | ], 686 | "source": [ 687 | "{\n", 688 | " \"recall@1\": res.sum()\n", 689 | " , \"num_vectors\": len(res)\n", 690 | " , \"mismatch\": len(res) - res.sum()\n", 691 | "}\n" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "id": "f52a636e-041b-43e6-9b94-5c31447f31cc", 697 | "metadata": { 698 | "execution": { 699 | "iopub.execute_input": "2023-04-26T00:02:55.003678Z", 700 | "iopub.status.busy": "2023-04-26T00:02:55.002818Z", 701 | "iopub.status.idle": "2023-04-26T00:02:55.010824Z", 702 | "shell.execute_reply": "2023-04-26T00:02:55.010099Z", 703 | "shell.execute_reply.started": "2023-04-26T00:02:55.003640Z" 704 | } 705 | }, 706 | "source": [ 707 | "For this corpus, we are able to find the query vector as position 1" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "id": "e88e34b6-f9e0-4835-90cc-ba0e6b2c0414", 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "id": "02bac14d-9696-4a49-be12-9541beeb45a2", 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "id": "7e79d4fc-8191-4151-b330-01b41a2b05d4", 729 | "metadata": {}, 730 | "source": [ 731 | "# FAISS IVF" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "id": "2c6f3e4c-2cad-461c-a8b7-5ffba0a5b354", 737 | "metadata": {}, 738 | "source": [ 739 | "\n", 740 | "\n", 741 | "

Image from Pinecone Faiss Tutorial

\n", 742 | "https://www.pinecone.io/learn/faiss-tutorial/\n", 743 | "\n", 744 | "\n", 745 | "**Parameters**:\n", 746 | "- nlist : number of clusters\n", 747 | "- nprobe: number of clusters to search" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 19, 753 | "id": "342966dc-d361-4fec-8ebe-c3c67864736e", 754 | "metadata": { 755 | "execution": { 756 | "iopub.execute_input": "2023-04-26T14:52:58.728467Z", 757 | "iopub.status.busy": "2023-04-26T14:52:58.728210Z", 758 | "iopub.status.idle": "2023-04-26T14:52:58.785614Z", 759 | "shell.execute_reply": "2023-04-26T14:52:58.784521Z", 760 | "shell.execute_reply.started": "2023-04-26T14:52:58.728443Z" 761 | }, 762 | "tags": [] 763 | }, 764 | "outputs": [], 765 | "source": [ 766 | "nlist = 20 # number of clusters\n", 767 | "quantizer = faiss.IndexFlatL2(dimension) # the other index\n", 768 | "index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 20, 774 | "id": "fec38a91-8c3c-40fe-935d-a160474a7e4f", 775 | "metadata": { 776 | "execution": { 777 | "iopub.execute_input": "2023-04-26T14:52:58.787144Z", 778 | "iopub.status.busy": "2023-04-26T14:52:58.786846Z", 779 | "iopub.status.idle": "2023-04-26T14:52:58.883952Z", 780 | "shell.execute_reply": "2023-04-26T14:52:58.882788Z", 781 | "shell.execute_reply.started": "2023-04-26T14:52:58.787117Z" 782 | }, 783 | "tags": [] 784 | }, 785 | "outputs": [], 786 | "source": [ 787 | "assert not index.is_trained\n", 788 | "index.train(x_corpus)\n", 789 | "assert index.is_trained" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": 21, 795 | "id": "6b577199-2e7b-439f-93a4-2653c7545eef", 796 | "metadata": { 797 | "execution": { 798 | "iopub.execute_input": "2023-04-26T14:52:58.885895Z", 799 | "iopub.status.busy": "2023-04-26T14:52:58.885561Z", 800 | "iopub.status.idle": "2023-04-26T14:52:58.946474Z", 801 | "shell.execute_reply": "2023-04-26T14:52:58.945447Z", 802 | "shell.execute_reply.started": "2023-04-26T14:52:58.885865Z" 803 | } 804 | }, 805 | "outputs": [], 806 | "source": [ 807 | "index.add(x_corpus) " 808 | ] 809 | }, 810 | { 811 | "cell_type": "markdown", 812 | "id": "55f7c7b3-7298-4e53-b1e0-2c3a0f568579", 813 | "metadata": {}, 814 | "source": [ 815 | "we need to train the index first with a sample of vectors before indexing" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": null, 821 | "id": "7a50b8c1-ed8b-45ac-9f19-4325f42b265a", 822 | "metadata": {}, 823 | "outputs": [], 824 | "source": [] 825 | }, 826 | { 827 | "cell_type": "markdown", 828 | "id": "f8d0824c-0712-46fb-a5d4-16b32ee695c0", 829 | "metadata": {}, 830 | "source": [ 831 | "search for single vector" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": 22, 837 | "id": "451988b2-0e03-4e12-92e2-8df408094526", 838 | "metadata": { 839 | "execution": { 840 | "iopub.execute_input": "2023-04-26T14:52:58.956643Z", 841 | "iopub.status.busy": "2023-04-26T14:52:58.953741Z", 842 | "iopub.status.idle": "2023-04-26T14:53:08.215103Z", 843 | "shell.execute_reply": "2023-04-26T14:53:08.214024Z", 844 | "shell.execute_reply.started": "2023-04-26T14:52:58.956606Z" 845 | }, 846 | "tags": [] 847 | }, 848 | "outputs": [ 849 | { 850 | "name": "stdout", 851 | "output_type": "stream", 852 | "text": [ 853 | "114 µs ± 729 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 854 | ] 855 | } 856 | ], 857 | "source": [ 858 | "%%timeit\n", 859 | "\n", 860 | "index.nprobe = 1 # default nprobe is 1\n", 861 | "\n", 862 | "D, I = index.search(x_corpus[:1], k) # actual search" 863 | ] 864 | }, 865 | { 866 | "cell_type": "markdown", 867 | "id": "28e0a32e-5380-48da-9022-0be90db13e75", 868 | "metadata": { 869 | "execution": { 870 | "iopub.execute_input": "2023-04-26T00:07:36.782792Z", 871 | "iopub.status.busy": "2023-04-26T00:07:36.782538Z", 872 | "iopub.status.idle": "2023-04-26T00:07:36.788109Z", 873 | "shell.execute_reply": "2023-04-26T00:07:36.786807Z", 874 | "shell.execute_reply.started": "2023-04-26T00:07:36.782768Z" 875 | } 876 | }, 877 | "source": [ 878 | "in the above, we are only querying 1/20 of the search space" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": null, 884 | "id": "f7547048-000e-4f32-b1c9-fccfd83b3918", 885 | "metadata": {}, 886 | "outputs": [], 887 | "source": [] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 23, 892 | "id": "27ef12d6-d585-4be3-9f5f-8b35ee192c38", 893 | "metadata": { 894 | "execution": { 895 | "iopub.execute_input": "2023-04-26T14:53:08.216344Z", 896 | "iopub.status.busy": "2023-04-26T14:53:08.216081Z", 897 | "iopub.status.idle": "2023-04-26T14:53:20.757164Z", 898 | "shell.execute_reply": "2023-04-26T14:53:20.755944Z", 899 | "shell.execute_reply.started": "2023-04-26T14:53:08.216319Z" 900 | }, 901 | "tags": [] 902 | }, 903 | "outputs": [ 904 | { 905 | "name": "stdout", 906 | "output_type": "stream", 907 | "text": [ 908 | "1.55 ms ± 13.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 909 | ] 910 | } 911 | ], 912 | "source": [ 913 | "%%timeit\n", 914 | "\n", 915 | "\n", 916 | "index.nprobe = 10 # default nprobe is 1\n", 917 | "\n", 918 | "D, I = index.search(x_corpus[:1], k) # actual search" 919 | ] 920 | }, 921 | { 922 | "cell_type": "markdown", 923 | "id": "5e98f114-4d94-4b59-84f2-73232fc834da", 924 | "metadata": { 925 | "execution": { 926 | "iopub.status.busy": "2023-04-26T00:07:36.790059Z", 927 | "iopub.status.idle": "2023-04-26T00:07:36.790366Z", 928 | "shell.execute_reply": "2023-04-26T00:07:36.790209Z", 929 | "shell.execute_reply.started": "2023-04-26T00:07:36.790195Z" 930 | } 931 | }, 932 | "source": [ 933 | "in the above, we are only querying half of the search space" 934 | ] 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": null, 939 | "id": "c1045fcc-358f-4216-a00e-e0db6e1811a4", 940 | "metadata": {}, 941 | "outputs": [], 942 | "source": [] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": null, 947 | "id": "50e83d62-b203-4048-95df-74e61fc2aa0c", 948 | "metadata": {}, 949 | "outputs": [], 950 | "source": [] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": 24, 955 | "id": "2a140a52-213a-4da1-8077-8be794a36f30", 956 | "metadata": { 957 | "execution": { 958 | "iopub.execute_input": "2023-04-26T14:53:20.758823Z", 959 | "iopub.status.busy": "2023-04-26T14:53:20.758438Z", 960 | "iopub.status.idle": "2023-04-26T14:53:24.717603Z", 961 | "shell.execute_reply": "2023-04-26T14:53:24.716457Z", 962 | "shell.execute_reply.started": "2023-04-26T14:53:20.758795Z" 963 | } 964 | }, 965 | "outputs": [ 966 | { 967 | "name": "stdout", 968 | "output_type": "stream", 969 | "text": [ 970 | "4.88 ms ± 57.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 971 | ] 972 | } 973 | ], 974 | "source": [ 975 | "%%timeit\n", 976 | "\n", 977 | "\n", 978 | "index.nprobe = 20 # default nprobe is 1\n", 979 | "\n", 980 | "D, I = index.search(x_corpus[:1], k) # actual search" 981 | ] 982 | }, 983 | { 984 | "cell_type": "markdown", 985 | "id": "971f14de-ec0c-4ac7-b63a-135ec7444834", 986 | "metadata": { 987 | "execution": { 988 | "iopub.status.busy": "2023-04-26T00:07:36.792692Z", 989 | "iopub.status.idle": "2023-04-26T00:07:36.792990Z", 990 | "shell.execute_reply": "2023-04-26T00:07:36.792856Z", 991 | "shell.execute_reply.started": "2023-04-26T00:07:36.792842Z" 992 | } 993 | }, 994 | "source": [ 995 | "in the above, we are querying the entire search space. This is the same as using Brute Force." 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": null, 1001 | "id": "ec6f1eb2-a9cd-47d9-86bc-5c58ed5dbb1d", 1002 | "metadata": {}, 1003 | "outputs": [], 1004 | "source": [] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": null, 1009 | "id": "fb11b2cd-eef5-4b15-a719-e9c5cbe0fb4f", 1010 | "metadata": {}, 1011 | "outputs": [], 1012 | "source": [] 1013 | }, 1014 | { 1015 | "cell_type": "markdown", 1016 | "id": "d8524b69-4efd-4a6d-9f10-499f299ff762", 1017 | "metadata": {}, 1018 | "source": [ 1019 | "search for entire corpus" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": 25, 1025 | "id": "60795b88-8957-4c49-a729-22f935c4dc3f", 1026 | "metadata": { 1027 | "execution": { 1028 | "iopub.execute_input": "2023-04-26T14:53:24.719250Z", 1029 | "iopub.status.busy": "2023-04-26T14:53:24.718697Z", 1030 | "iopub.status.idle": "2023-04-26T14:53:25.774560Z", 1031 | "shell.execute_reply": "2023-04-26T14:53:25.773678Z", 1032 | "shell.execute_reply.started": "2023-04-26T14:53:24.719219Z" 1033 | } 1034 | }, 1035 | "outputs": [ 1036 | { 1037 | "name": "stdout", 1038 | "output_type": "stream", 1039 | "text": [ 1040 | "CPU times: user 8.18 s, sys: 23.5 ms, total: 8.21 s\n", 1041 | "Wall time: 1.05 s\n" 1042 | ] 1043 | } 1044 | ], 1045 | "source": [ 1046 | "%%time\n", 1047 | "\n", 1048 | "\n", 1049 | "index.nprobe = 1 \n", 1050 | "\n", 1051 | "D, I = index.search(x_corpus, k) # actual search" 1052 | ] 1053 | }, 1054 | { 1055 | "cell_type": "code", 1056 | "execution_count": 26, 1057 | "id": "ddbabfeb-2b27-412c-b77b-1d795b00aa29", 1058 | "metadata": { 1059 | "execution": { 1060 | "iopub.execute_input": "2023-04-26T14:53:25.781104Z", 1061 | "iopub.status.busy": "2023-04-26T14:53:25.778777Z", 1062 | "iopub.status.idle": "2023-04-26T14:53:25.791108Z", 1063 | "shell.execute_reply": "2023-04-26T14:53:25.790296Z", 1064 | "shell.execute_reply.started": "2023-04-26T14:53:25.781066Z" 1065 | } 1066 | }, 1067 | "outputs": [ 1068 | { 1069 | "data": { 1070 | "text/plain": [ 1071 | "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}" 1072 | ] 1073 | }, 1074 | "execution_count": 26, 1075 | "metadata": {}, 1076 | "output_type": "execute_result" 1077 | } 1078 | ], 1079 | "source": [ 1080 | "z = I[:,0] == np.array( list(range(len(x_corpus))))\n", 1081 | "{\n", 1082 | " \"recall@1\": z.sum()\n", 1083 | " , \"num_vectors\": len(z)\n", 1084 | " , \"mismatch\": len(z) - z.sum()\n", 1085 | "}\n" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "execution_count": null, 1091 | "id": "6c9da4ac-9d1f-480a-ac9f-ea97e1d16f9d", 1092 | "metadata": {}, 1093 | "outputs": [], 1094 | "source": [] 1095 | }, 1096 | { 1097 | "cell_type": "markdown", 1098 | "id": "2b5515dc-09b7-459c-b93c-80424faac839", 1099 | "metadata": {}, 1100 | "source": [ 1101 | "increase the number of cells that are probed" 1102 | ] 1103 | }, 1104 | { 1105 | "cell_type": "code", 1106 | "execution_count": null, 1107 | "id": "687d36fe-de5d-4ee4-9941-7ca604b446fb", 1108 | "metadata": {}, 1109 | "outputs": [], 1110 | "source": [] 1111 | }, 1112 | { 1113 | "cell_type": "code", 1114 | "execution_count": 27, 1115 | "id": "31c0b86b-2d5e-480d-be94-c136e7fe07d1", 1116 | "metadata": { 1117 | "execution": { 1118 | "iopub.execute_input": "2023-04-26T14:53:25.792414Z", 1119 | "iopub.status.busy": "2023-04-26T14:53:25.792073Z", 1120 | "iopub.status.idle": "2023-04-26T14:54:03.768762Z", 1121 | "shell.execute_reply": "2023-04-26T14:54:03.767827Z", 1122 | "shell.execute_reply.started": "2023-04-26T14:53:25.792388Z" 1123 | } 1124 | }, 1125 | "outputs": [ 1126 | { 1127 | "name": "stdout", 1128 | "output_type": "stream", 1129 | "text": [ 1130 | "4.73 s ± 230 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 1131 | ] 1132 | } 1133 | ], 1134 | "source": [ 1135 | "%%timeit\n", 1136 | "\n", 1137 | "index.nprobe = 5 # default nprobe is 1\n", 1138 | "\n", 1139 | "D, I = index.search(x_corpus, k) " 1140 | ] 1141 | }, 1142 | { 1143 | "cell_type": "code", 1144 | "execution_count": 28, 1145 | "id": "fbffb654-a3a6-4d42-956f-602dab97124e", 1146 | "metadata": { 1147 | "execution": { 1148 | "iopub.execute_input": "2023-04-26T14:54:03.777354Z", 1149 | "iopub.status.busy": "2023-04-26T14:54:03.769965Z", 1150 | "iopub.status.idle": "2023-04-26T14:54:03.787506Z", 1151 | "shell.execute_reply": "2023-04-26T14:54:03.786386Z", 1152 | "shell.execute_reply.started": "2023-04-26T14:54:03.777305Z" 1153 | } 1154 | }, 1155 | "outputs": [ 1156 | { 1157 | "data": { 1158 | "text/plain": [ 1159 | "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}" 1160 | ] 1161 | }, 1162 | "execution_count": 28, 1163 | "metadata": {}, 1164 | "output_type": "execute_result" 1165 | } 1166 | ], 1167 | "source": [ 1168 | "z = I[:,0] == np.array( list(range(len(x_corpus))))\n", 1169 | "{\n", 1170 | " \"recall@1\": z.sum()\n", 1171 | " , \"num_vectors\": len(z)\n", 1172 | " , \"mismatch\": len(z) - z.sum()\n", 1173 | "}\n" 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "code", 1178 | "execution_count": null, 1179 | "id": "fd7e1354-2002-45dc-8732-9506ef6200cf", 1180 | "metadata": {}, 1181 | "outputs": [], 1182 | "source": [] 1183 | } 1184 | ], 1185 | "metadata": { 1186 | "environment": { 1187 | "kernel": "python3", 1188 | "name": "pytorch-gpu.1-13.m107", 1189 | "type": "gcloud", 1190 | "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-13:m107" 1191 | }, 1192 | "kernelspec": { 1193 | "display_name": "Python 3", 1194 | "language": "python", 1195 | "name": "python3" 1196 | }, 1197 | "language_info": { 1198 | "codemirror_mode": { 1199 | "name": "ipython", 1200 | "version": 3 1201 | }, 1202 | "file_extension": ".py", 1203 | "mimetype": "text/x-python", 1204 | "name": "python", 1205 | "nbconvert_exporter": "python", 1206 | "pygments_lexer": "ipython3", 1207 | "version": "3.7.12" 1208 | } 1209 | }, 1210 | "nbformat": 4, 1211 | "nbformat_minor": 5 1212 | } 1213 | -------------------------------------------------------------------------------- /notebooks/workshop_setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "id": "e9b14691-3881-4882-bae0-c46b23401f11", 7 | "metadata": { 8 | "execution": { 9 | "iopub.execute_input": "2023-04-26T14:49:50.397320Z", 10 | "iopub.status.busy": "2023-04-26T14:49:50.396553Z", 11 | "iopub.status.idle": "2023-04-26T14:49:50.401265Z", 12 | "shell.execute_reply": "2023-04-26T14:49:50.400495Z", 13 | "shell.execute_reply.started": "2023-04-26T14:49:50.397287Z" 14 | }, 15 | "tags": [] 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "import nltk\n", 20 | "from sentence_transformers import SentenceTransformer\n", 21 | "from transformers import AutoTokenizer\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "68926c5f-4643-4470-9e76-e1284acee82a", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "0d2cc671-31cd-4b00-8010-01932aa66d88", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "id": "229a8069-c18a-4296-819e-fcebd7398fe8", 44 | "metadata": { 45 | "execution": { 46 | "iopub.execute_input": "2023-04-26T13:32:29.510730Z", 47 | "iopub.status.busy": "2023-04-26T13:32:29.509639Z", 48 | "iopub.status.idle": "2023-04-26T13:32:30.515597Z", 49 | "shell.execute_reply": "2023-04-26T13:32:30.514710Z", 50 | "shell.execute_reply.started": "2023-04-26T13:32:29.510700Z" 51 | } 52 | }, 53 | "outputs": [ 54 | { 55 | "name": "stderr", 56 | "output_type": "stream", 57 | "text": [ 58 | "[nltk_data] Downloading package stopwords to\n", 59 | "[nltk_data] /home/jupyter/nltk_data...\n", 60 | "[nltk_data] Unzipping corpora/stopwords.zip.\n", 61 | "[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...\n", 62 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n", 63 | "[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...\n", 64 | "[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...\n" 65 | ] 66 | }, 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "True" 71 | ] 72 | }, 73 | "execution_count": 5, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "nltk.download('stopwords')\n", 80 | "nltk.download('punkt')\n", 81 | "nltk.download('wordnet')\n", 82 | "nltk.download('omw-1.4')" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "da25e712-759d-4fa8-89bc-ab12e4094acf", 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "32a7451b-7fe5-48f2-af6e-1a9d90b63c14", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 14, 104 | "id": "6dd5ed16-c707-4ad5-97ee-6748e74af1a9", 105 | "metadata": { 106 | "execution": { 107 | "iopub.execute_input": "2023-04-26T14:50:36.001708Z", 108 | "iopub.status.busy": "2023-04-26T14:50:36.000639Z", 109 | "iopub.status.idle": "2023-04-26T14:50:36.005695Z", 110 | "shell.execute_reply": "2023-04-26T14:50:36.004847Z", 111 | "shell.execute_reply.started": "2023-04-26T14:50:36.001654Z" 112 | }, 113 | "tags": [] 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "models = ['sentence-transformers/all-MiniLM-L6-v2','sentence-transformers/clip-ViT-B-32' , 'sentence-transformers/clip-ViT-B-32-multilingual-v1']\n", 118 | "\n", 119 | "\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 15, 125 | "id": "096eca1c-1ea2-4fa8-bc26-7c8ce41f46f1", 126 | "metadata": { 127 | "execution": { 128 | "iopub.execute_input": "2023-04-26T14:50:36.285375Z", 129 | "iopub.status.busy": "2023-04-26T14:50:36.284663Z", 130 | "iopub.status.idle": "2023-04-26T14:50:36.289042Z", 131 | "shell.execute_reply": "2023-04-26T14:50:36.288270Z", 132 | "shell.execute_reply.started": "2023-04-26T14:50:36.285345Z" 133 | }, 134 | "tags": [] 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "text = \"men shoes\"" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 18, 144 | "id": "6958324b-63f0-4320-9880-83d7d920e55c", 145 | "metadata": { 146 | "execution": { 147 | "iopub.execute_input": "2023-04-26T14:51:44.205272Z", 148 | "iopub.status.busy": "2023-04-26T14:51:44.204519Z", 149 | "iopub.status.idle": "2023-04-26T14:51:49.430374Z", 150 | "shell.execute_reply": "2023-04-26T14:51:49.429497Z", 151 | "shell.execute_reply.started": "2023-04-26T14:51:44.205234Z" 152 | }, 153 | "tags": [] 154 | }, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "sentence-transformers/all-MiniLM-L6-v2 {'input_ids': tensor([ 101, 2273, 6007, 102], device='cuda:0'), 'token_type_ids': tensor([0, 0, 0, 0], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1], device='cuda:0'), 'token_embeddings': tensor([[-0.2272, 0.0027, 0.1586, ..., -0.3998, -0.4343, -0.0824],\n", 161 | " [-0.0189, 0.0759, -0.6014, ..., -0.5021, 0.3637, 0.0202],\n", 162 | " [-1.3425, 0.1124, 0.1479, ..., -1.1672, -0.9619, -0.5084],\n", 163 | " [-0.3310, 0.2602, -0.1229, ..., -0.3570, -0.1665, 0.4503]],\n", 164 | " device='cuda:0'), 'sentence_embedding': tensor([-5.8684e-02, 1.3790e-02, -1.2774e-02, 1.4896e-02, 1.0860e-02,\n", 165 | " -6.4392e-02, 6.0345e-02, -6.9710e-02, -3.2946e-02, 4.2863e-03,\n", 166 | " -5.0637e-03, 9.6656e-02, -4.2155e-02, -2.6262e-03, -3.4538e-02,\n", 167 | " -1.3516e-02, -7.5456e-02, 2.6242e-02, -2.1382e-02, 2.3227e-02,\n", 168 | " 3.1208e-02, -3.7767e-02, -4.7549e-02, 4.5558e-03, -9.1842e-02,\n", 169 | " -8.6363e-03, 1.8699e-02, 6.6197e-02, -1.1301e-02, 1.3095e-02,\n", 170 | " 3.8383e-02, 9.8821e-03, 6.1136e-02, 2.6724e-02, -4.4304e-02,\n", 171 | " -8.6510e-02, 1.6309e-02, -3.2358e-02, 3.3459e-04, 9.5446e-02,\n", 172 | " -4.2075e-02, -1.2752e-01, -4.7495e-03, 5.5425e-02, 5.1755e-02,\n", 173 | " 3.0031e-02, 2.5861e-02, 1.5485e-02, -4.3001e-02, 1.2418e-01,\n", 174 | " 9.2281e-04, 2.5372e-02, -2.9569e-02, 2.1765e-02, 5.3302e-02,\n", 175 | " -1.9069e-02, -7.0468e-03, -2.1080e-02, -1.4390e-02, -6.2965e-02,\n", 176 | " 1.2275e-01, 7.1885e-04, -6.6201e-02, 2.3436e-02, 3.0998e-02,\n", 177 | " 1.5899e-02, -4.3535e-02, 4.4102e-03, -1.0930e-02, 4.9731e-02,\n", 178 | " 2.4521e-02, -1.8387e-02, -1.5710e-02, 2.5058e-02, -2.1643e-02,\n", 179 | " 2.9889e-02, -7.1892e-02, -7.9145e-02, -6.6582e-02, -2.8073e-02,\n", 180 | " -4.3930e-02, -3.0970e-02, -9.7531e-03, -1.1777e-02, 1.0500e-02,\n", 181 | " -2.3791e-02, 6.4381e-03, -1.2262e-04, -2.8900e-02, 4.9136e-02,\n", 182 | " -1.3542e-01, -7.4252e-02, 1.3062e-02, -1.8075e-03, -6.6097e-02,\n", 183 | " -7.3589e-03, 1.3785e-02, 3.7384e-02, -8.5169e-02, 1.1559e-01,\n", 184 | " 2.1454e-02, -1.0230e-02, -3.5801e-02, -1.7123e-02, 1.9046e-02,\n", 185 | " 2.3251e-02, -1.7273e-02, 8.2963e-02, 1.8455e-02, 8.6537e-02,\n", 186 | " -2.5882e-02, 2.0791e-02, -7.3512e-02, 1.8831e-02, -5.0279e-02,\n", 187 | " -8.1372e-02, -7.6812e-03, 7.2167e-02, 7.7383e-02, 7.4222e-02,\n", 188 | " 9.1467e-03, 2.2712e-02, 3.6326e-02, -2.0288e-02, -3.1550e-02,\n", 189 | " 7.1721e-03, -3.4719e-02, -4.2586e-33, 5.7250e-03, 1.9000e-02,\n", 190 | " 1.1605e-02, -2.7531e-02, 3.0725e-02, 1.2585e-02, 2.1076e-02,\n", 191 | " -5.6845e-02, 1.6838e-03, -4.8026e-04, -2.1989e-02, 9.2065e-02,\n", 192 | " 7.0795e-03, -4.6472e-02, 5.3270e-02, -5.0890e-02, 7.3680e-02,\n", 193 | " 5.6044e-03, -7.5194e-02, -8.5179e-02, -7.4869e-03, 6.3311e-02,\n", 194 | " -1.0171e-02, 2.0384e-02, 5.0803e-02, -6.6822e-03, -1.5785e-02,\n", 195 | " -3.5164e-02, 4.9931e-02, 8.2824e-03, 9.2603e-02, -2.0116e-02,\n", 196 | " 4.5553e-02, 1.2080e-02, -5.9395e-02, 2.1852e-02, 5.8675e-02,\n", 197 | " 2.2210e-02, 9.2596e-03, -2.3409e-02, 4.5004e-02, -5.1507e-02,\n", 198 | " 2.8044e-02, -6.5380e-03, 7.4831e-03, 5.8157e-02, 3.4447e-02,\n", 199 | " 4.1475e-02, -6.1901e-02, -1.6561e-02, -4.5630e-02, 4.9367e-02,\n", 200 | " -1.6766e-02, -5.7993e-02, 5.2082e-02, -8.8322e-02, 2.9853e-02,\n", 201 | " 2.4510e-02, -9.7888e-03, 4.2248e-03, 7.1062e-03, 1.1276e-01,\n", 202 | " 7.1893e-02, 1.2149e-03, -5.7319e-02, -7.2250e-02, 5.5623e-02,\n", 203 | " 1.4792e-02, -1.4871e-02, 6.0891e-02, -3.7015e-02, -4.5080e-02,\n", 204 | " 6.8613e-02, 5.5002e-02, -1.1833e-02, 2.1440e-02, -2.9428e-02,\n", 205 | " 7.0045e-02, -5.4634e-02, -6.3302e-02, -9.2292e-02, -3.5121e-02,\n", 206 | " 2.7007e-02, 2.0125e-02, -3.6556e-02, -1.3703e-02, -6.1224e-02,\n", 207 | " -1.0986e-02, 1.5851e-02, -2.6837e-02, -1.1262e-01, -4.3161e-02,\n", 208 | " -1.4493e-02, 7.6786e-03, -7.4723e-03, 2.1595e-33, 3.5767e-02,\n", 209 | " 1.1363e-01, -1.4693e-02, 9.2684e-02, 3.3924e-02, -1.5614e-02,\n", 210 | " -1.6073e-02, -6.0525e-04, 2.5468e-03, 9.1995e-04, -2.1501e-02,\n", 211 | " 7.5731e-03, 7.2208e-02, 4.1097e-02, 2.0974e-02, -3.6957e-02,\n", 212 | " 7.8952e-02, 4.7166e-02, 1.3827e-02, 1.5595e-02, 3.9452e-02,\n", 213 | " -2.2254e-02, 2.3012e-02, -3.3751e-02, -7.7026e-02, 2.4340e-02,\n", 214 | " 1.0263e-01, -3.7329e-02, -1.6680e-01, 7.4254e-02, 2.9221e-02,\n", 215 | " -6.3063e-04, -5.0962e-02, 4.1396e-02, -1.9880e-02, 1.0247e-02,\n", 216 | " -1.7069e-01, 9.8603e-02, 5.3988e-02, 1.2776e-02, 4.2524e-02,\n", 217 | " -3.5128e-03, 3.9324e-02, 4.7423e-02, -2.8995e-02, -5.1358e-02,\n", 218 | " -9.3995e-03, 3.7709e-03, -3.8401e-02, -5.3929e-02, -1.8980e-02,\n", 219 | " 3.6103e-02, -6.3064e-02, -4.9003e-02, -5.8757e-02, -4.0821e-04,\n", 220 | " -1.0862e-01, 4.6522e-02, -5.9556e-02, 7.4809e-02, -3.2031e-02,\n", 221 | " 6.8497e-02, 2.7983e-02, -5.1754e-03, -4.0551e-02, 1.9033e-02,\n", 222 | " -1.1128e-02, 8.7901e-03, -3.8768e-02, -3.0256e-02, 9.6022e-02,\n", 223 | " -1.0334e-01, 8.3221e-02, 1.0190e-01, -2.3660e-02, -1.2010e-03,\n", 224 | " -2.6769e-02, 6.1521e-02, 2.9026e-02, -5.4091e-02, 3.6390e-03,\n", 225 | " -9.2658e-02, 7.8827e-03, 1.1917e-01, -5.4807e-02, 1.5506e-01,\n", 226 | " 4.6332e-02, 4.9145e-02, 5.1203e-03, 4.5503e-02, 1.6287e-02,\n", 227 | " 4.3451e-02, 4.7670e-02, 1.7630e-02, -1.9633e-02, -1.1029e-08,\n", 228 | " 6.5470e-02, 5.2181e-02, 3.3727e-02, 3.1607e-02, -1.7458e-02,\n", 229 | " -1.0251e-04, -1.4603e-02, 2.6509e-02, 7.8088e-02, 1.1715e-02,\n", 230 | " -7.6623e-02, -1.9881e-02, -3.8228e-02, 8.3098e-02, 3.7713e-02,\n", 231 | " 5.6128e-02, -4.2606e-02, -1.7351e-02, -4.4235e-02, -6.9647e-02,\n", 232 | " 7.4529e-03, -5.4775e-02, 4.2709e-02, 1.1223e-01, -1.0754e-02,\n", 233 | " 1.0371e-02, 2.4825e-02, -1.1770e-01, 1.0997e-02, 1.0135e-01,\n", 234 | " 1.2151e-02, 2.0672e-02, 1.5000e-02, -3.0694e-02, 7.1544e-02,\n", 235 | " 3.1737e-02, -3.7621e-03, -6.8717e-03, 5.2807e-02, -8.3122e-02,\n", 236 | " -9.0656e-02, -1.2168e-01, 4.1270e-02, -2.7971e-02, -4.6418e-02,\n", 237 | " -5.2283e-02, 3.0334e-04, 1.3859e-01, -1.0576e-01, 3.3442e-02,\n", 238 | " 8.5742e-03, -2.2963e-02, 5.4894e-02, -1.8929e-02, 1.1873e-04,\n", 239 | " -7.1593e-02, -1.7575e-02, 9.0126e-02, 2.2157e-02, -4.3015e-02,\n", 240 | " 3.9037e-02, -7.4174e-02, -3.6659e-02, -3.6811e-03], device='cuda:0')}\n", 241 | "sentence-transformers/clip-ViT-B-32 {'input_ids': tensor([49406, 1656, 4079, 49407], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1], device='cuda:0'), 'image_text_info': 1, 'sentence_embedding': tensor([-2.6722e-01, -2.5024e-01, 1.9831e-01, -8.4449e-02, -4.5411e-01,\n", 242 | " 1.5269e-01, 1.9422e-01, -7.6131e-01, 1.6059e-02, 5.6570e-02,\n", 243 | " -1.1732e-01, -1.7434e-01, -4.8341e-02, -5.8244e-02, 1.9622e-03,\n", 244 | " 3.3086e-02, 4.6739e-01, 5.9902e-02, -1.4106e-01, -2.5187e-01,\n", 245 | " 1.1865e-01, 2.7707e-01, -2.4586e-01, 1.6738e-01, -6.5906e-02,\n", 246 | " -4.9510e-01, 7.1441e-02, 4.7636e-02, -1.1008e-01, 2.0449e-01,\n", 247 | " 1.9936e-02, -1.8740e-01, -1.1138e-02, 1.2773e-01, -5.5551e-01,\n", 248 | " -4.6653e-02, 1.8026e-01, -9.6517e-02, 1.2344e-01, -1.3597e-01,\n", 249 | " -1.9992e-01, -1.9570e-01, 1.5068e-01, -5.0438e-01, 9.0387e-02,\n", 250 | " 2.1159e-01, 5.4030e-03, -4.0513e-02, 2.7146e-01, 3.7458e-02,\n", 251 | " 8.0310e-02, -2.9841e-02, 5.1565e-02, -6.1074e-01, -2.5701e-01,\n", 252 | " 1.3316e-01, -1.1232e-01, -5.3493e-03, -6.2610e-01, -3.1894e-01,\n", 253 | " 1.9283e-01, 2.0204e-02, -7.6878e-02, -4.0941e-01, 3.9715e-01,\n", 254 | " 3.7647e-02, 1.9459e-01, 3.6569e-01, 3.9625e-01, -1.1323e-01,\n", 255 | " 4.1932e-01, -2.1839e-01, 1.6538e-01, 3.2633e-01, 2.7650e-01,\n", 256 | " 2.5525e-01, -7.0886e-02, -2.6799e-02, -2.8865e-01, -4.0908e-01,\n", 257 | " -2.3828e-01, 1.0888e-01, 1.9548e-02, 1.9067e-01, 1.2776e-01,\n", 258 | " 4.7739e-01, -2.3506e-01, -1.8611e-01, 1.3253e-01, -2.1160e-01,\n", 259 | " -5.1571e-01, -2.1936e-01, -1.3583e+00, 6.3813e-01, 1.9401e-01,\n", 260 | " -1.7399e-01, 2.3773e-02, 1.7300e-01, 2.0512e-01, 3.8955e-01,\n", 261 | " 1.2784e-02, -1.5616e-01, 2.7312e-01, 1.1615e-01, 9.6609e-02,\n", 262 | " 2.5802e-01, -1.6590e-01, -2.5717e-01, -2.7002e-01, 2.2556e-01,\n", 263 | " -5.9875e-01, 4.1399e-01, 1.0067e-01, 2.3802e-01, 1.8393e-01,\n", 264 | " -7.7338e-02, 2.8407e-01, -1.8666e-01, 1.8673e-01, -2.7489e-02,\n", 265 | " 6.1416e-02, -5.3712e-02, -3.1903e-01, 2.2104e-01, 4.5745e-01,\n", 266 | " -1.4629e-01, -2.0385e-01, -3.0613e-03, 1.0831e-01, 1.3029e-01,\n", 267 | " 7.5172e-02, 1.7098e-01, -1.7295e-01, 6.0518e+00, -4.3587e-01,\n", 268 | " -3.6123e-02, -2.3084e-01, -2.7611e-01, -2.6074e-01, -2.6225e-02,\n", 269 | " -1.2373e-01, -3.8144e-02, 3.9868e-02, 1.2663e-01, -1.5586e-02,\n", 270 | " -1.0817e-01, -1.9468e-01, -4.2929e-01, 1.9576e-01, 4.2164e-02,\n", 271 | " 2.4703e-01, 1.4054e-01, 5.2742e-01, 1.1616e-02, -1.9511e-01,\n", 272 | " 1.8861e-02, -3.4501e-02, 2.4374e-01, -1.0141e-01, -8.6388e-02,\n", 273 | " -3.6278e-02, -6.0471e-02, 1.7178e-01, 4.3191e-02, -5.6021e-02,\n", 274 | " 2.1537e-01, 2.2960e-01, -2.8191e-01, 3.6694e-01, -2.1738e-01,\n", 275 | " -1.5248e-01, -5.3858e-01, 4.8218e-04, 1.0611e-01, -4.3668e-01,\n", 276 | " 3.2565e-01, 2.1495e-01, 7.9084e-02, -1.1144e-01, -8.2238e-02,\n", 277 | " -1.0823e-02, 3.2505e-01, -1.6962e-02, 2.4763e-03, -2.1186e-01,\n", 278 | " -1.7482e-01, 5.7193e-01, 2.6038e-02, -2.0085e-01, 3.7864e-01,\n", 279 | " 2.8407e-02, -1.4909e-01, 4.4668e-02, -9.7271e-02, -7.5325e-02,\n", 280 | " 2.7872e-01, -1.5554e-01, 2.2307e-01, 9.8059e-02, -4.1579e-02,\n", 281 | " -2.8606e-02, -2.3032e-01, 3.1150e-01, -7.1310e-02, -2.4904e-01,\n", 282 | " -7.7626e-02, -1.2460e-01, -2.6306e-02, -1.9954e-01, 8.6190e-02,\n", 283 | " 3.0547e-05, 2.2495e-03, 2.2625e-01, -1.2017e-01, -1.5577e-01,\n", 284 | " 3.8977e-01, 6.9040e-03, -1.5162e-01, -1.7155e-01, 1.7012e-01,\n", 285 | " 2.9838e-01, -3.6955e-01, 1.8884e-01, -3.1549e-01, -2.8930e-01,\n", 286 | " -5.5932e-01, 3.5717e-01, 2.4953e-01, -2.4460e-01, 7.5881e-03,\n", 287 | " -1.7940e-01, -2.2461e-02, -1.8611e-01, 1.4540e-01, 3.2545e-01,\n", 288 | " 1.3929e-01, -3.1123e-01, -2.8450e-01, 4.3041e-01, -4.7437e-02,\n", 289 | " 6.9766e-02, -1.7365e-01, 5.9896e-02, 5.9521e-01, -4.6284e-02,\n", 290 | " 3.8939e-02, 1.3634e-01, -1.1272e-01, 1.6409e-01, 9.2894e-03,\n", 291 | " -4.9759e-02, -4.7318e-02, 1.4064e-01, -1.3499e-01, -3.7380e-02,\n", 292 | " -1.2251e-01, 5.5834e-01, -2.6055e-01, 9.8233e-02, -5.9575e-02,\n", 293 | " 5.2060e-02, -3.2568e-02, -2.9983e-02, 1.0168e-01, 1.8758e-01,\n", 294 | " -4.8515e-02, 2.4001e-01, -1.5750e-02, 1.0778e-01, 6.0214e-02,\n", 295 | " -2.2326e-01, -2.0359e-02, 2.0070e-01, 4.8625e-02, -1.4054e-02,\n", 296 | " 9.4663e-02, -1.7493e-01, 2.9097e-02, 1.6428e-01, -2.8810e-01,\n", 297 | " 1.3145e-01, 1.0770e-01, -3.8759e-02, -8.3538e-02, -7.6144e-02,\n", 298 | " -5.5353e-04, 3.1190e-01, -1.5708e-01, 7.7211e-02, -8.1434e-03,\n", 299 | " 2.1436e-01, -9.9840e-02, 1.8908e-01, 4.1516e-01, 1.0249e-02,\n", 300 | " -3.0407e-01, -3.6515e-01, -1.2376e-01, 2.7788e-02, 1.9021e-02,\n", 301 | " 1.8605e-01, 2.9233e-01, 5.6915e-02, 2.0724e-01, 7.0265e-02,\n", 302 | " -9.5444e-02, -2.1719e-01, 1.0457e-01, -2.2053e-01, 1.3246e-01,\n", 303 | " -1.5537e-01, -9.5731e-02, 6.0466e+00, 4.0778e-01, 1.1285e-02,\n", 304 | " 1.1125e-01, -2.3022e-01, 3.2817e-01, 3.3474e-01, 1.3995e-01,\n", 305 | " 1.5649e-01, 3.0448e-01, -1.5101e-02, -1.7316e-01, -3.8413e-01,\n", 306 | " 2.2023e-01, -4.7145e-01, 9.5377e-02, -1.6545e-01, -1.7516e+00,\n", 307 | " 2.9473e-01, 1.4115e-01, 1.8588e-01, -1.2569e-01, 6.5745e-02,\n", 308 | " -1.4898e-01, -9.3745e-02, 3.1413e-01, 1.2686e-01, -1.0292e-01,\n", 309 | " -2.1954e-01, 3.0948e-02, -4.2561e-01, -2.6715e-01, 5.5388e-02,\n", 310 | " -8.8885e-02, -9.3119e-02, 2.2464e-01, -4.0584e-02, 1.0608e-01,\n", 311 | " 2.8624e-01, -1.7813e-01, -5.1491e-02, 6.9263e-02, 1.7411e-01,\n", 312 | " -4.3583e-01, -1.5844e-01, 1.3288e-02, 1.3611e-01, 2.8650e-01,\n", 313 | " 2.2427e-01, 2.9285e-01, -8.2768e-02, -2.1313e-01, 4.6977e-01,\n", 314 | " 1.3981e-01, 4.4196e-01, 5.9897e-01, -2.0943e-01, 1.3199e-01,\n", 315 | " -1.2770e-01, 7.0080e-02, -8.7112e-02, 2.5521e-01, -3.5098e-01,\n", 316 | " 2.7341e-01, -3.3966e-02, -1.0843e-01, 4.3469e-01, -6.2516e-02,\n", 317 | " -1.0742e-01, -3.9380e-01, 2.8758e-02, -7.4800e-01, -1.1042e-01,\n", 318 | " 7.4263e-02, -1.0410e-01, -3.2008e-01, -2.6784e-01, 8.2537e-03,\n", 319 | " -3.0056e-01, 1.5119e-01, -1.6852e-01, -5.2178e-03, -1.4731e-01,\n", 320 | " 4.4588e-01, 3.7448e-01, -3.9732e-01, 1.3030e-01, 6.4875e-01,\n", 321 | " -2.1301e-01, 1.2479e-01, 2.6747e-01, -2.6366e-01, -1.2018e-01,\n", 322 | " -2.2133e-02, -1.6639e-02, -1.1777e-01, -3.0975e-01, 1.7078e-01,\n", 323 | " -1.0861e-04, 1.3197e-01, -2.0081e-01, -2.6618e-01, 3.1587e-02,\n", 324 | " 1.0479e-01, 8.0152e-02, 1.1496e-01, -1.7642e-01, -1.5247e-01,\n", 325 | " -4.1968e-01, 2.1120e-01, 1.1615e-01, -2.0123e-01, -1.9122e-01,\n", 326 | " 6.6410e-02, -2.2218e-01, 1.0440e-01, 7.5966e-03, 4.2399e-01,\n", 327 | " -9.5817e-02, -3.5184e-01, -6.2223e-02, -1.7677e-01, -4.7161e-01,\n", 328 | " -3.8534e-02, -8.0673e-02, -4.5963e-03, -6.4624e-03, -1.8014e-02,\n", 329 | " -1.1445e-01, 4.2948e-02, -2.5978e-01, 5.9180e-02, 3.8135e-02,\n", 330 | " 2.0812e-01, -2.4710e-01, -3.7079e-01, 2.0508e-01, 2.1899e-01,\n", 331 | " -9.4519e-01, 2.7558e-01, 2.7138e-01, 1.4367e-01, -1.5441e-01,\n", 332 | " -1.8515e-01, 9.0163e-02, 2.0217e-01, -1.0698e-01, -2.3912e-01,\n", 333 | " -2.2943e-01, 2.2451e-01, 1.2780e-01, 1.2775e-01, 3.1585e-01,\n", 334 | " -2.1394e-01, -1.2684e-01, -9.6884e-02, -6.5346e-01, -2.9291e-01,\n", 335 | " -5.3387e-01, -2.2139e-02, -1.9091e-01, -5.3790e-01, -1.2427e-01,\n", 336 | " 3.3499e-01, -1.1048e-02, 6.9154e-02, -4.7823e-02, -1.6247e-02,\n", 337 | " 1.7433e-01, -6.0335e-01, -1.3460e-02, -3.3694e-01, -4.4399e-01,\n", 338 | " -4.2622e-02, 3.5184e-01, -1.3890e-01, 2.1893e-01, -1.5696e-02,\n", 339 | " 1.5114e-01, 1.4282e-01, 2.9100e-01, 8.3577e-01, -7.7831e-02,\n", 340 | " -6.1096e-02, -5.0251e-02, -2.8395e-01, -1.4788e-01, -1.7162e-01,\n", 341 | " -9.0106e-02, 2.4305e-01, -1.6364e-01, 2.7089e-01, 3.6500e-01,\n", 342 | " -2.4414e-01, -4.1538e-01, 3.7322e-01, 3.0024e-01, 2.3689e-01,\n", 343 | " 7.3204e-02, -3.4812e-02], device='cuda:0')}\n", 344 | "Model name:sentence-transformers/clip-ViT-B-32 ; tokenizer doesn't exist; sentence-transformers/clip-ViT-B-32 does not appear to have a file named config.json. Checkout 'https://huggingface.co/sentence-transformers/clip-ViT-B-32/main' for available files.\n", 345 | "sentence-transformers/clip-ViT-B-32-multilingual-v1 {'input_ids': tensor([ 101, 10588, 48201, 47125, 102], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1, 1], device='cuda:0'), 'token_embeddings': tensor([[ 0.2305, 0.0549, -0.1571, ..., 0.5232, -0.0930, -0.1172],\n", 346 | " [-0.2680, 0.2789, -0.0539, ..., 0.4740, -0.2368, -0.3543],\n", 347 | " [ 0.3758, 0.2741, -0.3703, ..., 0.7613, 0.3820, -0.0288],\n", 348 | " [ 0.4044, 0.2899, -0.4232, ..., 0.8181, 0.3925, -0.0369],\n", 349 | " [ 0.1187, 0.0554, -0.3584, ..., 0.3664, 0.1128, -0.1268]],\n", 350 | " device='cuda:0'), 'sentence_embedding': tensor([-2.0334e-01, -1.6914e-01, 7.4771e-02, -1.4223e-01, -3.0504e-01,\n", 351 | " 2.0065e-01, 9.7776e-02, -9.3886e-01, -2.3021e-03, 9.3738e-02,\n", 352 | " 1.8778e-02, -2.6235e-01, -1.2688e-01, -1.7641e-01, 9.0206e-02,\n", 353 | " -6.5903e-02, 1.5076e-01, 6.2253e-02, -3.2784e-02, -2.5361e-01,\n", 354 | " 2.5175e-01, 3.9041e-01, -9.7813e-02, 2.4359e-01, 4.1893e-02,\n", 355 | " -3.3285e-01, 9.3473e-02, 1.3778e-01, -1.5428e-01, 2.4548e-01,\n", 356 | " 1.9890e-02, -2.7075e-01, -1.8032e-02, 6.4909e-02, -4.3061e-01,\n", 357 | " -1.1657e-01, 2.4541e-01, -4.9875e-02, 2.3737e-02, -2.3621e-02,\n", 358 | " -3.1835e-02, -2.0869e-01, 5.7527e-02, -5.0633e-01, 1.3199e-01,\n", 359 | " 2.9206e-01, 2.9964e-02, -3.3747e-02, 2.3226e-01, 3.6006e-02,\n", 360 | " 1.2699e-01, 9.7488e-02, 7.3582e-02, -5.8144e-01, -2.2830e-01,\n", 361 | " 1.3345e-01, -5.1341e-02, -8.0402e-02, -5.1184e-01, -1.9752e-01,\n", 362 | " 1.4001e-01, -5.0529e-02, 2.0968e-02, -3.1461e-01, 2.1346e-01,\n", 363 | " -3.8276e-02, 1.8773e-01, 2.2569e-01, 1.9953e-01, -7.3426e-02,\n", 364 | " 4.3863e-01, -1.7246e-01, 1.5217e-01, 1.7116e-01, 2.1022e-01,\n", 365 | " 4.1116e-01, 1.4070e-01, -1.1562e-01, -2.2117e-01, -4.7348e-01,\n", 366 | " -2.1663e-01, 4.4288e-03, 1.7295e-02, 2.5705e-01, 1.1410e-01,\n", 367 | " 5.6728e-01, -2.1271e-01, -2.2724e-01, 1.2777e-01, -1.4383e-01,\n", 368 | " -3.4052e-01, -1.4664e-01, -1.4763e+00, 6.1955e-01, 1.6093e-01,\n", 369 | " -1.7167e-01, 2.3186e-02, 1.3621e-01, 9.5215e-02, 3.2798e-01,\n", 370 | " -1.8117e-02, -4.4082e-02, 2.5197e-01, 1.7477e-01, 1.9436e-01,\n", 371 | " 3.0611e-01, -8.9931e-02, -7.9702e-02, -2.5298e-01, 2.7659e-01,\n", 372 | " -5.9393e-01, 4.1021e-01, 7.9611e-02, 1.3278e-01, 1.3027e-01,\n", 373 | " -4.7478e-02, 1.4775e-01, -1.3919e-02, 3.9992e-02, 1.2915e-01,\n", 374 | " -2.0684e-02, -1.4745e-01, -2.2467e-01, 2.0943e-01, 4.3064e-01,\n", 375 | " -1.1368e-01, -1.3093e-01, 8.6813e-03, 1.4154e-02, 1.3849e-01,\n", 376 | " 1.8656e-01, 1.8730e-01, -3.0004e-01, 5.7339e+00, -3.2605e-01,\n", 377 | " -4.2595e-02, -4.1856e-01, -3.0168e-01, -8.9973e-02, 9.5640e-04,\n", 378 | " -1.4158e-02, -1.0548e-02, -1.1700e-01, 2.3475e-01, -2.6989e-02,\n", 379 | " -1.3903e-01, -9.5062e-02, -3.3901e-01, 3.1460e-01, -5.6003e-02,\n", 380 | " 2.0593e-01, 9.9136e-02, 5.2334e-01, 7.3437e-02, -1.5021e-01,\n", 381 | " -6.6951e-02, -6.3785e-02, 2.3081e-01, 2.6384e-02, -8.1484e-02,\n", 382 | " -4.4474e-02, -4.0740e-02, 2.5724e-01, 3.1777e-02, -9.1613e-02,\n", 383 | " 3.9941e-02, 4.0293e-02, -3.0676e-01, 3.4817e-01, -2.2957e-01,\n", 384 | " -5.3091e-02, -3.3662e-01, -5.1250e-02, 1.4828e-01, -2.7174e-01,\n", 385 | " 3.7047e-01, -3.9839e-02, 1.3486e-01, -5.5569e-02, -6.2613e-02,\n", 386 | " 2.6114e-02, 2.9128e-01, 6.0373e-02, -1.2501e-02, -1.6453e-01,\n", 387 | " -9.3317e-03, 4.4107e-01, 1.1232e-01, -1.3276e-01, 3.4949e-01,\n", 388 | " 9.3545e-02, 4.6813e-03, -2.4483e-02, 6.4613e-02, -1.1478e-01,\n", 389 | " 3.0463e-01, -4.4370e-02, 2.0068e-01, -4.3869e-02, -8.9185e-02,\n", 390 | " 1.1884e-01, -1.3263e-01, 3.5835e-01, -1.0064e-01, -3.2594e-01,\n", 391 | " -1.5505e-01, -5.2067e-02, -1.1747e-03, -7.0044e-02, 4.5385e-02,\n", 392 | " -6.7558e-02, 1.6928e-01, 2.6598e-01, 6.8568e-02, -1.9572e-01,\n", 393 | " 2.9449e-01, 4.5720e-03, -6.7686e-02, -3.0300e-01, 2.0569e-01,\n", 394 | " 2.2198e-01, -2.7167e-01, 2.2274e-01, -1.5873e-01, -3.3218e-01,\n", 395 | " -4.9440e-01, 3.7218e-01, 2.3206e-01, -1.8265e-01, 1.3722e-01,\n", 396 | " -2.2112e-01, -3.3449e-02, -1.0907e-01, 3.2806e-02, 2.4866e-01,\n", 397 | " 2.5990e-02, -1.8595e-01, -2.5502e-01, 2.8818e-01, -2.1293e-01,\n", 398 | " 1.0490e-01, -6.9663e-02, 1.0206e-01, 5.4036e-01, 6.0295e-03,\n", 399 | " -8.6853e-03, 7.8703e-02, -6.7371e-02, 6.5200e-02, 9.7390e-03,\n", 400 | " 3.3331e-02, -1.1727e-01, 2.7078e-02, -7.0596e-02, 2.8880e-02,\n", 401 | " -1.6717e-01, 4.1236e-01, -1.3797e-01, 1.2633e-02, 4.0977e-02,\n", 402 | " -2.9763e-02, -4.3454e-02, 2.7565e-02, -4.0571e-02, 5.7820e-02,\n", 403 | " -4.8267e-02, 2.4278e-01, -7.6660e-03, 2.9698e-02, 4.8585e-02,\n", 404 | " -1.5352e-01, 1.4553e-02, 1.5029e-01, 6.8895e-03, 1.2136e-02,\n", 405 | " 4.9711e-02, -1.0585e-01, -3.0702e-02, 9.3002e-02, -8.1609e-02,\n", 406 | " 2.3991e-01, 3.9812e-03, -1.9190e-01, -1.3853e-01, -1.6096e-01,\n", 407 | " -2.1159e-01, 2.1663e-01, -8.0867e-02, -1.8770e-02, -1.5257e-01,\n", 408 | " 1.9546e-01, -6.9405e-02, 1.4577e-01, 3.3809e-01, 6.0678e-02,\n", 409 | " -2.6670e-01, -2.0814e-01, -5.2758e-03, 5.5603e-02, 4.4693e-02,\n", 410 | " 2.0097e-01, 2.7027e-01, 1.2986e-01, 2.3777e-01, 7.5795e-02,\n", 411 | " -7.5986e-02, -3.1976e-01, 5.8500e-02, -9.6743e-02, 5.7583e-04,\n", 412 | " -1.8898e-01, -1.7358e-01, 5.7260e+00, 4.8272e-01, -8.1259e-02,\n", 413 | " 3.2061e-03, -3.5555e-01, 1.5947e-01, 3.3509e-01, 1.7805e-01,\n", 414 | " 9.1990e-02, 3.5931e-02, -7.3387e-02, -6.7740e-02, -1.9722e-01,\n", 415 | " 1.6325e-01, -5.5183e-01, 1.3599e-01, -1.9017e-01, -2.1158e+00,\n", 416 | " 1.7186e-01, 2.0626e-01, 3.9789e-02, -1.1614e-01, 1.3275e-02,\n", 417 | " -2.0151e-01, -2.5364e-02, 3.5192e-01, 1.1858e-01, -4.1850e-02,\n", 418 | " -2.7810e-01, 2.4617e-02, -3.5590e-01, -2.9734e-01, 5.7519e-02,\n", 419 | " -1.2361e-01, 7.0654e-02, 1.9895e-01, -1.0288e-01, 1.0614e-01,\n", 420 | " 2.2623e-01, -1.1561e-01, -4.4942e-02, 1.0816e-01, 1.0192e-01,\n", 421 | " -2.6893e-01, -1.7173e-01, -2.1719e-01, 1.1773e-01, 2.5459e-01,\n", 422 | " 3.0536e-01, 1.7029e-01, 2.4242e-02, -1.4146e-01, 5.0061e-01,\n", 423 | " 7.2468e-02, 3.1891e-01, 3.2688e-01, -1.5695e-01, 1.7149e-01,\n", 424 | " -2.0500e-01, 4.5032e-02, -1.7520e-01, 2.2878e-01, -1.6988e-01,\n", 425 | " 1.5373e-01, -5.7668e-02, -8.8306e-03, 3.1439e-01, -2.9895e-02,\n", 426 | " -7.8530e-02, -2.2839e-01, -5.0848e-02, -6.3631e-01, -8.8465e-02,\n", 427 | " -1.5557e-02, -2.0264e-01, -2.6219e-01, -3.1466e-01, 2.0764e-02,\n", 428 | " -3.4295e-01, 1.9736e-01, -1.6578e-01, -1.0560e-01, -2.2583e-01,\n", 429 | " 2.5710e-01, 2.4691e-01, -4.3547e-01, 1.1362e-01, 4.6921e-01,\n", 430 | " -2.3151e-01, 1.5638e-01, 2.0220e-01, -2.8883e-01, -1.3096e-01,\n", 431 | " -2.4510e-02, 7.7040e-02, -7.8399e-04, -4.7700e-01, 4.2299e-02,\n", 432 | " 1.5806e-01, 1.6993e-01, -8.3540e-02, -9.5018e-02, -1.5459e-02,\n", 433 | " 4.4254e-02, 1.1774e-01, -3.8907e-02, -1.5936e-01, -2.1897e-02,\n", 434 | " -3.0448e-01, 2.6064e-01, 1.6372e-01, -2.3275e-01, -1.8462e-01,\n", 435 | " 9.4053e-02, -1.6129e-01, -1.5300e-01, 1.7068e-01, 4.8920e-01,\n", 436 | " 2.1387e-02, -2.2186e-01, 7.1614e-02, -1.5353e-02, -3.8598e-01,\n", 437 | " -1.4085e-01, -1.0007e-01, -7.3114e-02, -8.1861e-02, -2.0652e-02,\n", 438 | " -1.9611e-01, -1.1353e-02, -1.4559e-01, 1.9196e-01, 1.0416e-01,\n", 439 | " 1.1943e-01, -2.0479e-01, -2.3482e-01, 1.6960e-01, 2.5849e-01,\n", 440 | " -8.1506e-01, 3.2976e-01, 2.7162e-01, 1.1857e-01, -1.2812e-01,\n", 441 | " -1.3794e-01, 1.0144e-01, 1.3436e-01, -8.4321e-02, -2.6041e-01,\n", 442 | " -1.9379e-01, 2.6946e-01, -2.9697e-02, -2.7816e-02, 4.6124e-01,\n", 443 | " -1.1000e-01, 2.2152e-02, -7.0317e-02, -5.6700e-01, -3.9634e-01,\n", 444 | " -5.3712e-01, -2.5599e-02, -2.3336e-01, -4.9603e-01, -1.3422e-01,\n", 445 | " 2.4102e-01, 6.0139e-02, 1.9689e-02, -2.9797e-01, 7.1472e-02,\n", 446 | " 5.7367e-02, -4.8900e-01, 3.0909e-03, -2.3196e-01, -4.4083e-01,\n", 447 | " -1.6809e-02, 4.6111e-01, -5.8852e-02, 1.3696e-01, 9.9999e-02,\n", 448 | " 1.7090e-01, 1.2402e-01, 3.0726e-01, 8.3289e-01, 5.7107e-03,\n", 449 | " -5.5376e-02, -3.5042e-02, -2.4841e-01, -1.3627e-01, -2.5289e-01,\n", 450 | " -1.9113e-01, 1.7571e-01, -2.1889e-01, 5.0180e-01, 2.4607e-01,\n", 451 | " -3.9292e-01, -3.9180e-01, 2.2482e-01, 2.2826e-01, -2.6152e-02,\n", 452 | " -3.1935e-03, -9.9769e-02], device='cuda:0')}\n" 453 | ] 454 | }, 455 | { 456 | "data": { 457 | "application/vnd.jupyter.widget-view+json": { 458 | "model_id": "d966829260ec4907a130766b61833261", 459 | "version_major": 2, 460 | "version_minor": 0 461 | }, 462 | "text/plain": [ 463 | "Downloading (…)okenizer_config.json: 0%| | 0.00/371 [00:00 install.sh 21 | bash install.sh --disable-prompts --install-dir=/opt 22 | -------------------------------------------------------------------------------- /workshop_infra/scripts/build_setup_user.sh: -------------------------------------------------------------------------------- 1 | 2 | papermill /tmp/workshop/notebooks/workshop_setup.ipynb /tmp/workshop_setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar -------------------------------------------------------------------------------- /workshop_infra/scripts/container_startup.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | #gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS || echo "skipped gcloud authentication" 4 | 5 | 6 | #cp -r /tmp/workshop /home/jovyan 7 | 8 | 9 | 10 | echo $pwd 11 | 12 | GIT_BRANCH="pydata_seattle" 13 | GIT_BRANCH="main" 14 | 15 | echo "cloning repo" 16 | git clone --depth 1 https://github.com/npatta01/search-engine-workshop.git -b $GIT_BRANCH 17 | 18 | 19 | cd search-engine-workshop 20 | 21 | 22 | url="https://storage.googleapis.com/np-public-training-tmp/search-workshop/data.zip" 23 | 24 | if wget --spider $url 2>/dev/null; then 25 | 26 | echo "getting data from gcs" 27 | wget $url 28 | unzip -q data.zip 29 | else 30 | echo "getting from github" 31 | wget https://github.com/npatta01/search-engine-workshop/releases/download/v1.0/data_processed.zip 32 | unzip -q data_processed.zip 33 | 34 | fi 35 | 36 | -------------------------------------------------------------------------------- /workshop_infra/setup.md: -------------------------------------------------------------------------------- 1 | # Workshop Setup 2 | 3 | The following included commands and steps that were used to create a working jupyter hub installation for the workshop. 4 | 5 | The instructions assume that you are plannning to use GCP and have gcloud setup. 6 | 7 | 8 | Most of the instructions are taken from [zero-to-jupyterhub](https://zero-to-jupyterhub.readthedocs.io/en/latest/index.html) project. 9 | 10 | 11 | ## Step 1: common variables 12 | 13 | ```bash 14 | REGION="us-west2" 15 | ZONE="$REGION-a" 16 | NODE_TYPE_USER="e2-highmem-16" 17 | NODE_TYPE_DEFAULT="e2-standard-2" 18 | 19 | CLUSTER_NAME=workshop 20 | NODES_MIN=0 21 | NODES_MAX=400 22 | 23 | EMAIL="npatta01@gmail.com" 24 | GCP_PROJECT="np-public-training" 25 | 26 | HELM_NAMESPACE=$CLUSTER_NAME 27 | 28 | HELM_CHART_VERSION="2.0.0" 29 | ``` 30 | 31 | ## Step 2: create static ip address 32 | 33 | ```bash 34 | gcloud compute addresses create $CLUSTER_NAME \ 35 | --region $REGION \ 36 | --project $GCP_PROJECT 37 | 38 | gcloud compute addresses describe $CLUSTER_NAME \ 39 | --region $REGION \ 40 | --project $GCP_PROJECT 41 | 42 | ``` 43 | 44 | Create an `A` record with your DNS provider. 45 | 46 | I am using `hub` for my domain `np.training` 47 | 48 | 49 | 50 | 51 | ## Step 3: Create cluster 52 | 53 | 54 | ```bash 55 | 56 | gcloud container clusters create \ 57 | --machine-type $NODE_TYPE_DEFAULT \ 58 | --num-nodes 1 \ 59 | --region $REGION \ 60 | --cluster-version latest \ 61 | $CLUSTER_NAME \ 62 | --project $GCP_PROJECT 63 | 64 | ``` 65 | 66 | Get kubectl credentials 67 | 68 | ```bash 69 | gcloud container clusters get-credentials \ 70 | $CLUSTER_NAME \ 71 | --region $REGION \ 72 | --project $GCP_PROJECT 73 | ``` 74 | 75 | Create admin access for user 76 | 77 | ```bash 78 | kubectl create clusterrolebinding cluster-admin-binding \ 79 | --clusterrole=cluster-admin \ 80 | --user $EMAIL 81 | ``` 82 | 83 | Create separate node pool for jupyter notebook 84 | 85 | ```bash 86 | gcloud beta container node-pools create user-pool \ 87 | --machine-type $NODE_TYPE_USER \ 88 | --num-nodes 0 \ 89 | --enable-autoscaling \ 90 | --min-nodes $NODES_MIN \ 91 | --max-nodes $NODES_MAX \ 92 | --node-labels hub.jupyter.org/node-purpose=user \ 93 | --node-taints hub.jupyter.org_dedicated=user:NoSchedule \ 94 | --scopes "https://www.googleapis.com/auth/cloud-platform" \ 95 | --region $REGION \ 96 | --cluster $CLUSTER_NAME \ 97 | --project $GCP_PROJECT 98 | ``` 99 | 100 | 101 | ## Step 3b: Cert (optional) 102 | 103 | By default the Helm chart we will use supports LetsEncrypt. However, I had trouble getting it to work. 104 | So, I used followed the steps bellow to get create my own cert 105 | 106 | create certificate signing request for "*.np.training" 107 | 108 | ```bash 109 | openssl req -nodes -newkey rsa:2048 \ 110 | -keyout cert/server.key \ 111 | -out cert/server.csr \ 112 | -subj "/C=US/ST=New York/L=New York/O=NP Training./OU=IT/CN=*.np.training" 113 | ``` 114 | 115 | I bought a wildcard cert from Namecheap 116 | 117 | 118 | Download my cert and create a kubectl cert 119 | ```bash 120 | 121 | kubectl create namespace $HELM_NAMESPACE 122 | 123 | 124 | 125 | 126 | gsutil cp "gs://np-training-private/certs/_star.np.training/*" workshop_infra/cert 127 | 128 | 129 | kubectl create namespace $HELM_NAMESPACE 130 | cd workshop_infra/cert 131 | kubectl create secret tls $HELM_NAMESPACE-tls --key="tls.key" --cert="tls.crt" --namespace $HELM_NAMESPACE 132 | cd ../../ 133 | 134 | ``` 135 | 136 | 137 | download storage key 138 | 139 | ``` 140 | gcloud iam service-accounts keys create workshop_infra/keyfile.json \ 141 | --iam-account=public-storage-reader-sa@np-public-training.iam.gserviceaccount.com 142 | 143 | gsutil cp gs://np-training-private/service_accounts/keyfile.json workshop_infra/keyfile.json 144 | 145 | kubectl create secret generic gcsfs-creds --from-file=workshop_infra/keyfile.json --namespace $HELM_NAMESPACE 146 | 147 | 148 | 149 | ``` 150 | 151 | ## Step 4: Helm setup 152 | 153 | ```bash 154 | 155 | curl https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | bash 156 | 157 | helm version 158 | 159 | helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/ 160 | helm repo update 161 | 162 | ``` 163 | 164 | 165 | ## Step 5: Update config file (optional) 166 | 167 | 168 | build docker image 169 | 170 | ```bash 171 | docker build -t gcr.io/$GCP_PROJECT/semantic-search-workshop:v1.0 . 172 | docker push gcr.io/$GCP_PROJECT/semantic-search-workshop:v1.0 173 | 174 | ``` 175 | 176 | build milvus 177 | 178 | ```bash 179 | cd docker_milvus 180 | 181 | docker build -t gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 . 182 | docker push gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 183 | echo "gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 " 184 | cd .. 185 | ``` 186 | 187 | encrypt setup 188 | 189 | ```bash 190 | gcloud kms keyrings create sops --location global --project $GCP_PROJECT 191 | gcloud kms keys create sops-key --location global --keyring sops --purpose encryption --project $GCP_PROJECT 192 | gcloud kms keys list --location global --keyring sops --project $GCP_PROJECT 193 | ``` 194 | 195 | 196 | ```bash 197 | sops --encrypt --gcp-kms projects/$GCP_PROJECT/locations/global/keyRings/sops/cryptoKeys/sops-key \ 198 | --encrypted-regex '^(client_id|client_secret)$' \ 199 | workshop_infra/config.yaml > workshop_infra/config.enc.yaml 200 | ``` 201 | 202 | ```bash 203 | sops --decrypt workshop_infra/config.enc.yaml > workshop_infra/config.yaml 204 | ``` 205 | 206 | 207 | 208 | 209 | 210 | replace values in [config.yaml](workshop_infra/config.yaml) 211 | 212 | - GitHubOAuthenticator 213 | - singleuser.image.name 214 | - scheduling.userPlaceholder.replicas 215 | - proxy.https.host 216 | - proxy.https.service.loadBalancerIP 217 | 218 | 219 | 220 | ## Step 6: Helm Install with authentication 221 | 222 | setup with authentication and git oauth 223 | 224 | ```bash 225 | helm upgrade --cleanup-on-fail \ 226 | --install $HELM_NAMESPACE jupyterhub/jupyterhub \ 227 | --namespace $HELM_NAMESPACE \ 228 | --create-namespace \ 229 | --version $HELM_CHART_VERSION \ 230 | --values workshop_infra/config.yaml 231 | 232 | ``` 233 | 234 | ```bash 235 | kubectl --namespace=$HELM_NAMESPACE get pod 236 | 237 | kubectl --namespace=$HELM_NAMESPACE get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}' 238 | ``` 239 | 240 | 241 | ## Step 6b: Helm Install with no authentication ( not auth) 242 | 243 | ```bash 244 | helm upgrade --cleanup-on-fail \ 245 | --install $HELM_NAMESPACE-public jupyterhub/jupyterhub \ 246 | --namespace $HELM_NAMESPACE-public \ 247 | --create-namespace \ 248 | --version $HELM_CHART_VERSION \ 249 | --values workshop_infra/config_public.yaml 250 | 251 | 252 | kubectl --namespace=$HELM_NAMESPACE-public get pod 253 | 254 | kubectl --namespace=$HELM_NAMESPACE-public get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}' 255 | ``` 256 | 257 | add the external ip to dns 258 | 259 | 260 | ## Step 7: Cleanup (Helm Delete) 261 | 262 | ```bash 263 | 264 | helm delete $HELM_NAMESPACE --namespace $HELM_NAMESPACE 265 | kubectl delete namespace $HELM_NAMESPACE 266 | 267 | helm delete $HELM_NAMESPACE-public --namespace $HELM_NAMESPACE-public 268 | kubectl delete namespace $HELM_NAMESPACE-public 269 | 270 | 271 | gcloud container clusters delete $CLUSTER_NAME --region $REGION --project $GCP_PROJECT 272 | 273 | ``` --------------------------------------------------------------------------------