├── .dockerignore
├── .gitignore
├── archive
    ├── docker_milvus
    │   ├── Dockerfile
    │   ├── install.sh
    │   ├── readme.md
    │   └── supervisord.conf
    ├── notebooks
    │   ├── 01_bm25.ipynb
    │   ├── 02_dense_retriever_milvus.ipynb
    │   └── 04_ann-elastic.ipynb
    └── notebooks_stackoverflow
    │   ├── 00_data_fetch_bq.ipynb
    │   ├── 00_data_fetch_spark.ipynb
    │   ├── 01_b_setup.ipynb
    │   ├── 01_data_cleanup.ipynb
    │   ├── 01_data_subset.ipynb
    │   ├── 01_workshop_data_preview.ipynb
    │   ├── 02_retrieval_dense_milvus.ipynb
    │   ├── 02_retrieval_sparse.ipynb
    │   ├── 03_comparision.ipynb
    │   ├── ann_benchmark_recall.ipynb
    │   ├── metrics_utils.py
    │   ├── other__retrieve_rerank_simple_wikipedia.ipynb
    │   ├── test_setup.ipynb
    │   └── workshop_setup.ipynb
├── assets
    ├── all_assets.sw
    ├── slides_odsc2022.pdf
    ├── slides_pydatanyc2022.pdf
    └── slides_pydataseattle2023.pdf
├── docker-compose.yaml
├── docs
    ├── internal_notes.md
    └── slide_notes.md
├── environment.yaml
├── notebooks
    ├── 00_a_setup_dataset.ipynb
    ├── 00_b_setup_stats.ipynb
    ├── 00_c_sample_images.ipynb
    ├── 01_bm25_elastic.ipynb
    ├── 02_dense_retriever.ipynb
    ├── 03_clip_embed.ipynb
    ├── 04_ann.ipynb
    └── workshop_setup.ipynb
├── readme.md
├── requirements.txt
└── workshop_infra
    ├── Dockerfile
    ├── cert
        └── .gitkeep
    ├── config.enc.yaml
    ├── config_public.yaml
    ├── scripts
        ├── build_setup_root.sh
        ├── build_setup_user.sh
        └── container_startup.sh
    └── setup.md


/.dockerignore:
--------------------------------------------------------------------------------
1 | data/
2 | workshop_infra/
3 | !workshop_infra/scripts/
4 | 
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | 
132 | data/
133 | 
134 | 
135 | workshop_infra/cert/*
136 | workshop_infra/config.yaml
137 | workshop_infra/key_file.json
138 | 
139 | *.db
140 | tmp/
141 | .DS_Store
142 | 
143 | !/**/.gitkeep
144 | workshop_infra/keyfile.json
145 | *.zip
146 | 


--------------------------------------------------------------------------------
/archive/docker_milvus/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04 
 2 | 
 3 | ENV MILVUS_VERSION "2.1.4-1"
 4 | 
 5 | COPY install.sh /tmp/install.sh
 6 | 
 7 | RUN bash /tmp/install.sh
 8 | 
 9 | 
10 | # ARG S6_OVERLAY_VERSION=3.1.2.1
11 | 
12 | 
13 | # ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz /tmp
14 | 
15 | # RUN tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz
16 | 
17 | # ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-x86_64.tar.xz /tmp
18 | 
19 | # RUN tar -C / -Jxpf /tmp/s6-overlay-x86_64.tar.xz
20 | 
21 | 
22 | COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
23 | 
24 | 
25 | 
26 | CMD ["/usr/bin/supervisord"]
27 | 


--------------------------------------------------------------------------------
/archive/docker_milvus/install.sh:
--------------------------------------------------------------------------------
 1 | apt-get update -y 
 2 | 
 3 | apt install software-properties-common -y
 4 | 
 5 | add-apt-repository ppa:milvusdb/milvus
 6 | 
 7 | 
 8 | apt-get update -y 
 9 | 
10 | 
11 | apt-get install "milvus=$MILVUS_VERSION"  -y
12 | 
13 | 
14 | 
15 | #mkdir -p /etc/services.d/system/
16 | 
17 | #cp /lib/systemd/system/milvus* /etc/services.d/system/
18 | 
19 | #cp /lib/systemd/system/milvus* /etc/services.d/system/
20 | 
21 | #COPY resources/docker/services.d /etc/services.d
22 | 
23 | 
24 | 
25 | apt-get update && apt-get install -y supervisor
26 | mkdir -p /var/log/supervisor


--------------------------------------------------------------------------------
/archive/docker_milvus/readme.md:
--------------------------------------------------------------------------------
 1 | https://github.com/just-containers/s6-overlay
 2 | 
 3 | 
 4 | 
 5 | cat /etc/services.d/system/milvus-etcd.service 
 6 | ExecStart=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data
 7 | 
 8 | 
 9 | cat /etc/services.d/system/milvus-minio.service 
10 | ExecStart=/usr/bin/milvus-minio server /var/lib/milvus/minio-data
11 | 
12 | 
13 | 
14 | cat /etc/services.d/system/milvus.service 
15 | 
16 | Environment=MILVUSCONF=/etc/milvus/configs/
17 | ExecStart=/usr/bin/milvus run standalone
18 | 
19 | 
20 | 
21 | https://gdevillele.github.io/engine/admin/using_supervisord/


--------------------------------------------------------------------------------
/archive/docker_milvus/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [supervisord]
 2 | nodaemon=true
 3 | 
 4 | [program:milvus-minio]
 5 | command=/usr/bin/milvus-minio server /var/lib/milvus/minio-data
 6 | 
 7 | [program:milvus-etcd]
 8 | command=/usr/bin/milvus-etcd --data-dir /var/lib/milvus/etcd-data
 9 | 
10 | 
11 | [program:milvus]
12 | environment=MILVUSCONF=/etc/milvus/configs/
13 | command=/usr/bin/milvus run standalone


--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/00_data_fetch_bq.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "2d9c002c-9ba7-48cb-83a5-3d2903056d43",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import modin.pandas as pd\n",
 11 |     "import re\n",
 12 |     "import lxml.html\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "id": "05461bbb-02f3-4749-b6ca-dba3a02bf1e8",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stderr",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "UserWarning: Ray execution environment not yet initialized. Initializing...\n",
 26 |       "To remove this warning, run the following python code before doing dataframe operations:\n",
 27 |       "\n",
 28 |       "    import ray\n",
 29 |       "    ray.init()\n",
 30 |       "\n",
 31 |       "UserWarning: `read_gbq` defaulting to pandas implementation.\n",
 32 |       "To request implementation, send an email to feature_requests@modin.org.\n"
 33 |      ]
 34 |     },
 35 |     {
 36 |      "data": {
 37 |       "text/html": [
 38 |        "<div>\n",
 39 |        "<style scoped>\n",
 40 |        "    .dataframe tbody tr th:only-of-type {\n",
 41 |        "        vertical-align: middle;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe tbody tr th {\n",
 45 |        "        vertical-align: top;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe thead th {\n",
 49 |        "        text-align: right;\n",
 50 |        "    }\n",
 51 |        "</style>\n",
 52 |        "<table border=\"1\" class=\"dataframe\">\n",
 53 |        "  <thead>\n",
 54 |        "    <tr style=\"text-align: right;\">\n",
 55 |        "      <th></th>\n",
 56 |        "      <th>Id</th>\n",
 57 |        "      <th>Type</th>\n",
 58 |        "    </tr>\n",
 59 |        "  </thead>\n",
 60 |        "  <tbody>\n",
 61 |        "    <tr>\n",
 62 |        "      <th>0</th>\n",
 63 |        "      <td>1</td>\n",
 64 |        "      <td>Question</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>1</th>\n",
 68 |        "      <td>2</td>\n",
 69 |        "      <td>Answer</td>\n",
 70 |        "    </tr>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>2</th>\n",
 73 |        "      <td>3</td>\n",
 74 |        "      <td>Wiki</td>\n",
 75 |        "    </tr>\n",
 76 |        "    <tr>\n",
 77 |        "      <th>3</th>\n",
 78 |        "      <td>4</td>\n",
 79 |        "      <td>TagWikiExcerpt</td>\n",
 80 |        "    </tr>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>4</th>\n",
 83 |        "      <td>5</td>\n",
 84 |        "      <td>TagWiki</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>5</th>\n",
 88 |        "      <td>6</td>\n",
 89 |        "      <td>ModeratorNomination</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>6</th>\n",
 93 |        "      <td>7</td>\n",
 94 |        "      <td>WikiPlaceholder</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>7</th>\n",
 98 |        "      <td>8</td>\n",
 99 |        "      <td>PrivilegeWiki</td>\n",
100 |        "    </tr>\n",
101 |        "  </tbody>\n",
102 |        "</table>\n",
103 |        "</div>"
104 |       ],
105 |       "text/plain": [
106 |        "   Id                 Type\n",
107 |        "0   1             Question\n",
108 |        "1   2               Answer\n",
109 |        "2   3                 Wiki\n",
110 |        "3   4       TagWikiExcerpt\n",
111 |        "4   5              TagWiki\n",
112 |        "5   6  ModeratorNomination\n",
113 |        "6   7      WikiPlaceholder\n",
114 |        "7   8        PrivilegeWiki"
115 |       ]
116 |      },
117 |      "execution_count": 3,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "pd.read_gbq(f\"\"\"\n",
124 |     "select *\n",
125 |     "FROM`sotorrent-org.2020_12_31.PostType`\n",
126 |     "\n",
127 |     "\"\"\", use_bqstorage_api=True)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 4,
133 |    "id": "df33775c-cdb8-4fe2-8457-e8642c8265c5",
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "name": "stderr",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "UserWarning: `read_gbq` defaulting to pandas implementation.\n"
141 |      ]
142 |     }
143 |    ],
144 |    "source": [
145 |     "df_raw = pd.read_gbq (f\"\"\"\n",
146 |     "\n",
147 |     "with qn as (\n",
148 |     " select Id , AcceptedAnswerId, Title, Body as QuestionBody, Tags , ViewCount, AnswerCount,   CommentCount , Score, CreationDate\n",
149 |     " FROM `sotorrent-org.2020_12_31.Posts` \n",
150 |     " where PostTypeId = 1\n",
151 |     "),\n",
152 |     "ans as (\n",
153 |     " select Id , Body as AnswerBody\n",
154 |     " FROM `sotorrent-org.2020_12_31.Posts` \n",
155 |     " where PostTypeId = 2\n",
156 |     ")\n",
157 |     "\n",
158 |     "\n",
159 |     "SELECT qn.*, ans.AnswerBody\n",
160 |     "From qn \n",
161 |     "inner join ans \n",
162 |     "on qn.AcceptedAnswerId = ans.Id\n",
163 |     "\n",
164 |     "\"\"\", use_bqstorage_api=True)\n"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 5,
170 |    "id": "51c4dd43-b3be-4253-8b1c-76eb854d4668",
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "text/html": [
176 |        "<div>\n",
177 |        "<style scoped>\n",
178 |        "    .dataframe tbody tr th:only-of-type {\n",
179 |        "        vertical-align: middle;\n",
180 |        "    }\n",
181 |        "\n",
182 |        "    .dataframe tbody tr th {\n",
183 |        "        vertical-align: top;\n",
184 |        "    }\n",
185 |        "\n",
186 |        "    .dataframe thead th {\n",
187 |        "        text-align: right;\n",
188 |        "    }\n",
189 |        "</style>\n",
190 |        "<table border=\"1\" class=\"dataframe\">\n",
191 |        "  <thead>\n",
192 |        "    <tr style=\"text-align: right;\">\n",
193 |        "      <th></th>\n",
194 |        "      <th>Id</th>\n",
195 |        "      <th>AcceptedAnswerId</th>\n",
196 |        "      <th>Title</th>\n",
197 |        "      <th>QuestionBody</th>\n",
198 |        "      <th>Tags</th>\n",
199 |        "      <th>ViewCount</th>\n",
200 |        "      <th>AnswerCount</th>\n",
201 |        "      <th>CommentCount</th>\n",
202 |        "      <th>Score</th>\n",
203 |        "      <th>CreationDate</th>\n",
204 |        "      <th>AnswerBody</th>\n",
205 |        "    </tr>\n",
206 |        "  </thead>\n",
207 |        "  <tbody>\n",
208 |        "    <tr>\n",
209 |        "      <th>0</th>\n",
210 |        "      <td>22486469</td>\n",
211 |        "      <td>22488014</td>\n",
212 |        "      <td>Memory Mapping Large File Haskell</td>\n",
213 |        "      <td>&lt;p&gt;I am experimenting with the Haskell mmap pa...</td>\n",
214 |        "      <td>&lt;haskell&gt;</td>\n",
215 |        "      <td>566</td>\n",
216 |        "      <td>1</td>\n",
217 |        "      <td>1</td>\n",
218 |        "      <td>10</td>\n",
219 |        "      <td>2014-03-18 17:18:08</td>\n",
220 |        "      <td>&lt;p&gt;Looks like a typo. If I replace this:&lt;/p&gt;\\n...</td>\n",
221 |        "    </tr>\n",
222 |        "    <tr>\n",
223 |        "      <th>1</th>\n",
224 |        "      <td>20902775</td>\n",
225 |        "      <td>20902933</td>\n",
226 |        "      <td>How to check if auto-rotate screen setting is ...</td>\n",
227 |        "      <td>&lt;p&gt;I think each android device has an abitily ...</td>\n",
228 |        "      <td>&lt;java&gt;&lt;android&gt;</td>\n",
229 |        "      <td>11201</td>\n",
230 |        "      <td>3</td>\n",
231 |        "      <td>3</td>\n",
232 |        "      <td>12</td>\n",
233 |        "      <td>2014-01-03 11:37:35</td>\n",
234 |        "      <td>&lt;p&gt;Hope this code snippet helps you out:-&lt;/p&gt;\\...</td>\n",
235 |        "    </tr>\n",
236 |        "    <tr>\n",
237 |        "      <th>2</th>\n",
238 |        "      <td>39613023</td>\n",
239 |        "      <td>39623807</td>\n",
240 |        "      <td>Understanding the FFT output</td>\n",
241 |        "      <td>&lt;p&gt;I'm currently occupied in a practicum and m...</td>\n",
242 |        "      <td>&lt;java&gt;&lt;fft&gt;</td>\n",
243 |        "      <td>277</td>\n",
244 |        "      <td>1</td>\n",
245 |        "      <td>0</td>\n",
246 |        "      <td>-4</td>\n",
247 |        "      <td>2016-09-21 09:46:43</td>\n",
248 |        "      <td>&lt;p&gt;Computing a 512-point fourier transform aft...</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <th>3</th>\n",
252 |        "      <td>2770630</td>\n",
253 |        "      <td>2771563</td>\n",
254 |        "      <td>PDO::fetchAll vs. PDO::fetch in a loop</td>\n",
255 |        "      <td>&lt;p&gt;Just a quick question.&lt;/p&gt;\\n\\n&lt;p&gt;Is there a...</td>\n",
256 |        "      <td>&lt;php&gt;&lt;mysql&gt;&lt;pdo&gt;&lt;fetch&gt;</td>\n",
257 |        "      <td>86006</td>\n",
258 |        "      <td>7</td>\n",
259 |        "      <td>1</td>\n",
260 |        "      <td>72</td>\n",
261 |        "      <td>2010-05-05 04:31:40</td>\n",
262 |        "      <td>&lt;p&gt;Little benchmark with 200k random records. ...</td>\n",
263 |        "    </tr>\n",
264 |        "    <tr>\n",
265 |        "      <th>4</th>\n",
266 |        "      <td>31725206</td>\n",
267 |        "      <td>40180517</td>\n",
268 |        "      <td>Unable to Flash eMMC from SD Card BeagleBone B...</td>\n",
269 |        "      <td>&lt;p&gt;I am working on BeagleBone Black and Debian...</td>\n",
270 |        "      <td>&lt;debian&gt;&lt;beagleboneblack&gt;</td>\n",
271 |        "      <td>31664</td>\n",
272 |        "      <td>8</td>\n",
273 |        "      <td>3</td>\n",
274 |        "      <td>17</td>\n",
275 |        "      <td>2015-07-30 13:30:39</td>\n",
276 |        "      <td>&lt;p&gt;Did you remember to remove the \"#\" at the b...</td>\n",
277 |        "    </tr>\n",
278 |        "  </tbody>\n",
279 |        "</table>\n",
280 |        "</div>"
281 |       ],
282 |       "text/plain": [
283 |        "         Id  AcceptedAnswerId  \\\n",
284 |        "0  22486469          22488014   \n",
285 |        "1  20902775          20902933   \n",
286 |        "2  39613023          39623807   \n",
287 |        "3   2770630           2771563   \n",
288 |        "4  31725206          40180517   \n",
289 |        "\n",
290 |        "                                               Title  \\\n",
291 |        "0                  Memory Mapping Large File Haskell   \n",
292 |        "1  How to check if auto-rotate screen setting is ...   \n",
293 |        "2                       Understanding the FFT output   \n",
294 |        "3             PDO::fetchAll vs. PDO::fetch in a loop   \n",
295 |        "4  Unable to Flash eMMC from SD Card BeagleBone B...   \n",
296 |        "\n",
297 |        "                                        QuestionBody  \\\n",
298 |        "0  <p>I am experimenting with the Haskell mmap pa...   \n",
299 |        "1  <p>I think each android device has an abitily ...   \n",
300 |        "2  <p>I'm currently occupied in a practicum and m...   \n",
301 |        "3  <p>Just a quick question.</p>\\n\\n<p>Is there a...   \n",
302 |        "4  <p>I am working on BeagleBone Black and Debian...   \n",
303 |        "\n",
304 |        "                        Tags  ViewCount  AnswerCount  CommentCount  Score  \\\n",
305 |        "0                  <haskell>        566            1             1     10   \n",
306 |        "1            <java><android>      11201            3             3     12   \n",
307 |        "2                <java><fft>        277            1             0     -4   \n",
308 |        "3   <php><mysql><pdo><fetch>      86006            7             1     72   \n",
309 |        "4  <debian><beagleboneblack>      31664            8             3     17   \n",
310 |        "\n",
311 |        "         CreationDate                                         AnswerBody  \n",
312 |        "0 2014-03-18 17:18:08  <p>Looks like a typo. If I replace this:</p>\\n...  \n",
313 |        "1 2014-01-03 11:37:35  <p>Hope this code snippet helps you out:-</p>\\...  \n",
314 |        "2 2016-09-21 09:46:43  <p>Computing a 512-point fourier transform aft...  \n",
315 |        "3 2010-05-05 04:31:40  <p>Little benchmark with 200k random records. ...  \n",
316 |        "4 2015-07-30 13:30:39  <p>Did you remember to remove the \"#\" at the b...  "
317 |       ]
318 |      },
319 |      "execution_count": 5,
320 |      "metadata": {},
321 |      "output_type": "execute_result"
322 |     }
323 |    ],
324 |    "source": [
325 |     "df_raw.head()"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "id": "8546ce34-5cce-45e1-8b90-e178e98e7415",
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "df_raw.to_parquet(\"../data/df_raw\",index=False)"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "id": "4abb6cc3-040d-44b5-a5a5-68161b732b9e",
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "df_raw = pd.read_parquet(\"../data/df_raw\")"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "id": "c56f1bbd-6195-414a-adee-1075aede6aca",
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "len(df_raw)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "9cd18238-7318-44bb-8302-89232429028e",
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "def strip_html(s:str):\n",
366 |     "    try:\n",
367 |     "        return str(lxml.html.fromstring(s).text_content())\n",
368 |     "    except:\n",
369 |     "        return ''\n",
370 |     "\n",
371 |     "def parse_tags(content:str):\n",
372 |     "    return re.findall(r'<(.+?)>',content)\n"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "id": "8eda050e-3ab9-4555-9f7a-fe9cb190d824",
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "strip_html(f\"\"\"\n",
383 |     "\n",
384 |     "<p>I was asked to create a singleton that will...\t\n",
385 |     "\"\"\")"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "id": "a60b2cd6-2f02-447b-99ac-2de8598bb470",
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "df = df_raw"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "id": "f8fa3f43-d0c1-42ff-949c-cb7519e11454",
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "df['Body'] =  df['Body'].apply(strip_html)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": null,
411 |    "id": "d957c0c3-aba6-451b-816a-67224b8e6578",
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "df['Tags'] = df['Tags'].apply(parse_tags)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "id": "cb4c000b-0cd1-4d21-9de3-56589a04c40d",
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "df_final.to_parquet(\"../data/df_processed/\")"
426 |    ]
427 |   }
428 |  ],
429 |  "metadata": {
430 |   "environment": {
431 |    "kernel": "python3",
432 |    "name": "pytorch-gpu.1-11.m94",
433 |    "type": "gcloud",
434 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
435 |   },
436 |   "kernelspec": {
437 |    "display_name": "Python 3",
438 |    "language": "python",
439 |    "name": "python3"
440 |   },
441 |   "language_info": {
442 |    "codemirror_mode": {
443 |     "name": "ipython",
444 |     "version": 3
445 |    },
446 |    "file_extension": ".py",
447 |    "mimetype": "text/x-python",
448 |    "name": "python",
449 |    "nbconvert_exporter": "python",
450 |    "pygments_lexer": "ipython3",
451 |    "version": "3.7.12"
452 |   }
453 |  },
454 |  "nbformat": 4,
455 |  "nbformat_minor": 5
456 | }
457 | 


--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/01_b_setup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "4885264f-1d3f-4ad5-a29a-e338cf64e59c",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": []
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "2d9c002c-9ba7-48cb-83a5-3d2903056d43",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import modin.pandas as pd\n",
 19 |     "import re\n",
 20 |     "import lxml.html\n",
 21 |     "import re"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "id": "ce8fc711-181c-4481-a6c5-fb580bf7e5d0",
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stderr",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "UserWarning: Ray execution environment not yet initialized. Initializing...\n",
 35 |       "To remove this warning, run the following python code before doing dataframe operations:\n",
 36 |       "\n",
 37 |       "    import ray\n",
 38 |       "    ray.init()\n",
 39 |       "\n",
 40 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
 41 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m   if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
 42 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
 43 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m   if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
 44 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
 45 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m   if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
 46 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
 47 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m   if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
 48 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m /opt/conda/envs/stackoverflow/lib/python3.8/site-packages/ray/dashboard/agent.py:152: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n",
 49 |       "\u001b[2m\u001b[33m(raylet)\u001b[0m   if LooseVersion(aiohttp.__version__) < LooseVersion(\"4.0.0\"):\n",
 50 |       "2022-10-01 21:40:38,955\tWARNING worker.py:1257 -- (ip=10.52.136.102) The agent on node nup0013-dl failed to be restarted 5 times. There are 3 possible problems if you see this error.\n",
 51 |       "  1. The dashboard might not display correct information on this node.\n",
 52 |       "  2. Metrics on this node won't be reported.\n",
 53 |       "  3. runtime_env APIs won't work.\n",
 54 |       "Check out the `dashboard_agent.log` to see the detailed failure messages.\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "df_raw = pd.read_parquet(\"../data/df_raw/\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "id": "e0a22a80-c5c6-4bd7-8546-ffe39db0b7c6",
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/plain": [
 71 |        "Index(['Id', 'AcceptedAnswerId', 'Title', 'Body', 'Tags', 'ViewCount',\n",
 72 |        "       'AnswerCount', 'CommentCount', 'Score', 'CreationDate', 'AnswerBody'],\n",
 73 |        "      dtype='object')"
 74 |       ]
 75 |      },
 76 |      "execution_count": 3,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "df_raw.columns"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "id": "57723608-d170-419b-b1f0-85dde823485f",
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "00_data_fetch_bq.ipynb\t   01_b_setup_new.ipynb   02_indexing_faiss.ipynb  old\n",
 96 |       "00_data_fetch_spark.ipynb  01_data_cleanup.ipynb  03_searching_es.ipynb\n",
 97 |       "01_b_setup.ipynb\t   02_indexing_es.ipynb   Untitled.ipynb\n"
 98 |      ]
 99 |     }
100 |    ],
101 |    "source": [
102 |     "!ls"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 5,
108 |    "id": "9d757c53-4328-4baf-a3d1-f26dffd00ca4",
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "data": {
113 |       "text/html": [
114 |        "<div>\n",
115 |        "<style scoped>\n",
116 |        "    .dataframe tbody tr th:only-of-type {\n",
117 |        "        vertical-align: middle;\n",
118 |        "    }\n",
119 |        "\n",
120 |        "    .dataframe tbody tr th {\n",
121 |        "        vertical-align: top;\n",
122 |        "    }\n",
123 |        "\n",
124 |        "    .dataframe thead th {\n",
125 |        "        text-align: right;\n",
126 |        "    }\n",
127 |        "</style>\n",
128 |        "<table border=\"1\" class=\"dataframe\">\n",
129 |        "  <thead>\n",
130 |        "    <tr style=\"text-align: right;\">\n",
131 |        "      <th></th>\n",
132 |        "      <th>Id</th>\n",
133 |        "      <th>AcceptedAnswerId</th>\n",
134 |        "      <th>Title</th>\n",
135 |        "      <th>Body</th>\n",
136 |        "      <th>Tags</th>\n",
137 |        "      <th>ViewCount</th>\n",
138 |        "      <th>AnswerCount</th>\n",
139 |        "      <th>CommentCount</th>\n",
140 |        "      <th>Score</th>\n",
141 |        "      <th>CreationDate</th>\n",
142 |        "      <th>AnswerBody</th>\n",
143 |        "    </tr>\n",
144 |        "  </thead>\n",
145 |        "  <tbody>\n",
146 |        "    <tr>\n",
147 |        "      <th>0</th>\n",
148 |        "      <td>17123652</td>\n",
149 |        "      <td>17124724</td>\n",
150 |        "      <td>hierarchical encryption scheme</td>\n",
151 |        "      <td>&lt;p&gt;I am in need of the \"hierarchical\" encrypti...</td>\n",
152 |        "      <td>&lt;cryptography&gt;&lt;key&gt;&lt;hierarchical&gt;</td>\n",
153 |        "      <td>631</td>\n",
154 |        "      <td>1</td>\n",
155 |        "      <td>0</td>\n",
156 |        "      <td>2</td>\n",
157 |        "      <td>2013-06-15 12:29:50.987</td>\n",
158 |        "      <td>&lt;p&gt;A partial solution.  You own the master key...</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>1</th>\n",
162 |        "      <td>44577139</td>\n",
163 |        "      <td>44577209</td>\n",
164 |        "      <td>Uncaught TypeError: this.source is not a function</td>\n",
165 |        "      <td>&lt;p&gt;I want to prelaod all the customers and giv...</td>\n",
166 |        "      <td>&lt;ajax&gt;&lt;jquery-ui&gt;</td>\n",
167 |        "      <td>3396</td>\n",
168 |        "      <td>1</td>\n",
169 |        "      <td>0</td>\n",
170 |        "      <td>1</td>\n",
171 |        "      <td>2017-06-15 21:14:46.990</td>\n",
172 |        "      <td>&lt;p&gt;Initialize &lt;strong&gt;autocomplete&lt;/strong&gt; af...</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>2</th>\n",
176 |        "      <td>45162881</td>\n",
177 |        "      <td>45162984</td>\n",
178 |        "      <td>Class App\\Http\\Controllers\\ does not exist</td>\n",
179 |        "      <td>&lt;p&gt;This is my Route: &lt;/p&gt;\\n\\n&lt;pre&gt;&lt;code&gt; Route...</td>\n",
180 |        "      <td>&lt;php&gt;&lt;laravel&gt;&lt;controller&gt;&lt;routes&gt;</td>\n",
181 |        "      <td>32241</td>\n",
182 |        "      <td>3</td>\n",
183 |        "      <td>1</td>\n",
184 |        "      <td>2</td>\n",
185 |        "      <td>2017-07-18 09:35:26.630</td>\n",
186 |        "      <td>&lt;p&gt;At the first of controller you do not need ...</td>\n",
187 |        "    </tr>\n",
188 |        "    <tr>\n",
189 |        "      <th>3</th>\n",
190 |        "      <td>1887841</td>\n",
191 |        "      <td>1890092</td>\n",
192 |        "      <td>Grails startup is slow</td>\n",
193 |        "      <td>&lt;p&gt;Help! I'm porting a large ruby app to Grail...</td>\n",
194 |        "      <td>&lt;grails&gt;</td>\n",
195 |        "      <td>9592</td>\n",
196 |        "      <td>5</td>\n",
197 |        "      <td>0</td>\n",
198 |        "      <td>28</td>\n",
199 |        "      <td>2009-12-11 12:43:03.790</td>\n",
200 |        "      <td>&lt;p&gt;Unfortunately, I am not sure too much can b...</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>4</th>\n",
204 |        "      <td>8151129</td>\n",
205 |        "      <td>8151158</td>\n",
206 |        "      <td>AlertDialog - trying to understand this syntax</td>\n",
207 |        "      <td>&lt;p&gt;This is code from the book sample:&lt;/p&gt;\\n\\n&lt;...</td>\n",
208 |        "      <td>&lt;java&gt;&lt;android&gt;</td>\n",
209 |        "      <td>490</td>\n",
210 |        "      <td>5</td>\n",
211 |        "      <td>1</td>\n",
212 |        "      <td>2</td>\n",
213 |        "      <td>2011-11-16 11:46:42.137</td>\n",
214 |        "      <td>&lt;pre&gt;&lt;code&gt;// Create a builder\\nAlertDialog.Bu...</td>\n",
215 |        "    </tr>\n",
216 |        "  </tbody>\n",
217 |        "</table>\n",
218 |        "</div>"
219 |       ],
220 |       "text/plain": [
221 |        "         Id  AcceptedAnswerId  \\\n",
222 |        "0  17123652          17124724   \n",
223 |        "1  44577139          44577209   \n",
224 |        "2  45162881          45162984   \n",
225 |        "3   1887841           1890092   \n",
226 |        "4   8151129           8151158   \n",
227 |        "\n",
228 |        "                                               Title  \\\n",
229 |        "0                     hierarchical encryption scheme   \n",
230 |        "1  Uncaught TypeError: this.source is not a function   \n",
231 |        "2         Class App\\Http\\Controllers\\ does not exist   \n",
232 |        "3                             Grails startup is slow   \n",
233 |        "4     AlertDialog - trying to understand this syntax   \n",
234 |        "\n",
235 |        "                                                Body  \\\n",
236 |        "0  <p>I am in need of the \"hierarchical\" encrypti...   \n",
237 |        "1  <p>I want to prelaod all the customers and giv...   \n",
238 |        "2  <p>This is my Route: </p>\\n\\n<pre><code> Route...   \n",
239 |        "3  <p>Help! I'm porting a large ruby app to Grail...   \n",
240 |        "4  <p>This is code from the book sample:</p>\\n\\n<...   \n",
241 |        "\n",
242 |        "                                 Tags  ViewCount  AnswerCount  CommentCount  \\\n",
243 |        "0   <cryptography><key><hierarchical>        631            1             0   \n",
244 |        "1                   <ajax><jquery-ui>       3396            1             0   \n",
245 |        "2  <php><laravel><controller><routes>      32241            3             1   \n",
246 |        "3                            <grails>       9592            5             0   \n",
247 |        "4                     <java><android>        490            5             1   \n",
248 |        "\n",
249 |        "   Score            CreationDate  \\\n",
250 |        "0      2 2013-06-15 12:29:50.987   \n",
251 |        "1      1 2017-06-15 21:14:46.990   \n",
252 |        "2      2 2017-07-18 09:35:26.630   \n",
253 |        "3     28 2009-12-11 12:43:03.790   \n",
254 |        "4      2 2011-11-16 11:46:42.137   \n",
255 |        "\n",
256 |        "                                          AnswerBody  \n",
257 |        "0  <p>A partial solution.  You own the master key...  \n",
258 |        "1  <p>Initialize <strong>autocomplete</strong> af...  \n",
259 |        "2  <p>At the first of controller you do not need ...  \n",
260 |        "3  <p>Unfortunately, I am not sure too much can b...  \n",
261 |        "4  <pre><code>// Create a builder\\nAlertDialog.Bu...  "
262 |       ]
263 |      },
264 |      "execution_count": 5,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "df_raw.head()"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "id": "bcf8431e-c037-429f-bc53-13a8214be375",
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": []
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 6,
284 |    "id": "72644750-923e-4c69-8ba1-581e2929540d",
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "regex = r\"\"\"\n",
289 |     "\t<pre>.*?</pre>\n",
290 |     "\t\"\"\"\n",
291 |     "\n",
292 |     "def clean_text(snippet:str):\n",
293 |     "    snippet = re.sub(pattern=regex, repl = '[CODE]', string = snippet,  flags = re.IGNORECASE | re.DOTALL | re.MULTILINE | re.VERBOSE )\n",
294 |     "    \n",
295 |     "    snippet = str(lxml.html.fromstring(snippet).text_content())\n",
296 |     "    \n",
297 |     "    return snippet\n",
298 |     "\n",
299 |     "def parse_tags(content:str):\n",
300 |     "    return re.findall(r'<(.+?)>',content)"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "id": "99a5b7bf-694f-4798-bd6a-d41f4ec7dfeb",
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "\n",
311 |     "\n"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 7,
317 |    "id": "8eda050e-3ab9-4555-9f7a-fe9cb190d824",
318 |    "metadata": {},
319 |    "outputs": [
320 |     {
321 |      "data": {
322 |       "text/plain": [
323 |        "'I was asked to create a singleton that will...\\n\\n[CODE]\\n\\n test \\n\\n'"
324 |       ]
325 |      },
326 |      "execution_count": 7,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "clean_text(f\"\"\"\n",
333 |     "\n",
334 |     "<p>I was asked to create a singleton that will..</p>.\n",
335 |     "\n",
336 |     "<pre><code>KDF </code></pre>\n",
337 |     "\n",
338 |     "<p> test </p>\n",
339 |     "\n",
340 |     "\"\"\")"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "id": "a9a49fef-5499-40de-9bea-01301ce4e339",
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": []
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": 8,
354 |    "id": "a60b2cd6-2f02-447b-99ac-2de8598bb470",
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "df = df_raw"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 9,
364 |    "id": "f8fa3f43-d0c1-42ff-949c-cb7519e11454",
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "df['QuestionBody'] =  df['Body'].apply(clean_text)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 10,
374 |    "id": "d45cc4c4-0142-4d7a-abad-7bd941a331a1",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "df['AnswerBody'] =  df['AnswerBody'].apply(clean_text)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 11,
384 |    "id": "d957c0c3-aba6-451b-816a-67224b8e6578",
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "df['Tags'] = df['Tags'].apply(parse_tags)"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 12,
394 |    "id": "cb4c000b-0cd1-4d21-9de3-56589a04c40d",
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "df.to_parquet(\"../data/df_processed/\")"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "id": "f7dc6fdc-8d41-4f9b-bffa-0d8c5274f9f1",
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": []
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 13,
412 |    "id": "bcb300a3-53cb-4454-b472-c9cc422f6cc4",
413 |    "metadata": {},
414 |    "outputs": [
415 |     {
416 |      "name": "stdout",
417 |      "output_type": "stream",
418 |      "text": [
419 |       "part-0000.snappy.parquet  part-0006.snappy.parquet  part-0012.snappy.parquet\n",
420 |       "part-0001.snappy.parquet  part-0007.snappy.parquet  part-0013.snappy.parquet\n",
421 |       "part-0002.snappy.parquet  part-0008.snappy.parquet  part-0014.snappy.parquet\n",
422 |       "part-0003.snappy.parquet  part-0009.snappy.parquet  part-0015.snappy.parquet\n",
423 |       "part-0004.snappy.parquet  part-0010.snappy.parquet\n",
424 |       "part-0005.snappy.parquet  part-0011.snappy.parquet\n"
425 |      ]
426 |     }
427 |    ],
428 |    "source": [
429 |     "!ls ../data/df_processed/"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "id": "c722b250-f4ac-4523-adac-280b34dc3209",
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": []
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "id": "0569dbf8-8a79-42a2-b854-3f0628e8275d",
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": []
447 |   }
448 |  ],
449 |  "metadata": {
450 |   "environment": {
451 |    "kernel": "stackoverflow",
452 |    "name": "pytorch-gpu.1-11.m94",
453 |    "type": "gcloud",
454 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-11:m94"
455 |   },
456 |   "kernelspec": {
457 |    "display_name": "Python 3.8.5 ('py38')",
458 |    "language": "python",
459 |    "name": "python3"
460 |   },
461 |   "language_info": {
462 |    "codemirror_mode": {
463 |     "name": "ipython",
464 |     "version": 3
465 |    },
466 |    "file_extension": ".py",
467 |    "mimetype": "text/x-python",
468 |    "name": "python",
469 |    "nbconvert_exporter": "python",
470 |    "pygments_lexer": "ipython3",
471 |    "version": "3.8.5"
472 |   },
473 |   "vscode": {
474 |    "interpreter": {
475 |     "hash": "aefe80b7c360a2b6e560f9a0dcb6ff028291678d8b74cab0042c4a74d0e7253b"
476 |    }
477 |   }
478 |  },
479 |  "nbformat": 4,
480 |  "nbformat_minor": 5
481 | }
482 | 


--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/01_data_subset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "776a1f38-5ec7-4478-b392-bb943274b958",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "id": "a462938f-432c-48cc-b7ae-a20f4df6c3ff",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "df_posts = pd.read_parquet(\"gs://np-training-tmp/stackoverflow/final/posts.parquet\")"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 3,
 26 |    "id": "f155aa49-6b8d-4056-ad64-eea6fb96cb19",
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/html": [
 32 |        "<div>\n",
 33 |        "<style scoped>\n",
 34 |        "    .dataframe tbody tr th:only-of-type {\n",
 35 |        "        vertical-align: middle;\n",
 36 |        "    }\n",
 37 |        "\n",
 38 |        "    .dataframe tbody tr th {\n",
 39 |        "        vertical-align: top;\n",
 40 |        "    }\n",
 41 |        "\n",
 42 |        "    .dataframe thead th {\n",
 43 |        "        text-align: right;\n",
 44 |        "    }\n",
 45 |        "</style>\n",
 46 |        "<table border=\"1\" class=\"dataframe\">\n",
 47 |        "  <thead>\n",
 48 |        "    <tr style=\"text-align: right;\">\n",
 49 |        "      <th></th>\n",
 50 |        "      <th>Id</th>\n",
 51 |        "      <th>AcceptedAnswerId</th>\n",
 52 |        "      <th>Title</th>\n",
 53 |        "      <th>QuestionBody</th>\n",
 54 |        "      <th>Tags</th>\n",
 55 |        "      <th>ViewCount</th>\n",
 56 |        "      <th>AnswerCount</th>\n",
 57 |        "      <th>CommentCount</th>\n",
 58 |        "      <th>Score</th>\n",
 59 |        "      <th>CreationDate</th>\n",
 60 |        "      <th>AnswerId</th>\n",
 61 |        "      <th>AcceptedAnswerBody</th>\n",
 62 |        "    </tr>\n",
 63 |        "  </thead>\n",
 64 |        "  <tbody>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>0</th>\n",
 67 |        "      <td>33760194</td>\n",
 68 |        "      <td>NaN</td>\n",
 69 |        "      <td>Python How to burning discs with the monitorin...</td>\n",
 70 |        "      <td>I'm writing the programm on Python with module...</td>\n",
 71 |        "      <td>[python, event-handling, progressmonitor]</td>\n",
 72 |        "      <td>491</td>\n",
 73 |        "      <td>0</td>\n",
 74 |        "      <td>2</td>\n",
 75 |        "      <td>0</td>\n",
 76 |        "      <td>2015-11-17 15:02:09.103</td>\n",
 77 |        "      <td>NaN</td>\n",
 78 |        "      <td>None</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>1</th>\n",
 82 |        "      <td>15020895</td>\n",
 83 |        "      <td>NaN</td>\n",
 84 |        "      <td>Python int-byte efficient data structure</td>\n",
 85 |        "      <td>i am currently storing key-values of type int-...</td>\n",
 86 |        "      <td>[python, data-structures]</td>\n",
 87 |        "      <td>155</td>\n",
 88 |        "      <td>0</td>\n",
 89 |        "      <td>3</td>\n",
 90 |        "      <td>1</td>\n",
 91 |        "      <td>2013-02-22 09:33:26.360</td>\n",
 92 |        "      <td>NaN</td>\n",
 93 |        "      <td>None</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>2</th>\n",
 97 |        "      <td>47234657</td>\n",
 98 |        "      <td>NaN</td>\n",
 99 |        "      <td>converting word into other word keeping the or...</td>\n",
100 |        "      <td>def translate(string, translations):\\n\\n[CODE]...</td>\n",
101 |        "      <td>[python, python-3.x]</td>\n",
102 |        "      <td>48</td>\n",
103 |        "      <td>2</td>\n",
104 |        "      <td>1</td>\n",
105 |        "      <td>-1</td>\n",
106 |        "      <td>2017-11-11 05:23:34.343</td>\n",
107 |        "      <td>NaN</td>\n",
108 |        "      <td>None</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>3</th>\n",
112 |        "      <td>37310210</td>\n",
113 |        "      <td>NaN</td>\n",
114 |        "      <td>Camera Calibration with OpenCV - How to adjust...</td>\n",
115 |        "      <td>I am working on a camera calibration program u...</td>\n",
116 |        "      <td>[python, python-2.7, opencv, camera, camera-ca...</td>\n",
117 |        "      <td>8164</td>\n",
118 |        "      <td>2</td>\n",
119 |        "      <td>3</td>\n",
120 |        "      <td>3</td>\n",
121 |        "      <td>2016-05-18 21:14:34.110</td>\n",
122 |        "      <td>NaN</td>\n",
123 |        "      <td>None</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>4</th>\n",
127 |        "      <td>70675292</td>\n",
128 |        "      <td>NaN</td>\n",
129 |        "      <td>Python Same Period Last Year in Pandas with Gr...</td>\n",
130 |        "      <td>I have following DataFrame:\\nimport pandas as ...</td>\n",
131 |        "      <td>[python, pandas, group-by, offset, forecasting]</td>\n",
132 |        "      <td>70</td>\n",
133 |        "      <td>1</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>2022-01-12 01:19:53.640</td>\n",
137 |        "      <td>NaN</td>\n",
138 |        "      <td>None</td>\n",
139 |        "    </tr>\n",
140 |        "  </tbody>\n",
141 |        "</table>\n",
142 |        "</div>"
143 |       ],
144 |       "text/plain": [
145 |        "         Id  AcceptedAnswerId  \\\n",
146 |        "0  33760194               NaN   \n",
147 |        "1  15020895               NaN   \n",
148 |        "2  47234657               NaN   \n",
149 |        "3  37310210               NaN   \n",
150 |        "4  70675292               NaN   \n",
151 |        "\n",
152 |        "                                               Title  \\\n",
153 |        "0  Python How to burning discs with the monitorin...   \n",
154 |        "1           Python int-byte efficient data structure   \n",
155 |        "2  converting word into other word keeping the or...   \n",
156 |        "3  Camera Calibration with OpenCV - How to adjust...   \n",
157 |        "4  Python Same Period Last Year in Pandas with Gr...   \n",
158 |        "\n",
159 |        "                                        QuestionBody  \\\n",
160 |        "0  I'm writing the programm on Python with module...   \n",
161 |        "1  i am currently storing key-values of type int-...   \n",
162 |        "2  def translate(string, translations):\\n\\n[CODE]...   \n",
163 |        "3  I am working on a camera calibration program u...   \n",
164 |        "4  I have following DataFrame:\\nimport pandas as ...   \n",
165 |        "\n",
166 |        "                                                Tags  ViewCount  AnswerCount  \\\n",
167 |        "0          [python, event-handling, progressmonitor]        491            0   \n",
168 |        "1                          [python, data-structures]        155            0   \n",
169 |        "2                               [python, python-3.x]         48            2   \n",
170 |        "3  [python, python-2.7, opencv, camera, camera-ca...       8164            2   \n",
171 |        "4    [python, pandas, group-by, offset, forecasting]         70            1   \n",
172 |        "\n",
173 |        "   CommentCount  Score            CreationDate  AnswerId AcceptedAnswerBody  \n",
174 |        "0             2      0 2015-11-17 15:02:09.103       NaN               None  \n",
175 |        "1             3      1 2013-02-22 09:33:26.360       NaN               None  \n",
176 |        "2             1     -1 2017-11-11 05:23:34.343       NaN               None  \n",
177 |        "3             3      3 2016-05-18 21:14:34.110       NaN               None  \n",
178 |        "4             0      0 2022-01-12 01:19:53.640       NaN               None  "
179 |       ]
180 |      },
181 |      "execution_count": 3,
182 |      "metadata": {},
183 |      "output_type": "execute_result"
184 |     }
185 |    ],
186 |    "source": [
187 |     "df_posts.head()"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 4,
193 |    "id": "6069508b-5b2e-4572-ace5-ce01d47f9de2",
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/html": [
199 |        "<div>\n",
200 |        "<style scoped>\n",
201 |        "    .dataframe tbody tr th:only-of-type {\n",
202 |        "        vertical-align: middle;\n",
203 |        "    }\n",
204 |        "\n",
205 |        "    .dataframe tbody tr th {\n",
206 |        "        vertical-align: top;\n",
207 |        "    }\n",
208 |        "\n",
209 |        "    .dataframe thead th {\n",
210 |        "        text-align: right;\n",
211 |        "    }\n",
212 |        "</style>\n",
213 |        "<table border=\"1\" class=\"dataframe\">\n",
214 |        "  <thead>\n",
215 |        "    <tr style=\"text-align: right;\">\n",
216 |        "      <th></th>\n",
217 |        "      <th>PostId</th>\n",
218 |        "      <th>PostTitle</th>\n",
219 |        "      <th>RelatedPostIds</th>\n",
220 |        "      <th>RelatedPostTitles</th>\n",
221 |        "      <th>num_candidates</th>\n",
222 |        "    </tr>\n",
223 |        "  </thead>\n",
224 |        "  <tbody>\n",
225 |        "    <tr>\n",
226 |        "      <th>0</th>\n",
227 |        "      <td>57348742</td>\n",
228 |        "      <td>How do I simulate a Scrollbar in tkInter Canvas</td>\n",
229 |        "      <td>[57348742, 68340045]</td>\n",
230 |        "      <td>[How do I simulate a Scrollbar in tkInter Canv...</td>\n",
231 |        "      <td>2</td>\n",
232 |        "    </tr>\n",
233 |        "    <tr>\n",
234 |        "      <th>1</th>\n",
235 |        "      <td>3494593</td>\n",
236 |        "      <td>Shading a kernel density plot between two points.</td>\n",
237 |        "      <td>[3494593, 14863744, 14094644, 16504452, 488531...</td>\n",
238 |        "      <td>[Shading a kernel density plot between two poi...</td>\n",
239 |        "      <td>16</td>\n",
240 |        "    </tr>\n",
241 |        "    <tr>\n",
242 |        "      <th>2</th>\n",
243 |        "      <td>37949409</td>\n",
244 |        "      <td>Dictionary in a numpy array?</td>\n",
245 |        "      <td>[37949409, 47689224, 61517741]</td>\n",
246 |        "      <td>[Dictionary in a numpy array?, How to access t...</td>\n",
247 |        "      <td>3</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>3</th>\n",
251 |        "      <td>51519086</td>\n",
252 |        "      <td>How to remove tkinter - - - - line's when crea...</td>\n",
253 |        "      <td>[51519086, 55088055]</td>\n",
254 |        "      <td>[How to remove tkinter - - - - line's when cre...</td>\n",
255 |        "      <td>2</td>\n",
256 |        "    </tr>\n",
257 |        "    <tr>\n",
258 |        "      <th>4</th>\n",
259 |        "      <td>63107594</td>\n",
260 |        "      <td>How to deal with multi-level column names down...</td>\n",
261 |        "      <td>[63107594, 63107603, 62966295, 68674235, 63124...</td>\n",
262 |        "      <td>[How to deal with multi-level column names dow...</td>\n",
263 |        "      <td>6</td>\n",
264 |        "    </tr>\n",
265 |        "    <tr>\n",
266 |        "      <th>...</th>\n",
267 |        "      <td>...</td>\n",
268 |        "      <td>...</td>\n",
269 |        "      <td>...</td>\n",
270 |        "      <td>...</td>\n",
271 |        "      <td>...</td>\n",
272 |        "    </tr>\n",
273 |        "    <tr>\n",
274 |        "      <th>33243</th>\n",
275 |        "      <td>48536681</td>\n",
276 |        "      <td>What is the exact meaning of stride's list in ...</td>\n",
277 |        "      <td>[48536681, 47305022]</td>\n",
278 |        "      <td>[What is the exact meaning of stride's list in...</td>\n",
279 |        "      <td>2</td>\n",
280 |        "    </tr>\n",
281 |        "    <tr>\n",
282 |        "      <th>33244</th>\n",
283 |        "      <td>37814201</td>\n",
284 |        "      <td>pandas time shift from utc to local</td>\n",
285 |        "      <td>[37814201, 52390647]</td>\n",
286 |        "      <td>[pandas time shift from utc to local, Convert ...</td>\n",
287 |        "      <td>2</td>\n",
288 |        "    </tr>\n",
289 |        "    <tr>\n",
290 |        "      <th>33245</th>\n",
291 |        "      <td>2316987</td>\n",
292 |        "      <td>Converting a string to a formatted date-time s...</td>\n",
293 |        "      <td>[2316987, 48848730]</td>\n",
294 |        "      <td>[Converting a string to a formatted date-time ...</td>\n",
295 |        "      <td>2</td>\n",
296 |        "    </tr>\n",
297 |        "    <tr>\n",
298 |        "      <th>33246</th>\n",
299 |        "      <td>52027033</td>\n",
300 |        "      <td>Convert datetime to another format without cha...</td>\n",
301 |        "      <td>[52027033, 52252961]</td>\n",
302 |        "      <td>[Convert datetime to another format without ch...</td>\n",
303 |        "      <td>2</td>\n",
304 |        "    </tr>\n",
305 |        "    <tr>\n",
306 |        "      <th>33247</th>\n",
307 |        "      <td>17622419</td>\n",
308 |        "      <td>Creating a namedtuple object using only a subs...</td>\n",
309 |        "      <td>[17622419, 50899076]</td>\n",
310 |        "      <td>[Creating a namedtuple object using only a sub...</td>\n",
311 |        "      <td>2</td>\n",
312 |        "    </tr>\n",
313 |        "  </tbody>\n",
314 |        "</table>\n",
315 |        "<p>33248 rows × 5 columns</p>\n",
316 |        "</div>"
317 |       ],
318 |       "text/plain": [
319 |        "         PostId                                          PostTitle  \\\n",
320 |        "0      57348742    How do I simulate a Scrollbar in tkInter Canvas   \n",
321 |        "1       3494593  Shading a kernel density plot between two points.   \n",
322 |        "2      37949409                       Dictionary in a numpy array?   \n",
323 |        "3      51519086  How to remove tkinter - - - - line's when crea...   \n",
324 |        "4      63107594  How to deal with multi-level column names down...   \n",
325 |        "...         ...                                                ...   \n",
326 |        "33243  48536681  What is the exact meaning of stride's list in ...   \n",
327 |        "33244  37814201                pandas time shift from utc to local   \n",
328 |        "33245   2316987  Converting a string to a formatted date-time s...   \n",
329 |        "33246  52027033  Convert datetime to another format without cha...   \n",
330 |        "33247  17622419  Creating a namedtuple object using only a subs...   \n",
331 |        "\n",
332 |        "                                          RelatedPostIds  \\\n",
333 |        "0                                   [57348742, 68340045]   \n",
334 |        "1      [3494593, 14863744, 14094644, 16504452, 488531...   \n",
335 |        "2                         [37949409, 47689224, 61517741]   \n",
336 |        "3                                   [51519086, 55088055]   \n",
337 |        "4      [63107594, 63107603, 62966295, 68674235, 63124...   \n",
338 |        "...                                                  ...   \n",
339 |        "33243                               [48536681, 47305022]   \n",
340 |        "33244                               [37814201, 52390647]   \n",
341 |        "33245                                [2316987, 48848730]   \n",
342 |        "33246                               [52027033, 52252961]   \n",
343 |        "33247                               [17622419, 50899076]   \n",
344 |        "\n",
345 |        "                                       RelatedPostTitles  num_candidates  \n",
346 |        "0      [How do I simulate a Scrollbar in tkInter Canv...               2  \n",
347 |        "1      [Shading a kernel density plot between two poi...              16  \n",
348 |        "2      [Dictionary in a numpy array?, How to access t...               3  \n",
349 |        "3      [How to remove tkinter - - - - line's when cre...               2  \n",
350 |        "4      [How to deal with multi-level column names dow...               6  \n",
351 |        "...                                                  ...             ...  \n",
352 |        "33243  [What is the exact meaning of stride's list in...               2  \n",
353 |        "33244  [pandas time shift from utc to local, Convert ...               2  \n",
354 |        "33245  [Converting a string to a formatted date-time ...               2  \n",
355 |        "33246  [Convert datetime to another format without ch...               2  \n",
356 |        "33247  [Creating a namedtuple object using only a sub...               2  \n",
357 |        "\n",
358 |        "[33248 rows x 5 columns]"
359 |       ]
360 |      },
361 |      "execution_count": 4,
362 |      "metadata": {},
363 |      "output_type": "execute_result"
364 |     }
365 |    ],
366 |    "source": [
367 |     "df_related  = pd.read_parquet(\"gs://np-training-tmp/stackoverflow/final/related_posts.parquet\")\n",
368 |     "df_related"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "12502a21-39cf-4f73-b6b2-d106f446516f",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": []
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 15,
382 |    "id": "0174520e-92ed-48de-bab5-214d04d0249e",
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "post_ids = set (df_posts.sample(frac=1, random_state=42).head(200_000)['Id'] )"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 16,
392 |    "id": "586796a6-f69f-4faf-bd42-5f967986dfc1",
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "def match_exists(related_post_ids):\n",
397 |     "    res = set(related_post_ids ) & post_ids\n",
398 |     "    return len(res) > 0"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 17,
404 |    "id": "3412f27f-c39e-4fcf-9dd1-9147fbc0eac7",
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "df_related_subset  = df_related [ df_related['RelatedPostIds'].apply(match_exists) ]\n",
409 |     "post_ids_additional = set(df_candidates['RelatedPostIds'].explode() )\n",
410 |     "\n",
411 |     "post_id_final = post_ids | post_ids_additional"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "id": "f323527c-021a-474f-a9d0-b73aa3d55681",
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "len("
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 18,
427 |    "id": "649e579a-106f-4835-b646-d76e6c2e8305",
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "df_posts_subset = df_posts [ df_posts['Id'].isin(post_id_final)]"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 19,
437 |    "id": "8712592a-1549-4c4e-a508-8357f693d2eb",
438 |    "metadata": {},
439 |    "outputs": [
440 |     {
441 |      "data": {
442 |       "text/plain": [
443 |        "219841"
444 |       ]
445 |      },
446 |      "execution_count": 19,
447 |      "metadata": {},
448 |      "output_type": "execute_result"
449 |     }
450 |    ],
451 |    "source": [
452 |     "len(df_posts_subset)"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "id": "916d2770-d343-47e3-9d06-f9c399e7e6a7",
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": []
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": 20,
466 |    "id": "a503bf07-16fe-4b2a-84d6-cbd86851067e",
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "df_posts_subset.to_parquet(\"gs://np-training-tmp/stackoverflow/final_subset/posts.parquet\")"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 21,
476 |    "id": "f4fbd837-2557-4b63-b263-9af66690815a",
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "df_related_subset.to_parquet(\"gs://np-training-tmp/stackoverflow/final_subset/related_posts.parquet\")"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "id": "9e5b4524-14ec-44b7-bfe4-38f95c39e15b",
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": []
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "id": "1809b800-0eb7-46ec-9a35-4d52070c6840",
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": []
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": null,
502 |    "id": "4fde525e-db57-4c32-a07a-d0cc2b32926a",
503 |    "metadata": {},
504 |    "outputs": [],
505 |    "source": [
506 |     "!gsutil -m cp -r gs://np-training-tmp/stackoverflow/final_subset/* ../data/final_subset/"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": null,
512 |    "id": "7fd4e6ab-9c57-4fe6-a606-eaa4932f4244",
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": [
516 |     "!gsutil -m cp -r gs://np-training-tmp/stackoverflow/final/* ../data/final/\n",
517 |     "\n"
518 |    ]
519 |   }
520 |  ],
521 |  "metadata": {
522 |   "environment": {
523 |    "kernel": "stackoverflow",
524 |    "name": "pytorch-gpu.1-12.m99",
525 |    "type": "gcloud",
526 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
527 |   },
528 |   "kernelspec": {
529 |    "display_name": "stackoverflow",
530 |    "language": "python",
531 |    "name": "stackoverflow"
532 |   },
533 |   "language_info": {
534 |    "codemirror_mode": {
535 |     "name": "ipython",
536 |     "version": 3
537 |    },
538 |    "file_extension": ".py",
539 |    "mimetype": "text/x-python",
540 |    "name": "python",
541 |    "nbconvert_exporter": "python",
542 |    "pygments_lexer": "ipython3",
543 |    "version": "3.7.12"
544 |   }
545 |  },
546 |  "nbformat": 4,
547 |  "nbformat_minor": 5
548 | }
549 | 


--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/01_workshop_data_preview.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "id": "724dc187-f812-4c97-81dd-ad527f9d8338",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "from IPython.display import JSON\n",
 12 |     "import metrics_utils"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "id": "37b1ba40-6527-4ec3-8180-7db66fc9d808",
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": []
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 11,
 26 |    "id": "dfeca3c8-2684-44a1-8497-1bf4c4c89c9d",
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "  1.35 GiB  2022-11-02T08:48:12Z  gs://np-public-training-temp/stackoverflow/final/posts.parquet\n",
 34 |       "  5.26 MiB  2022-11-02T08:48:12Z  gs://np-public-training-temp/stackoverflow/final/related_posts.parquet\n",
 35 |       "115.09 MiB  2022-11-02T08:48:12Z  gs://np-public-training-temp/stackoverflow/final_subset/posts.parquet\n",
 36 |       "  1.08 GiB  2022-11-02T11:42:53Z  gs://np-public-training-temp/stackoverflow/final_subset/posts_with_embedding.parquet\n",
 37 |       "   1.4 MiB  2022-11-02T08:48:12Z  gs://np-public-training-temp/stackoverflow/final_subset/related_posts.parquet\n",
 38 |       "TOTAL: 5 objects, 2736956352 bytes (2.55 GiB)\n"
 39 |      ]
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "!gsutil ls -lh gs://np-public-training-temp/stackoverflow/**"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "id": "e218afd6-edcb-46cc-8263-94611d54ffeb",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "path_posts = \"gs://np-public-training-temp/stackoverflow/final_subset/posts.parquet\"\n",
 54 |     "path_posts_related = \"gs://np-public-training-temp/stackoverflow/final_subset/related_posts.parquet\"\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "id": "431537d1-2701-4d8e-a3fc-22d877cd14bb",
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/html": [
 66 |        "<div>\n",
 67 |        "<style scoped>\n",
 68 |        "    .dataframe tbody tr th:only-of-type {\n",
 69 |        "        vertical-align: middle;\n",
 70 |        "    }\n",
 71 |        "\n",
 72 |        "    .dataframe tbody tr th {\n",
 73 |        "        vertical-align: top;\n",
 74 |        "    }\n",
 75 |        "\n",
 76 |        "    .dataframe thead th {\n",
 77 |        "        text-align: right;\n",
 78 |        "    }\n",
 79 |        "</style>\n",
 80 |        "<table border=\"1\" class=\"dataframe\">\n",
 81 |        "  <thead>\n",
 82 |        "    <tr style=\"text-align: right;\">\n",
 83 |        "      <th></th>\n",
 84 |        "      <th>Id</th>\n",
 85 |        "      <th>AcceptedAnswerId</th>\n",
 86 |        "      <th>Title</th>\n",
 87 |        "      <th>QuestionBody</th>\n",
 88 |        "      <th>Tags</th>\n",
 89 |        "      <th>ViewCount</th>\n",
 90 |        "      <th>AnswerCount</th>\n",
 91 |        "      <th>CommentCount</th>\n",
 92 |        "      <th>Score</th>\n",
 93 |        "      <th>CreationDate</th>\n",
 94 |        "      <th>AnswerId</th>\n",
 95 |        "      <th>AcceptedAnswerBody</th>\n",
 96 |        "    </tr>\n",
 97 |        "  </thead>\n",
 98 |        "  <tbody>\n",
 99 |        "    <tr>\n",
100 |        "      <th>1</th>\n",
101 |        "      <td>15020895</td>\n",
102 |        "      <td>NaN</td>\n",
103 |        "      <td>Python int-byte efficient data structure</td>\n",
104 |        "      <td>i am currently storing key-values of type int-...</td>\n",
105 |        "      <td>[python, data-structures]</td>\n",
106 |        "      <td>155</td>\n",
107 |        "      <td>0</td>\n",
108 |        "      <td>3</td>\n",
109 |        "      <td>1</td>\n",
110 |        "      <td>2013-02-22 09:33:26.360</td>\n",
111 |        "      <td>NaN</td>\n",
112 |        "      <td>None</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>9</th>\n",
116 |        "      <td>68487902</td>\n",
117 |        "      <td>NaN</td>\n",
118 |        "      <td>Why does the Variance of Laplace very differen...</td>\n",
119 |        "      <td>TL;DR: How can I use skimage.filters.laplace(i...</td>\n",
120 |        "      <td>[python, opencv, image-processing, computer-vi...</td>\n",
121 |        "      <td>391</td>\n",
122 |        "      <td>0</td>\n",
123 |        "      <td>5</td>\n",
124 |        "      <td>1</td>\n",
125 |        "      <td>2021-07-22 15:50:34.220</td>\n",
126 |        "      <td>NaN</td>\n",
127 |        "      <td>None</td>\n",
128 |        "    </tr>\n",
129 |        "    <tr>\n",
130 |        "      <th>15</th>\n",
131 |        "      <td>61391327</td>\n",
132 |        "      <td>NaN</td>\n",
133 |        "      <td>Why input never ends</td>\n",
134 |        "      <td>I have python 3.7 installed and I have this co...</td>\n",
135 |        "      <td>[python, python-3.x, input]</td>\n",
136 |        "      <td>104</td>\n",
137 |        "      <td>1</td>\n",
138 |        "      <td>6</td>\n",
139 |        "      <td>3</td>\n",
140 |        "      <td>2020-04-23 15:43:03.497</td>\n",
141 |        "      <td>NaN</td>\n",
142 |        "      <td>None</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>27</th>\n",
146 |        "      <td>28852710</td>\n",
147 |        "      <td>NaN</td>\n",
148 |        "      <td>Crashes with piecewise linear objective for gu...</td>\n",
149 |        "      <td>We have a complex optimization problem which i...</td>\n",
150 |        "      <td>[python, crash, gurobi, piecewise]</td>\n",
151 |        "      <td>403</td>\n",
152 |        "      <td>1</td>\n",
153 |        "      <td>1</td>\n",
154 |        "      <td>3</td>\n",
155 |        "      <td>2015-03-04 10:58:16.370</td>\n",
156 |        "      <td>NaN</td>\n",
157 |        "      <td>None</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>29</th>\n",
161 |        "      <td>24043029</td>\n",
162 |        "      <td>NaN</td>\n",
163 |        "      <td>Python TypeError: plotdatehist() got an unexpe...</td>\n",
164 |        "      <td>apologies beforehand if this is a stupid quest...</td>\n",
165 |        "      <td>[python, typeerror]</td>\n",
166 |        "      <td>419</td>\n",
167 |        "      <td>0</td>\n",
168 |        "      <td>7</td>\n",
169 |        "      <td>0</td>\n",
170 |        "      <td>2014-06-04 16:42:32.257</td>\n",
171 |        "      <td>NaN</td>\n",
172 |        "      <td>None</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>...</th>\n",
176 |        "      <td>...</td>\n",
177 |        "      <td>...</td>\n",
178 |        "      <td>...</td>\n",
179 |        "      <td>...</td>\n",
180 |        "      <td>...</td>\n",
181 |        "      <td>...</td>\n",
182 |        "      <td>...</td>\n",
183 |        "      <td>...</td>\n",
184 |        "      <td>...</td>\n",
185 |        "      <td>...</td>\n",
186 |        "      <td>...</td>\n",
187 |        "      <td>...</td>\n",
188 |        "    </tr>\n",
189 |        "    <tr>\n",
190 |        "      <th>2661376</th>\n",
191 |        "      <td>55431749</td>\n",
192 |        "      <td>55431832.0</td>\n",
193 |        "      <td>Handling exception returned by a method</td>\n",
194 |        "      <td>I am calling a method that throws Valuerror ex...</td>\n",
195 |        "      <td>[python-3.x]</td>\n",
196 |        "      <td>26</td>\n",
197 |        "      <td>1</td>\n",
198 |        "      <td>2</td>\n",
199 |        "      <td>1</td>\n",
200 |        "      <td>2019-03-30 13:07:07.893</td>\n",
201 |        "      <td>55431832.0</td>\n",
202 |        "      <td>You need to place call to sanitize method in t...</td>\n",
203 |        "    </tr>\n",
204 |        "    <tr>\n",
205 |        "      <th>2661378</th>\n",
206 |        "      <td>13794532</td>\n",
207 |        "      <td>13794740.0</td>\n",
208 |        "      <td>Python regular expression for Beautiful Soup</td>\n",
209 |        "      <td>I am using Beautiful Soup to pull out specific...</td>\n",
210 |        "      <td>[python, regex, beautifulsoup]</td>\n",
211 |        "      <td>10723</td>\n",
212 |        "      <td>1</td>\n",
213 |        "      <td>3</td>\n",
214 |        "      <td>5</td>\n",
215 |        "      <td>2012-12-10 03:18:14.743</td>\n",
216 |        "      <td>13794740.0</td>\n",
217 |        "      <td>I think I've got it:\\n\\n[CODE]\\n\\nNotice that,...</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>2661390</th>\n",
221 |        "      <td>25083943</td>\n",
222 |        "      <td>25084142.0</td>\n",
223 |        "      <td>Search has no attribute teaser</td>\n",
224 |        "      <td>I am trying to access teaser. I tried many dif...</td>\n",
225 |        "      <td>[python, regex, json, python-3.x]</td>\n",
226 |        "      <td>62</td>\n",
227 |        "      <td>1</td>\n",
228 |        "      <td>2</td>\n",
229 |        "      <td>-2</td>\n",
230 |        "      <td>2014-08-01 15:45:26.733</td>\n",
231 |        "      <td>25084142.0</td>\n",
232 |        "      <td>Not exactly sure what you are trying to do but...</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>2661401</th>\n",
236 |        "      <td>8221324</td>\n",
237 |        "      <td>8221764.0</td>\n",
238 |        "      <td>Is there a reason the SQLAlchemy ORM tutorial ...</td>\n",
239 |        "      <td>The SQLAlchemy ORM tutorial uses this class:\\n...</td>\n",
240 |        "      <td>[python, sqlalchemy]</td>\n",
241 |        "      <td>834</td>\n",
242 |        "      <td>3</td>\n",
243 |        "      <td>1</td>\n",
244 |        "      <td>4</td>\n",
245 |        "      <td>2011-11-22 02:42:24.157</td>\n",
246 |        "      <td>8221764.0</td>\n",
247 |        "      <td>Bear in mind that eval is not used too much; c...</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>2661415</th>\n",
251 |        "      <td>57679429</td>\n",
252 |        "      <td>57679695.0</td>\n",
253 |        "      <td>How can I turn a list of column names into a p...</td>\n",
254 |        "      <td>I have a list of pandas column names (consisti...</td>\n",
255 |        "      <td>[python, string, list, patsy]</td>\n",
256 |        "      <td>106</td>\n",
257 |        "      <td>1</td>\n",
258 |        "      <td>0</td>\n",
259 |        "      <td>0</td>\n",
260 |        "      <td>2019-08-27 17:11:24.390</td>\n",
261 |        "      <td>57679695.0</td>\n",
262 |        "      <td>[CODE]\\n\\n[CODE]\\n</td>\n",
263 |        "    </tr>\n",
264 |        "  </tbody>\n",
265 |        "</table>\n",
266 |        "<p>219841 rows × 12 columns</p>\n",
267 |        "</div>"
268 |       ],
269 |       "text/plain": [
270 |        "               Id  AcceptedAnswerId  \\\n",
271 |        "1        15020895               NaN   \n",
272 |        "9        68487902               NaN   \n",
273 |        "15       61391327               NaN   \n",
274 |        "27       28852710               NaN   \n",
275 |        "29       24043029               NaN   \n",
276 |        "...           ...               ...   \n",
277 |        "2661376  55431749        55431832.0   \n",
278 |        "2661378  13794532        13794740.0   \n",
279 |        "2661390  25083943        25084142.0   \n",
280 |        "2661401   8221324         8221764.0   \n",
281 |        "2661415  57679429        57679695.0   \n",
282 |        "\n",
283 |        "                                                     Title  \\\n",
284 |        "1                 Python int-byte efficient data structure   \n",
285 |        "9        Why does the Variance of Laplace very differen...   \n",
286 |        "15                                    Why input never ends   \n",
287 |        "27       Crashes with piecewise linear objective for gu...   \n",
288 |        "29       Python TypeError: plotdatehist() got an unexpe...   \n",
289 |        "...                                                    ...   \n",
290 |        "2661376            Handling exception returned by a method   \n",
291 |        "2661378       Python regular expression for Beautiful Soup   \n",
292 |        "2661390                     Search has no attribute teaser   \n",
293 |        "2661401  Is there a reason the SQLAlchemy ORM tutorial ...   \n",
294 |        "2661415  How can I turn a list of column names into a p...   \n",
295 |        "\n",
296 |        "                                              QuestionBody  \\\n",
297 |        "1        i am currently storing key-values of type int-...   \n",
298 |        "9        TL;DR: How can I use skimage.filters.laplace(i...   \n",
299 |        "15       I have python 3.7 installed and I have this co...   \n",
300 |        "27       We have a complex optimization problem which i...   \n",
301 |        "29       apologies beforehand if this is a stupid quest...   \n",
302 |        "...                                                    ...   \n",
303 |        "2661376  I am calling a method that throws Valuerror ex...   \n",
304 |        "2661378  I am using Beautiful Soup to pull out specific...   \n",
305 |        "2661390  I am trying to access teaser. I tried many dif...   \n",
306 |        "2661401  The SQLAlchemy ORM tutorial uses this class:\\n...   \n",
307 |        "2661415  I have a list of pandas column names (consisti...   \n",
308 |        "\n",
309 |        "                                                      Tags  ViewCount  \\\n",
310 |        "1                                [python, data-structures]        155   \n",
311 |        "9        [python, opencv, image-processing, computer-vi...        391   \n",
312 |        "15                             [python, python-3.x, input]        104   \n",
313 |        "27                      [python, crash, gurobi, piecewise]        403   \n",
314 |        "29                                     [python, typeerror]        419   \n",
315 |        "...                                                    ...        ...   \n",
316 |        "2661376                                       [python-3.x]         26   \n",
317 |        "2661378                     [python, regex, beautifulsoup]      10723   \n",
318 |        "2661390                  [python, regex, json, python-3.x]         62   \n",
319 |        "2661401                               [python, sqlalchemy]        834   \n",
320 |        "2661415                      [python, string, list, patsy]        106   \n",
321 |        "\n",
322 |        "         AnswerCount  CommentCount  Score            CreationDate    AnswerId  \\\n",
323 |        "1                  0             3      1 2013-02-22 09:33:26.360         NaN   \n",
324 |        "9                  0             5      1 2021-07-22 15:50:34.220         NaN   \n",
325 |        "15                 1             6      3 2020-04-23 15:43:03.497         NaN   \n",
326 |        "27                 1             1      3 2015-03-04 10:58:16.370         NaN   \n",
327 |        "29                 0             7      0 2014-06-04 16:42:32.257         NaN   \n",
328 |        "...              ...           ...    ...                     ...         ...   \n",
329 |        "2661376            1             2      1 2019-03-30 13:07:07.893  55431832.0   \n",
330 |        "2661378            1             3      5 2012-12-10 03:18:14.743  13794740.0   \n",
331 |        "2661390            1             2     -2 2014-08-01 15:45:26.733  25084142.0   \n",
332 |        "2661401            3             1      4 2011-11-22 02:42:24.157   8221764.0   \n",
333 |        "2661415            1             0      0 2019-08-27 17:11:24.390  57679695.0   \n",
334 |        "\n",
335 |        "                                        AcceptedAnswerBody  \n",
336 |        "1                                                     None  \n",
337 |        "9                                                     None  \n",
338 |        "15                                                    None  \n",
339 |        "27                                                    None  \n",
340 |        "29                                                    None  \n",
341 |        "...                                                    ...  \n",
342 |        "2661376  You need to place call to sanitize method in t...  \n",
343 |        "2661378  I think I've got it:\\n\\n[CODE]\\n\\nNotice that,...  \n",
344 |        "2661390  Not exactly sure what you are trying to do but...  \n",
345 |        "2661401  Bear in mind that eval is not used too much; c...  \n",
346 |        "2661415                                 [CODE]\\n\\n[CODE]\\n  \n",
347 |        "\n",
348 |        "[219841 rows x 12 columns]"
349 |       ]
350 |      },
351 |      "execution_count": 4,
352 |      "metadata": {},
353 |      "output_type": "execute_result"
354 |     }
355 |    ],
356 |    "source": [
357 |     "df_posts = pd.read_parquet(path_posts)\n",
358 |     "df_posts"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "id": "bd200fc0-da3e-4a72-8fd2-2004d540691a",
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": []
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "id": "d557f519-6249-4a00-ba28-0948db54405a",
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": []
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 5,
380 |    "id": "72242ee8-cc09-4ddd-ab0d-89f7ea0d1b78",
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "data": {
385 |       "text/html": [
386 |        "<div>\n",
387 |        "<style scoped>\n",
388 |        "    .dataframe tbody tr th:only-of-type {\n",
389 |        "        vertical-align: middle;\n",
390 |        "    }\n",
391 |        "\n",
392 |        "    .dataframe tbody tr th {\n",
393 |        "        vertical-align: top;\n",
394 |        "    }\n",
395 |        "\n",
396 |        "    .dataframe thead th {\n",
397 |        "        text-align: right;\n",
398 |        "    }\n",
399 |        "</style>\n",
400 |        "<table border=\"1\" class=\"dataframe\">\n",
401 |        "  <thead>\n",
402 |        "    <tr style=\"text-align: right;\">\n",
403 |        "      <th></th>\n",
404 |        "      <th>PostId</th>\n",
405 |        "      <th>PostTitle</th>\n",
406 |        "      <th>RelatedPostIds</th>\n",
407 |        "      <th>RelatedPostTitles</th>\n",
408 |        "      <th>num_candidates</th>\n",
409 |        "    </tr>\n",
410 |        "  </thead>\n",
411 |        "  <tbody>\n",
412 |        "    <tr>\n",
413 |        "      <th>1</th>\n",
414 |        "      <td>3494593</td>\n",
415 |        "      <td>Shading a kernel density plot between two points.</td>\n",
416 |        "      <td>[3494593, 14863744, 14094644, 16504452, 488531...</td>\n",
417 |        "      <td>[Shading a kernel density plot between two poi...</td>\n",
418 |        "      <td>16</td>\n",
419 |        "    </tr>\n",
420 |        "    <tr>\n",
421 |        "      <th>2</th>\n",
422 |        "      <td>37949409</td>\n",
423 |        "      <td>Dictionary in a numpy array?</td>\n",
424 |        "      <td>[37949409, 47689224, 61517741]</td>\n",
425 |        "      <td>[Dictionary in a numpy array?, How to access t...</td>\n",
426 |        "      <td>3</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>8</th>\n",
430 |        "      <td>19876079</td>\n",
431 |        "      <td>Cannot find module cv2 when using OpenCV</td>\n",
432 |        "      <td>[19876079, 62443365, 64580641, 45606137, 60294...</td>\n",
433 |        "      <td>[Cannot find module cv2 when using OpenCV, How...</td>\n",
434 |        "      <td>7</td>\n",
435 |        "    </tr>\n",
436 |        "    <tr>\n",
437 |        "      <th>12</th>\n",
438 |        "      <td>35082143</td>\n",
439 |        "      <td>Error: package or namespace load failed for ‘car’</td>\n",
440 |        "      <td>[35082143, 65941744, 68515009, 56409535]</td>\n",
441 |        "      <td>[Error: package or namespace load failed for ‘...</td>\n",
442 |        "      <td>4</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "      <th>14</th>\n",
446 |        "      <td>2673651</td>\n",
447 |        "      <td>inheritance from str or int</td>\n",
448 |        "      <td>[2673651, 48465797, 3120562, 15085917, 3238350...</td>\n",
449 |        "      <td>[inheritance from str or int, Inherited class ...</td>\n",
450 |        "      <td>15</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>...</th>\n",
454 |        "      <td>...</td>\n",
455 |        "      <td>...</td>\n",
456 |        "      <td>...</td>\n",
457 |        "      <td>...</td>\n",
458 |        "      <td>...</td>\n",
459 |        "    </tr>\n",
460 |        "    <tr>\n",
461 |        "      <th>33231</th>\n",
462 |        "      <td>28419763</td>\n",
463 |        "      <td>Expand Text widget to fill the entire parent F...</td>\n",
464 |        "      <td>[28419763, 48171462]</td>\n",
465 |        "      <td>[Expand Text widget to fill the entire parent ...</td>\n",
466 |        "      <td>2</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>33234</th>\n",
470 |        "      <td>40332743</td>\n",
471 |        "      <td>Source code for str.split?</td>\n",
472 |        "      <td>[40332743, 51355719]</td>\n",
473 |        "      <td>[Source code for str.split?, where can I find ...</td>\n",
474 |        "      <td>2</td>\n",
475 |        "    </tr>\n",
476 |        "    <tr>\n",
477 |        "      <th>33241</th>\n",
478 |        "      <td>27443414</td>\n",
479 |        "      <td>Cannot perform a backup or restore operation w...</td>\n",
480 |        "      <td>[27443414, 53216877]</td>\n",
481 |        "      <td>[Cannot perform a backup or restore operation ...</td>\n",
482 |        "      <td>2</td>\n",
483 |        "    </tr>\n",
484 |        "    <tr>\n",
485 |        "      <th>33243</th>\n",
486 |        "      <td>48536681</td>\n",
487 |        "      <td>What is the exact meaning of stride's list in ...</td>\n",
488 |        "      <td>[48536681, 47305022]</td>\n",
489 |        "      <td>[What is the exact meaning of stride's list in...</td>\n",
490 |        "      <td>2</td>\n",
491 |        "    </tr>\n",
492 |        "    <tr>\n",
493 |        "      <th>33244</th>\n",
494 |        "      <td>37814201</td>\n",
495 |        "      <td>pandas time shift from utc to local</td>\n",
496 |        "      <td>[37814201, 52390647]</td>\n",
497 |        "      <td>[pandas time shift from utc to local, Convert ...</td>\n",
498 |        "      <td>2</td>\n",
499 |        "    </tr>\n",
500 |        "  </tbody>\n",
501 |        "</table>\n",
502 |        "<p>6114 rows × 5 columns</p>\n",
503 |        "</div>"
504 |       ],
505 |       "text/plain": [
506 |        "         PostId                                          PostTitle  \\\n",
507 |        "1       3494593  Shading a kernel density plot between two points.   \n",
508 |        "2      37949409                       Dictionary in a numpy array?   \n",
509 |        "8      19876079           Cannot find module cv2 when using OpenCV   \n",
510 |        "12     35082143  Error: package or namespace load failed for ‘car’   \n",
511 |        "14      2673651                        inheritance from str or int   \n",
512 |        "...         ...                                                ...   \n",
513 |        "33231  28419763  Expand Text widget to fill the entire parent F...   \n",
514 |        "33234  40332743                         Source code for str.split?   \n",
515 |        "33241  27443414  Cannot perform a backup or restore operation w...   \n",
516 |        "33243  48536681  What is the exact meaning of stride's list in ...   \n",
517 |        "33244  37814201                pandas time shift from utc to local   \n",
518 |        "\n",
519 |        "                                          RelatedPostIds  \\\n",
520 |        "1      [3494593, 14863744, 14094644, 16504452, 488531...   \n",
521 |        "2                         [37949409, 47689224, 61517741]   \n",
522 |        "8      [19876079, 62443365, 64580641, 45606137, 60294...   \n",
523 |        "12              [35082143, 65941744, 68515009, 56409535]   \n",
524 |        "14     [2673651, 48465797, 3120562, 15085917, 3238350...   \n",
525 |        "...                                                  ...   \n",
526 |        "33231                               [28419763, 48171462]   \n",
527 |        "33234                               [40332743, 51355719]   \n",
528 |        "33241                               [27443414, 53216877]   \n",
529 |        "33243                               [48536681, 47305022]   \n",
530 |        "33244                               [37814201, 52390647]   \n",
531 |        "\n",
532 |        "                                       RelatedPostTitles  num_candidates  \n",
533 |        "1      [Shading a kernel density plot between two poi...              16  \n",
534 |        "2      [Dictionary in a numpy array?, How to access t...               3  \n",
535 |        "8      [Cannot find module cv2 when using OpenCV, How...               7  \n",
536 |        "12     [Error: package or namespace load failed for ‘...               4  \n",
537 |        "14     [inheritance from str or int, Inherited class ...              15  \n",
538 |        "...                                                  ...             ...  \n",
539 |        "33231  [Expand Text widget to fill the entire parent ...               2  \n",
540 |        "33234  [Source code for str.split?, where can I find ...               2  \n",
541 |        "33241  [Cannot perform a backup or restore operation ...               2  \n",
542 |        "33243  [What is the exact meaning of stride's list in...               2  \n",
543 |        "33244  [pandas time shift from utc to local, Convert ...               2  \n",
544 |        "\n",
545 |        "[6114 rows x 5 columns]"
546 |       ]
547 |      },
548 |      "execution_count": 5,
549 |      "metadata": {},
550 |      "output_type": "execute_result"
551 |     }
552 |    ],
553 |    "source": [
554 |     "df_posts = pd.read_parquet(path_posts_related)\n",
555 |     "df_posts"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 8,
561 |    "id": "ba0dc292-3101-457f-b80c-5ce061118c09",
562 |    "metadata": {},
563 |    "outputs": [
564 |     {
565 |      "data": {
566 |       "application/json": {
567 |        "PostId": 3494593,
568 |        "PostTitle": "Shading a kernel density plot between two points.",
569 |        "RelatedPostIds": [
570 |         3494593,
571 |         14863744,
572 |         14094644,
573 |         16504452,
574 |         48853178,
575 |         36948624,
576 |         47308146,
577 |         34029811,
578 |         31215748,
579 |         29499914,
580 |         41484896,
581 |         7787114,
582 |         27189453,
583 |         23680729,
584 |         36224394,
585 |         18742693
586 |        ],
587 |        "RelatedPostTitles": [
588 |         "Shading a kernel density plot between two points.",
589 |         "adding percentile lines to a density plot",
590 |         "draw the following shaded area in R",
591 |         "color a portion of the normal distribution",
592 |         "How can I shade the area under a curve?",
593 |         "Shade area under a curve",
594 |         "Shading a region under a PDF",
595 |         "Fill different colors for each quantile in geom_density() of ggplot",
596 |         "How to shade part of a density curve in ggplot (with no y axis data)",
597 |         "r density plot - fill area under curve",
598 |         "Fill negative value area below geom_line",
599 |         "polygon in density plot?",
600 |         "Shade (fill or color) area under density curve by quantile",
601 |         "Partially fill density plot for area of interest",
602 |         "Shade density plot to the left of vline?",
603 |         "Shade an area in a R plot"
604 |        ],
605 |        "num_candidates": 16
606 |       },
607 |       "text/plain": [
608 |        "<IPython.core.display.JSON object>"
609 |       ]
610 |      },
611 |      "execution_count": 8,
612 |      "metadata": {
613 |       "application/json": {
614 |        "expanded": false,
615 |        "root": "root"
616 |       }
617 |      },
618 |      "output_type": "execute_result"
619 |     }
620 |    ],
621 |    "source": [
622 |     "JSON ( df_posts.iloc[0].to_dict() )"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "id": "9fb7ab6f-08fa-4099-939d-edcb7beca230",
629 |    "metadata": {},
630 |    "outputs": [],
631 |    "source": []
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "id": "473e7e29-7a27-4030-aad3-c60c89dc19bd",
637 |    "metadata": {},
638 |    "outputs": [],
639 |    "source": []
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": null,
644 |    "id": "ce7fc618-3b9c-450e-a89f-576d47fba15e",
645 |    "metadata": {},
646 |    "outputs": [],
647 |    "source": []
648 |   },
649 |   {
650 |    "cell_type": "code",
651 |    "execution_count": null,
652 |    "id": "30a57006-3696-4a2d-82ca-726ee7c5b6b3",
653 |    "metadata": {},
654 |    "outputs": [],
655 |    "source": []
656 |   },
657 |   {
658 |    "cell_type": "markdown",
659 |    "id": "29ebbeec-f1de-4d07-b603-917e5aa3928b",
660 |    "metadata": {},
661 |    "source": [
662 |     "## Metrics"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 4,
668 |    "id": "3c824225-1fe7-488a-a291-f8ade3f82a82",
669 |    "metadata": {},
670 |    "outputs": [
671 |     {
672 |      "data": {
673 |       "text/plain": [
674 |        "\u001b[0;31mType:\u001b[0m        module\n",
675 |        "\u001b[0;31mString form:\u001b[0m <module 'metrics_utils' from '/home/jupyter/projects/search-engine-workshop/notebooks/metrics_utils.py'>\n",
676 |        "\u001b[0;31mFile:\u001b[0m        ~/projects/search-engine-workshop/notebooks/metrics_utils.py\n",
677 |        "\u001b[0;31mSource:\u001b[0m     \n",
678 |        "\u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\n",
679 |        "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
680 |        "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
681 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
682 |        "\u001b[0;34m\u001b[0m    \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
683 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
684 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mreturn\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m\u001b[0m\n",
685 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
686 |        "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
687 |        "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
688 |        "\u001b[0;34m\u001b[0m    \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n",
689 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
690 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
691 |        "\u001b[0;34m\u001b[0m        \u001b[0mfirst_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
692 |        "\u001b[0;34m\u001b[0m        \u001b[0mmrr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;34m/\u001b[0m  \u001b[0;34m(\u001b[0m\u001b[0mfirst_index\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
693 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
694 |        "\u001b[0;34m\u001b[0m        \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\n",
695 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
696 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mreturn\u001b[0m \u001b[0mmrr\u001b[0m\u001b[0;34m\u001b[0m\n",
697 |        "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
698 |        "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
699 |        "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
700 |        "\u001b[0;34m\u001b[0m    \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
701 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
702 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mfor\u001b[0m \u001b[0midx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
703 |        "\u001b[0;34m\u001b[0m        \u001b[0;34m\u001b[0m\n",
704 |        "\u001b[0;34m\u001b[0m        \u001b[0;32mif\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
705 |        "\u001b[0;34m\u001b[0m            \u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
706 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
707 |        "\u001b[0;34m\u001b[0m    \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n",
708 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
709 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mif\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
710 |        "\u001b[0;34m\u001b[0m        \u001b[0map\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
711 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
712 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mreturn\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m\u001b[0m\n",
713 |        "\u001b[0;34m\u001b[0m                           \u001b[0;34m\u001b[0m\n",
714 |        "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
715 |        "\u001b[0;34m\u001b[0m\u001b[0;32mdef\u001b[0m \u001b[0mall_metrics\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
716 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
717 |        "\u001b[0;34m\u001b[0m    \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m\u001b[0m\n",
718 |        "\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
719 |        "\u001b[0;34m\u001b[0m    \u001b[0mres\u001b[0m\u001b[0;34m=\u001b[0m  \u001b[0;34m{\u001b[0m\u001b[0;34m\u001b[0m\n",
720 |        "\u001b[0;34m\u001b[0m     \u001b[0;34m\"p@1\"\u001b[0m \u001b[0;34m:\u001b[0m  \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
721 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@5\"\u001b[0m \u001b[0;34m:\u001b[0m  \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
722 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m,\u001b[0m \u001b[0;34m\"p@10\"\u001b[0m \u001b[0;34m:\u001b[0m  \u001b[0mprecision_at_k\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
723 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m,\u001b[0m  \u001b[0;34m\"mrr\"\u001b[0m \u001b[0;34m:\u001b[0m  \u001b[0mmean_reciprocal_rank\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
724 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m,\u001b[0m  \u001b[0;34m\"map\"\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0maverage_precision\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
725 |        "\u001b[0;34m\u001b[0m        \u001b[0;34m\u001b[0m\n",
726 |        "\u001b[0;34m\u001b[0m        \u001b[0;34m\u001b[0m\n",
727 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n",
728 |        "\u001b[0;34m\u001b[0m    \u001b[0;34m\u001b[0m\n",
729 |        "\u001b[0;34m\u001b[0m    \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n"
730 |       ]
731 |      },
732 |      "metadata": {},
733 |      "output_type": "display_data"
734 |     }
735 |    ],
736 |    "source": [
737 |     "??metrics_utils"
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "code",
742 |    "execution_count": null,
743 |    "id": "9b37a9b9-ab34-4152-88af-22728c8758a9",
744 |    "metadata": {},
745 |    "outputs": [],
746 |    "source": []
747 |   },
748 |   {
749 |    "cell_type": "markdown",
750 |    "id": "90285ffa-4312-4ea8-84a6-595199688140",
751 |    "metadata": {},
752 |    "source": [
753 |     "relevant result at the end"
754 |    ]
755 |   },
756 |   {
757 |    "cell_type": "code",
758 |    "execution_count": 11,
759 |    "id": "356a2b4a-6f3d-42df-bf65-7796bc29c7d9",
760 |    "metadata": {},
761 |    "outputs": [
762 |     {
763 |      "data": {
764 |       "text/plain": [
765 |        "{'p@1': 0.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 0.2, 'map': 0.2}"
766 |       ]
767 |      },
768 |      "execution_count": 11,
769 |      "metadata": {},
770 |      "output_type": "execute_result"
771 |     }
772 |    ],
773 |    "source": [
774 |     "metrics_utils.all_metrics([0,0,0,0,1])"
775 |    ]
776 |   },
777 |   {
778 |    "cell_type": "markdown",
779 |    "id": "59f9f574-e506-45e0-9c4c-c65a2b3827eb",
780 |    "metadata": {},
781 |    "source": [
782 |     "relevant result at the beginning"
783 |    ]
784 |   },
785 |   {
786 |    "cell_type": "code",
787 |    "execution_count": 12,
788 |    "id": "8252bfbc-7184-437b-91e6-b60d166a9742",
789 |    "metadata": {},
790 |    "outputs": [
791 |     {
792 |      "data": {
793 |       "text/plain": [
794 |        "{'p@1': 1.0, 'p@5': 0.2, 'p@10': 0.1, 'mrr': 1.0, 'map': 1.0}"
795 |       ]
796 |      },
797 |      "execution_count": 12,
798 |      "metadata": {},
799 |      "output_type": "execute_result"
800 |     }
801 |    ],
802 |    "source": [
803 |     "metrics_utils.all_metrics([1,0,0,0,0])"
804 |    ]
805 |   },
806 |   {
807 |    "cell_type": "code",
808 |    "execution_count": 13,
809 |    "id": "196acac3-a263-4307-8ef9-075e7492870c",
810 |    "metadata": {},
811 |    "outputs": [
812 |     {
813 |      "data": {
814 |       "text/plain": [
815 |        "0.2"
816 |       ]
817 |      },
818 |      "execution_count": 13,
819 |      "metadata": {},
820 |      "output_type": "execute_result"
821 |     }
822 |    ],
823 |    "source": []
824 |   },
825 |   {
826 |    "cell_type": "markdown",
827 |    "id": "e3b1413a-81a2-4a7d-9a46-ac6c9938b17e",
828 |    "metadata": {},
829 |    "source": [
830 |     "map captures that the relevant results are shown at the beginning"
831 |    ]
832 |   },
833 |   {
834 |    "cell_type": "code",
835 |    "execution_count": 14,
836 |    "id": "ad52c3ad-952a-4340-87bd-d20369cb420d",
837 |    "metadata": {},
838 |    "outputs": [
839 |     {
840 |      "data": {
841 |       "text/plain": [
842 |        "{'p@1': 0.0,\n",
843 |        " 'p@5': 0.4,\n",
844 |        " 'p@10': 0.2,\n",
845 |        " 'mrr': 0.3333333333333333,\n",
846 |        " 'map': 0.41666666666666663}"
847 |       ]
848 |      },
849 |      "execution_count": 14,
850 |      "metadata": {},
851 |      "output_type": "execute_result"
852 |     }
853 |    ],
854 |    "source": [
855 |     "metrics_utils.all_metrics([0,0,1,1,0])"
856 |    ]
857 |   },
858 |   {
859 |    "cell_type": "code",
860 |    "execution_count": 15,
861 |    "id": "8f61fc8e-7292-43dc-8f29-501d7fee8876",
862 |    "metadata": {},
863 |    "outputs": [
864 |     {
865 |      "data": {
866 |       "text/plain": [
867 |        "{'p@1': 1.0, 'p@5': 0.4, 'p@10': 0.2, 'mrr': 1.0, 'map': 1.0}"
868 |       ]
869 |      },
870 |      "execution_count": 15,
871 |      "metadata": {},
872 |      "output_type": "execute_result"
873 |     }
874 |    ],
875 |    "source": [
876 |     "metrics_utils.all_metrics([1,1,0,0,0])"
877 |    ]
878 |   },
879 |   {
880 |    "cell_type": "code",
881 |    "execution_count": null,
882 |    "id": "8505c6aa-d009-4dea-9263-38ca4d9f2c4b",
883 |    "metadata": {},
884 |    "outputs": [],
885 |    "source": []
886 |   }
887 |  ],
888 |  "metadata": {
889 |   "environment": {
890 |    "kernel": "python3",
891 |    "name": "pytorch-gpu.1-12.m99",
892 |    "type": "gcloud",
893 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
894 |   },
895 |   "kernelspec": {
896 |    "display_name": "Python 3",
897 |    "language": "python",
898 |    "name": "python3"
899 |   },
900 |   "language_info": {
901 |    "codemirror_mode": {
902 |     "name": "ipython",
903 |     "version": 3
904 |    },
905 |    "file_extension": ".py",
906 |    "mimetype": "text/x-python",
907 |    "name": "python",
908 |    "nbconvert_exporter": "python",
909 |    "pygments_lexer": "ipython3",
910 |    "version": "3.7.12"
911 |   }
912 |  },
913 |  "nbformat": 4,
914 |  "nbformat_minor": 5
915 | }
916 | 


--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/metrics_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def precision_at_k(r, k):
 4 |     
 5 |     r = r[:k]
 6 |     
 7 |     return sum(r) / k 
 8 |     
 9 | 
10 | def mean_reciprocal_rank(r):
11 |     mrr = 0
12 |     
13 |     try:
14 |         first_index = r.index(True)
15 |         mrr = 1 /  (first_index + 1)
16 |     except:
17 |         pass
18 |     
19 |     return mrr
20 | 
21 | def average_precision(r):
22 | 
23 |     out = []
24 |     
25 |     for idx in range(len(r)):
26 |         
27 |         if r[idx]:
28 |             out.append (precision_at_k(r, idx + 1) )
29 |     
30 |     ap = 0
31 |     
32 |     if out:
33 |         ap = sum(out) / len(out)
34 |     
35 |     return ap
36 |                            
37 | 
38 | def all_metrics(result):
39 |     
40 |     result = list(result) 
41 | 
42 |     res=  {
43 |      "p@1" :  precision_at_k(result, 1)
44 |     , "p@5" :  precision_at_k(result, 5)
45 |     , "p@10" :  precision_at_k(result, 10)
46 |     ,  "mrr" :  mean_reciprocal_rank(result)
47 |     ,  "map" : average_precision(result)
48 |         
49 |         
50 |     }
51 |     
52 |     return res


--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/test_setup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "ad224cf0-176b-4460-afc0-03f0aacdfe71",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import datetime\n",
 11 |     "import pickle\n",
 12 |     "import uuid\n",
 13 |     "import datetime\n",
 14 |     "import numpy as np\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "9d7d74e6-1472-4f92-b582-fb74683a252e",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": []
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "aae437e9-d66c-44a8-ab44-4523f7abb5b1",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "# Elastic Search"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "b7831c3d-d97c-4027-bfbd-26b4f672b003",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import elasticsearch"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "69aff282-e9b0-466b-b828-87b69e3dcbc1",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "elasticsearch.__version__"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "07c82415-1cd2-43e8-a88c-626eac3dea04",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "from elasticsearch import Elasticsearch"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "id": "5e2563f6-5723-4aa6-a120-f4764c4d8b07",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "es = Elasticsearch(hosts=\"http://localhost:9200\" , verify_certs=False)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "4f00be9b-8c7d-4993-9a75-2ed9f5a94b37",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "index_name = \"test-index\""
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "056b1007-5cc8-4a40-ab59-ffda53e269d2",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "doc = {\n",
 92 |     "    'author': 'kimchy',\n",
 93 |     "    'text': 'Elasticsearch: cool. bonsai cool.',\n",
 94 |     "    'timestamp': datetime.datetime.now(),\n",
 95 |     "}"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "e994e955-f3ea-47f4-95e6-994390e5403e",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "resp = es.index(index=index_name, id=1, document=doc)\n"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "6ecf186b-458a-4bf6-9ed1-ff1030b72f50",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "print(resp['result'])\n",
116 |     "\n",
117 |     "resp = es.get(index=index_name, id=1)\n",
118 |     "print(resp['_source'])\n",
119 |     "\n",
120 |     "es.indices.refresh(index=index_name)\n",
121 |     "\n",
122 |     "resp = es.search(index=index_name, query={\"match_all\": {}})\n",
123 |     "print(\"Got %d Hits:\" % resp['hits']['total']['value'])\n",
124 |     "for hit in resp['hits']['hits']:\n",
125 |     "    print(\"%(timestamp)s %(author)s: %(text)s\" % hit[\"_source\"])"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "ec1b445b-6cc5-458d-bc9b-5b371c82d1a5",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": []
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "cf0dd18f-8fde-4a72-b226-f58ab22c4520",
139 |    "metadata": {},
140 |    "source": [
141 |     "# Milvus"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "id": "53965619-9c81-472e-9110-7c4e2b1cc3a7",
147 |    "metadata": {},
148 |    "source": [
149 |     "https://github.com/milvus-io/pymilvus/blob/master/examples/hello_milvus.ipynb"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "8c384da7-3a72-4b9e-bacf-4e188af6a3b4",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "from pymilvus import (\n",
160 |     "    connections,\n",
161 |     "    utility,\n",
162 |     "    FieldSchema, CollectionSchema, DataType,\n",
163 |     "    Collection,\n",
164 |     ")\n"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "id": "c4c65116-21fa-46da-ba1c-231bf2a7569a",
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "num_entities, dim = 3000, 8\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "id": "ee5a1a32-5735-4951-8724-9a773d036ea6",
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "collection_name=\"hello_milvus\""
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "id": "e92ce9b8-cbb3-42d2-ad8b-791d21729d3c",
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "!ls"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "id": "55e5a4ad-9150-4b86-9cb6-f968b7fe51fd",
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "connections.connect(\"default\", host=\"localhost\", port=\"19530\")\n"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "id": "ae3415ca-a52f-4319-b20c-d08f49bd06e7",
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "if  utility.has_collection(collection_name):\n",
215 |     "    utility.drop_collection(collection_name)"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "id": "a4324ba0-4050-4b3c-9ba6-6e591177509d",
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "utility.list_collections()"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "id": "c070eb28-fa18-43ad-a2de-1067b2476274",
231 |    "metadata": {},
232 |    "source": [
233 |     "create collection"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "5e08cee9-e957-44b5-856f-c4549faa7b86",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "fields = [\n",
244 |     "    FieldSchema(name=\"pk\", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),\n",
245 |     "    FieldSchema(name=\"random\", dtype=DataType.DOUBLE),\n",
246 |     "    FieldSchema(name=\"embeddings\", dtype=DataType.FLOAT_VECTOR, dim=dim)\n",
247 |     "]\n",
248 |     "\n",
249 |     "schema = CollectionSchema(fields, \"hello_milvus is the simplest demo to introduce the APIs\")\n",
250 |     "\n",
251 |     "hello_milvus = Collection(collection_name, schema, consistency_level=\"Strong\")"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "id": "8eff6925-4089-46c3-ae03-00fc748c3135",
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": []
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "id": "99ee18c0-fb04-4970-8ba5-e36eca35680a",
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "rng = np.random.default_rng(seed=19530)\n",
270 |     "entities = [\n",
271 |     "    # provide the pk field because `auto_id` is set to False\n",
272 |     "    [str(i) for i in range(num_entities)],\n",
273 |     "    rng.random(num_entities).tolist(),  # field random, only supports list\n",
274 |     "    rng.random((num_entities, dim)),    # field embeddings, supports numpy.ndarray and list\n",
275 |     "]\n",
276 |     "\n",
277 |     "insert_result = hello_milvus.insert(entities)\n",
278 |     "\n",
279 |     "print(f\"Number of entities in Milvus: {hello_milvus.num_entities}\")  # check the num_entites"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "id": "ede3539d-e940-458d-a7d9-6c345178f357",
286 |    "metadata": {},
287 |    "outputs": [],
288 |    "source": []
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "c272a614-8baa-4b3f-b77d-499aa30760d7",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "index = {\n",
298 |     "    \"index_type\": \"IVF_FLAT\",\n",
299 |     "    \"metric_type\": \"L2\",\n",
300 |     "    \"params\": {\"nlist\": 128},\n",
301 |     "}\n",
302 |     "\n",
303 |     "hello_milvus.create_index(\"embeddings\", index)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "id": "b2865bb6-854a-41a3-97f1-98634f9b57e4",
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "hello_milvus.load()\n"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "id": "0e8e8b2f-a00e-49a0-bbc9-cb99fecb0e7a",
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "vectors_to_search = entities[-1][-2:]\n",
324 |     "search_params = {\n",
325 |     "    \"metric_type\": \"L2\",\n",
326 |     "    \"params\": {\"nprobe\": 10},\n",
327 |     "}\n",
328 |     "\n",
329 |     "start_time = time.time()\n",
330 |     "result = hello_milvus.search(vectors_to_search, \"embeddings\", search_params, limit=3, output_fields=[\"random\"])\n",
331 |     "end_time = time.time()\n",
332 |     "\n",
333 |     "for hits in result:\n",
334 |     "    for hit in hits:\n",
335 |     "        print(f\"hit: {hit}, random field: {hit.entity.get('random')}\")\n",
336 |     "print((end_time - start_time))"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "id": "385d0057-cc9e-4ee3-834e-bec4675cbb96",
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": []
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "id": "70601713-0913-4ccc-9efd-9058397d1266",
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": []
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "id": "4ef6e0b5-1975-422f-9fdd-6e45f2e0917a",
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": []
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "id": "493a4373-3de7-4cff-9b5f-c0b7d0288506",
366 |    "metadata": {},
367 |    "source": [
368 |     "# weaviate"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "27b9d308-face-444b-bf3c-5d3bec0072bb",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "#!pip install weaviate-client==3.8.0"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "id": "8f818cf1-ed3b-478f-bf5b-b513043ebcb1",
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "import weaviate\n"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "id": "595ed265-609c-41be-b9f9-eadfb9820a2f",
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "def generate_uuid(class_name: str, identifier: str,\n",
399 |     "                  test: str = 'teststrong') -> str:\n",
400 |     "    \"\"\" Generate a uuid based on an identifier\n",
401 |     "    :param identifier: characters used to generate the uuid\n",
402 |     "    :type identifier: str, required\n",
403 |     "    :param class_name: classname of the object to create a uuid for\n",
404 |     "    :type class_name: str, required\n",
405 |     "    \"\"\"\n",
406 |     "    test = 'overwritten'\n",
407 |     "    return str(uuid.uuid5(uuid.NAMESPACE_DNS, class_name + identifier))\n",
408 |     "\n",
409 |     "def log(i: str) -> str:\n",
410 |     "    \"\"\" A simple logger\n",
411 |     "    :param i: the log message\n",
412 |     "    :type i: str\n",
413 |     "    \"\"\"\n",
414 |     "    now = datetime.datetime.utcnow()\n",
415 |     "    print(now, \"| \" + str(i))"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "id": "0cd92cf1-a8f6-4f71-ab1b-008d17e659b3",
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "client = weaviate.Client(\"http://localhost:8081\")\n",
426 |     "print(\"Client created\")"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "id": "3645408d-9a51-41b9-aa02-afd549cacdc3",
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "from sentence_transformers import SentenceTransformer\n",
437 |     "sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') #, Initially load using this, then start using pickle to save time."
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "id": "b1499af5-7259-4e05-afca-117a12dfb659",
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "# from sentence_transformers import SentenceTransformer\n",
448 |     "# # sbert_model = SentenceTransformer('bert-base-nli-mean-tokens'), Initially load using this, then start using pickle to save time.\n",
449 |     "# with open(\"sbert\",'rb') as f:\n",
450 |     "#     sbert_model = pickle.load(f)\n",
451 |     "\n",
452 |     "print(\"sbert loaded\")\n",
453 |     "\n",
454 |     "# I am adding the texts in this list,\n",
455 |     "# We can also add sentences of a large text individually to get more precise results when we query.\n",
456 |     "documents = [\n",
457 |     "    '''Taj mahal is an immense mausoleum of white marble, built in Agra between 1631 and 1648 by order of the Mughal emperor Shah Jahan in memory of his favourite wife, the Taj Mahal is the jewel of Muslim art in India and one of the universally admired masterpieces of the world's heritage.''',\n",
458 |     "    '''The Statue of Liberty is a 305-foot (93-metre) statue located on Liberty Island in Upper New York Bay, off the coast of New York City. The statue is a personification of liberty in the form of a woman. She holds a torch in her raised right hand and clutches a tablet in her left.''',\n",
459 |     "    '''The Statue of Liberty was sculpted between 1875 and 1884 under the direction of French sculptor Frédéric-Auguste Bartholdi, who began drafting designs in 1870. Bartholdi and his team hammered roughly 31 tons of copper sheets onto a steel frame. Before being mounted on its current pedestal, the statue stood over 151 feet (46 metres) tall and weighed 225 tons.''',\n",
460 |     "    '''Badminton is a racquet sport played using racquets to hit a shuttlecock across a net. Although it may be played with larger teams, the most common forms of the game are \"singles\" (with one player per side) and \"doubles\" (with two players per side). Badminton is often played as a casual outdoor activity in a yard or on a beach; formal games are played on a rectangular indoor court. Points are scored by striking the shuttlecock with the racquet and landing it within the opposing side's half of the court.''',\n",
461 |     "    '''James Bond is a fictional character created by novelist Ian Fleming in 1953.''',\n",
462 |     "    '''A British secret agent working for MI6 under the codename 007, he has been portrayed on film by actors Sean Connery, David Niven, George Lazenby, Roger Moore, Timothy Dalton, Pierce Brosnan and Daniel Craig in twenty-seven productions.'''\n",
463 |     "]\n",
464 |     "\n",
465 |     "# A dictionary to store the document and its feature vector (the vector generated by SBERT)\n",
466 |     "doc_and_vec = {}\n",
467 |     "\n",
468 |     "def giveVector(texts):\n",
469 |     "    # this function returns the vector using SBERT\n",
470 |     "    return sbert_model.encode(texts)\n",
471 |     "\n",
472 |     "vectors = giveVector(documents)\n",
473 |     "\n",
474 |     "for doc,vec in zip(documents,vectors):\n",
475 |     "    doc_and_vec[doc] = vec\n",
476 |     "\n",
477 |     "print(\"vectors formed\")\n",
478 |     "\n",
479 |     "client.schema.delete_all()\n",
480 |     "class_obj = {\n",
481 |     "    \"class\": \"Post\",\n",
482 |     "    \"vectorizer\": \"none\", # we are providing the vectors ourselves through our SBERT model, so this field is none\n",
483 |     "    \"properties\": [{\n",
484 |     "        \"name\": \"content\",\n",
485 |     "        \"dataType\": [\"text\"],\n",
486 |     "    }]\n",
487 |     "}\n",
488 |     "\n",
489 |     "client.schema.create_class(class_obj)\n",
490 |     "print(\"Schema class created\")\n",
491 |     "\n",
492 |     "for doc,vec in doc_and_vec.items():\n",
493 |     "    data_obj = {\n",
494 |     "    \"content\": doc\n",
495 |     "    }\n",
496 |     "    client.data_object.create(\n",
497 |     "    data_obj,\n",
498 |     "    \"Post\",\n",
499 |     "    generate_uuid('Post',doc),\n",
500 |     "    vector = vec,\n",
501 |     "    )\n",
502 |     "print(\"Finished importing data\")\n",
503 |     "\n",
504 |     "def process_query(vec):\n",
505 |     "    nearVector = {\"vector\": vec}\n",
506 |     "    res = client.query.get(\"Post\", [\"content\", \"_additional {certainty}\"]).with_near_vector(nearVector).do()\n",
507 |     "    print(res)\n",
508 |     "    print(\"------------------------------------------------------------------------------------------------\")\n",
509 |     "    print(\"-----------------------------------Most similar text -------------------------------------------\")\n",
510 |     "    print(res['data']['Get']['Post'][0]['content'])\n",
511 |     "    print(\"------------------------------------------------------------------------------------------------\")\n",
512 |     "    print(res['data']['Get']['Post'][1]['content'])\n",
513 |     "    print(\"------------------------------------------------------------------------------------------------\")\n",
514 |     "\n",
515 |     "    \n"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "id": "065e4517-d430-48c7-89ae-9cd29c3a31f3",
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "query =\"american tourist destination\"\n",
526 |     "query_vec = sbert_model.encode(query)\n",
527 |     "process_query(query_vec)\n"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": null,
533 |    "id": "1a232068-a38a-490b-a329-d5b5e773174b",
534 |    "metadata": {},
535 |    "outputs": [],
536 |    "source": []
537 |   }
538 |  ],
539 |  "metadata": {
540 |   "environment": {
541 |    "kernel": "python3",
542 |    "name": "pytorch-gpu.1-12.m99",
543 |    "type": "gcloud",
544 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
545 |   },
546 |   "kernelspec": {
547 |    "display_name": "Python 3",
548 |    "language": "python",
549 |    "name": "python3"
550 |   },
551 |   "language_info": {
552 |    "codemirror_mode": {
553 |     "name": "ipython",
554 |     "version": 3
555 |    },
556 |    "file_extension": ".py",
557 |    "mimetype": "text/x-python",
558 |    "name": "python",
559 |    "nbconvert_exporter": "python",
560 |    "pygments_lexer": "ipython3",
561 |    "version": "3.7.12"
562 |   }
563 |  },
564 |  "nbformat": 4,
565 |  "nbformat_minor": 5
566 | }
567 | 


--------------------------------------------------------------------------------
/archive/notebooks_stackoverflow/workshop_setup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "92773bd8-4d3e-47d0-af4f-52216bb43465",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": []
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "55d91d10-03a4-46ad-b011-a1cfd22ab1e7",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from sentence_transformers import SentenceTransformer,  CrossEncoder, util\n",
 19 |     "import os"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "id": "1cd6384a-b5e3-4d82-9a07-35b821524321",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": []
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "9451b070-e18c-46c3-a012-6e2878cc26f4",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "model = SentenceTransformer('flax-sentence-embeddings/stackoverflow_mpnet-base')\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "f414975a-3dc2-4eb6-9b3b-bd24be4d18a4",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": []
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "24f40c4c-a183-4d84-b70e-4a1a86a229ee",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')\n"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "8983c00d-214d-4df3-b024-154b2105ace5",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')\n"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "1770345e-6eff-4761-bf39-64020967cc51",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": []
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "id": "a8834f31-56cf-4760-b0e6-c86e3b8efd39",
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "wikipedia_filepath = 'simplewiki-2020-11-01.jsonl.gz'\n",
 84 |     "\n",
 85 |     "if not os.path.exists(wikipedia_filepath):\n",
 86 |     "    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n"
 87 |    ]
 88 |   }
 89 |  ],
 90 |  "metadata": {
 91 |   "environment": {
 92 |    "kernel": "python3",
 93 |    "name": "pytorch-gpu.1-12.m99",
 94 |    "type": "gcloud",
 95 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-12:m99"
 96 |   },
 97 |   "kernelspec": {
 98 |    "display_name": "Python 3",
 99 |    "language": "python",
100 |    "name": "python3"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.7.12"
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 5
117 | }
118 | 


--------------------------------------------------------------------------------
/assets/all_assets.sw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/all_assets.sw


--------------------------------------------------------------------------------
/assets/slides_odsc2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_odsc2022.pdf


--------------------------------------------------------------------------------
/assets/slides_pydatanyc2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_pydatanyc2022.pdf


--------------------------------------------------------------------------------
/assets/slides_pydataseattle2023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/assets/slides_pydataseattle2023.pdf


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.0"
 2 | services:
 3 |   elasticsearch:
 4 |     container_name: es-container
 5 |     image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0
 6 |     environment:
 7 |       - xpack.security.enabled=false
 8 |       - "discovery.type=single-node"
 9 |     ports:
10 |       - 9200:9200
11 |     volumes:
12 |         - esdata:/usr/share/elasticsearch/data
13 |         
14 |         
15 | #   milvus:
16 | #     container_name: milvus
17 | #     image: milvusdb/milvus:1.1.1-cpu-d061621-330cc6
18 | #     ports:
19 | #       - 19530:19530 
20 | #       - 19121:19121
21 | #     volumes:
22 | #         - milvusdata:/var/lib/milvus
23 | 
24 | 
25 | #   milvus:
26 | #     container_name: milvus
27 | #     build:
28 | #       context: docker_milvus    
29 | #     ports:
30 | #       - 19530:19530 
31 | #       - 19121:19121
32 | #     volumes:
33 | #         - milvusdata:/var/lib/milvus
34 |  
35 |         
36 | #   weaviate:
37 | #     image: semitechnologies/weaviate:1.14.0
38 | #     ports:
39 | #       - 8081:8080
40 | #     environment:
41 | #         QUERY_DEFAULTS_LIMIT: 25
42 | #         AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
43 | #         PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
44 | #         DEFAULT_VECTORIZER_MODULE: 'none'
45 | #         ENABLE_MODULES: ''
46 | #         CLUSTER_HOSTNAME: 'node1'  
47 | #     volumes:
48 | #       - weaviatedata:/var/lib/weaviate
49 | volumes:
50 |   esdata:
51 |   # weaviatedata:
52 |   # milvusdata:


--------------------------------------------------------------------------------
/docs/internal_notes.md:
--------------------------------------------------------------------------------
 1 | # Internal Notes
 2 | 
 3 | ## Setup Dep
 4 | 
 5 | Install other deps
 6 | 
 7 | ```bash
 8 | sudo apt update && sudo apt install -y p7zip-full
 9 | ```
10 | 
11 | Create conda environemnt
12 | 
13 | ```bash
14 | conda create -n workshop python=3.7 mamba
15 | conda activate workshop
16 | mamba env update -n workshop -f environment.yaml 
17 | # mamba install anaconda jupyter ipykernel nb_conda_kernels
18 | 
19 | mamba install ipython ipykernel nb_conda_kernels
20 | 
21 | ipython kernel install --user --name=workshop
22 | 
23 | 
24 | conda create --name workshop --clone base
25 | 
26 | ```
27 | 
28 | Start ES/ Faiss for local dev
29 | 
30 | ```bash
31 | docker-compose up
32 | ```
33 | 
34 | ```bash
35 | docker run --user root -e GRANT_SUDO=yes -it app bash
36 | ```
37 | 
38 | 
39 | 
40 | ```
41 | Go to DIR: /projects/search-engine-workshop
42 | Type: docker-compose up
43 | 
44 | In the notebooks test... checks the milvus and elastic connections
45 | 
46 | 
47 | ```
48 | gsutil -m cp -r gs://np-training-tmp/stackoverflow/final* gs://np-public-training-temp/stackoverflow/
49 | ```
50 | 
51 | ```
52 | 
53 | 
54 | 
55 | 
56 | ```
57 | zip -r data_processed.zip data/processed/
58 | 
59 | gh release delete v1.0
60 | 
61 | gh release create v1.0 'data_processed.zip#Hugging Face Dataset of Unsplashed collection' \
62 | --title "v1.0" --notes "initial release"
63 | 
64 | 
65 | ```
66 | 
67 | 
68 | 
69 | ```
70 | zip -r /tmp/data.zip data/
71 | gsutil cp /tmp/data.zip gs://np-public-training-tmp/search-workshop/data.zip
72 | 
73 | 
74 | ```


--------------------------------------------------------------------------------
/docs/slide_notes.md:
--------------------------------------------------------------------------------
 1 | PUT /items
 2 | 
 3 | ```json
 4 | {
 5 |   "mappings": {
 6 |     "properties": {
 7 |       "title":          { "type": "text"    },
 8 |       "description":    { "type": "text"    },
 9 | 
10 |       "brand":          { "type": "keyword" },
11 |       "product_type":   { "type": "keyword" },
12 | 
13 |       "price":          { "type": "double"  }
14 |     }
15 |   }
16 | }
17 | ```
18 | 
19 | Nike shoe under 100$ 
20 | 
21 | GET /items/_search
22 | 
23 | ```json
24 | {
25 |   "query": {
26 |     
27 |     "multi_match": {
28 |         "query": "Nike shoe under 100$",
29 |         "fields": ["title^2", "Description^1"]
30 |     }
31 | 
32 |     ,"bool": {
33 |       "filter": [
34 |         { "term": { "brand": "nike" }}
35 |       ]
36 |     }
37 |     ,"filtered": {           
38 |         "filter": {
39 |             "range":  {
40 |                 "price" : {  "lte": 100 }
41 |             }
42 |         }
43 |     }   
44 | }
45 | 
46 | ```
47 | 
48 | 
49 | 
50 | ## PR curve
51 | ```
52 | Recall	Perfect Classifier	Baseline Classifier	Good Classifier	High Precision
53 | 0.1	0.95	0.5	0.9	0.91
54 | 0.2	0.95	0.5	0.85	0.91
55 | 0.3	0.95	0.5	0.85	0.91
56 | 0.4	0.95	0.5	0.8	0.9
57 | 0.5	0.95	0.5	0.8	0.4
58 | 0.6	0.95	0.5	0.8	0.4
59 | 0.7	0.95	0.5	0.8	0.4
60 | 0.8	0.95	0.5	0.8	0.2
61 | 0.9	0.95	0.5	0.7	0.2
62 | 1	0.9	0.5	0.2	0.1
63 | ```
64 | 
65 | 
66 | 
67 | 
68 | dcg
69 | 
70 | ```
71 | Discounted\space Cumulative\space Gain
72 |  = \sum_{1}^{p}\frac{ relevance (i)}{log_{2}(i+1)}
73 | 
74 | 
75 | \\
76 | DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{1}{log_{2}(3)} + {\color{Red}\frac{0}{log_{2}(4)} }+\frac{2}{log_{2}(5)} = 4.49
77 | 
78 | \\
79 | 
80 | (Ideal)\space DCG = {\color{Green}\frac{3}{log_{2}(2)} } + \frac{2}{log_{2}(3)}  + \frac{1}{log_{2}(4)} + {\color{Red}\frac{0}{log_{2}(5)} } = 5.88
81 | 
82 | \\
83 | Normalized\space Discounted\space Cumulative\space Gain
84 |  = \frac{ DCG}{Ideal\space DCG} = \frac{4.49}{5.88}
85 | ```


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | #name: workshop
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python==3.7.*
 6 |   - pip
 7 |   - mamba
 8 |   - nb_conda_kernels
 9 |   - pyarrow==9.0.* 
10 |   - lxml==4.9.*
11 |   - pip:
12 |     - -r requirements.txt
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/notebooks/04_ann.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "d6298118-b5f8-4250-bd82-e2a3787914ca",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# Benchmarking Aproximate Nearest Neighbors"
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "markdown",
  13 |    "id": "c2af1fec-6519-40d0-8826-f201d0acba0b",
  14 |    "metadata": {},
  15 |    "source": [
  16 |     "# About"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "markdown",
  21 |    "id": "6a45ed78-348b-4193-8870-cbcbaff240e1",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "In order for embedding retrieval to work at scale, need to use a vector database.\n",
  25 |     "We also need to use Approximate Nearest Search instead of brute force.\n",
  26 |     "\n",
  27 |     "\n",
  28 |     "In this notebook, we will use [FAISS]() a library from facebook.\n",
  29 |     "\n",
  30 |     "We will compare a brute force and the speedup gained from `IVF`.\n",
  31 |     "\n",
  32 |     "For a more detailed comparision, take a look <a href=\"http://ann-benchmarks.com/\">here</a> to find other solutions and benchmark data.\n",
  33 |     "\n",
  34 |     "\n",
  35 |     "We will look at `performance` and `recall@1`"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "markdown",
  40 |    "id": "e45e627d-9c8c-4f19-ab91-0e64ed8677d7",
  41 |    "metadata": {},
  42 |    "source": [
  43 |     "# Setup"
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "code",
  48 |    "execution_count": 1,
  49 |    "id": "62298ef5-fa20-4164-8ec7-e7f43bf85c20",
  50 |    "metadata": {
  51 |     "execution": {
  52 |      "iopub.execute_input": "2023-04-26T14:52:36.834089Z",
  53 |      "iopub.status.busy": "2023-04-26T14:52:36.833681Z",
  54 |      "iopub.status.idle": "2023-04-26T14:52:37.774946Z",
  55 |      "shell.execute_reply": "2023-04-26T14:52:37.774030Z",
  56 |      "shell.execute_reply.started": "2023-04-26T14:52:36.834036Z"
  57 |     }
  58 |    },
  59 |    "outputs": [],
  60 |    "source": [
  61 |     "from pathlib import Path\n",
  62 |     "import numpy as np\n",
  63 |     "import pandas as pd\n",
  64 |     "import faiss\n",
  65 |     "import datasets"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "markdown",
  70 |    "id": "6b9a73d2-aa07-46ef-b94d-8780ca9ecb68",
  71 |    "metadata": {},
  72 |    "source": [
  73 |     "## Load the embeddings of the image corpus"
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": 2,
  79 |    "id": "74c3f8aa-cb27-4ae8-a9a8-0060001d357c",
  80 |    "metadata": {
  81 |     "execution": {
  82 |      "iopub.execute_input": "2023-04-26T14:52:37.778006Z",
  83 |      "iopub.status.busy": "2023-04-26T14:52:37.776915Z",
  84 |      "iopub.status.idle": "2023-04-26T14:52:44.572677Z",
  85 |      "shell.execute_reply": "2023-04-26T14:52:44.571814Z",
  86 |      "shell.execute_reply.started": "2023-04-26T14:52:37.777973Z"
  87 |     }
  88 |    },
  89 |    "outputs": [],
  90 |    "source": [
  91 |     "dset = datasets.load_from_disk(\"../data/processed_embeddings\")\n",
  92 |     "## these embeddings will be used to create the search space.\n",
  93 |     "corpus = dset['embeddings']\n",
  94 |     "\n",
  95 |     "\n",
  96 |     "corpus = np.array(corpus).astype('float32')\n",
  97 |     "corpus = np.unique(corpus, axis=0)"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 3,
 103 |    "id": "94eeb2f5-88ff-428f-8e9c-5014234427b8",
 104 |    "metadata": {
 105 |     "execution": {
 106 |      "iopub.execute_input": "2023-04-26T14:52:44.574012Z",
 107 |      "iopub.status.busy": "2023-04-26T14:52:44.573742Z",
 108 |      "iopub.status.idle": "2023-04-26T14:52:44.580175Z",
 109 |      "shell.execute_reply": "2023-04-26T14:52:44.579367Z",
 110 |      "shell.execute_reply.started": "2023-04-26T14:52:44.573987Z"
 111 |     }
 112 |    },
 113 |    "outputs": [
 114 |     {
 115 |      "data": {
 116 |       "text/plain": [
 117 |        "(24954, 512)"
 118 |       ]
 119 |      },
 120 |      "execution_count": 3,
 121 |      "metadata": {},
 122 |      "output_type": "execute_result"
 123 |     }
 124 |    ],
 125 |    "source": [
 126 |     "corpus.shape"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 4,
 132 |    "id": "939108fa-daf9-472c-b026-19c6d9708a77",
 133 |    "metadata": {
 134 |     "execution": {
 135 |      "iopub.execute_input": "2023-04-26T14:52:44.581364Z",
 136 |      "iopub.status.busy": "2023-04-26T14:52:44.581116Z",
 137 |      "iopub.status.idle": "2023-04-26T14:52:44.590669Z",
 138 |      "shell.execute_reply": "2023-04-26T14:52:44.589884Z",
 139 |      "shell.execute_reply.started": "2023-04-26T14:52:44.581340Z"
 140 |     }
 141 |    },
 142 |    "outputs": [
 143 |     {
 144 |      "data": {
 145 |       "text/plain": [
 146 |        "array([[-0.08344752,  0.01604629,  0.03037108, ...,  0.03962855,\n",
 147 |        "        -0.02023211, -0.01102281],\n",
 148 |        "       [-0.07890625,  0.02533851,  0.00522987, ...,  0.02622218,\n",
 149 |        "        -0.05418065, -0.00765004],\n",
 150 |        "       [-0.0781679 ,  0.03937826, -0.01087696, ...,  0.04282334,\n",
 151 |        "        -0.02091636, -0.01027698],\n",
 152 |        "       ...,\n",
 153 |        "       [ 0.0878398 ,  0.01232621,  0.00077178, ..., -0.00705758,\n",
 154 |        "         0.01574707, -0.01541145],\n",
 155 |        "       [ 0.0882502 ,  0.03615745, -0.00961868, ...,  0.01392467,\n",
 156 |        "         0.00077467, -0.02139922],\n",
 157 |        "       [ 0.09195283,  0.04004925, -0.00255262, ...,  0.0036222 ,\n",
 158 |        "        -0.0181689 , -0.04212729]], dtype=float32)"
 159 |       ]
 160 |      },
 161 |      "execution_count": 4,
 162 |      "metadata": {},
 163 |      "output_type": "execute_result"
 164 |     }
 165 |    ],
 166 |    "source": [
 167 |     "corpus"
 168 |    ]
 169 |   },
 170 |   {
 171 |    "cell_type": "code",
 172 |    "execution_count": null,
 173 |    "id": "63581c47-4d2a-4106-883f-3f42c9070e99",
 174 |    "metadata": {},
 175 |    "outputs": [],
 176 |    "source": []
 177 |   },
 178 |   {
 179 |    "cell_type": "code",
 180 |    "execution_count": 5,
 181 |    "id": "69bf8642-772f-4996-b3c7-910a1f38b1b2",
 182 |    "metadata": {
 183 |     "execution": {
 184 |      "iopub.execute_input": "2023-04-26T14:52:44.592214Z",
 185 |      "iopub.status.busy": "2023-04-26T14:52:44.591565Z",
 186 |      "iopub.status.idle": "2023-04-26T14:52:44.600591Z",
 187 |      "shell.execute_reply": "2023-04-26T14:52:44.599816Z",
 188 |      "shell.execute_reply.started": "2023-04-26T14:52:44.592163Z"
 189 |     }
 190 |    },
 191 |    "outputs": [
 192 |     {
 193 |      "data": {
 194 |       "text/plain": [
 195 |        "512"
 196 |       ]
 197 |      },
 198 |      "execution_count": 5,
 199 |      "metadata": {},
 200 |      "output_type": "execute_result"
 201 |     }
 202 |    ],
 203 |    "source": [
 204 |     "dimension = corpus.shape[-1]\n",
 205 |     "dimension"
 206 |    ]
 207 |   },
 208 |   {
 209 |    "cell_type": "code",
 210 |    "execution_count": null,
 211 |    "id": "b5fbf481-46ac-45d3-b277-e9e19185c214",
 212 |    "metadata": {},
 213 |    "outputs": [],
 214 |    "source": []
 215 |   },
 216 |   {
 217 |    "cell_type": "markdown",
 218 |    "id": "06a59231-abcf-45f9-a69a-69149e73c2f8",
 219 |    "metadata": {
 220 |     "tags": []
 221 |    },
 222 |    "source": [
 223 |     "# Flat Index / Brute Force\n"
 224 |    ]
 225 |   },
 226 |   {
 227 |    "cell_type": "markdown",
 228 |    "id": "f2b9e1b9-c75b-4700-a3c9-cb2b8ecab451",
 229 |    "metadata": {},
 230 |    "source": [
 231 |     "FAISS supports a bruteforce index.    \n",
 232 |     "This index is good if you want perfect recall.   \n",
 233 |     "It requires all the data to be fit in memory.  "
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "markdown",
 238 |    "id": "926f4977-2ee0-48bd-8f50-f13e6ed82897",
 239 |    "metadata": {},
 240 |    "source": [
 241 |     "## Create the index"
 242 |    ]
 243 |   },
 244 |   {
 245 |    "cell_type": "code",
 246 |    "execution_count": 6,
 247 |    "id": "a447b05c-f30d-4d43-a6ed-fd29a99477a2",
 248 |    "metadata": {
 249 |     "execution": {
 250 |      "iopub.execute_input": "2023-04-26T14:52:44.601898Z",
 251 |      "iopub.status.busy": "2023-04-26T14:52:44.601570Z",
 252 |      "iopub.status.idle": "2023-04-26T14:52:44.609907Z",
 253 |      "shell.execute_reply": "2023-04-26T14:52:44.609154Z",
 254 |      "shell.execute_reply.started": "2023-04-26T14:52:44.601873Z"
 255 |     }
 256 |    },
 257 |    "outputs": [],
 258 |    "source": [
 259 |     "x_corpus = corpus\n",
 260 |     "x_corpus.shape\n",
 261 |     "dimension = x_corpus.shape[-1]"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "markdown",
 266 |    "id": "cb28c78c-3156-4da9-b5c1-3b192cb4c70f",
 267 |    "metadata": {},
 268 |    "source": [
 269 |     "initialize the flat index for data dimension.    \n",
 270 |     "In current example it is 512\n"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": 7,
 276 |    "id": "5bf3d65c-c435-407a-a932-8cdd9655ff5a",
 277 |    "metadata": {
 278 |     "execution": {
 279 |      "iopub.execute_input": "2023-04-26T14:52:44.612718Z",
 280 |      "iopub.status.busy": "2023-04-26T14:52:44.612384Z",
 281 |      "iopub.status.idle": "2023-04-26T14:52:44.664297Z",
 282 |      "shell.execute_reply": "2023-04-26T14:52:44.663442Z",
 283 |      "shell.execute_reply.started": "2023-04-26T14:52:44.612692Z"
 284 |     }
 285 |    },
 286 |    "outputs": [],
 287 |    "source": [
 288 |     "index = faiss.IndexFlatL2(dimension)"
 289 |    ]
 290 |   },
 291 |   {
 292 |    "cell_type": "markdown",
 293 |    "id": "c121ab02-4fe3-4012-af73-038ae78f872e",
 294 |    "metadata": {},
 295 |    "source": [
 296 |     "since it is a brute force index, there is no \"training\" or parameters to learn"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "code",
 301 |    "execution_count": 8,
 302 |    "id": "3c2d2c60-0afb-44cf-9cf6-5032811725a7",
 303 |    "metadata": {
 304 |     "execution": {
 305 |      "iopub.execute_input": "2023-04-26T14:52:44.666136Z",
 306 |      "iopub.status.busy": "2023-04-26T14:52:44.665598Z",
 307 |      "iopub.status.idle": "2023-04-26T14:52:44.670880Z",
 308 |      "shell.execute_reply": "2023-04-26T14:52:44.670173Z",
 309 |      "shell.execute_reply.started": "2023-04-26T14:52:44.666092Z"
 310 |     }
 311 |    },
 312 |    "outputs": [
 313 |     {
 314 |      "data": {
 315 |       "text/plain": [
 316 |        "True"
 317 |       ]
 318 |      },
 319 |      "execution_count": 8,
 320 |      "metadata": {},
 321 |      "output_type": "execute_result"
 322 |     }
 323 |    ],
 324 |    "source": [
 325 |     "index.is_trained\n"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "markdown",
 330 |    "id": "a547440c-fd96-4ce5-9996-0210f00617a7",
 331 |    "metadata": {},
 332 |    "source": [
 333 |     "add data to the index. This is a CPU based index."
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "code",
 338 |    "execution_count": 9,
 339 |    "id": "7aa739ee-42e1-4ac5-b1d8-9876ec777129",
 340 |    "metadata": {
 341 |     "execution": {
 342 |      "iopub.execute_input": "2023-04-26T14:52:44.672070Z",
 343 |      "iopub.status.busy": "2023-04-26T14:52:44.671820Z",
 344 |      "iopub.status.idle": "2023-04-26T14:52:44.747616Z",
 345 |      "shell.execute_reply": "2023-04-26T14:52:44.746751Z",
 346 |      "shell.execute_reply.started": "2023-04-26T14:52:44.672047Z"
 347 |     },
 348 |     "tags": []
 349 |    },
 350 |    "outputs": [],
 351 |    "source": [
 352 |     "index.add(x_corpus)               "
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "code",
 357 |    "execution_count": 10,
 358 |    "id": "ab9e1b45-1cf2-4e88-960c-6999ad312e22",
 359 |    "metadata": {
 360 |     "execution": {
 361 |      "iopub.execute_input": "2023-04-26T14:52:44.749140Z",
 362 |      "iopub.status.busy": "2023-04-26T14:52:44.748763Z",
 363 |      "iopub.status.idle": "2023-04-26T14:52:44.754419Z",
 364 |      "shell.execute_reply": "2023-04-26T14:52:44.753707Z",
 365 |      "shell.execute_reply.started": "2023-04-26T14:52:44.749112Z"
 366 |     }
 367 |    },
 368 |    "outputs": [
 369 |     {
 370 |      "data": {
 371 |       "text/plain": [
 372 |        "24954"
 373 |       ]
 374 |      },
 375 |      "execution_count": 10,
 376 |      "metadata": {},
 377 |      "output_type": "execute_result"
 378 |     }
 379 |    ],
 380 |    "source": [
 381 |     "len(x_corpus)"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "code",
 386 |    "execution_count": null,
 387 |    "id": "bfabea5d-26bd-45d8-8b7a-97f179bc4013",
 388 |    "metadata": {},
 389 |    "outputs": [],
 390 |    "source": []
 391 |   },
 392 |   {
 393 |    "cell_type": "markdown",
 394 |    "id": "348f697d-db43-4093-aa4e-4858d4c058f0",
 395 |    "metadata": {},
 396 |    "source": [
 397 |     "number of vectors / results to retrieve"
 398 |    ]
 399 |   },
 400 |   {
 401 |    "cell_type": "code",
 402 |    "execution_count": 11,
 403 |    "id": "6200be29-b926-4847-9e42-daf90d99319d",
 404 |    "metadata": {
 405 |     "execution": {
 406 |      "iopub.execute_input": "2023-04-26T14:52:44.755735Z",
 407 |      "iopub.status.busy": "2023-04-26T14:52:44.755402Z",
 408 |      "iopub.status.idle": "2023-04-26T14:52:44.764114Z",
 409 |      "shell.execute_reply": "2023-04-26T14:52:44.763389Z",
 410 |      "shell.execute_reply.started": "2023-04-26T14:52:44.755710Z"
 411 |     }
 412 |    },
 413 |    "outputs": [],
 414 |    "source": [
 415 |     "k =1"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "markdown",
 420 |    "id": "d57e8258-b7fa-49cb-a9f7-52a62a0dda17",
 421 |    "metadata": {},
 422 |    "source": [
 423 |     "#### Index Search\n",
 424 |     "search method returns query indices (I) similar to search query vector and their euclidean distances (D) from the search query vector."
 425 |    ]
 426 |   },
 427 |   {
 428 |    "cell_type": "markdown",
 429 |    "id": "87ea66c3-ebda-4600-b5d8-34ad796cc7c2",
 430 |    "metadata": {},
 431 |    "source": [
 432 |     "search for single vector and get top 1 result"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "code",
 437 |    "execution_count": 12,
 438 |    "id": "306e3a89-331c-4253-b7cf-ce7a0d951b42",
 439 |    "metadata": {
 440 |     "execution": {
 441 |      "iopub.execute_input": "2023-04-26T14:52:44.765288Z",
 442 |      "iopub.status.busy": "2023-04-26T14:52:44.765050Z",
 443 |      "iopub.status.idle": "2023-04-26T14:52:48.341486Z",
 444 |      "shell.execute_reply": "2023-04-26T14:52:48.340493Z",
 445 |      "shell.execute_reply.started": "2023-04-26T14:52:44.765265Z"
 446 |     }
 447 |    },
 448 |    "outputs": [
 449 |     {
 450 |      "name": "stdout",
 451 |      "output_type": "stream",
 452 |      "text": [
 453 |       "4.38 ms ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
 454 |      ]
 455 |     }
 456 |    ],
 457 |    "source": [
 458 |     "%%timeit\n",
 459 |     "D, I = index.search(x_corpus[:1], k=1)   "
 460 |    ]
 461 |   },
 462 |   {
 463 |    "cell_type": "markdown",
 464 |    "id": "a2382cf6-2718-4240-b805-c96195af51f0",
 465 |    "metadata": {},
 466 |    "source": [
 467 |     "search for all vectors in corpus and get top 1 result"
 468 |    ]
 469 |   },
 470 |   {
 471 |    "cell_type": "code",
 472 |    "execution_count": 13,
 473 |    "id": "badcb20e-f872-4763-baf5-1876d5dd617d",
 474 |    "metadata": {
 475 |     "execution": {
 476 |      "iopub.execute_input": "2023-04-26T14:52:48.342963Z",
 477 |      "iopub.status.busy": "2023-04-26T14:52:48.342590Z",
 478 |      "iopub.status.idle": "2023-04-26T14:52:58.653601Z",
 479 |      "shell.execute_reply": "2023-04-26T14:52:58.652643Z",
 480 |      "shell.execute_reply.started": "2023-04-26T14:52:48.342935Z"
 481 |     },
 482 |     "tags": []
 483 |    },
 484 |    "outputs": [
 485 |     {
 486 |      "name": "stdout",
 487 |      "output_type": "stream",
 488 |      "text": [
 489 |       "CPU times: user 30.3 s, sys: 8.92 ms, total: 30.3 s\n",
 490 |       "Wall time: 10.3 s\n"
 491 |      ]
 492 |     }
 493 |    ],
 494 |    "source": [
 495 |     "%%time\n",
 496 |     "D, I = index.search(x_corpus, k=1)     "
 497 |    ]
 498 |   },
 499 |   {
 500 |    "cell_type": "code",
 501 |    "execution_count": null,
 502 |    "id": "acf3d26f-119c-4f5c-8c24-fd28e3fccdfc",
 503 |    "metadata": {},
 504 |    "outputs": [],
 505 |    "source": []
 506 |   },
 507 |   {
 508 |    "cell_type": "markdown",
 509 |    "id": "1eac1d91-007d-4143-b0a3-ae19c763cc6c",
 510 |    "metadata": {},
 511 |    "source": [
 512 |     "distance of vector in corpus to query vector"
 513 |    ]
 514 |   },
 515 |   {
 516 |    "cell_type": "code",
 517 |    "execution_count": 14,
 518 |    "id": "0b555673-cb87-4b11-aa81-ed2ff69d513e",
 519 |    "metadata": {
 520 |     "execution": {
 521 |      "iopub.execute_input": "2023-04-26T14:52:58.660044Z",
 522 |      "iopub.status.busy": "2023-04-26T14:52:58.657749Z",
 523 |      "iopub.status.idle": "2023-04-26T14:52:58.667064Z",
 524 |      "shell.execute_reply": "2023-04-26T14:52:58.666198Z",
 525 |      "shell.execute_reply.started": "2023-04-26T14:52:58.660006Z"
 526 |     }
 527 |    },
 528 |    "outputs": [
 529 |     {
 530 |      "data": {
 531 |       "text/plain": [
 532 |        "array([[0.0000000e+00],\n",
 533 |        "       [0.0000000e+00],\n",
 534 |        "       [3.5762787e-07],\n",
 535 |        "       ...,\n",
 536 |        "       [0.0000000e+00],\n",
 537 |        "       [1.3113022e-06],\n",
 538 |        "       [7.1525574e-07]], dtype=float32)"
 539 |       ]
 540 |      },
 541 |      "execution_count": 14,
 542 |      "metadata": {},
 543 |      "output_type": "execute_result"
 544 |     }
 545 |    ],
 546 |    "source": [
 547 |     "D"
 548 |    ]
 549 |   },
 550 |   {
 551 |    "cell_type": "markdown",
 552 |    "id": "c4d271c3-850a-4375-87a2-100ffab7a416",
 553 |    "metadata": {},
 554 |    "source": [
 555 |     "top vertex id \n",
 556 |     "\n"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": 15,
 562 |    "id": "65310b96-aab0-4a34-a9eb-48d0dbefc0ec",
 563 |    "metadata": {
 564 |     "execution": {
 565 |      "iopub.execute_input": "2023-04-26T14:52:58.668403Z",
 566 |      "iopub.status.busy": "2023-04-26T14:52:58.668046Z",
 567 |      "iopub.status.idle": "2023-04-26T14:52:58.690783Z",
 568 |      "shell.execute_reply": "2023-04-26T14:52:58.689783Z",
 569 |      "shell.execute_reply.started": "2023-04-26T14:52:58.668375Z"
 570 |     }
 571 |    },
 572 |    "outputs": [
 573 |     {
 574 |      "data": {
 575 |       "text/plain": [
 576 |        "array([[    0],\n",
 577 |        "       [    1],\n",
 578 |        "       [    2],\n",
 579 |        "       ...,\n",
 580 |        "       [24951],\n",
 581 |        "       [24952],\n",
 582 |        "       [24953]])"
 583 |       ]
 584 |      },
 585 |      "execution_count": 15,
 586 |      "metadata": {},
 587 |      "output_type": "execute_result"
 588 |     }
 589 |    ],
 590 |    "source": [
 591 |     "I"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "markdown",
 596 |    "id": "90283075-8783-4607-bf28-e5ce9f55c08c",
 597 |    "metadata": {},
 598 |    "source": [
 599 |     "because we are using the entire corpus and the ids are sequential, the ideal recall would be sequential too"
 600 |    ]
 601 |   },
 602 |   {
 603 |    "cell_type": "code",
 604 |    "execution_count": 16,
 605 |    "id": "b3fb40a2-dd43-4676-a766-3d198943f957",
 606 |    "metadata": {
 607 |     "execution": {
 608 |      "iopub.execute_input": "2023-04-26T14:52:58.692358Z",
 609 |      "iopub.status.busy": "2023-04-26T14:52:58.691962Z",
 610 |      "iopub.status.idle": "2023-04-26T14:52:58.703731Z",
 611 |      "shell.execute_reply": "2023-04-26T14:52:58.702726Z",
 612 |      "shell.execute_reply.started": "2023-04-26T14:52:58.692330Z"
 613 |     }
 614 |    },
 615 |    "outputs": [
 616 |     {
 617 |      "data": {
 618 |       "text/plain": [
 619 |        "array([ True,  True,  True, ...,  True,  True,  True])"
 620 |       ]
 621 |      },
 622 |      "execution_count": 16,
 623 |      "metadata": {},
 624 |      "output_type": "execute_result"
 625 |     }
 626 |    ],
 627 |    "source": [
 628 |     "res = I[:,0] == np.array( list(range(len(x_corpus))))\n",
 629 |     "res"
 630 |    ]
 631 |   },
 632 |   {
 633 |    "cell_type": "code",
 634 |    "execution_count": 17,
 635 |    "id": "27641884-4563-4c1a-9d37-1b8e61ee5322",
 636 |    "metadata": {
 637 |     "execution": {
 638 |      "iopub.execute_input": "2023-04-26T14:52:58.705103Z",
 639 |      "iopub.status.busy": "2023-04-26T14:52:58.704836Z",
 640 |      "iopub.status.idle": "2023-04-26T14:52:58.713367Z",
 641 |      "shell.execute_reply": "2023-04-26T14:52:58.712403Z",
 642 |      "shell.execute_reply.started": "2023-04-26T14:52:58.705078Z"
 643 |     }
 644 |    },
 645 |    "outputs": [
 646 |     {
 647 |      "data": {
 648 |       "text/plain": [
 649 |        "(array([], dtype=int64),)"
 650 |       ]
 651 |      },
 652 |      "execution_count": 17,
 653 |      "metadata": {},
 654 |      "output_type": "execute_result"
 655 |     }
 656 |    ],
 657 |    "source": [
 658 |     "np.where(res == False)"
 659 |    ]
 660 |   },
 661 |   {
 662 |    "cell_type": "code",
 663 |    "execution_count": 18,
 664 |    "id": "ec2181ae-7a70-4a3a-bc34-bd9899168fe5",
 665 |    "metadata": {
 666 |     "execution": {
 667 |      "iopub.execute_input": "2023-04-26T14:52:58.714883Z",
 668 |      "iopub.status.busy": "2023-04-26T14:52:58.714532Z",
 669 |      "iopub.status.idle": "2023-04-26T14:52:58.726900Z",
 670 |      "shell.execute_reply": "2023-04-26T14:52:58.725900Z",
 671 |      "shell.execute_reply.started": "2023-04-26T14:52:58.714856Z"
 672 |     }
 673 |    },
 674 |    "outputs": [
 675 |     {
 676 |      "data": {
 677 |       "text/plain": [
 678 |        "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}"
 679 |       ]
 680 |      },
 681 |      "execution_count": 18,
 682 |      "metadata": {},
 683 |      "output_type": "execute_result"
 684 |     }
 685 |    ],
 686 |    "source": [
 687 |     "{\n",
 688 |     " \"recall@1\":  res.sum()\n",
 689 |     " , \"num_vectors\":  len(res)\n",
 690 |     " , \"mismatch\":    len(res) - res.sum()\n",
 691 |     "}\n"
 692 |    ]
 693 |   },
 694 |   {
 695 |    "cell_type": "markdown",
 696 |    "id": "f52a636e-041b-43e6-9b94-5c31447f31cc",
 697 |    "metadata": {
 698 |     "execution": {
 699 |      "iopub.execute_input": "2023-04-26T00:02:55.003678Z",
 700 |      "iopub.status.busy": "2023-04-26T00:02:55.002818Z",
 701 |      "iopub.status.idle": "2023-04-26T00:02:55.010824Z",
 702 |      "shell.execute_reply": "2023-04-26T00:02:55.010099Z",
 703 |      "shell.execute_reply.started": "2023-04-26T00:02:55.003640Z"
 704 |     }
 705 |    },
 706 |    "source": [
 707 |     "For this corpus, we are able to find the query vector as position 1"
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "code",
 712 |    "execution_count": null,
 713 |    "id": "e88e34b6-f9e0-4835-90cc-ba0e6b2c0414",
 714 |    "metadata": {},
 715 |    "outputs": [],
 716 |    "source": []
 717 |   },
 718 |   {
 719 |    "cell_type": "code",
 720 |    "execution_count": null,
 721 |    "id": "02bac14d-9696-4a49-be12-9541beeb45a2",
 722 |    "metadata": {},
 723 |    "outputs": [],
 724 |    "source": []
 725 |   },
 726 |   {
 727 |    "cell_type": "markdown",
 728 |    "id": "7e79d4fc-8191-4151-b330-01b41a2b05d4",
 729 |    "metadata": {},
 730 |    "source": [
 731 |     "# FAISS IVF"
 732 |    ]
 733 |   },
 734 |   {
 735 |    "cell_type": "markdown",
 736 |    "id": "2c6f3e4c-2cad-461c-a8b7-5ffba0a5b354",
 737 |    "metadata": {},
 738 |    "source": [
 739 |     "<img src=\"https://d33wubrfki0l68.cloudfront.net/44acb1425f25e30ca058daec92bdb209c6c47ad2/e92fc/images/faiss5.png\" width=\"500\"/>\n",
 740 |     "\n",
 741 |     "<p> Image from Pinecone Faiss Tutorial </p>\n",
 742 |     "https://www.pinecone.io/learn/faiss-tutorial/\n",
 743 |     "\n",
 744 |     "\n",
 745 |     "**Parameters**:\n",
 746 |     "- nlist : number of clusters\n",
 747 |     "- nprobe: number of clusters to search"
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "code",
 752 |    "execution_count": 19,
 753 |    "id": "342966dc-d361-4fec-8ebe-c3c67864736e",
 754 |    "metadata": {
 755 |     "execution": {
 756 |      "iopub.execute_input": "2023-04-26T14:52:58.728467Z",
 757 |      "iopub.status.busy": "2023-04-26T14:52:58.728210Z",
 758 |      "iopub.status.idle": "2023-04-26T14:52:58.785614Z",
 759 |      "shell.execute_reply": "2023-04-26T14:52:58.784521Z",
 760 |      "shell.execute_reply.started": "2023-04-26T14:52:58.728443Z"
 761 |     },
 762 |     "tags": []
 763 |    },
 764 |    "outputs": [],
 765 |    "source": [
 766 |     "nlist = 20 # number of clusters\n",
 767 |     "quantizer = faiss.IndexFlatL2(dimension)  # the other index\n",
 768 |     "index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)"
 769 |    ]
 770 |   },
 771 |   {
 772 |    "cell_type": "code",
 773 |    "execution_count": 20,
 774 |    "id": "fec38a91-8c3c-40fe-935d-a160474a7e4f",
 775 |    "metadata": {
 776 |     "execution": {
 777 |      "iopub.execute_input": "2023-04-26T14:52:58.787144Z",
 778 |      "iopub.status.busy": "2023-04-26T14:52:58.786846Z",
 779 |      "iopub.status.idle": "2023-04-26T14:52:58.883952Z",
 780 |      "shell.execute_reply": "2023-04-26T14:52:58.882788Z",
 781 |      "shell.execute_reply.started": "2023-04-26T14:52:58.787117Z"
 782 |     },
 783 |     "tags": []
 784 |    },
 785 |    "outputs": [],
 786 |    "source": [
 787 |     "assert not index.is_trained\n",
 788 |     "index.train(x_corpus)\n",
 789 |     "assert index.is_trained"
 790 |    ]
 791 |   },
 792 |   {
 793 |    "cell_type": "code",
 794 |    "execution_count": 21,
 795 |    "id": "6b577199-2e7b-439f-93a4-2653c7545eef",
 796 |    "metadata": {
 797 |     "execution": {
 798 |      "iopub.execute_input": "2023-04-26T14:52:58.885895Z",
 799 |      "iopub.status.busy": "2023-04-26T14:52:58.885561Z",
 800 |      "iopub.status.idle": "2023-04-26T14:52:58.946474Z",
 801 |      "shell.execute_reply": "2023-04-26T14:52:58.945447Z",
 802 |      "shell.execute_reply.started": "2023-04-26T14:52:58.885865Z"
 803 |     }
 804 |    },
 805 |    "outputs": [],
 806 |    "source": [
 807 |     "index.add(x_corpus)         "
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "markdown",
 812 |    "id": "55f7c7b3-7298-4e53-b1e0-2c3a0f568579",
 813 |    "metadata": {},
 814 |    "source": [
 815 |     "we need to train the index first with a sample of vectors before indexing"
 816 |    ]
 817 |   },
 818 |   {
 819 |    "cell_type": "code",
 820 |    "execution_count": null,
 821 |    "id": "7a50b8c1-ed8b-45ac-9f19-4325f42b265a",
 822 |    "metadata": {},
 823 |    "outputs": [],
 824 |    "source": []
 825 |   },
 826 |   {
 827 |    "cell_type": "markdown",
 828 |    "id": "f8d0824c-0712-46fb-a5d4-16b32ee695c0",
 829 |    "metadata": {},
 830 |    "source": [
 831 |     "search for single vector"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "code",
 836 |    "execution_count": 22,
 837 |    "id": "451988b2-0e03-4e12-92e2-8df408094526",
 838 |    "metadata": {
 839 |     "execution": {
 840 |      "iopub.execute_input": "2023-04-26T14:52:58.956643Z",
 841 |      "iopub.status.busy": "2023-04-26T14:52:58.953741Z",
 842 |      "iopub.status.idle": "2023-04-26T14:53:08.215103Z",
 843 |      "shell.execute_reply": "2023-04-26T14:53:08.214024Z",
 844 |      "shell.execute_reply.started": "2023-04-26T14:52:58.956606Z"
 845 |     },
 846 |     "tags": []
 847 |    },
 848 |    "outputs": [
 849 |     {
 850 |      "name": "stdout",
 851 |      "output_type": "stream",
 852 |      "text": [
 853 |       "114 µs ± 729 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
 854 |      ]
 855 |     }
 856 |    ],
 857 |    "source": [
 858 |     "%%timeit\n",
 859 |     "\n",
 860 |     "index.nprobe = 1              # default nprobe is 1\n",
 861 |     "\n",
 862 |     "D, I = index.search(x_corpus[:1], k)     # actual search"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "markdown",
 867 |    "id": "28e0a32e-5380-48da-9022-0be90db13e75",
 868 |    "metadata": {
 869 |     "execution": {
 870 |      "iopub.execute_input": "2023-04-26T00:07:36.782792Z",
 871 |      "iopub.status.busy": "2023-04-26T00:07:36.782538Z",
 872 |      "iopub.status.idle": "2023-04-26T00:07:36.788109Z",
 873 |      "shell.execute_reply": "2023-04-26T00:07:36.786807Z",
 874 |      "shell.execute_reply.started": "2023-04-26T00:07:36.782768Z"
 875 |     }
 876 |    },
 877 |    "source": [
 878 |     "in the above, we are only querying 1/20 of the search space"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": null,
 884 |    "id": "f7547048-000e-4f32-b1c9-fccfd83b3918",
 885 |    "metadata": {},
 886 |    "outputs": [],
 887 |    "source": []
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": 23,
 892 |    "id": "27ef12d6-d585-4be3-9f5f-8b35ee192c38",
 893 |    "metadata": {
 894 |     "execution": {
 895 |      "iopub.execute_input": "2023-04-26T14:53:08.216344Z",
 896 |      "iopub.status.busy": "2023-04-26T14:53:08.216081Z",
 897 |      "iopub.status.idle": "2023-04-26T14:53:20.757164Z",
 898 |      "shell.execute_reply": "2023-04-26T14:53:20.755944Z",
 899 |      "shell.execute_reply.started": "2023-04-26T14:53:08.216319Z"
 900 |     },
 901 |     "tags": []
 902 |    },
 903 |    "outputs": [
 904 |     {
 905 |      "name": "stdout",
 906 |      "output_type": "stream",
 907 |      "text": [
 908 |       "1.55 ms ± 13.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
 909 |      ]
 910 |     }
 911 |    ],
 912 |    "source": [
 913 |     "%%timeit\n",
 914 |     "\n",
 915 |     "\n",
 916 |     "index.nprobe = 10              # default nprobe is 1\n",
 917 |     "\n",
 918 |     "D, I = index.search(x_corpus[:1], k)     # actual search"
 919 |    ]
 920 |   },
 921 |   {
 922 |    "cell_type": "markdown",
 923 |    "id": "5e98f114-4d94-4b59-84f2-73232fc834da",
 924 |    "metadata": {
 925 |     "execution": {
 926 |      "iopub.status.busy": "2023-04-26T00:07:36.790059Z",
 927 |      "iopub.status.idle": "2023-04-26T00:07:36.790366Z",
 928 |      "shell.execute_reply": "2023-04-26T00:07:36.790209Z",
 929 |      "shell.execute_reply.started": "2023-04-26T00:07:36.790195Z"
 930 |     }
 931 |    },
 932 |    "source": [
 933 |     "in the above, we are only querying half of the search space"
 934 |    ]
 935 |   },
 936 |   {
 937 |    "cell_type": "code",
 938 |    "execution_count": null,
 939 |    "id": "c1045fcc-358f-4216-a00e-e0db6e1811a4",
 940 |    "metadata": {},
 941 |    "outputs": [],
 942 |    "source": []
 943 |   },
 944 |   {
 945 |    "cell_type": "code",
 946 |    "execution_count": null,
 947 |    "id": "50e83d62-b203-4048-95df-74e61fc2aa0c",
 948 |    "metadata": {},
 949 |    "outputs": [],
 950 |    "source": []
 951 |   },
 952 |   {
 953 |    "cell_type": "code",
 954 |    "execution_count": 24,
 955 |    "id": "2a140a52-213a-4da1-8077-8be794a36f30",
 956 |    "metadata": {
 957 |     "execution": {
 958 |      "iopub.execute_input": "2023-04-26T14:53:20.758823Z",
 959 |      "iopub.status.busy": "2023-04-26T14:53:20.758438Z",
 960 |      "iopub.status.idle": "2023-04-26T14:53:24.717603Z",
 961 |      "shell.execute_reply": "2023-04-26T14:53:24.716457Z",
 962 |      "shell.execute_reply.started": "2023-04-26T14:53:20.758795Z"
 963 |     }
 964 |    },
 965 |    "outputs": [
 966 |     {
 967 |      "name": "stdout",
 968 |      "output_type": "stream",
 969 |      "text": [
 970 |       "4.88 ms ± 57.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
 971 |      ]
 972 |     }
 973 |    ],
 974 |    "source": [
 975 |     "%%timeit\n",
 976 |     "\n",
 977 |     "\n",
 978 |     "index.nprobe = 20              # default nprobe is 1\n",
 979 |     "\n",
 980 |     "D, I = index.search(x_corpus[:1], k)     # actual search"
 981 |    ]
 982 |   },
 983 |   {
 984 |    "cell_type": "markdown",
 985 |    "id": "971f14de-ec0c-4ac7-b63a-135ec7444834",
 986 |    "metadata": {
 987 |     "execution": {
 988 |      "iopub.status.busy": "2023-04-26T00:07:36.792692Z",
 989 |      "iopub.status.idle": "2023-04-26T00:07:36.792990Z",
 990 |      "shell.execute_reply": "2023-04-26T00:07:36.792856Z",
 991 |      "shell.execute_reply.started": "2023-04-26T00:07:36.792842Z"
 992 |     }
 993 |    },
 994 |    "source": [
 995 |     "in the above, we are querying the entire search space. This is the same as using Brute Force."
 996 |    ]
 997 |   },
 998 |   {
 999 |    "cell_type": "code",
1000 |    "execution_count": null,
1001 |    "id": "ec6f1eb2-a9cd-47d9-86bc-5c58ed5dbb1d",
1002 |    "metadata": {},
1003 |    "outputs": [],
1004 |    "source": []
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": null,
1009 |    "id": "fb11b2cd-eef5-4b15-a719-e9c5cbe0fb4f",
1010 |    "metadata": {},
1011 |    "outputs": [],
1012 |    "source": []
1013 |   },
1014 |   {
1015 |    "cell_type": "markdown",
1016 |    "id": "d8524b69-4efd-4a6d-9f10-499f299ff762",
1017 |    "metadata": {},
1018 |    "source": [
1019 |     "search for entire corpus"
1020 |    ]
1021 |   },
1022 |   {
1023 |    "cell_type": "code",
1024 |    "execution_count": 25,
1025 |    "id": "60795b88-8957-4c49-a729-22f935c4dc3f",
1026 |    "metadata": {
1027 |     "execution": {
1028 |      "iopub.execute_input": "2023-04-26T14:53:24.719250Z",
1029 |      "iopub.status.busy": "2023-04-26T14:53:24.718697Z",
1030 |      "iopub.status.idle": "2023-04-26T14:53:25.774560Z",
1031 |      "shell.execute_reply": "2023-04-26T14:53:25.773678Z",
1032 |      "shell.execute_reply.started": "2023-04-26T14:53:24.719219Z"
1033 |     }
1034 |    },
1035 |    "outputs": [
1036 |     {
1037 |      "name": "stdout",
1038 |      "output_type": "stream",
1039 |      "text": [
1040 |       "CPU times: user 8.18 s, sys: 23.5 ms, total: 8.21 s\n",
1041 |       "Wall time: 1.05 s\n"
1042 |      ]
1043 |     }
1044 |    ],
1045 |    "source": [
1046 |     "%%time\n",
1047 |     "\n",
1048 |     "\n",
1049 |     "index.nprobe = 1              \n",
1050 |     "\n",
1051 |     "D, I = index.search(x_corpus, k)     # actual search"
1052 |    ]
1053 |   },
1054 |   {
1055 |    "cell_type": "code",
1056 |    "execution_count": 26,
1057 |    "id": "ddbabfeb-2b27-412c-b77b-1d795b00aa29",
1058 |    "metadata": {
1059 |     "execution": {
1060 |      "iopub.execute_input": "2023-04-26T14:53:25.781104Z",
1061 |      "iopub.status.busy": "2023-04-26T14:53:25.778777Z",
1062 |      "iopub.status.idle": "2023-04-26T14:53:25.791108Z",
1063 |      "shell.execute_reply": "2023-04-26T14:53:25.790296Z",
1064 |      "shell.execute_reply.started": "2023-04-26T14:53:25.781066Z"
1065 |     }
1066 |    },
1067 |    "outputs": [
1068 |     {
1069 |      "data": {
1070 |       "text/plain": [
1071 |        "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}"
1072 |       ]
1073 |      },
1074 |      "execution_count": 26,
1075 |      "metadata": {},
1076 |      "output_type": "execute_result"
1077 |     }
1078 |    ],
1079 |    "source": [
1080 |     "z = I[:,0] == np.array( list(range(len(x_corpus))))\n",
1081 |     "{\n",
1082 |     " \"recall@1\":  z.sum()\n",
1083 |     " , \"num_vectors\":  len(z)\n",
1084 |     " , \"mismatch\":    len(z) - z.sum()\n",
1085 |     "}\n"
1086 |    ]
1087 |   },
1088 |   {
1089 |    "cell_type": "code",
1090 |    "execution_count": null,
1091 |    "id": "6c9da4ac-9d1f-480a-ac9f-ea97e1d16f9d",
1092 |    "metadata": {},
1093 |    "outputs": [],
1094 |    "source": []
1095 |   },
1096 |   {
1097 |    "cell_type": "markdown",
1098 |    "id": "2b5515dc-09b7-459c-b93c-80424faac839",
1099 |    "metadata": {},
1100 |    "source": [
1101 |     "increase the number of cells that are probed"
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "code",
1106 |    "execution_count": null,
1107 |    "id": "687d36fe-de5d-4ee4-9941-7ca604b446fb",
1108 |    "metadata": {},
1109 |    "outputs": [],
1110 |    "source": []
1111 |   },
1112 |   {
1113 |    "cell_type": "code",
1114 |    "execution_count": 27,
1115 |    "id": "31c0b86b-2d5e-480d-be94-c136e7fe07d1",
1116 |    "metadata": {
1117 |     "execution": {
1118 |      "iopub.execute_input": "2023-04-26T14:53:25.792414Z",
1119 |      "iopub.status.busy": "2023-04-26T14:53:25.792073Z",
1120 |      "iopub.status.idle": "2023-04-26T14:54:03.768762Z",
1121 |      "shell.execute_reply": "2023-04-26T14:54:03.767827Z",
1122 |      "shell.execute_reply.started": "2023-04-26T14:53:25.792388Z"
1123 |     }
1124 |    },
1125 |    "outputs": [
1126 |     {
1127 |      "name": "stdout",
1128 |      "output_type": "stream",
1129 |      "text": [
1130 |       "4.73 s ± 230 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
1131 |      ]
1132 |     }
1133 |    ],
1134 |    "source": [
1135 |     "%%timeit\n",
1136 |     "\n",
1137 |     "index.nprobe = 5              # default nprobe is 1\n",
1138 |     "\n",
1139 |     "D, I = index.search(x_corpus, k)    "
1140 |    ]
1141 |   },
1142 |   {
1143 |    "cell_type": "code",
1144 |    "execution_count": 28,
1145 |    "id": "fbffb654-a3a6-4d42-956f-602dab97124e",
1146 |    "metadata": {
1147 |     "execution": {
1148 |      "iopub.execute_input": "2023-04-26T14:54:03.777354Z",
1149 |      "iopub.status.busy": "2023-04-26T14:54:03.769965Z",
1150 |      "iopub.status.idle": "2023-04-26T14:54:03.787506Z",
1151 |      "shell.execute_reply": "2023-04-26T14:54:03.786386Z",
1152 |      "shell.execute_reply.started": "2023-04-26T14:54:03.777305Z"
1153 |     }
1154 |    },
1155 |    "outputs": [
1156 |     {
1157 |      "data": {
1158 |       "text/plain": [
1159 |        "{'recall@1': 24954, 'num_vectors': 24954, 'mismatch': 0}"
1160 |       ]
1161 |      },
1162 |      "execution_count": 28,
1163 |      "metadata": {},
1164 |      "output_type": "execute_result"
1165 |     }
1166 |    ],
1167 |    "source": [
1168 |     "z = I[:,0] == np.array( list(range(len(x_corpus))))\n",
1169 |     "{\n",
1170 |     " \"recall@1\":  z.sum()\n",
1171 |     " , \"num_vectors\":  len(z)\n",
1172 |     " , \"mismatch\":    len(z) - z.sum()\n",
1173 |     "}\n"
1174 |    ]
1175 |   },
1176 |   {
1177 |    "cell_type": "code",
1178 |    "execution_count": null,
1179 |    "id": "fd7e1354-2002-45dc-8732-9506ef6200cf",
1180 |    "metadata": {},
1181 |    "outputs": [],
1182 |    "source": []
1183 |   }
1184 |  ],
1185 |  "metadata": {
1186 |   "environment": {
1187 |    "kernel": "python3",
1188 |    "name": "pytorch-gpu.1-13.m107",
1189 |    "type": "gcloud",
1190 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-13:m107"
1191 |   },
1192 |   "kernelspec": {
1193 |    "display_name": "Python 3",
1194 |    "language": "python",
1195 |    "name": "python3"
1196 |   },
1197 |   "language_info": {
1198 |    "codemirror_mode": {
1199 |     "name": "ipython",
1200 |     "version": 3
1201 |    },
1202 |    "file_extension": ".py",
1203 |    "mimetype": "text/x-python",
1204 |    "name": "python",
1205 |    "nbconvert_exporter": "python",
1206 |    "pygments_lexer": "ipython3",
1207 |    "version": "3.7.12"
1208 |   }
1209 |  },
1210 |  "nbformat": 4,
1211 |  "nbformat_minor": 5
1212 | }
1213 | 


--------------------------------------------------------------------------------
/notebooks/workshop_setup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 12,
  6 |    "id": "e9b14691-3881-4882-bae0-c46b23401f11",
  7 |    "metadata": {
  8 |     "execution": {
  9 |      "iopub.execute_input": "2023-04-26T14:49:50.397320Z",
 10 |      "iopub.status.busy": "2023-04-26T14:49:50.396553Z",
 11 |      "iopub.status.idle": "2023-04-26T14:49:50.401265Z",
 12 |      "shell.execute_reply": "2023-04-26T14:49:50.400495Z",
 13 |      "shell.execute_reply.started": "2023-04-26T14:49:50.397287Z"
 14 |     },
 15 |     "tags": []
 16 |    },
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import nltk\n",
 20 |     "from sentence_transformers import SentenceTransformer\n",
 21 |     "from transformers import AutoTokenizer\n"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "68926c5f-4643-4470-9e76-e1284acee82a",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": []
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "id": "0d2cc671-31cd-4b00-8010-01932aa66d88",
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": []
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 5,
 43 |    "id": "229a8069-c18a-4296-819e-fcebd7398fe8",
 44 |    "metadata": {
 45 |     "execution": {
 46 |      "iopub.execute_input": "2023-04-26T13:32:29.510730Z",
 47 |      "iopub.status.busy": "2023-04-26T13:32:29.509639Z",
 48 |      "iopub.status.idle": "2023-04-26T13:32:30.515597Z",
 49 |      "shell.execute_reply": "2023-04-26T13:32:30.514710Z",
 50 |      "shell.execute_reply.started": "2023-04-26T13:32:29.510700Z"
 51 |     }
 52 |    },
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stderr",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "[nltk_data] Downloading package stopwords to\n",
 59 |       "[nltk_data]     /home/jupyter/nltk_data...\n",
 60 |       "[nltk_data]   Unzipping corpora/stopwords.zip.\n",
 61 |       "[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...\n",
 62 |       "[nltk_data]   Unzipping tokenizers/punkt.zip.\n",
 63 |       "[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...\n",
 64 |       "[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...\n"
 65 |      ]
 66 |     },
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "True"
 71 |       ]
 72 |      },
 73 |      "execution_count": 5,
 74 |      "metadata": {},
 75 |      "output_type": "execute_result"
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "nltk.download('stopwords')\n",
 80 |     "nltk.download('punkt')\n",
 81 |     "nltk.download('wordnet')\n",
 82 |     "nltk.download('omw-1.4')"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "da25e712-759d-4fa8-89bc-ab12e4094acf",
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": []
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "32a7451b-7fe5-48f2-af6e-1a9d90b63c14",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": []
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 14,
104 |    "id": "6dd5ed16-c707-4ad5-97ee-6748e74af1a9",
105 |    "metadata": {
106 |     "execution": {
107 |      "iopub.execute_input": "2023-04-26T14:50:36.001708Z",
108 |      "iopub.status.busy": "2023-04-26T14:50:36.000639Z",
109 |      "iopub.status.idle": "2023-04-26T14:50:36.005695Z",
110 |      "shell.execute_reply": "2023-04-26T14:50:36.004847Z",
111 |      "shell.execute_reply.started": "2023-04-26T14:50:36.001654Z"
112 |     },
113 |     "tags": []
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "models = ['sentence-transformers/all-MiniLM-L6-v2','sentence-transformers/clip-ViT-B-32' , 'sentence-transformers/clip-ViT-B-32-multilingual-v1']\n",
118 |     "\n",
119 |     "\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 15,
125 |    "id": "096eca1c-1ea2-4fa8-bc26-7c8ce41f46f1",
126 |    "metadata": {
127 |     "execution": {
128 |      "iopub.execute_input": "2023-04-26T14:50:36.285375Z",
129 |      "iopub.status.busy": "2023-04-26T14:50:36.284663Z",
130 |      "iopub.status.idle": "2023-04-26T14:50:36.289042Z",
131 |      "shell.execute_reply": "2023-04-26T14:50:36.288270Z",
132 |      "shell.execute_reply.started": "2023-04-26T14:50:36.285345Z"
133 |     },
134 |     "tags": []
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "text = \"men shoes\""
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 18,
144 |    "id": "6958324b-63f0-4320-9880-83d7d920e55c",
145 |    "metadata": {
146 |     "execution": {
147 |      "iopub.execute_input": "2023-04-26T14:51:44.205272Z",
148 |      "iopub.status.busy": "2023-04-26T14:51:44.204519Z",
149 |      "iopub.status.idle": "2023-04-26T14:51:49.430374Z",
150 |      "shell.execute_reply": "2023-04-26T14:51:49.429497Z",
151 |      "shell.execute_reply.started": "2023-04-26T14:51:44.205234Z"
152 |     },
153 |     "tags": []
154 |    },
155 |    "outputs": [
156 |     {
157 |      "name": "stdout",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "sentence-transformers/all-MiniLM-L6-v2 {'input_ids': tensor([ 101, 2273, 6007,  102], device='cuda:0'), 'token_type_ids': tensor([0, 0, 0, 0], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1], device='cuda:0'), 'token_embeddings': tensor([[-0.2272,  0.0027,  0.1586,  ..., -0.3998, -0.4343, -0.0824],\n",
161 |       "        [-0.0189,  0.0759, -0.6014,  ..., -0.5021,  0.3637,  0.0202],\n",
162 |       "        [-1.3425,  0.1124,  0.1479,  ..., -1.1672, -0.9619, -0.5084],\n",
163 |       "        [-0.3310,  0.2602, -0.1229,  ..., -0.3570, -0.1665,  0.4503]],\n",
164 |       "       device='cuda:0'), 'sentence_embedding': tensor([-5.8684e-02,  1.3790e-02, -1.2774e-02,  1.4896e-02,  1.0860e-02,\n",
165 |       "        -6.4392e-02,  6.0345e-02, -6.9710e-02, -3.2946e-02,  4.2863e-03,\n",
166 |       "        -5.0637e-03,  9.6656e-02, -4.2155e-02, -2.6262e-03, -3.4538e-02,\n",
167 |       "        -1.3516e-02, -7.5456e-02,  2.6242e-02, -2.1382e-02,  2.3227e-02,\n",
168 |       "         3.1208e-02, -3.7767e-02, -4.7549e-02,  4.5558e-03, -9.1842e-02,\n",
169 |       "        -8.6363e-03,  1.8699e-02,  6.6197e-02, -1.1301e-02,  1.3095e-02,\n",
170 |       "         3.8383e-02,  9.8821e-03,  6.1136e-02,  2.6724e-02, -4.4304e-02,\n",
171 |       "        -8.6510e-02,  1.6309e-02, -3.2358e-02,  3.3459e-04,  9.5446e-02,\n",
172 |       "        -4.2075e-02, -1.2752e-01, -4.7495e-03,  5.5425e-02,  5.1755e-02,\n",
173 |       "         3.0031e-02,  2.5861e-02,  1.5485e-02, -4.3001e-02,  1.2418e-01,\n",
174 |       "         9.2281e-04,  2.5372e-02, -2.9569e-02,  2.1765e-02,  5.3302e-02,\n",
175 |       "        -1.9069e-02, -7.0468e-03, -2.1080e-02, -1.4390e-02, -6.2965e-02,\n",
176 |       "         1.2275e-01,  7.1885e-04, -6.6201e-02,  2.3436e-02,  3.0998e-02,\n",
177 |       "         1.5899e-02, -4.3535e-02,  4.4102e-03, -1.0930e-02,  4.9731e-02,\n",
178 |       "         2.4521e-02, -1.8387e-02, -1.5710e-02,  2.5058e-02, -2.1643e-02,\n",
179 |       "         2.9889e-02, -7.1892e-02, -7.9145e-02, -6.6582e-02, -2.8073e-02,\n",
180 |       "        -4.3930e-02, -3.0970e-02, -9.7531e-03, -1.1777e-02,  1.0500e-02,\n",
181 |       "        -2.3791e-02,  6.4381e-03, -1.2262e-04, -2.8900e-02,  4.9136e-02,\n",
182 |       "        -1.3542e-01, -7.4252e-02,  1.3062e-02, -1.8075e-03, -6.6097e-02,\n",
183 |       "        -7.3589e-03,  1.3785e-02,  3.7384e-02, -8.5169e-02,  1.1559e-01,\n",
184 |       "         2.1454e-02, -1.0230e-02, -3.5801e-02, -1.7123e-02,  1.9046e-02,\n",
185 |       "         2.3251e-02, -1.7273e-02,  8.2963e-02,  1.8455e-02,  8.6537e-02,\n",
186 |       "        -2.5882e-02,  2.0791e-02, -7.3512e-02,  1.8831e-02, -5.0279e-02,\n",
187 |       "        -8.1372e-02, -7.6812e-03,  7.2167e-02,  7.7383e-02,  7.4222e-02,\n",
188 |       "         9.1467e-03,  2.2712e-02,  3.6326e-02, -2.0288e-02, -3.1550e-02,\n",
189 |       "         7.1721e-03, -3.4719e-02, -4.2586e-33,  5.7250e-03,  1.9000e-02,\n",
190 |       "         1.1605e-02, -2.7531e-02,  3.0725e-02,  1.2585e-02,  2.1076e-02,\n",
191 |       "        -5.6845e-02,  1.6838e-03, -4.8026e-04, -2.1989e-02,  9.2065e-02,\n",
192 |       "         7.0795e-03, -4.6472e-02,  5.3270e-02, -5.0890e-02,  7.3680e-02,\n",
193 |       "         5.6044e-03, -7.5194e-02, -8.5179e-02, -7.4869e-03,  6.3311e-02,\n",
194 |       "        -1.0171e-02,  2.0384e-02,  5.0803e-02, -6.6822e-03, -1.5785e-02,\n",
195 |       "        -3.5164e-02,  4.9931e-02,  8.2824e-03,  9.2603e-02, -2.0116e-02,\n",
196 |       "         4.5553e-02,  1.2080e-02, -5.9395e-02,  2.1852e-02,  5.8675e-02,\n",
197 |       "         2.2210e-02,  9.2596e-03, -2.3409e-02,  4.5004e-02, -5.1507e-02,\n",
198 |       "         2.8044e-02, -6.5380e-03,  7.4831e-03,  5.8157e-02,  3.4447e-02,\n",
199 |       "         4.1475e-02, -6.1901e-02, -1.6561e-02, -4.5630e-02,  4.9367e-02,\n",
200 |       "        -1.6766e-02, -5.7993e-02,  5.2082e-02, -8.8322e-02,  2.9853e-02,\n",
201 |       "         2.4510e-02, -9.7888e-03,  4.2248e-03,  7.1062e-03,  1.1276e-01,\n",
202 |       "         7.1893e-02,  1.2149e-03, -5.7319e-02, -7.2250e-02,  5.5623e-02,\n",
203 |       "         1.4792e-02, -1.4871e-02,  6.0891e-02, -3.7015e-02, -4.5080e-02,\n",
204 |       "         6.8613e-02,  5.5002e-02, -1.1833e-02,  2.1440e-02, -2.9428e-02,\n",
205 |       "         7.0045e-02, -5.4634e-02, -6.3302e-02, -9.2292e-02, -3.5121e-02,\n",
206 |       "         2.7007e-02,  2.0125e-02, -3.6556e-02, -1.3703e-02, -6.1224e-02,\n",
207 |       "        -1.0986e-02,  1.5851e-02, -2.6837e-02, -1.1262e-01, -4.3161e-02,\n",
208 |       "        -1.4493e-02,  7.6786e-03, -7.4723e-03,  2.1595e-33,  3.5767e-02,\n",
209 |       "         1.1363e-01, -1.4693e-02,  9.2684e-02,  3.3924e-02, -1.5614e-02,\n",
210 |       "        -1.6073e-02, -6.0525e-04,  2.5468e-03,  9.1995e-04, -2.1501e-02,\n",
211 |       "         7.5731e-03,  7.2208e-02,  4.1097e-02,  2.0974e-02, -3.6957e-02,\n",
212 |       "         7.8952e-02,  4.7166e-02,  1.3827e-02,  1.5595e-02,  3.9452e-02,\n",
213 |       "        -2.2254e-02,  2.3012e-02, -3.3751e-02, -7.7026e-02,  2.4340e-02,\n",
214 |       "         1.0263e-01, -3.7329e-02, -1.6680e-01,  7.4254e-02,  2.9221e-02,\n",
215 |       "        -6.3063e-04, -5.0962e-02,  4.1396e-02, -1.9880e-02,  1.0247e-02,\n",
216 |       "        -1.7069e-01,  9.8603e-02,  5.3988e-02,  1.2776e-02,  4.2524e-02,\n",
217 |       "        -3.5128e-03,  3.9324e-02,  4.7423e-02, -2.8995e-02, -5.1358e-02,\n",
218 |       "        -9.3995e-03,  3.7709e-03, -3.8401e-02, -5.3929e-02, -1.8980e-02,\n",
219 |       "         3.6103e-02, -6.3064e-02, -4.9003e-02, -5.8757e-02, -4.0821e-04,\n",
220 |       "        -1.0862e-01,  4.6522e-02, -5.9556e-02,  7.4809e-02, -3.2031e-02,\n",
221 |       "         6.8497e-02,  2.7983e-02, -5.1754e-03, -4.0551e-02,  1.9033e-02,\n",
222 |       "        -1.1128e-02,  8.7901e-03, -3.8768e-02, -3.0256e-02,  9.6022e-02,\n",
223 |       "        -1.0334e-01,  8.3221e-02,  1.0190e-01, -2.3660e-02, -1.2010e-03,\n",
224 |       "        -2.6769e-02,  6.1521e-02,  2.9026e-02, -5.4091e-02,  3.6390e-03,\n",
225 |       "        -9.2658e-02,  7.8827e-03,  1.1917e-01, -5.4807e-02,  1.5506e-01,\n",
226 |       "         4.6332e-02,  4.9145e-02,  5.1203e-03,  4.5503e-02,  1.6287e-02,\n",
227 |       "         4.3451e-02,  4.7670e-02,  1.7630e-02, -1.9633e-02, -1.1029e-08,\n",
228 |       "         6.5470e-02,  5.2181e-02,  3.3727e-02,  3.1607e-02, -1.7458e-02,\n",
229 |       "        -1.0251e-04, -1.4603e-02,  2.6509e-02,  7.8088e-02,  1.1715e-02,\n",
230 |       "        -7.6623e-02, -1.9881e-02, -3.8228e-02,  8.3098e-02,  3.7713e-02,\n",
231 |       "         5.6128e-02, -4.2606e-02, -1.7351e-02, -4.4235e-02, -6.9647e-02,\n",
232 |       "         7.4529e-03, -5.4775e-02,  4.2709e-02,  1.1223e-01, -1.0754e-02,\n",
233 |       "         1.0371e-02,  2.4825e-02, -1.1770e-01,  1.0997e-02,  1.0135e-01,\n",
234 |       "         1.2151e-02,  2.0672e-02,  1.5000e-02, -3.0694e-02,  7.1544e-02,\n",
235 |       "         3.1737e-02, -3.7621e-03, -6.8717e-03,  5.2807e-02, -8.3122e-02,\n",
236 |       "        -9.0656e-02, -1.2168e-01,  4.1270e-02, -2.7971e-02, -4.6418e-02,\n",
237 |       "        -5.2283e-02,  3.0334e-04,  1.3859e-01, -1.0576e-01,  3.3442e-02,\n",
238 |       "         8.5742e-03, -2.2963e-02,  5.4894e-02, -1.8929e-02,  1.1873e-04,\n",
239 |       "        -7.1593e-02, -1.7575e-02,  9.0126e-02,  2.2157e-02, -4.3015e-02,\n",
240 |       "         3.9037e-02, -7.4174e-02, -3.6659e-02, -3.6811e-03], device='cuda:0')}\n",
241 |       "sentence-transformers/clip-ViT-B-32 {'input_ids': tensor([49406,  1656,  4079, 49407], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1], device='cuda:0'), 'image_text_info': 1, 'sentence_embedding': tensor([-2.6722e-01, -2.5024e-01,  1.9831e-01, -8.4449e-02, -4.5411e-01,\n",
242 |       "         1.5269e-01,  1.9422e-01, -7.6131e-01,  1.6059e-02,  5.6570e-02,\n",
243 |       "        -1.1732e-01, -1.7434e-01, -4.8341e-02, -5.8244e-02,  1.9622e-03,\n",
244 |       "         3.3086e-02,  4.6739e-01,  5.9902e-02, -1.4106e-01, -2.5187e-01,\n",
245 |       "         1.1865e-01,  2.7707e-01, -2.4586e-01,  1.6738e-01, -6.5906e-02,\n",
246 |       "        -4.9510e-01,  7.1441e-02,  4.7636e-02, -1.1008e-01,  2.0449e-01,\n",
247 |       "         1.9936e-02, -1.8740e-01, -1.1138e-02,  1.2773e-01, -5.5551e-01,\n",
248 |       "        -4.6653e-02,  1.8026e-01, -9.6517e-02,  1.2344e-01, -1.3597e-01,\n",
249 |       "        -1.9992e-01, -1.9570e-01,  1.5068e-01, -5.0438e-01,  9.0387e-02,\n",
250 |       "         2.1159e-01,  5.4030e-03, -4.0513e-02,  2.7146e-01,  3.7458e-02,\n",
251 |       "         8.0310e-02, -2.9841e-02,  5.1565e-02, -6.1074e-01, -2.5701e-01,\n",
252 |       "         1.3316e-01, -1.1232e-01, -5.3493e-03, -6.2610e-01, -3.1894e-01,\n",
253 |       "         1.9283e-01,  2.0204e-02, -7.6878e-02, -4.0941e-01,  3.9715e-01,\n",
254 |       "         3.7647e-02,  1.9459e-01,  3.6569e-01,  3.9625e-01, -1.1323e-01,\n",
255 |       "         4.1932e-01, -2.1839e-01,  1.6538e-01,  3.2633e-01,  2.7650e-01,\n",
256 |       "         2.5525e-01, -7.0886e-02, -2.6799e-02, -2.8865e-01, -4.0908e-01,\n",
257 |       "        -2.3828e-01,  1.0888e-01,  1.9548e-02,  1.9067e-01,  1.2776e-01,\n",
258 |       "         4.7739e-01, -2.3506e-01, -1.8611e-01,  1.3253e-01, -2.1160e-01,\n",
259 |       "        -5.1571e-01, -2.1936e-01, -1.3583e+00,  6.3813e-01,  1.9401e-01,\n",
260 |       "        -1.7399e-01,  2.3773e-02,  1.7300e-01,  2.0512e-01,  3.8955e-01,\n",
261 |       "         1.2784e-02, -1.5616e-01,  2.7312e-01,  1.1615e-01,  9.6609e-02,\n",
262 |       "         2.5802e-01, -1.6590e-01, -2.5717e-01, -2.7002e-01,  2.2556e-01,\n",
263 |       "        -5.9875e-01,  4.1399e-01,  1.0067e-01,  2.3802e-01,  1.8393e-01,\n",
264 |       "        -7.7338e-02,  2.8407e-01, -1.8666e-01,  1.8673e-01, -2.7489e-02,\n",
265 |       "         6.1416e-02, -5.3712e-02, -3.1903e-01,  2.2104e-01,  4.5745e-01,\n",
266 |       "        -1.4629e-01, -2.0385e-01, -3.0613e-03,  1.0831e-01,  1.3029e-01,\n",
267 |       "         7.5172e-02,  1.7098e-01, -1.7295e-01,  6.0518e+00, -4.3587e-01,\n",
268 |       "        -3.6123e-02, -2.3084e-01, -2.7611e-01, -2.6074e-01, -2.6225e-02,\n",
269 |       "        -1.2373e-01, -3.8144e-02,  3.9868e-02,  1.2663e-01, -1.5586e-02,\n",
270 |       "        -1.0817e-01, -1.9468e-01, -4.2929e-01,  1.9576e-01,  4.2164e-02,\n",
271 |       "         2.4703e-01,  1.4054e-01,  5.2742e-01,  1.1616e-02, -1.9511e-01,\n",
272 |       "         1.8861e-02, -3.4501e-02,  2.4374e-01, -1.0141e-01, -8.6388e-02,\n",
273 |       "        -3.6278e-02, -6.0471e-02,  1.7178e-01,  4.3191e-02, -5.6021e-02,\n",
274 |       "         2.1537e-01,  2.2960e-01, -2.8191e-01,  3.6694e-01, -2.1738e-01,\n",
275 |       "        -1.5248e-01, -5.3858e-01,  4.8218e-04,  1.0611e-01, -4.3668e-01,\n",
276 |       "         3.2565e-01,  2.1495e-01,  7.9084e-02, -1.1144e-01, -8.2238e-02,\n",
277 |       "        -1.0823e-02,  3.2505e-01, -1.6962e-02,  2.4763e-03, -2.1186e-01,\n",
278 |       "        -1.7482e-01,  5.7193e-01,  2.6038e-02, -2.0085e-01,  3.7864e-01,\n",
279 |       "         2.8407e-02, -1.4909e-01,  4.4668e-02, -9.7271e-02, -7.5325e-02,\n",
280 |       "         2.7872e-01, -1.5554e-01,  2.2307e-01,  9.8059e-02, -4.1579e-02,\n",
281 |       "        -2.8606e-02, -2.3032e-01,  3.1150e-01, -7.1310e-02, -2.4904e-01,\n",
282 |       "        -7.7626e-02, -1.2460e-01, -2.6306e-02, -1.9954e-01,  8.6190e-02,\n",
283 |       "         3.0547e-05,  2.2495e-03,  2.2625e-01, -1.2017e-01, -1.5577e-01,\n",
284 |       "         3.8977e-01,  6.9040e-03, -1.5162e-01, -1.7155e-01,  1.7012e-01,\n",
285 |       "         2.9838e-01, -3.6955e-01,  1.8884e-01, -3.1549e-01, -2.8930e-01,\n",
286 |       "        -5.5932e-01,  3.5717e-01,  2.4953e-01, -2.4460e-01,  7.5881e-03,\n",
287 |       "        -1.7940e-01, -2.2461e-02, -1.8611e-01,  1.4540e-01,  3.2545e-01,\n",
288 |       "         1.3929e-01, -3.1123e-01, -2.8450e-01,  4.3041e-01, -4.7437e-02,\n",
289 |       "         6.9766e-02, -1.7365e-01,  5.9896e-02,  5.9521e-01, -4.6284e-02,\n",
290 |       "         3.8939e-02,  1.3634e-01, -1.1272e-01,  1.6409e-01,  9.2894e-03,\n",
291 |       "        -4.9759e-02, -4.7318e-02,  1.4064e-01, -1.3499e-01, -3.7380e-02,\n",
292 |       "        -1.2251e-01,  5.5834e-01, -2.6055e-01,  9.8233e-02, -5.9575e-02,\n",
293 |       "         5.2060e-02, -3.2568e-02, -2.9983e-02,  1.0168e-01,  1.8758e-01,\n",
294 |       "        -4.8515e-02,  2.4001e-01, -1.5750e-02,  1.0778e-01,  6.0214e-02,\n",
295 |       "        -2.2326e-01, -2.0359e-02,  2.0070e-01,  4.8625e-02, -1.4054e-02,\n",
296 |       "         9.4663e-02, -1.7493e-01,  2.9097e-02,  1.6428e-01, -2.8810e-01,\n",
297 |       "         1.3145e-01,  1.0770e-01, -3.8759e-02, -8.3538e-02, -7.6144e-02,\n",
298 |       "        -5.5353e-04,  3.1190e-01, -1.5708e-01,  7.7211e-02, -8.1434e-03,\n",
299 |       "         2.1436e-01, -9.9840e-02,  1.8908e-01,  4.1516e-01,  1.0249e-02,\n",
300 |       "        -3.0407e-01, -3.6515e-01, -1.2376e-01,  2.7788e-02,  1.9021e-02,\n",
301 |       "         1.8605e-01,  2.9233e-01,  5.6915e-02,  2.0724e-01,  7.0265e-02,\n",
302 |       "        -9.5444e-02, -2.1719e-01,  1.0457e-01, -2.2053e-01,  1.3246e-01,\n",
303 |       "        -1.5537e-01, -9.5731e-02,  6.0466e+00,  4.0778e-01,  1.1285e-02,\n",
304 |       "         1.1125e-01, -2.3022e-01,  3.2817e-01,  3.3474e-01,  1.3995e-01,\n",
305 |       "         1.5649e-01,  3.0448e-01, -1.5101e-02, -1.7316e-01, -3.8413e-01,\n",
306 |       "         2.2023e-01, -4.7145e-01,  9.5377e-02, -1.6545e-01, -1.7516e+00,\n",
307 |       "         2.9473e-01,  1.4115e-01,  1.8588e-01, -1.2569e-01,  6.5745e-02,\n",
308 |       "        -1.4898e-01, -9.3745e-02,  3.1413e-01,  1.2686e-01, -1.0292e-01,\n",
309 |       "        -2.1954e-01,  3.0948e-02, -4.2561e-01, -2.6715e-01,  5.5388e-02,\n",
310 |       "        -8.8885e-02, -9.3119e-02,  2.2464e-01, -4.0584e-02,  1.0608e-01,\n",
311 |       "         2.8624e-01, -1.7813e-01, -5.1491e-02,  6.9263e-02,  1.7411e-01,\n",
312 |       "        -4.3583e-01, -1.5844e-01,  1.3288e-02,  1.3611e-01,  2.8650e-01,\n",
313 |       "         2.2427e-01,  2.9285e-01, -8.2768e-02, -2.1313e-01,  4.6977e-01,\n",
314 |       "         1.3981e-01,  4.4196e-01,  5.9897e-01, -2.0943e-01,  1.3199e-01,\n",
315 |       "        -1.2770e-01,  7.0080e-02, -8.7112e-02,  2.5521e-01, -3.5098e-01,\n",
316 |       "         2.7341e-01, -3.3966e-02, -1.0843e-01,  4.3469e-01, -6.2516e-02,\n",
317 |       "        -1.0742e-01, -3.9380e-01,  2.8758e-02, -7.4800e-01, -1.1042e-01,\n",
318 |       "         7.4263e-02, -1.0410e-01, -3.2008e-01, -2.6784e-01,  8.2537e-03,\n",
319 |       "        -3.0056e-01,  1.5119e-01, -1.6852e-01, -5.2178e-03, -1.4731e-01,\n",
320 |       "         4.4588e-01,  3.7448e-01, -3.9732e-01,  1.3030e-01,  6.4875e-01,\n",
321 |       "        -2.1301e-01,  1.2479e-01,  2.6747e-01, -2.6366e-01, -1.2018e-01,\n",
322 |       "        -2.2133e-02, -1.6639e-02, -1.1777e-01, -3.0975e-01,  1.7078e-01,\n",
323 |       "        -1.0861e-04,  1.3197e-01, -2.0081e-01, -2.6618e-01,  3.1587e-02,\n",
324 |       "         1.0479e-01,  8.0152e-02,  1.1496e-01, -1.7642e-01, -1.5247e-01,\n",
325 |       "        -4.1968e-01,  2.1120e-01,  1.1615e-01, -2.0123e-01, -1.9122e-01,\n",
326 |       "         6.6410e-02, -2.2218e-01,  1.0440e-01,  7.5966e-03,  4.2399e-01,\n",
327 |       "        -9.5817e-02, -3.5184e-01, -6.2223e-02, -1.7677e-01, -4.7161e-01,\n",
328 |       "        -3.8534e-02, -8.0673e-02, -4.5963e-03, -6.4624e-03, -1.8014e-02,\n",
329 |       "        -1.1445e-01,  4.2948e-02, -2.5978e-01,  5.9180e-02,  3.8135e-02,\n",
330 |       "         2.0812e-01, -2.4710e-01, -3.7079e-01,  2.0508e-01,  2.1899e-01,\n",
331 |       "        -9.4519e-01,  2.7558e-01,  2.7138e-01,  1.4367e-01, -1.5441e-01,\n",
332 |       "        -1.8515e-01,  9.0163e-02,  2.0217e-01, -1.0698e-01, -2.3912e-01,\n",
333 |       "        -2.2943e-01,  2.2451e-01,  1.2780e-01,  1.2775e-01,  3.1585e-01,\n",
334 |       "        -2.1394e-01, -1.2684e-01, -9.6884e-02, -6.5346e-01, -2.9291e-01,\n",
335 |       "        -5.3387e-01, -2.2139e-02, -1.9091e-01, -5.3790e-01, -1.2427e-01,\n",
336 |       "         3.3499e-01, -1.1048e-02,  6.9154e-02, -4.7823e-02, -1.6247e-02,\n",
337 |       "         1.7433e-01, -6.0335e-01, -1.3460e-02, -3.3694e-01, -4.4399e-01,\n",
338 |       "        -4.2622e-02,  3.5184e-01, -1.3890e-01,  2.1893e-01, -1.5696e-02,\n",
339 |       "         1.5114e-01,  1.4282e-01,  2.9100e-01,  8.3577e-01, -7.7831e-02,\n",
340 |       "        -6.1096e-02, -5.0251e-02, -2.8395e-01, -1.4788e-01, -1.7162e-01,\n",
341 |       "        -9.0106e-02,  2.4305e-01, -1.6364e-01,  2.7089e-01,  3.6500e-01,\n",
342 |       "        -2.4414e-01, -4.1538e-01,  3.7322e-01,  3.0024e-01,  2.3689e-01,\n",
343 |       "         7.3204e-02, -3.4812e-02], device='cuda:0')}\n",
344 |       "Model name:sentence-transformers/clip-ViT-B-32 ; tokenizer doesn't exist; sentence-transformers/clip-ViT-B-32 does not appear to have a file named config.json. Checkout 'https://huggingface.co/sentence-transformers/clip-ViT-B-32/main' for available files.\n",
345 |       "sentence-transformers/clip-ViT-B-32-multilingual-v1 {'input_ids': tensor([  101, 10588, 48201, 47125,   102], device='cuda:0'), 'attention_mask': tensor([1, 1, 1, 1, 1], device='cuda:0'), 'token_embeddings': tensor([[ 0.2305,  0.0549, -0.1571,  ...,  0.5232, -0.0930, -0.1172],\n",
346 |       "        [-0.2680,  0.2789, -0.0539,  ...,  0.4740, -0.2368, -0.3543],\n",
347 |       "        [ 0.3758,  0.2741, -0.3703,  ...,  0.7613,  0.3820, -0.0288],\n",
348 |       "        [ 0.4044,  0.2899, -0.4232,  ...,  0.8181,  0.3925, -0.0369],\n",
349 |       "        [ 0.1187,  0.0554, -0.3584,  ...,  0.3664,  0.1128, -0.1268]],\n",
350 |       "       device='cuda:0'), 'sentence_embedding': tensor([-2.0334e-01, -1.6914e-01,  7.4771e-02, -1.4223e-01, -3.0504e-01,\n",
351 |       "         2.0065e-01,  9.7776e-02, -9.3886e-01, -2.3021e-03,  9.3738e-02,\n",
352 |       "         1.8778e-02, -2.6235e-01, -1.2688e-01, -1.7641e-01,  9.0206e-02,\n",
353 |       "        -6.5903e-02,  1.5076e-01,  6.2253e-02, -3.2784e-02, -2.5361e-01,\n",
354 |       "         2.5175e-01,  3.9041e-01, -9.7813e-02,  2.4359e-01,  4.1893e-02,\n",
355 |       "        -3.3285e-01,  9.3473e-02,  1.3778e-01, -1.5428e-01,  2.4548e-01,\n",
356 |       "         1.9890e-02, -2.7075e-01, -1.8032e-02,  6.4909e-02, -4.3061e-01,\n",
357 |       "        -1.1657e-01,  2.4541e-01, -4.9875e-02,  2.3737e-02, -2.3621e-02,\n",
358 |       "        -3.1835e-02, -2.0869e-01,  5.7527e-02, -5.0633e-01,  1.3199e-01,\n",
359 |       "         2.9206e-01,  2.9964e-02, -3.3747e-02,  2.3226e-01,  3.6006e-02,\n",
360 |       "         1.2699e-01,  9.7488e-02,  7.3582e-02, -5.8144e-01, -2.2830e-01,\n",
361 |       "         1.3345e-01, -5.1341e-02, -8.0402e-02, -5.1184e-01, -1.9752e-01,\n",
362 |       "         1.4001e-01, -5.0529e-02,  2.0968e-02, -3.1461e-01,  2.1346e-01,\n",
363 |       "        -3.8276e-02,  1.8773e-01,  2.2569e-01,  1.9953e-01, -7.3426e-02,\n",
364 |       "         4.3863e-01, -1.7246e-01,  1.5217e-01,  1.7116e-01,  2.1022e-01,\n",
365 |       "         4.1116e-01,  1.4070e-01, -1.1562e-01, -2.2117e-01, -4.7348e-01,\n",
366 |       "        -2.1663e-01,  4.4288e-03,  1.7295e-02,  2.5705e-01,  1.1410e-01,\n",
367 |       "         5.6728e-01, -2.1271e-01, -2.2724e-01,  1.2777e-01, -1.4383e-01,\n",
368 |       "        -3.4052e-01, -1.4664e-01, -1.4763e+00,  6.1955e-01,  1.6093e-01,\n",
369 |       "        -1.7167e-01,  2.3186e-02,  1.3621e-01,  9.5215e-02,  3.2798e-01,\n",
370 |       "        -1.8117e-02, -4.4082e-02,  2.5197e-01,  1.7477e-01,  1.9436e-01,\n",
371 |       "         3.0611e-01, -8.9931e-02, -7.9702e-02, -2.5298e-01,  2.7659e-01,\n",
372 |       "        -5.9393e-01,  4.1021e-01,  7.9611e-02,  1.3278e-01,  1.3027e-01,\n",
373 |       "        -4.7478e-02,  1.4775e-01, -1.3919e-02,  3.9992e-02,  1.2915e-01,\n",
374 |       "        -2.0684e-02, -1.4745e-01, -2.2467e-01,  2.0943e-01,  4.3064e-01,\n",
375 |       "        -1.1368e-01, -1.3093e-01,  8.6813e-03,  1.4154e-02,  1.3849e-01,\n",
376 |       "         1.8656e-01,  1.8730e-01, -3.0004e-01,  5.7339e+00, -3.2605e-01,\n",
377 |       "        -4.2595e-02, -4.1856e-01, -3.0168e-01, -8.9973e-02,  9.5640e-04,\n",
378 |       "        -1.4158e-02, -1.0548e-02, -1.1700e-01,  2.3475e-01, -2.6989e-02,\n",
379 |       "        -1.3903e-01, -9.5062e-02, -3.3901e-01,  3.1460e-01, -5.6003e-02,\n",
380 |       "         2.0593e-01,  9.9136e-02,  5.2334e-01,  7.3437e-02, -1.5021e-01,\n",
381 |       "        -6.6951e-02, -6.3785e-02,  2.3081e-01,  2.6384e-02, -8.1484e-02,\n",
382 |       "        -4.4474e-02, -4.0740e-02,  2.5724e-01,  3.1777e-02, -9.1613e-02,\n",
383 |       "         3.9941e-02,  4.0293e-02, -3.0676e-01,  3.4817e-01, -2.2957e-01,\n",
384 |       "        -5.3091e-02, -3.3662e-01, -5.1250e-02,  1.4828e-01, -2.7174e-01,\n",
385 |       "         3.7047e-01, -3.9839e-02,  1.3486e-01, -5.5569e-02, -6.2613e-02,\n",
386 |       "         2.6114e-02,  2.9128e-01,  6.0373e-02, -1.2501e-02, -1.6453e-01,\n",
387 |       "        -9.3317e-03,  4.4107e-01,  1.1232e-01, -1.3276e-01,  3.4949e-01,\n",
388 |       "         9.3545e-02,  4.6813e-03, -2.4483e-02,  6.4613e-02, -1.1478e-01,\n",
389 |       "         3.0463e-01, -4.4370e-02,  2.0068e-01, -4.3869e-02, -8.9185e-02,\n",
390 |       "         1.1884e-01, -1.3263e-01,  3.5835e-01, -1.0064e-01, -3.2594e-01,\n",
391 |       "        -1.5505e-01, -5.2067e-02, -1.1747e-03, -7.0044e-02,  4.5385e-02,\n",
392 |       "        -6.7558e-02,  1.6928e-01,  2.6598e-01,  6.8568e-02, -1.9572e-01,\n",
393 |       "         2.9449e-01,  4.5720e-03, -6.7686e-02, -3.0300e-01,  2.0569e-01,\n",
394 |       "         2.2198e-01, -2.7167e-01,  2.2274e-01, -1.5873e-01, -3.3218e-01,\n",
395 |       "        -4.9440e-01,  3.7218e-01,  2.3206e-01, -1.8265e-01,  1.3722e-01,\n",
396 |       "        -2.2112e-01, -3.3449e-02, -1.0907e-01,  3.2806e-02,  2.4866e-01,\n",
397 |       "         2.5990e-02, -1.8595e-01, -2.5502e-01,  2.8818e-01, -2.1293e-01,\n",
398 |       "         1.0490e-01, -6.9663e-02,  1.0206e-01,  5.4036e-01,  6.0295e-03,\n",
399 |       "        -8.6853e-03,  7.8703e-02, -6.7371e-02,  6.5200e-02,  9.7390e-03,\n",
400 |       "         3.3331e-02, -1.1727e-01,  2.7078e-02, -7.0596e-02,  2.8880e-02,\n",
401 |       "        -1.6717e-01,  4.1236e-01, -1.3797e-01,  1.2633e-02,  4.0977e-02,\n",
402 |       "        -2.9763e-02, -4.3454e-02,  2.7565e-02, -4.0571e-02,  5.7820e-02,\n",
403 |       "        -4.8267e-02,  2.4278e-01, -7.6660e-03,  2.9698e-02,  4.8585e-02,\n",
404 |       "        -1.5352e-01,  1.4553e-02,  1.5029e-01,  6.8895e-03,  1.2136e-02,\n",
405 |       "         4.9711e-02, -1.0585e-01, -3.0702e-02,  9.3002e-02, -8.1609e-02,\n",
406 |       "         2.3991e-01,  3.9812e-03, -1.9190e-01, -1.3853e-01, -1.6096e-01,\n",
407 |       "        -2.1159e-01,  2.1663e-01, -8.0867e-02, -1.8770e-02, -1.5257e-01,\n",
408 |       "         1.9546e-01, -6.9405e-02,  1.4577e-01,  3.3809e-01,  6.0678e-02,\n",
409 |       "        -2.6670e-01, -2.0814e-01, -5.2758e-03,  5.5603e-02,  4.4693e-02,\n",
410 |       "         2.0097e-01,  2.7027e-01,  1.2986e-01,  2.3777e-01,  7.5795e-02,\n",
411 |       "        -7.5986e-02, -3.1976e-01,  5.8500e-02, -9.6743e-02,  5.7583e-04,\n",
412 |       "        -1.8898e-01, -1.7358e-01,  5.7260e+00,  4.8272e-01, -8.1259e-02,\n",
413 |       "         3.2061e-03, -3.5555e-01,  1.5947e-01,  3.3509e-01,  1.7805e-01,\n",
414 |       "         9.1990e-02,  3.5931e-02, -7.3387e-02, -6.7740e-02, -1.9722e-01,\n",
415 |       "         1.6325e-01, -5.5183e-01,  1.3599e-01, -1.9017e-01, -2.1158e+00,\n",
416 |       "         1.7186e-01,  2.0626e-01,  3.9789e-02, -1.1614e-01,  1.3275e-02,\n",
417 |       "        -2.0151e-01, -2.5364e-02,  3.5192e-01,  1.1858e-01, -4.1850e-02,\n",
418 |       "        -2.7810e-01,  2.4617e-02, -3.5590e-01, -2.9734e-01,  5.7519e-02,\n",
419 |       "        -1.2361e-01,  7.0654e-02,  1.9895e-01, -1.0288e-01,  1.0614e-01,\n",
420 |       "         2.2623e-01, -1.1561e-01, -4.4942e-02,  1.0816e-01,  1.0192e-01,\n",
421 |       "        -2.6893e-01, -1.7173e-01, -2.1719e-01,  1.1773e-01,  2.5459e-01,\n",
422 |       "         3.0536e-01,  1.7029e-01,  2.4242e-02, -1.4146e-01,  5.0061e-01,\n",
423 |       "         7.2468e-02,  3.1891e-01,  3.2688e-01, -1.5695e-01,  1.7149e-01,\n",
424 |       "        -2.0500e-01,  4.5032e-02, -1.7520e-01,  2.2878e-01, -1.6988e-01,\n",
425 |       "         1.5373e-01, -5.7668e-02, -8.8306e-03,  3.1439e-01, -2.9895e-02,\n",
426 |       "        -7.8530e-02, -2.2839e-01, -5.0848e-02, -6.3631e-01, -8.8465e-02,\n",
427 |       "        -1.5557e-02, -2.0264e-01, -2.6219e-01, -3.1466e-01,  2.0764e-02,\n",
428 |       "        -3.4295e-01,  1.9736e-01, -1.6578e-01, -1.0560e-01, -2.2583e-01,\n",
429 |       "         2.5710e-01,  2.4691e-01, -4.3547e-01,  1.1362e-01,  4.6921e-01,\n",
430 |       "        -2.3151e-01,  1.5638e-01,  2.0220e-01, -2.8883e-01, -1.3096e-01,\n",
431 |       "        -2.4510e-02,  7.7040e-02, -7.8399e-04, -4.7700e-01,  4.2299e-02,\n",
432 |       "         1.5806e-01,  1.6993e-01, -8.3540e-02, -9.5018e-02, -1.5459e-02,\n",
433 |       "         4.4254e-02,  1.1774e-01, -3.8907e-02, -1.5936e-01, -2.1897e-02,\n",
434 |       "        -3.0448e-01,  2.6064e-01,  1.6372e-01, -2.3275e-01, -1.8462e-01,\n",
435 |       "         9.4053e-02, -1.6129e-01, -1.5300e-01,  1.7068e-01,  4.8920e-01,\n",
436 |       "         2.1387e-02, -2.2186e-01,  7.1614e-02, -1.5353e-02, -3.8598e-01,\n",
437 |       "        -1.4085e-01, -1.0007e-01, -7.3114e-02, -8.1861e-02, -2.0652e-02,\n",
438 |       "        -1.9611e-01, -1.1353e-02, -1.4559e-01,  1.9196e-01,  1.0416e-01,\n",
439 |       "         1.1943e-01, -2.0479e-01, -2.3482e-01,  1.6960e-01,  2.5849e-01,\n",
440 |       "        -8.1506e-01,  3.2976e-01,  2.7162e-01,  1.1857e-01, -1.2812e-01,\n",
441 |       "        -1.3794e-01,  1.0144e-01,  1.3436e-01, -8.4321e-02, -2.6041e-01,\n",
442 |       "        -1.9379e-01,  2.6946e-01, -2.9697e-02, -2.7816e-02,  4.6124e-01,\n",
443 |       "        -1.1000e-01,  2.2152e-02, -7.0317e-02, -5.6700e-01, -3.9634e-01,\n",
444 |       "        -5.3712e-01, -2.5599e-02, -2.3336e-01, -4.9603e-01, -1.3422e-01,\n",
445 |       "         2.4102e-01,  6.0139e-02,  1.9689e-02, -2.9797e-01,  7.1472e-02,\n",
446 |       "         5.7367e-02, -4.8900e-01,  3.0909e-03, -2.3196e-01, -4.4083e-01,\n",
447 |       "        -1.6809e-02,  4.6111e-01, -5.8852e-02,  1.3696e-01,  9.9999e-02,\n",
448 |       "         1.7090e-01,  1.2402e-01,  3.0726e-01,  8.3289e-01,  5.7107e-03,\n",
449 |       "        -5.5376e-02, -3.5042e-02, -2.4841e-01, -1.3627e-01, -2.5289e-01,\n",
450 |       "        -1.9113e-01,  1.7571e-01, -2.1889e-01,  5.0180e-01,  2.4607e-01,\n",
451 |       "        -3.9292e-01, -3.9180e-01,  2.2482e-01,  2.2826e-01, -2.6152e-02,\n",
452 |       "        -3.1935e-03, -9.9769e-02], device='cuda:0')}\n"
453 |      ]
454 |     },
455 |     {
456 |      "data": {
457 |       "application/vnd.jupyter.widget-view+json": {
458 |        "model_id": "d966829260ec4907a130766b61833261",
459 |        "version_major": 2,
460 |        "version_minor": 0
461 |       },
462 |       "text/plain": [
463 |        "Downloading (…)okenizer_config.json:   0%|          | 0.00/371 [00:00<?, ?B/s]"
464 |       ]
465 |      },
466 |      "metadata": {},
467 |      "output_type": "display_data"
468 |     },
469 |     {
470 |      "data": {
471 |       "application/vnd.jupyter.widget-view+json": {
472 |        "model_id": "2a89bf62a7304bdeb19b75a5a76a74fe",
473 |        "version_major": 2,
474 |        "version_minor": 0
475 |       },
476 |       "text/plain": [
477 |        "Downloading (…)lve/main/config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]"
478 |       ]
479 |      },
480 |      "metadata": {},
481 |      "output_type": "display_data"
482 |     },
483 |     {
484 |      "data": {
485 |       "application/vnd.jupyter.widget-view+json": {
486 |        "model_id": "5b38dbfeb07740e888bbb86ba760fe91",
487 |        "version_major": 2,
488 |        "version_minor": 0
489 |       },
490 |       "text/plain": [
491 |        "Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]"
492 |       ]
493 |      },
494 |      "metadata": {},
495 |      "output_type": "display_data"
496 |     },
497 |     {
498 |      "data": {
499 |       "application/vnd.jupyter.widget-view+json": {
500 |        "model_id": "2b3f54f0f109410580f47a34e06b15c1",
501 |        "version_major": 2,
502 |        "version_minor": 0
503 |       },
504 |       "text/plain": [
505 |        "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]"
506 |       ]
507 |      },
508 |      "metadata": {},
509 |      "output_type": "display_data"
510 |     },
511 |     {
512 |      "data": {
513 |       "application/vnd.jupyter.widget-view+json": {
514 |        "model_id": "0f0768a02ffa440a87c1b341f1d1d183",
515 |        "version_major": 2,
516 |        "version_minor": 0
517 |       },
518 |       "text/plain": [
519 |        "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]"
520 |       ]
521 |      },
522 |      "metadata": {},
523 |      "output_type": "display_data"
524 |     }
525 |    ],
526 |    "source": [
527 |     "for model_name in models:\n",
528 |     "    model = SentenceTransformer(model_name)\n",
529 |     "    resp = model.encode(text, output_value=None)\n",
530 |     "    \n",
531 |     "\n",
532 |     "    print (model_name, resp)\n",
533 |     "    \n",
534 |     "    try:\n",
535 |     "        tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
536 |     "        resp =  tokenizer.tokenize(text)\n",
537 |     "    except Exception as e:\n",
538 |     "        print (f\"Model name:{model_name} ; tokenizer doesn't exist; {e}\")\n"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": null,
544 |    "id": "c7d3ca6f-8a0e-45ac-a4a7-49d9d24bbefd",
545 |    "metadata": {},
546 |    "outputs": [],
547 |    "source": []
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "id": "386b27e7-aad9-4d50-8324-46d3673f7bdc",
553 |    "metadata": {},
554 |    "outputs": [],
555 |    "source": []
556 |   }
557 |  ],
558 |  "metadata": {
559 |   "environment": {
560 |    "kernel": "python3",
561 |    "name": "pytorch-gpu.1-13.m107",
562 |    "type": "gcloud",
563 |    "uri": "gcr.io/deeplearning-platform-release/pytorch-gpu.1-13:m107"
564 |   },
565 |   "kernelspec": {
566 |    "display_name": "Python 3",
567 |    "language": "python",
568 |    "name": "python3"
569 |   },
570 |   "language_info": {
571 |    "codemirror_mode": {
572 |     "name": "ipython",
573 |     "version": 3
574 |    },
575 |    "file_extension": ".py",
576 |    "mimetype": "text/x-python",
577 |    "name": "python",
578 |    "nbconvert_exporter": "python",
579 |    "pygments_lexer": "ipython3",
580 |    "version": "3.7.12"
581 |   }
582 |  },
583 |  "nbformat": 4,
584 |  "nbformat_minor": 5
585 | }
586 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Search Engine Workshop
 2 | 
 3 | 
 4 | ## About
 5 | 
 6 | Handson workshop for building a semantic search engine.
 7 | 
 8 | 
 9 | 
10 | 
11 | ## Setup 
12 | 
13 | If you came to this repo, during a workshop visit this custom [jupyter hub](http://hub.np.training) with all the dependencies already set up.
14 | 
15 | The repo is located at [npatta01/search-engine-workshop](https://github.com/npatta01/search-engine-workshop)
16 | 
17 | To use this repo outside a workshop, please use Binder
18 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/npatta01/search-engine-workshop/main)
19 | 
20 | ## Content (Notebooks)
21 | 
22 | 
23 | **Data Fetching**
24 | 
25 | [setup notebook](notebooks/00_a_setup_dataset.ipynb)        
26 | [stats notebook](notebooks/00_b_setup_stats.ipynb)     
27 | [sample image notebook](notebooks/00_c_sample_images.ipynb)
28 | 
29 | 
30 | Notebooks to download unsplash dataset and save as hugging face dataset format
31 | 
32 | 
33 | **Non Deep Learning Retrieval**
34 | 
35 | BM25 retrieval with elastic search: [notebook](notebooks/01_bm25_elastic.ipynb)
36 | 
37 | 
38 | **Deep Learning Retrieval (text)**
39 | 
40 | 
41 | Text Deep Learning retrieval: [Link](notebooks/02_dense_retriever.ipynb)
42 | 
43 | 
44 | **Deep Learning Retrieval (image)**
45 | 
46 | 
47 | Clip Retrieval: [Link](notebooks/03_clip_embed.ipynb)
48 | 
49 | **ANN**
50 | 
51 | Shows how to speed up Deep Learning retrieval by exploring different ANN indexes
52 | [Link](notebooks/04_ann.ipynb) 
53 | 
54 | 
55 | 
56 | 
57 | ## Slides
58 | 
59 | [PyData Seattle 2022](assets/slides_pydataseattle2023.pdf)
60 | 
61 | [PyData NYC 2022](assets/slides_pydatanyc2022.pdf)
62 | 
63 | 
64 | [ODSC 2022](assets/slides_odsc2022.pdf) 
65 | 
66 | 
67 | ## Contact
68 | 
69 | For help or feedback, please reach out to :
70 | 
71 | - [Nidhin Pattaniyil](https://www.linkedin.com/in/nidhinpattaniyil/)   
72 | - [Ravi Yadav](https://www.linkedin.com/in/ravi-kumar-yadav-535b268/)   
73 | - [Mustafa Zengin](https://www.linkedin.com/in/mustafazengin/)   
74 | 
75 | 
76 | 
77 | 
78 | 
79 | ## Acknowledgments
80 | 
81 | This workshop uses Unsplash Lite Dataset 1.2.0 [link](unsplash.com/data)
82 | 
83 | The hands on portion of the workshop was made possible due to [JupyterHub Helm Chart](https://github.com/jupyterhub/helm-chart)
84 | 
85 | ## Changelog
86 | 
87 | **v1.1**
88 | - setup for PyDataNYC
89 | - replaced stackoverflow data with unsplash data
90 | 
91 | **v1.0**
92 | - setup for ODSC
93 | - used stackoverflow data


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | elasticsearch==8.7.*
 2 | pandarallel==1.6.*
 3 | jupyterlab-execute-time==2.3.*
 4 | ipywidgets==8.0.*
 5 | datasets==2.6.*
 6 | gcsfs==2023.1.*
 7 | rank_bm25==0.2.*
 8 | faiss_cpu==1.7.*
 9 | sentence-transformers==2.2.*
10 | transformers==4.28.*
11 | papermill==2.4.*
12 | cloudpickle==2.2.*
13 | rich==13.3.*
14 | ipyplot==1.1.*
15 | 
16 | # pyarrow<11.0.*
17 | # google-cloud-bigquery-storage 
18 | # weaviate-client==3.8.0
19 | # datasets==2.6.*
20 | # milvus==2.1.*
21 | # rich==12.6.*
22 | # papermill==2.4.*
23 | 


--------------------------------------------------------------------------------
/workshop_infra/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | #FROM jupyter/scipy-notebook:python-3.10.6
 3 | FROM jupyter/scipy-notebook:python-3.7.12
 4 | 
 5 | 
 6 | 
 7 | USER root
 8 | 
 9 | 
10 | RUN apt-get update && apt-get --yes install apt-utils && \
11 |     apt-get --yes --no-install-recommends install htop tmux graphviz curl build-essential libsasl2-dev gfortran && \
12 |     apt-get clean;
13 | 
14 | 
15 | # set the user back to original setting
16 | USER $NB_UID
17 | 
18 | 
19 | 
20 | # Install from requirements.txt file
21 | COPY --chown=${NB_UID}:${NB_GID} environment.yaml /tmp/
22 | COPY --chown=${NB_UID}:${NB_GID} requirements.txt /tmp/
23 | 
24 | RUN mamba env update -n base -f /tmp/environment.yaml && \
25 |     fix-permissions "${CONDA_DIR}" && \
26 |     fix-permissions "/home/${NB_USER}"
27 | 
28 | COPY --chown=${NB_UID}:${NB_GID} workshop_infra/scripts /tmp/scripts/
29 | 
30 | USER root
31 | 
32 | RUN bash /tmp/scripts/build_setup_root.sh
33 | USER $NB_UID
34 | 
35 | 
36 | COPY --chown=${NB_UID}:${NB_GID} notebooks/workshop_setup.ipynb /tmp/workshop/notebooks/
37 | 
38 | RUN bash /tmp/scripts/build_setup_user.sh
39 | 
40 | COPY --chown=${NB_UID}:${NB_GID} . /tmp/workshop/
41 | 
42 | 
43 | ENV PATH="/opt/google-cloud-sdk/bin:${PATH}" 
44 | 
45 | 
46 | #COPY --chown=${NB_UID}:${NB_GID} docker-setup.sh /tmp/
47 | 
48 | #COPY --chown=${NB_UID}:${NB_GID} setup.ipynb /tmp/
49 | 
50 | # RUN papermill /tmp/setup.ipynb /tmp/setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar && \
51 | #     fix-permissions "${CONDA_DIR}" && \
52 | #     fix-permissions "/home/${NB_USER}"


--------------------------------------------------------------------------------
/workshop_infra/cert/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/npatta01/search-engine-workshop/d8d4d1e6234f29c3a158b6343b06701728be92ab/workshop_infra/cert/.gitkeep


--------------------------------------------------------------------------------
/workshop_infra/config.enc.yaml:
--------------------------------------------------------------------------------
  1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
  2 | scheduling:
  3 |     userScheduler:
  4 |         enabled: true
  5 |     podPriority:
  6 |         enabled: true
  7 |     userPlaceholder:
  8 |         enabled: true
  9 |         # Specify five dummy user pods will be used as placeholders
 10 |         replicas: 5
 11 |     userPods:
 12 |         nodeAffinity:
 13 |             matchNodePurpose: require
 14 | cull:
 15 |     enabled: true
 16 |     timeout: 3600
 17 |     every: 300
 18 | singleuser:
 19 |     cpu:
 20 |         limit: 4
 21 |         guarantee: 4
 22 |     memory:
 23 |         limit: 8G
 24 |         guarantee: 8G
 25 |     image:
 26 |         # You should replace the "latest" tag with a fixed version from:
 27 |         # https://hub.docker.com/r/jupyter/datascience-notebook/tags/
 28 |         # Inspect the Dockerfile at:
 29 |         # https://github.com/jupyter/docker-stacks/tree/HEAD/datascience-notebook/Dockerfile
 30 |         name: gcr.io/np-public-training/semantic-search-workshop
 31 |         tag: v1.0
 32 |     #defaultUrl: /lab
 33 |     # extraEnv:
 34 |     #     GOOGLE_APPLICATION_CREDENTIALS: /etc/secrets/keyfile.json
 35 |     storage:
 36 |         # extraVolumes:
 37 |         #     - name: gcsfs-creds
 38 |         #       secret:
 39 |         #         secretName: gcsfs-creds
 40 |         #         items:
 41 |         #             - key: keyfile.json
 42 |         #               path: keyfile.json
 43 |         # extraVolumeMounts:
 44 |         #     - name: gcsfs-creds
 45 |         #       mountPath: /etc/secrets
 46 |         #       readOnly: true
 47 |         type: none
 48 |     lifecycleHooks:
 49 |         postStart:
 50 |             exec:
 51 |                 command:
 52 |                     - sh
 53 |                     - -c
 54 |                     - bash /tmp/workshop/workshop_infra/scripts/container_startup.sh
 55 |     extraContainers:
 56 |         - name: elastic-search
 57 |           image: elasticsearch:8.7.0
 58 |           env:
 59 |             - name: discovery.type
 60 |               value: single-node
 61 |             - name: xpack.security.enabled
 62 |               value: "false"
 63 |             - name: ES_JAVA_OPTS
 64 |               value: -Xms1g -Xmx1g
 65 |               # - name: milvus
 66 |               #   image: gcr.io/np-public-training/custom-milvus:v2.1.4-1
 67 |               # - name: weaviate
 68 |               #   image: semitechnologies/weaviate:1.14.0
 69 |               #   env:
 70 |               #     - name: QUERY_DEFAULTS_LIMIT
 71 |               #       value: "25"
 72 |               #     - name: AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED
 73 |               #       value: "true"
 74 |               #     - name: PERSISTENCE_DATA_PATH
 75 |               #       value: /var/lib/weaviate
 76 |               #     - name: DEFAULT_VECTORIZER_MODULE
 77 |               #       value: none
 78 |               #     - name: ENABLE_MODULES
 79 |               #       value: ""
 80 |               #     - name: CLUSTER_HOSTNAME
 81 |               #       value: node1
 82 | # proxy:
 83 | #   https:
 84 | #     enabled: true
 85 | #     hosts:
 86 | #       - hub.np.training
 87 | #     letsencrypt:
 88 | #       contactEmail: npatta01@gmail.com
 89 | #   service:
 90 | #     loadBalancerIP: "34.145.156.81"
 91 | # proxy:
 92 | #   service:
 93 | #     loadBalancerIP: "34.145.156.81"
 94 | proxy:
 95 |     https:
 96 |         enabled: true
 97 |         hosts:
 98 |             - hub.np.training
 99 |         type: secret
100 |         secret:
101 |             name: workshop-tls
102 |     service:
103 |         loadBalancerIP: 34.102.71.215
104 | hub:
105 |     config:
106 |         Authenticator:
107 |             admin_users:
108 |                 - npatta01
109 |                 - vishalkumar95
110 |                 - mzengin
111 |                 - rkyadav-ncsu
112 |         GitHubOAuthenticator:
113 |             client_id: ENC[AES256_GCM,data:Af0qVw8uUkPGgukNRUihE/v6Yxw=,iv:glsdWx5z0/cJ1PKZUQp+7LvDpi2pn4RGkKIza0sP7rA=,tag:bWW55KyWY+Sh4ECtdsUGlw==,type:str]
114 |             client_secret: ENC[AES256_GCM,data:/JyxvL2uXAd1/I45blHoOq8cfco58SnqtLdbDfnPNRakJuch4ShW1g==,iv:+xoS5twfpnX+2xduZMQNTcffqcuaLWIIsss/8Whvn00=,tag:RG4cuZjt8TookMgnXWHL1A==,type:str]
115 |             oauth_callback_url: https://hub.np.training/hub/oauth_callback
116 |         JupyterHub:
117 |             authenticator_class: github
118 | sops:
119 |     kms: []
120 |     gcp_kms:
121 |         - resource_id: projects/np-public-training/locations/global/keyRings/sops/cryptoKeys/sops-key
122 |           created_at: "2022-10-13T23:23:47Z"
123 |           enc: CiQAtA68IX63yVjyNNzcuN6oxKMDvZI/hnlne6POMs/AToxGvoUSSQDOyIoWf1EgyIyvrp486rhLw/G2J+YuUkobdqfonbEr5Tss0E60rJY5vCtgqzes+/7aunlxPDTU5zngKhkH/vP7dz/z69G3ZmQ=
124 |     azure_kv: []
125 |     hc_vault: []
126 |     age: []
127 |     lastmodified: "2023-04-26T01:39:50Z"
128 |     mac: ENC[AES256_GCM,data:cDB37QZ0UxitWQTSkpQCWJLc1lLfiiqWTbjujMAJi6FB7MryXEaQU7k/8vueuV2+/3k1Zhp++H68SB3zZWJ9XBg4UXsUkLTgqlCQ9Fv3M2rrwVL/h1LQ7wqHVlnvEySy+qODb0PoXS3QfUShOvrPNPf/ZtFHWIRYrfJEprkTys0=,iv:xMc7R4v+PNMFh9DXYxKVnPr9v81MbPqNDq5AVQcwYW4=,tag:Jp8+haROua5fHqWovZNb+Q==,type:str]
129 |     pgp: []
130 |     encrypted_regex: ^(client_id|client_secret)$
131 |     version: 3.7.3
132 | 


--------------------------------------------------------------------------------
/workshop_infra/config_public.yaml:
--------------------------------------------------------------------------------
  1 | # https://zero-to-jupyterhub.readthedocs.io/en/latest/administrator/optimization.html#optimization
  2 | scheduling:
  3 |     userScheduler:
  4 |         enabled: true
  5 |     podPriority:
  6 |         enabled: true
  7 |     userPlaceholder:
  8 |         enabled: true
  9 |         # Specify five dummy user pods will be used as placeholders
 10 |         replicas: 1
 11 |     userPods:
 12 |         nodeAffinity:
 13 |             matchNodePurpose: require
 14 | cull:
 15 |     enabled: true
 16 |     timeout: 3600
 17 |     every: 300
 18 | singleuser:
 19 |     cpu:
 20 |         limit: 4
 21 |         guarantee: 4
 22 |     memory:
 23 |         limit: 8G
 24 |         guarantee: 8G
 25 |     image:
 26 |         # You should replace the "latest" tag with a fixed version from:
 27 |         # https://hub.docker.com/r/jupyter/datascience-notebook/tags/
 28 |         # Inspect the Dockerfile at:
 29 |         # https://github.com/jupyter/docker-stacks/tree/HEAD/datascience-notebook/Dockerfile
 30 |         name: gcr.io/np-public-training/semantic-search-workshop
 31 |         tag: v1.0
 32 |     #defaultUrl: /lab
 33 |     # extraEnv:
 34 |     #     GOOGLE_APPLICATION_CREDENTIALS: /etc/secrets/keyfile.json
 35 |     storage:
 36 |       type: none
 37 |         # extraVolumes:
 38 |         #     - name: gcsfs-creds
 39 |         #       secret:
 40 |         #         secretName: gcsfs-creds
 41 |         #         items:
 42 |         #             - key: keyfile.json
 43 |         #               path: keyfile.json
 44 |         # extraVolumeMounts:
 45 |         #     - name: gcsfs-creds
 46 |         #       mountPath: /etc/secrets
 47 |         #       readOnly: true
 48 |     lifecycleHooks:
 49 |         postStart:
 50 |             exec:
 51 |                 command:
 52 |                     - sh
 53 |                     - -c
 54 |                     - "bash /tmp/workshop/workshop_infra/scripts/container_startup.sh"
 55 |     extraContainers:
 56 |         - name: elastic-search
 57 |           image: elasticsearch:8.7.0
 58 |           env:
 59 |             - name: discovery.type
 60 |               value: single-node
 61 |             - name: xpack.security.enabled
 62 |               value: "false"
 63 |             - name: ES_JAVA_OPTS
 64 |               value: -Xms1g -Xmx1g
 65 |         # - name: milvus
 66 |         #   image: gcr.io/np-public-training/custom-milvus:v2.1.4-1
 67 |         # - name: weaviate
 68 |         #   image: semitechnologies/weaviate:1.14.0
 69 |         #   env:
 70 |         #     - name: QUERY_DEFAULTS_LIMIT
 71 |         #       value: "25"
 72 |         #     - name: AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED
 73 |         #       value: "true"
 74 |         #     - name: PERSISTENCE_DATA_PATH
 75 |         #       value: /var/lib/weaviate
 76 |         #     - name: DEFAULT_VECTORIZER_MODULE
 77 |         #       value: none
 78 |         #     - name: ENABLE_MODULES
 79 |         #       value: ""
 80 |         #     - name: CLUSTER_HOSTNAME
 81 |         #       value: node1
 82 | # proxy:
 83 | #   https:
 84 | #     enabled: true
 85 | #     hosts:
 86 | #       - hub.np.training
 87 | #     letsencrypt:
 88 | #       contactEmail: npatta01@gmail.com
 89 | #   service:
 90 | #     loadBalancerIP: "34.145.156.81"
 91 | # proxy:
 92 | #   service:
 93 | #     loadBalancerIP: "34.145.156.81"
 94 | 
 95 | hub:
 96 |     config:
 97 |         Authenticator:
 98 |             admin_users:
 99 |                 - npatta01
100 |                 - vishalkumar95
101 |                 - mzengin
102 |                 - rkyadav-ncsu
103 | 
104 | 


--------------------------------------------------------------------------------
/workshop_infra/scripts/build_setup_root.sh:
--------------------------------------------------------------------------------
 1 | # apt-get install --reinstall systemd --yes
 2 | # wget https://github.com/milvus-io/milvus/releases/download/v2.1.4/milvus_2.1.4-1_amd64.deb
 3 | # apt-get update --yes
 4 | # dpkg -i milvus_2.1.4-1_amd64.deb
 5 | # apt-get -f install --yes
 6 | # apt-get install --yes --no-install-recommends build-essential libsasl2-dev gfortran milvus
 7 | 
 8 | #pip install milvus==2.1.*
 9 | apt-get update --yes
10 | apt-get install --yes --no-install-recommends build-essential libsasl2-dev gfortran pigz
11 | 
12 | #python -c "import milvus; milvus.before()"
13 | 
14 | #bash /var/bin/e-milvus/lib/install_deps.sh
15 | 
16 | 
17 | 
18 | EXPORT CLOUDSDK_CORE_DISABLE_PROMPTS=1 
19 | 
20 | curl https://sdk.cloud.google.com > install.sh
21 | bash install.sh --disable-prompts --install-dir=/opt 
22 | 


--------------------------------------------------------------------------------
/workshop_infra/scripts/build_setup_user.sh:
--------------------------------------------------------------------------------
1 | 
2 | papermill /tmp/workshop/notebooks/workshop_setup.ipynb /tmp/workshop_setup__out.ipynb -k python3 --log-output --log-level INFO --progress-bar 


--------------------------------------------------------------------------------
/workshop_infra/scripts/container_startup.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS || echo "skipped gcloud authentication"
 4 | 
 5 | 
 6 | #cp -r /tmp/workshop /home/jovyan
 7 | 
 8 | 
 9 | 
10 | echo $pwd
11 | 
12 | GIT_BRANCH="pydata_seattle"
13 | GIT_BRANCH="main"
14 | 
15 | echo "cloning repo"
16 | git clone --depth 1 https://github.com/npatta01/search-engine-workshop.git -b $GIT_BRANCH
17 | 
18 | 
19 | cd search-engine-workshop
20 | 
21 | 
22 | url="https://storage.googleapis.com/np-public-training-tmp/search-workshop/data.zip"
23 | 
24 | if wget --spider $url 2>/dev/null; then
25 |   
26 |   echo "getting data from gcs"
27 |   wget $url 
28 |   unzip -q data.zip
29 | else
30 |   echo "getting from github"
31 |   wget https://github.com/npatta01/search-engine-workshop/releases/download/v1.0/data_processed.zip
32 |   unzip -q data_processed.zip
33 | 
34 | fi
35 | 
36 | 


--------------------------------------------------------------------------------
/workshop_infra/setup.md:
--------------------------------------------------------------------------------
  1 | # Workshop Setup
  2 | 
  3 | The following included commands and steps that were used to create a working jupyter hub installation for the workshop.
  4 | 
  5 | The instructions assume that you are plannning to use GCP and have gcloud setup.
  6 | 
  7 | 
  8 | Most of the instructions are taken from [zero-to-jupyterhub](https://zero-to-jupyterhub.readthedocs.io/en/latest/index.html) project.
  9 | 
 10 | 
 11 | ## Step 1: common variables
 12 | 
 13 | ```bash
 14 | REGION="us-west2"
 15 | ZONE="$REGION-a"
 16 | NODE_TYPE_USER="e2-highmem-16"
 17 | NODE_TYPE_DEFAULT="e2-standard-2"
 18 | 
 19 | CLUSTER_NAME=workshop
 20 | NODES_MIN=0
 21 | NODES_MAX=400
 22 | 
 23 | EMAIL="npatta01@gmail.com"
 24 | GCP_PROJECT="np-public-training"
 25 | 
 26 | HELM_NAMESPACE=$CLUSTER_NAME
 27 | 
 28 | HELM_CHART_VERSION="2.0.0"
 29 | ```
 30 | 
 31 | ## Step 2: create static ip address
 32 | 
 33 | ```bash
 34 | gcloud compute addresses create $CLUSTER_NAME \
 35 |     --region $REGION \
 36 |     --project $GCP_PROJECT
 37 | 
 38 | gcloud compute addresses describe $CLUSTER_NAME \
 39 | --region $REGION \
 40 | --project $GCP_PROJECT
 41 | 
 42 | ```
 43 | 
 44 | Create an `A` record with your DNS provider.
 45 | 
 46 | I am using `hub` for my domain `np.training`
 47 | 
 48 | 
 49 | 
 50 | 
 51 | ## Step 3: Create cluster
 52 | 
 53 | 
 54 | ```bash
 55 | 
 56 | gcloud container clusters create \
 57 |   --machine-type $NODE_TYPE_DEFAULT \
 58 |   --num-nodes 1 \
 59 |   --region $REGION \
 60 |   --cluster-version latest \
 61 |   $CLUSTER_NAME \
 62 |   --project $GCP_PROJECT
 63 | 
 64 | ```
 65 | 
 66 | Get kubectl credentials
 67 | 
 68 | ```bash
 69 | gcloud container clusters get-credentials \
 70 | $CLUSTER_NAME \
 71 | --region $REGION \
 72 | --project $GCP_PROJECT
 73 | ```
 74 | 
 75 | Create admin access for user
 76 | 
 77 | ```bash
 78 | kubectl create clusterrolebinding cluster-admin-binding \
 79 |   --clusterrole=cluster-admin \
 80 |   --user $EMAIL
 81 | ```
 82 | 
 83 | Create separate node pool for jupyter notebook
 84 | 
 85 | ```bash
 86 | gcloud beta container node-pools create user-pool \
 87 |   --machine-type $NODE_TYPE_USER \
 88 |   --num-nodes 0 \
 89 |   --enable-autoscaling \
 90 |   --min-nodes $NODES_MIN \
 91 |   --max-nodes $NODES_MAX \
 92 |   --node-labels hub.jupyter.org/node-purpose=user \
 93 |   --node-taints hub.jupyter.org_dedicated=user:NoSchedule \
 94 |   --scopes "https://www.googleapis.com/auth/cloud-platform" \
 95 |   --region $REGION \
 96 |   --cluster $CLUSTER_NAME  \
 97 |   --project $GCP_PROJECT 
 98 | ```
 99 | 
100 | 
101 | ## Step 3b: Cert (optional)
102 | 
103 | By default the Helm chart we will use supports LetsEncrypt. However, I had trouble getting it to work.
104 | So, I used followed the steps bellow to get create my own cert
105 | 
106 | create certificate signing request for "*.np.training"
107 | 
108 | ```bash 
109 | openssl req -nodes -newkey rsa:2048 \
110 | -keyout cert/server.key \
111 | -out cert/server.csr \
112 | -subj "/C=US/ST=New York/L=New York/O=NP Training./OU=IT/CN=*.np.training"
113 | ```
114 | 
115 | I bought a wildcard cert from Namecheap
116 | 
117 | 
118 | Download my cert and create a kubectl cert
119 | ```bash
120 | 
121 | kubectl create namespace $HELM_NAMESPACE
122 | 
123 | 
124 | 
125 | 
126 | gsutil cp "gs://np-training-private/certs/_star.np.training/*" workshop_infra/cert
127 | 
128 | 
129 | kubectl create namespace $HELM_NAMESPACE 
130 | cd workshop_infra/cert  
131 | kubectl create secret tls $HELM_NAMESPACE-tls --key="tls.key" --cert="tls.crt" --namespace $HELM_NAMESPACE 
132 | cd ../../
133 | 
134 | ```
135 | 
136 | 
137 | download storage key
138 | 
139 | ```
140 | gcloud iam service-accounts keys create workshop_infra/keyfile.json \
141 |     --iam-account=public-storage-reader-sa@np-public-training.iam.gserviceaccount.com
142 | 
143 | gsutil cp gs://np-training-private/service_accounts/keyfile.json workshop_infra/keyfile.json
144 | 
145 | kubectl create secret generic gcsfs-creds --from-file=workshop_infra/keyfile.json --namespace $HELM_NAMESPACE
146 | 
147 | 
148 | 
149 | ```
150 | 
151 | ## Step 4: Helm setup
152 | 
153 | ```bash
154 | 
155 | curl https://raw.githubusercontent.com/helm/helm/HEAD/scripts/get-helm-3 | bash
156 | 
157 | helm version
158 | 
159 | helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/
160 | helm repo update
161 | 
162 | ```
163 | 
164 | 
165 | ## Step 5: Update config file (optional)
166 | 
167 | 
168 | build docker image
169 | 
170 | ```bash
171 | docker build -t gcr.io/$GCP_PROJECT/semantic-search-workshop:v1.0 .
172 | docker push gcr.io/$GCP_PROJECT/semantic-search-workshop:v1.0
173 | 
174 | ```
175 | 
176 | build milvus
177 | 
178 | ```bash
179 | cd docker_milvus
180 | 
181 | docker build -t gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 .
182 | docker push gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 
183 | echo "gcr.io/$GCP_PROJECT/custom-milvus:v2.1.4-1 "
184 | cd ..
185 | ```
186 | 
187 | encrypt setup
188 | 
189 | ```bash
190 | gcloud kms keyrings create sops --location global --project $GCP_PROJECT
191 | gcloud kms keys create sops-key --location global --keyring sops --purpose encryption --project $GCP_PROJECT
192 | gcloud kms keys list --location global --keyring sops --project $GCP_PROJECT
193 | ```
194 | 
195 | 
196 | ```bash
197 | sops --encrypt --gcp-kms projects/$GCP_PROJECT/locations/global/keyRings/sops/cryptoKeys/sops-key \
198 | --encrypted-regex '^(client_id|client_secret)$' \
199 | workshop_infra/config.yaml > workshop_infra/config.enc.yaml
200 | ```
201 | 
202 | ```bash
203 | sops --decrypt workshop_infra/config.enc.yaml > workshop_infra/config.yaml
204 | ```
205 | 
206 | 
207 | 
208 | 
209 | 
210 | replace values in [config.yaml](workshop_infra/config.yaml)
211 | 
212 | - GitHubOAuthenticator
213 | - singleuser.image.name
214 | - scheduling.userPlaceholder.replicas
215 | - proxy.https.host
216 | - proxy.https.service.loadBalancerIP
217 | 
218 | 
219 | 
220 | ## Step 6: Helm Install with authentication
221 | 
222 | setup with authentication and git oauth
223 | 
224 | ```bash
225 | helm upgrade --cleanup-on-fail \
226 |   --install $HELM_NAMESPACE jupyterhub/jupyterhub \
227 |   --namespace $HELM_NAMESPACE \
228 |   --create-namespace \
229 |   --version $HELM_CHART_VERSION \
230 |   --values workshop_infra/config.yaml
231 | 
232 | ```
233 | 
234 | ```bash
235 | kubectl --namespace=$HELM_NAMESPACE get pod
236 | 
237 | kubectl --namespace=$HELM_NAMESPACE  get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
238 | ```
239 | 
240 | 
241 | ## Step 6b: Helm Install with no authentication ( not auth)
242 | 
243 | ```bash
244 | helm upgrade --cleanup-on-fail \
245 |   --install $HELM_NAMESPACE-public jupyterhub/jupyterhub \
246 |   --namespace $HELM_NAMESPACE-public \
247 |   --create-namespace \
248 |   --version $HELM_CHART_VERSION \
249 |   --values workshop_infra/config_public.yaml
250 | 
251 | 
252 | kubectl --namespace=$HELM_NAMESPACE-public get pod
253 | 
254 | kubectl --namespace=$HELM_NAMESPACE-public  get svc proxy-public -o jsonpath='{.status.loadBalancer.ingress[].ip}'
255 | ```
256 | 
257 | add the external ip to dns
258 | 
259 | 
260 | ## Step 7: Cleanup (Helm Delete)
261 | 
262 | ```bash
263 | 
264 | helm delete $HELM_NAMESPACE --namespace $HELM_NAMESPACE 
265 | kubectl delete namespace $HELM_NAMESPACE 
266 | 
267 | helm delete $HELM_NAMESPACE-public --namespace $HELM_NAMESPACE-public
268 | kubectl delete namespace $HELM_NAMESPACE-public
269 | 
270 | 
271 | gcloud container clusters  delete $CLUSTER_NAME  --region $REGION   --project $GCP_PROJECT
272 | 
273 | ```


--------------------------------------------------------------------------------