├── .gitignore ├── Dockerfile ├── Dockerrun.aws.json ├── README.md ├── app.py ├── data └── misinformation_papers.csv ├── models └── faiss_index.pickle ├── notebooks └── 001_vector_search.ipynb ├── requirements.txt ├── setup.py └── vector_engine ├── __init__.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # /data 132 | 133 | *~ 134 | #* 135 | *# 136 | *bin 137 | *npy 138 | 139 | # OSX 140 | .DS_Store -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | COPY . /app 3 | WORKDIR /app 4 | RUN pip install -r requirements.txt 5 | EXPOSE 8501 6 | ENTRYPOINT ["streamlit","run"] 7 | CMD ["app.py"] 8 | -------------------------------------------------------------------------------- /Dockerrun.aws.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSEBDockerrunVersion": "1", 3 | "Image": { 4 | "Name": "kstathou/vector_engine", 5 | "Update": "true" 6 | }, 7 | "Ports": [ 8 | { 9 | "ContainerPort": 8501, 10 | "HostPort": 8501 11 | } 12 | ], 13 | "Logging": "/var/log/nginx" 14 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Building a semantic search engine with Transformers and Faiss 2 | This repository contains the code for the following Medium blogs: 3 | - [How to build a semantic search engine with Transformers and Faiss](https://kstathou.medium.com/how-to-build-a-semantic-search-engine-with-transformers-and-faiss-dcbea307a0e8?source=friends_link&sk=6974c79b86e2f257c32f77d49583a524) 4 | - [How to deploy a machine learning model on AWS Elastic Beanstalk with Streamlit and Docker](https://kstathou.medium.com/how-to-deploy-a-semantic-search-engine-with-streamlit-and-docker-on-aws-elastic-beanstalk-42ddce0422f3?source=friends_link&sk=dcc7bbf8d172f2cd18aefcdf0c2c6b49) 5 | 6 | Check out the blogs if you want to learn how to create a semantic search engine with Sentence Transformers and Faiss. 7 | 8 | You can [run the notebook on Google Colab](https://colab.research.google.com/drive/11WBCrwNzbNWN7QbMEwzy-8MZROOVQFnZ?usp=sharing) and leverage the free GPU to speed up the computation! 9 | 10 | ## How to deploy the Streamlit app locally with Docker ## 11 | Assuming docker is running on your machine and you have a docker account, do the following: 12 | - Build the image 13 | 14 | ``` bash 15 | docker build -t / . 16 | ``` 17 | 18 | - Run the image 19 | 20 | ``` bash 21 | docker run -p 8501:8501 / 22 | ``` 23 | 24 | - Open your browser and go to `http://localhost:8501/` -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import faiss 2 | import pickle 3 | import pandas as pd 4 | import streamlit as st 5 | from sentence_transformers import SentenceTransformer 6 | from vector_engine.utils import vector_search 7 | 8 | 9 | @st.cache 10 | def read_data(data="data/misinformation_papers.csv"): 11 | """Read the data from local.""" 12 | return pd.read_csv(data) 13 | 14 | 15 | @st.cache(allow_output_mutation=True) 16 | def load_bert_model(name="distilbert-base-nli-stsb-mean-tokens"): 17 | """Instantiate a sentence-level DistilBERT model.""" 18 | return SentenceTransformer(name) 19 | 20 | 21 | @st.cache(allow_output_mutation=True) 22 | def load_faiss_index(path_to_faiss="models/faiss_index.pickle"): 23 | """Load and deserialize the Faiss index.""" 24 | with open(path_to_faiss, "rb") as h: 25 | data = pickle.load(h) 26 | return faiss.deserialize_index(data) 27 | 28 | 29 | def main(): 30 | # Load data and models 31 | data = read_data() 32 | model = load_bert_model() 33 | faiss_index = load_faiss_index() 34 | 35 | st.title("Vector-based searches with Sentence Transformers and Faiss") 36 | 37 | # User search 38 | user_input = st.text_area("Search box", "covid-19 misinformation and social media") 39 | 40 | # Filters 41 | st.sidebar.markdown("**Filters**") 42 | filter_year = st.sidebar.slider("Publication year", 2010, 2021, (2010, 2021), 1) 43 | filter_citations = st.sidebar.slider("Citations", 0, 250, 0) 44 | num_results = st.sidebar.slider("Number of search results", 10, 50, 10) 45 | 46 | # Fetch results 47 | if user_input: 48 | # Get paper IDs 49 | D, I = vector_search([user_input], model, faiss_index, num_results) 50 | # Slice data on year 51 | frame = data[ 52 | (data.year >= filter_year[0]) 53 | & (data.year <= filter_year[1]) 54 | & (data.citations >= filter_citations) 55 | ] 56 | # Get individual results 57 | for id_ in I.flatten().tolist(): 58 | if id_ in set(frame.id): 59 | f = frame[(frame.id == id_)] 60 | else: 61 | continue 62 | 63 | st.write( 64 | f"""**{f.iloc[0].original_title}** 65 | **Citations**: {f.iloc[0].citations} 66 | **Publication year**: {f.iloc[0].year} 67 | **Abstract** 68 | {f.iloc[0].abstract} 69 | """ 70 | ) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /models/faiss_index.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kstathou/vector_engine/2ae08673e89d3a1fe5170e14b9bea5d5e509be7d/models/faiss_index.pickle -------------------------------------------------------------------------------- /notebooks/001_vector_search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "bJLQoimyVyQ8" 7 | }, 8 | "source": [ 9 | "### Uncomment and run the following cells if you work on Google Colab :) Don't forget to change your runtime type to GPU!" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 33, 15 | "metadata": { 16 | "colab": { 17 | "base_uri": "https://localhost:8080/" 18 | }, 19 | "id": "rVV81xc3VyQ9", 20 | "outputId": "5f91275b-3caf-4588-d4bd-ec11089b1b71" 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# !git clone https://github.com/kstathou/vector_engine" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 34, 30 | "metadata": { 31 | "colab": { 32 | "base_uri": "https://localhost:8080/" 33 | }, 34 | "id": "C0lSFLw3VyRG", 35 | "outputId": "ae5c260a-7925-4bc6-8aac-b6800017d1e0" 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "# cd vector_engine" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 35, 45 | "metadata": { 46 | "colab": { 47 | "base_uri": "https://localhost:8080/", 48 | "height": 1000 49 | }, 50 | "id": "5sOhWL6UVyRQ", 51 | "outputId": "e6f41a8c-d210-4b33-a1ac-4c3886253920" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "# pip install -r requirements.txt" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "id": "vbnscDwgVyRW" 62 | }, 63 | "source": [ 64 | "### Let's begin!" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": { 71 | "id": "v7ftrzzmVyRX" 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "%load_ext autoreload" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": { 82 | "id": "fU2i4vlCVyRc" 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "%autoreload 2\n", 87 | "# Used to import data from S3.\n", 88 | "import pandas as pd\n", 89 | "import s3fs\n", 90 | "\n", 91 | "# Used to create the dense document vectors.\n", 92 | "import torch\n", 93 | "from sentence_transformers import SentenceTransformer\n", 94 | "\n", 95 | "# Used to create and store the Faiss index.\n", 96 | "import faiss\n", 97 | "import numpy as np\n", 98 | "import pickle\n", 99 | "from pathlib import Path\n", 100 | "\n", 101 | "# Used to do vector searches and display the results.\n", 102 | "from vector_engine.utils import vector_search, id2details" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "id": "Kz5YBwU5VyRi" 109 | }, 110 | "source": [ 111 | "Stored and processed data in s3" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": { 118 | "id": "VEANywYAVyRi" 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "# Use pandas to read files from S3 buckets!\n", 123 | "df = pd.read_csv('s3://vector-search-blog/misinformation_papers.csv')" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "metadata": { 130 | "colab": { 131 | "base_uri": "https://localhost:8080/", 132 | "height": 143 133 | }, 134 | "id": "HJXljSbYVyRn", 135 | "outputId": "1c180fbc-42a4-441a-da47-14f5cc21d826" 136 | }, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/html": [ 141 | "
\n", 142 | "\n", 155 | "\n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | "
original_titleabstractyearcitationsidis_EN
0When Corrections Fail: The Persistence of Poli...An extensive literature addresses citizen igno...201090121325536811
1A postmodern Pandora's box: anti-vaccination m...The Internet plays a large role in disseminati...201044021174857951
2Spread of (Mis)Information in Social NetworksWe provide a model to investigate the tension ...201027821200150721
\n", 197 | "
" 198 | ], 199 | "text/plain": [ 200 | " original_title ... is_EN\n", 201 | "0 When Corrections Fail: The Persistence of Poli... ... 1\n", 202 | "1 A postmodern Pandora's box: anti-vaccination m... ... 1\n", 203 | "2 Spread of (Mis)Information in Social Networks ... 1\n", 204 | "\n", 205 | "[3 rows x 6 columns]" 206 | ] 207 | }, 208 | "execution_count": 7, 209 | "metadata": { 210 | "tags": [] 211 | }, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "df.head(3)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 8, 222 | "metadata": { 223 | "colab": { 224 | "base_uri": "https://localhost:8080/" 225 | }, 226 | "id": "MljadlGpVyRs", 227 | "outputId": "8f2f5205-772f-4c6b-f445-5dd32350d45e" 228 | }, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "Misinformation, disinformation and fake news papers: 8430\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "print(f\"Misinformation, disinformation and fake news papers: {df.id.unique().shape[0]}\")" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "id": "VyRG1wZLVyRw" 246 | }, 247 | "source": [ 248 | "The [Sentence Transformers library](https://github.com/UKPLab/sentence-transformers) offers pretrained transformers that produce SOTA sentence embeddings. Checkout this [spreadsheet](https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/) with all the available models.\n", 249 | "\n", 250 | "In this tutorial, we will use the `distilbert-base-nli-stsb-mean-tokens` model which has the best performance on Semantic Textual Similarity tasks among the DistilBERT versions. Moreover, although it's slightly worse than BERT, it is quite faster thanks to having a smaller size.\n", 251 | "\n", 252 | "I use the same model in [Orion's semantic search engine](https://www.orion-search.org/)!" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 9, 258 | "metadata": { 259 | "colab": { 260 | "base_uri": "https://localhost:8080/" 261 | }, 262 | "id": "PjF6CrwUVyRx", 263 | "outputId": "db338335-b032-45f2-db21-8e3f53640b86" 264 | }, 265 | "outputs": [ 266 | { 267 | "name": "stderr", 268 | "output_type": "stream", 269 | "text": [ 270 | "100%|██████████| 245M/245M [00:14<00:00, 16.6MB/s]\n" 271 | ] 272 | }, 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "cuda:0\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "# Instantiate the sentence-level DistilBERT\n", 283 | "model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')\n", 284 | "# Check if GPU is available and use it\n", 285 | "if torch.cuda.is_available():\n", 286 | " model = model.to(torch.device(\"cuda\"))\n", 287 | "print(model.device)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 10, 293 | "metadata": { 294 | "colab": { 295 | "base_uri": "https://localhost:8080/", 296 | "height": 66, 297 | "referenced_widgets": [ 298 | "7a7e927567024c578b33b15000d5e531", 299 | "da5011a6565e45e2b5d0b37634498648", 300 | "3e9c24c8488b431fb88a0d045d85b700", 301 | "7aaf8a423e7f48bbac24d6970ed4dfe9", 302 | "f9c22552944b4e2dafe1f98170665293", 303 | "15c5bbc768db41e4bc06c9405539091c", 304 | "1e1408cddd284bc4a4b5484be0561991", 305 | "7f1cd9dbb4b742ab8d78e073fb9b0f90" 306 | ] 307 | }, 308 | "id": "Y_GS0_CWVyR1", 309 | "outputId": "4deb0814-1ce9-4ea8-8e50-72992e0ea303" 310 | }, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "application/vnd.jupyter.widget-view+json": { 315 | "model_id": "7a7e927567024c578b33b15000d5e531", 316 | "version_major": 2, 317 | "version_minor": 0 318 | }, 319 | "text/plain": [ 320 | "HBox(children=(FloatProgress(value=0.0, description='Batches', max=264.0, style=ProgressStyle(description_widt…" 321 | ] 322 | }, 323 | "metadata": { 324 | "tags": [] 325 | }, 326 | "output_type": "display_data" 327 | }, 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "# Convert abstracts to vectors\n", 338 | "embeddings = model.encode(df.abstract.to_list(), show_progress_bar=True)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 11, 344 | "metadata": { 345 | "colab": { 346 | "base_uri": "https://localhost:8080/" 347 | }, 348 | "id": "gE7w-RJbVyR6", 349 | "outputId": "0451849a-88ef-4aee-be2d-e6ff173782f3" 350 | }, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "Shape of the vectorised abstract: (768,)\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "print(f'Shape of the vectorised abstract: {embeddings[0].shape}')" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "id": "YGV4Je1EVyR_" 368 | }, 369 | "source": [ 370 | "## Vector similarity search with Faiss\n", 371 | "[Faiss](https://github.com/facebookresearch/faiss) is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, even ones that do not fit in RAM. \n", 372 | " \n", 373 | "Faiss is built around the `Index` object which contains, and sometimes preprocesses, the searchable vectors. Faiss has a large collection of [indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes). You can even create [composite indexes](https://github.com/facebookresearch/faiss/wiki/Faiss-indexes-(composite)). Faiss handles collections of vectors of a fixed dimensionality d, typically a few 10s to 100s.\n", 374 | "\n", 375 | "**Note**: Faiss uses only 32-bit floating point matrices. This means that you will have to change the data type of the input before building the index.\n", 376 | "\n", 377 | "To learn more about Faiss, you can read their paper on [arXiv](https://arxiv.org/abs/1702.08734).\n", 378 | "\n", 379 | "Here, we will the `IndexFlatL2` index:\n", 380 | "- It's a simple index that performs a brute-force L2 distance search\n", 381 | "- It scales linearly. It will work fine with our data but you might want to try [faster indexes](https://github.com/facebookresearch/faiss/wiki/Faster-search) if you work will millions of vectors.\n", 382 | "\n", 383 | "To create an index with the `misinformation` abstract vectors, we will:\n", 384 | "1. Change the data type of the abstract vectors to float32.\n", 385 | "2. Build an index and pass it the dimension of the vectors it will operate on.\n", 386 | "3. Pass the index to IndexIDMap, an object that enables us to provide a custom list of IDs for the indexed vectors.\n", 387 | "4. Add the abstract vectors and their ID mapping to the index. In our case, we will map vectors to their paper IDs from MAG." 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 12, 393 | "metadata": { 394 | "colab": { 395 | "base_uri": "https://localhost:8080/" 396 | }, 397 | "id": "8kkUDtwHVyR_", 398 | "outputId": "0f668d02-ef33-4123-ab77-18f72a93ab80" 399 | }, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "Number of vectors in the Faiss index: 8430\n" 406 | ] 407 | } 408 | ], 409 | "source": [ 410 | "# Step 1: Change data type\n", 411 | "embeddings = np.array([embedding for embedding in embeddings]).astype(\"float32\")\n", 412 | "\n", 413 | "# Step 2: Instantiate the index\n", 414 | "index = faiss.IndexFlatL2(embeddings.shape[1])\n", 415 | "\n", 416 | "# Step 3: Pass the index to IndexIDMap\n", 417 | "index = faiss.IndexIDMap(index)\n", 418 | "\n", 419 | "# Step 4: Add vectors and their IDs\n", 420 | "index.add_with_ids(embeddings, df.id.values)\n", 421 | "\n", 422 | "print(f\"Number of vectors in the Faiss index: {index.ntotal}\")" 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": { 428 | "id": "yt1z-433VySE" 429 | }, 430 | "source": [ 431 | "### Searching the index\n", 432 | "The index we built will perform a k-nearest-neighbour search. We have to provide the number of neighbours to be returned. \n", 433 | "\n", 434 | "Let's query the index with an abstract from our dataset and retrieve the 10 most relevant documents. **The first one must be our query!**\n" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 13, 440 | "metadata": { 441 | "colab": { 442 | "base_uri": "https://localhost:8080/", 443 | "height": 106 444 | }, 445 | "id": "eEeJt7lYVySN", 446 | "outputId": "771571b3-0200-48b8-f2de-4e8cdbd5ec98" 447 | }, 448 | "outputs": [ 449 | { 450 | "data": { 451 | "application/vnd.google.colaboratory.intrinsic+json": { 452 | "type": "string" 453 | }, 454 | "text/plain": [ 455 | "\"We address the diffusion of information about the COVID-19 with a massive data analysis on Twitter, Instagram, YouTube, Reddit and Gab. We analyze engagement and interest in the COVID-19 topic and provide a differential assessment on the evolution of the discourse on a global scale for each platform and their users. We fit information spreading with epidemic models characterizing the basic reproduction number [Formula: see text] for each social media platform. Moreover, we identify information spreading from questionable sources, finding different volumes of misinformation in each platform. However, information from both reliable and questionable sources do not present different spreading patterns. Finally, we provide platform-dependent numerical estimates of rumors' amplification.\"" 456 | ] 457 | }, 458 | "execution_count": 13, 459 | "metadata": { 460 | "tags": [] 461 | }, 462 | "output_type": "execute_result" 463 | } 464 | ], 465 | "source": [ 466 | "# Paper abstract\n", 467 | "df.iloc[5415, 1]" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 14, 473 | "metadata": { 474 | "colab": { 475 | "base_uri": "https://localhost:8080/" 476 | }, 477 | "id": "BSuRcH85VySQ", 478 | "outputId": "cec93a12-e79c-4f0a-fc15-45048a3469aa" 479 | }, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "L2 distance: [0.0, 1.267284631729126, 62.72160339355469, 63.670326232910156, 64.58393859863281, 67.47344970703125, 67.96402740478516, 69.47564697265625, 72.5633544921875, 74.62230682373047]\n", 486 | "\n", 487 | "MAG paper IDs: [3092618151, 3011345566, 3012936764, 3055557295, 3011186656, 3044429417, 3092128270, 3024620668, 3047284882, 3048848247]\n" 488 | ] 489 | } 490 | ], 491 | "source": [ 492 | "# Retrieve the 10 nearest neighbours\n", 493 | "D, I = index.search(np.array([embeddings[5415]]), k=10)\n", 494 | "print(f'L2 distance: {D.flatten().tolist()}\\n\\nMAG paper IDs: {I.flatten().tolist()}')" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 15, 500 | "metadata": { 501 | "colab": { 502 | "base_uri": "https://localhost:8080/" 503 | }, 504 | "id": "SiO1pa4oVySU", 505 | "outputId": "24c9d1ea-3951-4b4a-e436-c077f3528d7e" 506 | }, 507 | "outputs": [ 508 | { 509 | "data": { 510 | "text/plain": [ 511 | "[['The COVID-19 social media infodemic.'],\n", 512 | " ['The COVID-19 Social Media Infodemic'],\n", 513 | " ['Understanding the perception of COVID-19 policies by mining a multilanguage Twitter dataset'],\n", 514 | " ['Covid-19 infodemic reveals new tipping point epidemiology and a revised R formula.'],\n", 515 | " ['Coronavirus Goes Viral: Quantifying the COVID-19 Misinformation Epidemic on Twitter'],\n", 516 | " ['Effects of misinformation on COVID-19 individual responses and recommendations for resilience of disastrous consequences of misinformation'],\n", 517 | " ['Analysis of online misinformation during the peak of the COVID-19 pandemics in Italy'],\n", 518 | " ['Quantifying COVID-19 Content in the Online Health Opinion War Using Machine Learning'],\n", 519 | " ['Global Infodemiology of COVID-19: Analysis of Google Web Searches and Instagram Hashtags.'],\n", 520 | " ['COVID-19-Related Infodemic and Its Impact on Public Health: A Global Social Media Analysis.']]" 521 | ] 522 | }, 523 | "execution_count": 15, 524 | "metadata": { 525 | "tags": [] 526 | }, 527 | "output_type": "execute_result" 528 | } 529 | ], 530 | "source": [ 531 | "# Fetch the paper titles based on their index\n", 532 | "id2details(df, I, 'original_title')" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 16, 538 | "metadata": { 539 | "colab": { 540 | "base_uri": "https://localhost:8080/" 541 | }, 542 | "id": "p29pEtGrWUMV", 543 | "outputId": "b6448614-0a6a-4673-c841-2bfb0340607e" 544 | }, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "[[\"We address the diffusion of information about the COVID-19 with a massive data analysis on Twitter, Instagram, YouTube, Reddit and Gab. We analyze engagement and interest in the COVID-19 topic and provide a differential assessment on the evolution of the discourse on a global scale for each platform and their users. We fit information spreading with epidemic models characterizing the basic reproduction number [Formula: see text] for each social media platform. Moreover, we identify information spreading from questionable sources, finding different volumes of misinformation in each platform. However, information from both reliable and questionable sources do not present different spreading patterns. Finally, we provide platform-dependent numerical estimates of rumors' amplification.\"],\n", 550 | " [\"We address the diffusion of information about the COVID-19 with a massive data analysis on Twitter, Instagram, YouTube, Reddit and Gab. We analyze engagement and interest in the COVID-19 topic and provide a differential assessment on the evolution of the discourse on a global scale for each platform and their users. We fit information spreading with epidemic models characterizing the basic reproduction numbers $R_0$ for each social media platform. Moreover, we characterize information spreading from questionable sources, finding different volumes of misinformation in each platform. However, information from both reliable and questionable sources do not present different spreading patterns. Finally, we provide platform-dependent numerical estimates of rumors' amplification.\"],\n", 551 | " ['The objective of this work is to explore popular discourse about the COVID-19 pandemic and policies implemented to manage it. Using Natural Language Processing, Text Mining, and Network Analysis to analyze corpus of tweets that relate to the COVID-19 pandemic, we identify common responses to the pandemic and how these responses differ across time. Moreover, insights as to how information and misinformation were transmitted via Twitter, starting at the early stages of this pandemic, are presented. Finally, this work introduces a dataset of tweets collected from all over the world, in multiple languages, dating back to January 22nd, when the total cases of reported COVID-19 were below 600 worldwide. The insights presented in this work could help inform decision makers in the face of future pandemics, and the dataset introduced can be used to acquire valuable knowledge to help mitigate the COVID-19 pandemic.'],\n", 552 | " [\"Many governments have managed to control their COVID-19 outbreak with a simple message: keep the effective '$R$ number' $R<1$ to prevent widespread contagion and flatten the curve. This raises the question whether a similar policy could control dangerous online 'infodemics' of information, misinformation and disinformation. Here we show, using multi-platform data from the COVID-19 infodemic, that its online spreading instead encompasses a different dynamical regime where communities and users within and across independent platforms, sporadically form temporary active links on similar timescales to the viral spreading. This allows material that might have died out, to evolve and even mutate. This has enabled niche networks that were already successfully spreading hate and anti-vaccination material, to rapidly become global super-spreaders of narratives featuring fake COVID-19 treatments, anti-Asian sentiment and conspiracy theories. We derive new tools that incorporate these coupled social-viral dynamics, including an online $R$, to help prevent infodemic spreading at all scales: from spreading across platforms (e.g. Facebook, 4Chan) to spreading within a given subpopulation, or community, or topic. By accounting for similar social and viral timescales, the same mathematical theory also offers a quantitative description of other unconventional infection profiles such as rumors spreading in financial markets and colds spreading in schools.\"],\n", 553 | " ['Background Since the beginning of the coronavirus disease 2019 (COVID-19) epidemic, misinformation has been spreading\\xa0uninhibited\\xa0over traditional and social media at a rapid pace. We sought to analyze the magnitude of misinformation that is being spread on Twitter\\xa0(Twitter, Inc., San Francisco, CA) regarding the coronavirus epidemic.\\xa0 Materials and methods We conducted a search on Twitter using 14 different trending hashtags and keywords related to the COVID-19 epidemic. We then summarized and assessed individual tweets for misinformation in comparison to verified and peer-reviewed resources. Descriptive statistics were used to compare\\xa0terms and hashtags, and to identify individual tweets and account characteristics. Results The study included 673 tweets. Most tweets were posted by informal individuals/groups (66%), and 129 (19.2%) belonged to verified Twitter accounts. The majority of included tweets contained serious content (91.2%); 548 tweets (81.4%) included genuine information pertaining to the COVID-19 epidemic. Around 70% of the tweets tackled medical/public health information, while the others were pertaining to sociopolitical and financial factors. In total, 153 tweets (24.8%) included misinformation, and 107 (17.4%) included unverifiable information regarding the COVID-19 epidemic. The rate of misinformation was higher among informal individual/group accounts (33.8%, p: <0.001). Tweets from unverified Twitter accounts contained more misinformation (31.0% vs 12.6% for verified accounts, p: <0.001). Tweets from healthcare/public health accounts had the lowest rate of unverifiable information (12.3%, p: 0.04). The number of likes and retweets per tweet was not associated with a difference in either false or unverifiable content. The keyword \"COVID-19\" had the lowest rate of misinformation and unverifiable information, while the keywords \"#2019_ncov\" and \"Corona\" were associated with the highest amount of misinformation and unverifiable content respectively. Conclusions Medical misinformation and unverifiable content pertaining to the global COVID-19 epidemic are being propagated at an alarming rate on social media. We provide an early quantification of the magnitude of misinformation spread and highlight the importance of early interventions in order to curb this phenomenon that endangers public safety at a time when awareness and appropriate preventive actions are paramount.'],\n", 554 | " ['Abstract The proliferation of misinformation on social media platforms is faster than the spread of Corona Virus Diseases (COVID-19) and it can generate hefty deleterious consequences on health amid a disaster like COVID-19. Drawing upon research on the stimulus-response theory (hypodermic needle theory) and the resilience theory, this study tested a conceptual framework considering general misinformation belief, conspiracy belief, and religious misinformation belief as the stimulus; and credibility evaluations as resilience strategy; and their effects on COVID-19 individual responses. Using a self-administered online survey during the COVID-19 pandemic, the study obtained 483 useable responses and after test, finds that all-inclusive, the propagation of misinformation on social media undermines the COVID-19 individual responses. Particularly, credibility evaluation of misinformation strongly predicts the COVID-19 individual responses with positive influences and religious misinformation beliefs as well as conspiracy beliefs and general misinformation beliefs come next and influence negatively. The findings and general recommendations will help the public, in general, to be cautious about misinformation, and the respective authority of a country, in particular, for initiating proper safety measures about disastrous misinformation to protect the public health from being exploited.'],\n", 555 | " ['During the Covid-19 pandemics, we also experience another dangerous pandemics based on misinformation. Narratives disconnected from fact-checking on the origin and cure of the disease intertwined with pre-existing political fights. We collect a database on Twitter posts and analyse the topology of the networks of retweeters (users broadcasting again the same elementary piece of information, or tweet) and validate its structure with methods of statistical physics of networks. Furthermore, by using commonly available fact checking software, we assess the reputation of the pieces of news exchanged. By using a combination of theoretical and practical weapons, we are able to track down the flow of misinformation in a snapshot of the Twitter ecosystem. Thanks to the presence of verified users, we can also assign a polarization to the network nodes (users) and see the impact of low-quality information producers and spreaders in the Twitter ecosystem.'],\n", 556 | " ['A huge amount of potentially dangerous COVID-19 misinformation is appearing online. Here we use machine learning to quantify COVID-19 content among online opponents of establishment health guidance, in particular vaccinations (“anti-vax”). We find that the anti-vax community is developing a less focused debate around COVID-19 than its counterpart, the pro-vaccination (“pro-vax”) community. However, the anti-vax community exhibits a broader range of “flavors” of COVID-19 topics, and hence can appeal to a broader cross-section of individuals seeking COVID-19 guidance online, e.g. individuals wary of a mandatory fast-tracked COVID-19 vaccine or those seeking alternative remedies. Hence the anti-vax community looks better positioned to attract fresh support going forward than the pro-vax community. This is concerning since a widespread lack of adoption of a COVID-19 vaccine will mean the world falls short of providing herd immunity, leaving countries open to future COVID-19 resurgences. We provide a mechanistic model that interprets these results and could help in assessing the likely efficacy of intervention strategies. Our approach is scalable and hence tackles the urgent problem facing social media platforms of having to analyze huge volumes of online health misinformation and disinformation.'],\n", 557 | " ['BACKGROUND: Although \"infodemiological\" methods have been used in research on coronavirus disease (COVID-19), an examination of the extent of infodemic moniker (misinformation) use on the internet remains limited. OBJECTIVE: The aim of this paper is to investigate internet search behaviors related to COVID-19 and examine the circulation of infodemic monikers through two platforms-Google and Instagram-during the current global pandemic. METHODS: We have defined infodemic moniker as a term, query, hashtag, or phrase that generates or feeds fake news, misinterpretations, or discriminatory phenomena. Using Google Trends and Instagram hashtags, we explored internet search activities and behaviors related to the COVID-19 pandemic from February 20, 2020, to May 6, 2020. We investigated the names used to identify the virus, health and risk perception, life during the lockdown, and information related to the adoption of COVID-19 infodemic monikers. We computed the average peak volume with a 95% CI for the monikers. RESULTS: The top six COVID-19-related terms searched in Google were \"coronavirus,\" \"corona,\" \"COVID,\" \"virus,\" \"corona virus,\" and \"COVID-19.\" Countries with a higher number of COVID-19 cases had a higher number of COVID-19 queries on Google. The monikers \"coronavirus ozone,\" \"coronavirus laboratory,\" \"coronavirus 5G,\" \"coronavirus conspiracy,\" and \"coronavirus bill gates\" were widely circulated on the internet. Searches on \"tips and cures\" for COVID-19 spiked in relation to the US president speculating about a \"miracle cure\" and suggesting an injection of disinfectant to treat the virus. Around two thirds (n=48,700,000, 66.1%) of Instagram users used the hashtags \"COVID-19\" and \"coronavirus\" to disperse virus-related information. CONCLUSIONS: Globally, there is a growing interest in COVID-19, and numerous infodemic monikers continue to circulate on the internet. Based on our findings, we hope to encourage mass media regulators and health organizers to be vigilant and diminish the use and circulation of these infodemic monikers to decrease the spread of misinformation.'],\n", 558 | " ['Infodemics, often including rumors, stigma, and conspiracy theories, have been common during the COVID-19 pandemic. Monitoring social media data has been identified as the best method for tracking rumors in real time and as a possible way to dispel misinformation and reduce stigma. However, the detection, assessment, and response to rumors, stigma, and conspiracy theories in real time are a challenge. Therefore, we followed and examined COVID-19-related rumors, stigma, and conspiracy theories circulating on online platforms, including fact-checking agency websites, Facebook, Twitter, and online newspapers, and their impacts on public health. Information was extracted between December 31, 2019 and April 5, 2020, and descriptively analyzed. We performed a content analysis of the news articles to compare and contrast data collected from other sources. We identified 2,311 reports of rumors, stigma, and conspiracy theories in 25 languages from 87 countries. Claims were related to illness, transmission and mortality (24%), control measures (21%), treatment and cure (19%), cause of disease including the origin (15%), violence (1%), and miscellaneous (20%). Of the 2,276 reports for which text ratings were available, 1,856 claims were false (82%). Misinformation fueled by rumors, stigma, and conspiracy theories can have potentially serious implications on the individual and community if prioritized over evidence-based guidelines. Health agencies must track misinformation associated with the COVID-19 in real time, and engage local communities and government stakeholders to debunk misinformation.']]" 559 | ] 560 | }, 561 | "execution_count": 16, 562 | "metadata": { 563 | "tags": [] 564 | }, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "# Fetch the paper abstracts based on their index\n", 570 | "id2details(df, I, 'abstract')" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "metadata": { 576 | "id": "gFKvRb4QY-DL" 577 | }, 578 | "source": [ 579 | "\n", 580 | "## Putting all together\n", 581 | "\n", 582 | "So far, we've built a Faiss index using the misinformation abstract vectors we encoded with a sentence-DistilBERT model. That's helpful but in a real case scenario, we would have to work with unseen data. To query the index with an unseen query and retrieve its most relevant documents, we would have to do the following:\n", 583 | "\n", 584 | "1. Encode the query with the same sentence-DistilBERT model we used for the rest of the abstract vectors.\n", 585 | "2. Change its data type to float32.\n", 586 | "3. Search the index with the encoded query.\n", 587 | "\n", 588 | "Here, we will use the introduction of an article published on [HKS Misinformation Review](https://misinforeview.hks.harvard.edu/article/can-whatsapp-benefit-from-debunked-fact-checked-stories-to-reduce-misinformation/).\n" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 17, 594 | "metadata": { 595 | "id": "iDhftkrhX99T" 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "user_query = \"\"\"\n", 600 | "WhatsApp was alleged to have been widely used to spread misinformation and propaganda \n", 601 | "during the 2018 elections in Brazil and the 2019 elections in India. Due to the \n", 602 | "private encrypted nature of the messages on WhatsApp, it is hard to track the dissemination \n", 603 | "of misinformation at scale. In this work, using public WhatsApp data from Brazil and India, we \n", 604 | "observe that misinformation has been largely shared on WhatsApp public groups even after they \n", 605 | "were already fact-checked by popular fact-checking agencies. This represents a significant portion \n", 606 | "of misinformation spread in both Brazil and India in the groups analyzed. We posit that such \n", 607 | "misinformation content could be prevented if WhatsApp had a means to flag already fact-checked \n", 608 | "content. To this end, we propose an architecture that could be implemented by WhatsApp to counter \n", 609 | "such misinformation. Our proposal respects the current end-to-end encryption architecture on WhatsApp, \n", 610 | "thus protecting users’ privacy while providing an approach to detect the misinformation that benefits \n", 611 | "from fact-checking efforts.\n", 612 | "\"\"\"" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": 18, 618 | "metadata": { 619 | "colab": { 620 | "base_uri": "https://localhost:8080/" 621 | }, 622 | "id": "6AFhbGnWZpWN", 623 | "outputId": "b8a02af0-2f0d-4740-984a-e804405b3e6a" 624 | }, 625 | "outputs": [ 626 | { 627 | "name": "stdout", 628 | "output_type": "stream", 629 | "text": [ 630 | "L2 distance: [7.384466171264648, 57.3224983215332, 57.3224983215332, 71.48453521728516, 72.06803131103516, 79.13472747802734, 86.0128173828125, 89.91024780273438, 90.76014709472656, 90.76422119140625]\n", 631 | "\n", 632 | "MAG paper IDs: [3047438096, 3021927925, 3037966274, 2889959140, 2791045616, 2943077655, 3014380170, 2967434249, 3028584171, 2990343632]\n" 633 | ] 634 | } 635 | ], 636 | "source": [ 637 | "# For convenience, I've wrapped all steps in the vector_search function.\n", 638 | "# It takes four arguments: \n", 639 | "# A query, the sentence-level transformer, the Faiss index and the number of requested results\n", 640 | "D, I = vector_search([user_query], model, index, num_results=10)\n", 641 | "print(f'L2 distance: {D.flatten().tolist()}\\n\\nMAG paper IDs: {I.flatten().tolist()}')" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 19, 647 | "metadata": { 648 | "colab": { 649 | "base_uri": "https://localhost:8080/" 650 | }, 651 | "id": "tbanjBhBZtWZ", 652 | "outputId": "9a5d6d97-8983-4253-8095-b7d899e33ac8" 653 | }, 654 | "outputs": [ 655 | { 656 | "data": { 657 | "text/plain": [ 658 | "[['Can WhatsApp Benefit from Debunked Fact-Checked Stories to Reduce Misinformation?'],\n", 659 | " ['A Dataset of Fact-Checked Images Shared on WhatsApp During the Brazilian and Indian Elections'],\n", 660 | " ['A Dataset of Fact-Checked Images Shared on WhatsApp During the Brazilian and Indian Elections'],\n", 661 | " ['A System for Monitoring Public Political Groups in WhatsApp'],\n", 662 | " ['Politics of Fake News: How WhatsApp Became a Potent Propaganda Tool in India'],\n", 663 | " ['Characterizing Attention Cascades in WhatsApp Groups'],\n", 664 | " ['OS IMPACTOS JURÍDICOS E SOCIAIS DAS FAKE NEWS EM TERRITÓRIO BRASILEIRO'],\n", 665 | " ['Fake News and Social Media: Indian Perspective'],\n", 666 | " ['Images and Misinformation in Political Groups: Evidence from WhatsApp in India'],\n", 667 | " ['Can WhatsApp Counter Misinformation by Limiting Message Forwarding']]" 668 | ] 669 | }, 670 | "execution_count": 19, 671 | "metadata": { 672 | "tags": [] 673 | }, 674 | "output_type": "execute_result" 675 | } 676 | ], 677 | "source": [ 678 | "# Fetching the paper titles based on their index\n", 679 | "id2details(df, I, 'original_title')" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": 21, 685 | "metadata": { 686 | "colab": { 687 | "base_uri": "https://localhost:8080/" 688 | }, 689 | "id": "rbxFKF-DZxg0", 690 | "outputId": "d78cdc03-41f6-469d-bf67-8f32001a7415" 691 | }, 692 | "outputs": [ 693 | { 694 | "name": "stdout", 695 | "output_type": "stream", 696 | "text": [ 697 | "/content/vector_engine\n" 698 | ] 699 | } 700 | ], 701 | "source": [ 702 | "# Define project base directory\n", 703 | "# Change the index from 1 to 0 if you run this on Google Colab\n", 704 | "project_dir = Path('notebooks').resolve().parents[1]\n", 705 | "print(project_dir)\n", 706 | "\n", 707 | "# Serialise index and store it as a pickle\n", 708 | "with open(f\"{project_dir}/models/faiss_index.pickle\", \"wb\") as h:\n", 709 | " pickle.dump(faiss.serialize_index(index), h)" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "metadata": { 716 | "id": "AtSC6oDDjtMA" 717 | }, 718 | "outputs": [], 719 | "source": [] 720 | } 721 | ], 722 | "metadata": { 723 | "accelerator": "GPU", 724 | "colab": { 725 | "name": "001-vector-search.ipynb", 726 | "provenance": [], 727 | "toc_visible": true 728 | }, 729 | "kernelspec": { 730 | "display_name": "Python [conda env:myenv]", 731 | "language": "python", 732 | "name": "conda-env-myenv-py" 733 | }, 734 | "language_info": { 735 | "codemirror_mode": { 736 | "name": "ipython", 737 | "version": 3 738 | }, 739 | "file_extension": ".py", 740 | "mimetype": "text/x-python", 741 | "name": "python", 742 | "nbconvert_exporter": "python", 743 | "pygments_lexer": "ipython3", 744 | "version": "3.7.6" 745 | }, 746 | "widgets": { 747 | "application/vnd.jupyter.widget-state+json": { 748 | "15c5bbc768db41e4bc06c9405539091c": { 749 | "model_module": "@jupyter-widgets/base", 750 | "model_name": "LayoutModel", 751 | "state": { 752 | "_model_module": "@jupyter-widgets/base", 753 | "_model_module_version": "1.2.0", 754 | "_model_name": "LayoutModel", 755 | "_view_count": null, 756 | "_view_module": "@jupyter-widgets/base", 757 | "_view_module_version": "1.2.0", 758 | "_view_name": "LayoutView", 759 | "align_content": null, 760 | "align_items": null, 761 | "align_self": null, 762 | "border": null, 763 | "bottom": null, 764 | "display": null, 765 | "flex": null, 766 | "flex_flow": null, 767 | "grid_area": null, 768 | "grid_auto_columns": null, 769 | "grid_auto_flow": null, 770 | "grid_auto_rows": null, 771 | "grid_column": null, 772 | "grid_gap": null, 773 | "grid_row": null, 774 | "grid_template_areas": null, 775 | "grid_template_columns": null, 776 | "grid_template_rows": null, 777 | "height": null, 778 | "justify_content": null, 779 | "justify_items": null, 780 | "left": null, 781 | "margin": null, 782 | "max_height": null, 783 | "max_width": null, 784 | "min_height": null, 785 | "min_width": null, 786 | "object_fit": null, 787 | "object_position": null, 788 | "order": null, 789 | "overflow": null, 790 | "overflow_x": null, 791 | "overflow_y": null, 792 | "padding": null, 793 | "right": null, 794 | "top": null, 795 | "visibility": null, 796 | "width": null 797 | } 798 | }, 799 | "1e1408cddd284bc4a4b5484be0561991": { 800 | "model_module": "@jupyter-widgets/controls", 801 | "model_name": "DescriptionStyleModel", 802 | "state": { 803 | "_model_module": "@jupyter-widgets/controls", 804 | "_model_module_version": "1.5.0", 805 | "_model_name": "DescriptionStyleModel", 806 | "_view_count": null, 807 | "_view_module": "@jupyter-widgets/base", 808 | "_view_module_version": "1.2.0", 809 | "_view_name": "StyleView", 810 | "description_width": "" 811 | } 812 | }, 813 | "3e9c24c8488b431fb88a0d045d85b700": { 814 | "model_module": "@jupyter-widgets/controls", 815 | "model_name": "FloatProgressModel", 816 | "state": { 817 | "_dom_classes": [], 818 | "_model_module": "@jupyter-widgets/controls", 819 | "_model_module_version": "1.5.0", 820 | "_model_name": "FloatProgressModel", 821 | "_view_count": null, 822 | "_view_module": "@jupyter-widgets/controls", 823 | "_view_module_version": "1.5.0", 824 | "_view_name": "ProgressView", 825 | "bar_style": "success", 826 | "description": "Batches: 100%", 827 | "description_tooltip": null, 828 | "layout": "IPY_MODEL_15c5bbc768db41e4bc06c9405539091c", 829 | "max": 264, 830 | "min": 0, 831 | "orientation": "horizontal", 832 | "style": "IPY_MODEL_f9c22552944b4e2dafe1f98170665293", 833 | "value": 264 834 | } 835 | }, 836 | "7a7e927567024c578b33b15000d5e531": { 837 | "model_module": "@jupyter-widgets/controls", 838 | "model_name": "HBoxModel", 839 | "state": { 840 | "_dom_classes": [], 841 | "_model_module": "@jupyter-widgets/controls", 842 | "_model_module_version": "1.5.0", 843 | "_model_name": "HBoxModel", 844 | "_view_count": null, 845 | "_view_module": "@jupyter-widgets/controls", 846 | "_view_module_version": "1.5.0", 847 | "_view_name": "HBoxView", 848 | "box_style": "", 849 | "children": [ 850 | "IPY_MODEL_3e9c24c8488b431fb88a0d045d85b700", 851 | "IPY_MODEL_7aaf8a423e7f48bbac24d6970ed4dfe9" 852 | ], 853 | "layout": "IPY_MODEL_da5011a6565e45e2b5d0b37634498648" 854 | } 855 | }, 856 | "7aaf8a423e7f48bbac24d6970ed4dfe9": { 857 | "model_module": "@jupyter-widgets/controls", 858 | "model_name": "HTMLModel", 859 | "state": { 860 | "_dom_classes": [], 861 | "_model_module": "@jupyter-widgets/controls", 862 | "_model_module_version": "1.5.0", 863 | "_model_name": "HTMLModel", 864 | "_view_count": null, 865 | "_view_module": "@jupyter-widgets/controls", 866 | "_view_module_version": "1.5.0", 867 | "_view_name": "HTMLView", 868 | "description": "", 869 | "description_tooltip": null, 870 | "layout": "IPY_MODEL_7f1cd9dbb4b742ab8d78e073fb9b0f90", 871 | "placeholder": "​", 872 | "style": "IPY_MODEL_1e1408cddd284bc4a4b5484be0561991", 873 | "value": " 264/264 [00:31<00:00, 8.30it/s]" 874 | } 875 | }, 876 | "7f1cd9dbb4b742ab8d78e073fb9b0f90": { 877 | "model_module": "@jupyter-widgets/base", 878 | "model_name": "LayoutModel", 879 | "state": { 880 | "_model_module": "@jupyter-widgets/base", 881 | "_model_module_version": "1.2.0", 882 | "_model_name": "LayoutModel", 883 | "_view_count": null, 884 | "_view_module": "@jupyter-widgets/base", 885 | "_view_module_version": "1.2.0", 886 | "_view_name": "LayoutView", 887 | "align_content": null, 888 | "align_items": null, 889 | "align_self": null, 890 | "border": null, 891 | "bottom": null, 892 | "display": null, 893 | "flex": null, 894 | "flex_flow": null, 895 | "grid_area": null, 896 | "grid_auto_columns": null, 897 | "grid_auto_flow": null, 898 | "grid_auto_rows": null, 899 | "grid_column": null, 900 | "grid_gap": null, 901 | "grid_row": null, 902 | "grid_template_areas": null, 903 | "grid_template_columns": null, 904 | "grid_template_rows": null, 905 | "height": null, 906 | "justify_content": null, 907 | "justify_items": null, 908 | "left": null, 909 | "margin": null, 910 | "max_height": null, 911 | "max_width": null, 912 | "min_height": null, 913 | "min_width": null, 914 | "object_fit": null, 915 | "object_position": null, 916 | "order": null, 917 | "overflow": null, 918 | "overflow_x": null, 919 | "overflow_y": null, 920 | "padding": null, 921 | "right": null, 922 | "top": null, 923 | "visibility": null, 924 | "width": null 925 | } 926 | }, 927 | "da5011a6565e45e2b5d0b37634498648": { 928 | "model_module": "@jupyter-widgets/base", 929 | "model_name": "LayoutModel", 930 | "state": { 931 | "_model_module": "@jupyter-widgets/base", 932 | "_model_module_version": "1.2.0", 933 | "_model_name": "LayoutModel", 934 | "_view_count": null, 935 | "_view_module": "@jupyter-widgets/base", 936 | "_view_module_version": "1.2.0", 937 | "_view_name": "LayoutView", 938 | "align_content": null, 939 | "align_items": null, 940 | "align_self": null, 941 | "border": null, 942 | "bottom": null, 943 | "display": null, 944 | "flex": null, 945 | "flex_flow": null, 946 | "grid_area": null, 947 | "grid_auto_columns": null, 948 | "grid_auto_flow": null, 949 | "grid_auto_rows": null, 950 | "grid_column": null, 951 | "grid_gap": null, 952 | "grid_row": null, 953 | "grid_template_areas": null, 954 | "grid_template_columns": null, 955 | "grid_template_rows": null, 956 | "height": null, 957 | "justify_content": null, 958 | "justify_items": null, 959 | "left": null, 960 | "margin": null, 961 | "max_height": null, 962 | "max_width": null, 963 | "min_height": null, 964 | "min_width": null, 965 | "object_fit": null, 966 | "object_position": null, 967 | "order": null, 968 | "overflow": null, 969 | "overflow_x": null, 970 | "overflow_y": null, 971 | "padding": null, 972 | "right": null, 973 | "top": null, 974 | "visibility": null, 975 | "width": null 976 | } 977 | }, 978 | "f9c22552944b4e2dafe1f98170665293": { 979 | "model_module": "@jupyter-widgets/controls", 980 | "model_name": "ProgressStyleModel", 981 | "state": { 982 | "_model_module": "@jupyter-widgets/controls", 983 | "_model_module_version": "1.5.0", 984 | "_model_name": "ProgressStyleModel", 985 | "_view_count": null, 986 | "_view_module": "@jupyter-widgets/base", 987 | "_view_module_version": "1.2.0", 988 | "_view_name": "StyleView", 989 | "bar_color": null, 990 | "description_width": "initial" 991 | } 992 | } 993 | } 994 | } 995 | }, 996 | "nbformat": 4, 997 | "nbformat_minor": 4 998 | } 999 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.8.1 2 | transformers==3.3.1 3 | sentence-transformers==0.3.8 4 | pandas==1.1.2 5 | faiss-cpu==1.6.1 6 | numpy==1.19.2 7 | folium==0.2.1 8 | streamlit==0.62.0 9 | -e . -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_namespace_packages 3 | 4 | common_kwargs = dict( 5 | version="0.1.0", 6 | license="MIT", 7 | # install_requires=required, 8 | long_description=open("README.md").read(), 9 | url="https://github.com/kstathou/vector_engine", 10 | author="Kostas Stathoulopoulos", 11 | author_email="k.stathou@gmail.com", 12 | classifiers=[ 13 | "Intended Audience :: Developers", 14 | "Intended Audience :: Science/Research", 15 | "License :: OSI Approved :: MIT License", 16 | "Natural Language :: English", 17 | "Operating System :: OS Independent", 18 | "Programming Language :: Python :: 3.7", 19 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 20 | ], 21 | python_requires=">3.6", 22 | include_package_data=False, 23 | ) 24 | 25 | setup( 26 | name="vector_engine", 27 | packages=find_namespace_packages(where="vector_engine.*"), 28 | **common_kwargs 29 | ) 30 | -------------------------------------------------------------------------------- /vector_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kstathou/vector_engine/2ae08673e89d3a1fe5170e14b9bea5d5e509be7d/vector_engine/__init__.py -------------------------------------------------------------------------------- /vector_engine/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def vector_search(query, model, index, num_results=10): 5 | """Tranforms query to vector using a pretrained, sentence-level 6 | DistilBERT model and finds similar vectors using FAISS. 7 | Args: 8 | query (str): User query that should be more than a sentence long. 9 | model (sentence_transformers.SentenceTransformer.SentenceTransformer) 10 | index (`numpy.ndarray`): FAISS index that needs to be deserialized. 11 | num_results (int): Number of results to return. 12 | Returns: 13 | D (:obj:`numpy.array` of `float`): Distance between results and query. 14 | I (:obj:`numpy.array` of `int`): Paper ID of the results. 15 | 16 | """ 17 | vector = model.encode(list(query)) 18 | D, I = index.search(np.array(vector).astype("float32"), k=num_results) 19 | return D, I 20 | 21 | 22 | def id2details(df, I, column): 23 | """Returns the paper titles based on the paper index.""" 24 | return [list(df[df.id == idx][column]) for idx in I[0]] 25 | --------------------------------------------------------------------------------