├── assets ├── app_url.png ├── indexing.png ├── var_env.png ├── cloud_run.png ├── invocations.png ├── port_value.png ├── push_image.png ├── rag-upstash.gif ├── deployed_app.png └── container_registry_region.png ├── justfile ├── src ├── prompts.py ├── callbacks.py ├── paperswithcode.py ├── rag.py ├── app.py ├── index_papers.py └── upstash.py ├── pyproject.toml ├── Dockerfile ├── README.md └── .gitignore /assets/app_url.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/app_url.png -------------------------------------------------------------------------------- /assets/indexing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/indexing.png -------------------------------------------------------------------------------- /assets/var_env.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/var_env.png -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | run-app: 2 | @poetry run python -m streamlit run src/app.py --theme.primaryColor "#135aaf" -------------------------------------------------------------------------------- /assets/cloud_run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/cloud_run.png -------------------------------------------------------------------------------- /assets/invocations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/invocations.png -------------------------------------------------------------------------------- /assets/port_value.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/port_value.png -------------------------------------------------------------------------------- /assets/push_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/push_image.png -------------------------------------------------------------------------------- /assets/rag-upstash.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/rag-upstash.gif -------------------------------------------------------------------------------- /assets/deployed_app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/deployed_app.png -------------------------------------------------------------------------------- /assets/container_registry_region.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedbesbes/chat-with-papers/HEAD/assets/container_registry_region.png -------------------------------------------------------------------------------- /src/prompts.py: -------------------------------------------------------------------------------- 1 | RAG_PROMPT_TEMPLATE = """ 2 | Your task is to answer questions by using a given context. 3 | 4 | Don't invent anything that is outside of the context. 5 | Answer in at least 350 characters. 6 | 7 | %CONTEXT% 8 | {context} 9 | 10 | %Question% 11 | {question} 12 | 13 | Hint: Do not copy the context. Use your own words 14 | 15 | Answer: 16 | """ 17 | -------------------------------------------------------------------------------- /src/callbacks.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | from langchain.callbacks.base import BaseCallbackHandler 4 | 5 | 6 | class StreamHandler(BaseCallbackHandler): 7 | def __init__(self, container, initial_text=""): 8 | self.container = container 9 | self.text = "### Generated Answer 🤖\n" 10 | 11 | def on_llm_new_token(self, token: str, **kwargs) -> None: 12 | for letter in token: 13 | delay = random.uniform(0.0005, 0.001) 14 | time.sleep(delay) 15 | self.text += letter 16 | self.container.markdown(self.text) 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "upstash-rag" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Ahmed Besbes "] 6 | readme = "README.md" 7 | packages = [{include = "upstash_rag"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9,<3.9.7 || >3.9.7,<4.0" 11 | langchain = "^0.1.4" 12 | upstash-vector = "^0.1.5" 13 | python-dotenv = "^1.0.1" 14 | tqdm = "^4.66.1" 15 | google-cloud-aiplatform = "^1.40.0" 16 | langchain-openai = "^0.0.5" 17 | langchain-google-vertexai = "^0.0.3" 18 | streamlit = "^1.30.0" 19 | 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | black = "^24.1.0" 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | -------------------------------------------------------------------------------- /src/paperswithcode.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | import requests 3 | from tqdm import tqdm 4 | 5 | 6 | def extract_papers(query: str): 7 | query = urllib.parse.quote(query) 8 | url = f"https://paperswithcode.com/api/v1/papers/?q={query}" 9 | response = requests.get(url) 10 | response = response.json() 11 | count = response["count"] 12 | results = [] 13 | results += response["results"] 14 | 15 | num_pages = count // 50 16 | for page in tqdm(range(2, num_pages)): 17 | url = f"https://paperswithcode.com/api/v1/papers/?page={page}&q={query}" 18 | response = requests.get(url) 19 | response = response.json() 20 | results += response["results"] 21 | return results 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim as builder 2 | 3 | ARG user=app_user 4 | ARG group=${user} 5 | ARG uid=1010 6 | ARG gid=1010 7 | 8 | ARG APP_DIR=/app 9 | 10 | # install app 11 | ENV POETRY_HOME="/.poetry" \ 12 | POETRY_CACHE_DIR=/tmp/poetry_cache \ 13 | POETRY_NO_INTERACTION=1 \ 14 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 15 | POETRY_VIRTUALENVS_CREATE=1 \ 16 | PYTHONDONTWRITEBYTECODE=1 17 | 18 | RUN apt-get -y -q update && \ 19 | apt-get -y -q install --no-install-recommends curl && \ 20 | curl -sSL https://install.python-poetry.org | python - --version 1.3.2 21 | 22 | ENV PATH="${POETRY_HOME}/bin:${PATH}" 23 | ENV PATH="${APP_DIR}/.venv/bin:${PATH}" 24 | 25 | WORKDIR $APP_DIR 26 | COPY pyproject.toml poetry.lock ./ 27 | RUN find . | grep -E "(__pycache__|\.pyc$)" | xargs rm -rf 28 | RUN poetry install --with app --without dev --no-root --no-interaction && rm -rf $POETRY_CACHE_DIR 29 | 30 | FROM python:3.9-slim as runtime 31 | 32 | ENV VIRTUAL_ENV=/app/.venv \ 33 | PATH="/app/.venv/bin:$PATH" \ 34 | PROJECT_ID="playground-351113" 35 | 36 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} 37 | 38 | COPY src src 39 | 40 | ENV PORT 80 41 | 42 | ENTRYPOINT [ "python", "-m", "streamlit", "run", "src/app.py", "--server.port=80", "--server.address=0.0.0.0", "--theme.primaryColor=#135aaf"] -------------------------------------------------------------------------------- /src/rag.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from langchain_openai import ChatOpenAI 4 | from upstash_vector import Index 5 | from prompts import RAG_PROMPT_TEMPLATE 6 | from src.upstash import UpstashVectorStore 7 | from src.callbacks import StreamHandler 8 | 9 | 10 | load_dotenv() 11 | 12 | 13 | class RAG: 14 | def __init__(self, chat_box, embeddings): 15 | self.chat_box = chat_box 16 | self.set_llm() 17 | self.embeddings = embeddings 18 | self.index = Index( 19 | url=os.environ.get("UPSTASH_URL"), 20 | token=os.environ.get("UPSTASH_TOKEN"), 21 | ) 22 | self.vectorstore = UpstashVectorStore(self.index, self.embeddings) 23 | 24 | def set_llm(self): 25 | chat_box = self.chat_box.container().empty() 26 | stream_handler = StreamHandler(chat_box) 27 | llm = ChatOpenAI( 28 | max_tokens=400, 29 | streaming=True, 30 | callbacks=[stream_handler], 31 | ) 32 | self.llm = llm 33 | 34 | def get_context(self, query): 35 | results = self.vectorstore.similarity_search_with_score(query) 36 | context = "" 37 | 38 | for doc, _ in results: 39 | context += doc.page_content + "\n===\n" 40 | return context, results 41 | 42 | @staticmethod 43 | def get_prompt(question, context): 44 | prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context) 45 | return prompt 46 | 47 | def predict(self, query): 48 | context, source_documents = self.get_context(query) 49 | prompt = self.get_prompt(query, context) 50 | answer = self.llm.predict(prompt) 51 | prediction = { 52 | "answer": answer, 53 | "source_documents": source_documents, 54 | } 55 | return prediction 56 | -------------------------------------------------------------------------------- /src/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | import streamlit as st 4 | from langchain_google_vertexai import VertexAIEmbeddings 5 | from google.cloud import aiplatform 6 | from src.rag import RAG 7 | 8 | st.set_page_config(layout="wide") 9 | load_dotenv() 10 | 11 | aiplatform.init( 12 | project=os.environ.get("PROJECT_ID"), 13 | location="europe-west1", 14 | ) 15 | 16 | 17 | @st.cache_resource 18 | def get_embedding_model(): 19 | embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003") 20 | return embeddings 21 | 22 | 23 | def load_rag(_chat_box): 24 | embeddings = get_embedding_model() 25 | rag = RAG(_chat_box, embeddings) 26 | return rag 27 | 28 | 29 | def display_source_documents(source_documents): 30 | for document, score in source_documents: 31 | metadata = document.metadata 32 | document_content = document.page_content 33 | 34 | id_ = metadata["id"] 35 | arxiv_id = metadata["arxiv_id"] 36 | url_pdf = metadata["url_pdf"] 37 | title = metadata["title"] 38 | authors = metadata["authors"] 39 | published = metadata["published"] 40 | 41 | with st.container(border=True): 42 | st.markdown(f"* **📰 Title** : {title} (score = {score})") 43 | st.markdown(f"* **🏷️ ARXIV ID** : **`{arxiv_id}`**") 44 | st.markdown(f"* **✍️ Authors** : {' ,'.join(authors)}") 45 | st.markdown(f"* **📅 Publication date** : {published}") 46 | st.markdown(f"URL 🔗: {url_pdf}") 47 | st.write(f"context: {document_content}") 48 | 49 | 50 | input_question = st.text_input("Ask your question") 51 | columns = st.columns(2) 52 | 53 | with columns[0]: 54 | chat_box = st.empty() 55 | 56 | rag = load_rag(chat_box) 57 | 58 | 59 | if input_question.strip() != "": 60 | with st.spinner("Generating Answer"): 61 | prediction = rag.predict(input_question) 62 | 63 | answer = prediction["answer"] 64 | source_documents = prediction["source_documents"] 65 | 66 | with columns[1]: 67 | st.write("### Source documents") 68 | display_source_documents(source_documents) 69 | -------------------------------------------------------------------------------- /src/index_papers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | from dotenv import load_dotenv 4 | from langchain.docstore.document import Document 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter 6 | from langchain_community.embeddings import VertexAIEmbeddings 7 | from upstash_vector import Index 8 | from src.paperswithcode import extract_papers 9 | from src.upstash import UpstashVectorStore 10 | 11 | 12 | @click.command() 13 | @click.option("--query", type=str) 14 | @click.option("--batch_size", type=int, default=32) 15 | @click.option("--limit", type=int, default=None) 16 | def cli(query, batch_size, limit): 17 | load_dotenv() 18 | click.echo(f"Extracting papers matching this query: {query}") 19 | papers = extract_papers(query) 20 | click.echo(f"Extraction complete ✅: ({len(papers)} papers)") 21 | documents = [ 22 | Document( 23 | page_content=paper["abstract"], 24 | metadata={ 25 | "id": paper["id"] if paper["id"] else "", 26 | "arxiv_id": paper["arxiv_id"] if paper["arxiv_id"] else "", 27 | "url_pdf": paper["url_pdf"] if paper["url_pdf"] else "", 28 | "title": paper["title"] if paper["title"] else "", 29 | "authors": paper["authors"] if paper["authors"] else "", 30 | "published": paper["published"] if paper["published"] else "", 31 | }, 32 | ) 33 | for paper in papers 34 | ] 35 | 36 | text_splitter = RecursiveCharacterTextSplitter( 37 | chunk_size=1200, 38 | chunk_overlap=200, 39 | separators=["."], 40 | ) 41 | splits = text_splitter.split_documents(documents) 42 | splits = splits[:limit] 43 | 44 | index = Index( 45 | url=os.environ.get("UPSTASH_URL"), 46 | token=os.environ.get("UPSTASH_TOKEN"), 47 | ) 48 | embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003") 49 | upstash_vector_store = UpstashVectorStore(index, embeddings) 50 | click.echo("Indexing to Upstash ...") 51 | ids = upstash_vector_store.add_documents(splits, batch_size=batch_size) 52 | click.echo(f"Successfully indexed {len(ids)} vector to Upstash") 53 | 54 | 55 | if __name__ == "__main__": 56 | cli() 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo contains the code for building a RAG-based assistant to chat with Papers With Code. 2 | 3 | ### 0. Some requirements 4 | 5 | - A GCP account with VertexAI and Cloud Run services activated 6 | - An OpenAI key 7 | - A free account on [Upstash](https://upstash.com/) (serverless database) 8 | 9 | 10 | ### 1. Indexing 11 | 12 | To index data into the vector DB, you first need to create an index on Upstash and fill in the credentials in the `.env` file: 13 | 14 | ``` 15 | UPSTASH_URL=... 16 | UPSTASH_TOKEN=... 17 | ``` 18 | 19 | Then you have to run this command: 20 | 21 | ```bash 22 | poetry run python -m src.index_papers --query "OpenAI" --limit 200 23 | ``` 24 | 25 | Here's the result of indexing 200 chunks matching the "OpenAI" query. 26 | 27 | ![](./assets/indexing.png) 28 | 29 | 30 | ### 2. Run the Streamlit application locally to interact with the RAG 31 | 32 | 33 | ```bash 34 | poetry run python -m streamlit run src/app.py --theme.primaryColor "#135aaf" 35 | ``` 36 | 37 | ![](./assets/rag-upstash.gif) 38 | 39 | 40 | ### 3. Deploy the application to Cloud Run 41 | 42 | Follow these steps: 43 | 44 | Build the Docker image locally: 45 | 46 | ```bash 47 | docker build -t chat-pwc . 48 | ``` 49 | 50 | Push the image to container registry: 51 | 52 | ```bash 53 | gcloud builds submit --tag gcr.io//pwc-rag --timeout=2h 54 | ``` 55 | ![](./assets/push_image.png) 56 | 57 | Connect to GCP console, go to Cloud Run service and hit the button "Create service": 58 | 59 | ![](./assets/cloud_run.png) 60 | 61 | Fill in these parameters: 62 | 63 | - The container image URL and the region 64 | 65 | ![](./assets/container_registry_region.png) 66 | 67 | - The container port (To match the value mentioned in the Dockerfile=80) 68 | 69 | ![](./assets/port_value.png) 70 | 71 | - The secrets that will be injected as environment variables 72 | 73 | ![](./assets/var_env.png) 74 | 75 | - Activate unathenticated invocations 76 | 77 | ![](./assets/invocations.png) 78 | 79 | Then hit the create button: 80 | 81 | - Once the service created, you'll see the corresponding URL 82 | 83 | ![](./assets/app_url.png) 84 | 85 | Now you can visit the app: 86 | 87 | ![](./assets/deployed_app.png) 88 | 89 | 90 | ### More details 91 | 92 | Check Medium [post](https://towardsdatascience.com/how-to-build-an-llm-powered-app-to-chat-with-paperswithcode-09ddd9ee753a). -------------------------------------------------------------------------------- /src/upstash.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | from uuid import uuid4 3 | from langchain.docstore.document import Document 4 | from langchain.embeddings.base import Embeddings 5 | from tqdm import tqdm 6 | from upstash_vector import Index 7 | 8 | 9 | class UpstashVectorStore: 10 | def __init__(self, index: Index, embeddings: Embeddings): 11 | self.index = index 12 | self.embeddings = embeddings 13 | 14 | def delete_vectors( 15 | self, 16 | ids: Union[str, List[str]] = None, 17 | delete_all: bool = None, 18 | ): 19 | if delete_all: 20 | self.index.reset() 21 | else: 22 | self.index.delete(ids) 23 | 24 | def add_documents( 25 | self, 26 | documents: List[Document], 27 | ids: Optional[List[str]] = None, 28 | batch_size: int = 32, 29 | ): 30 | texts = [] 31 | metadatas = [] 32 | all_ids = [] 33 | 34 | for document in tqdm(documents): 35 | text = document.page_content 36 | metadata = document.metadata 37 | metadata = {"context": text, **metadata} 38 | texts.append(text) 39 | metadatas.append(metadata) 40 | 41 | if len(texts) >= batch_size: 42 | ids = [str(uuid4()) for _ in range(len(texts))] 43 | all_ids += ids 44 | embeddings = self.embeddings.embed_documents(texts, batch_size=250) 45 | self.index.upsert( 46 | vectors=zip(ids, embeddings, metadatas), 47 | ) 48 | texts = [] 49 | metadatas = [] 50 | 51 | if len(texts) > 0: 52 | ids = [str(uuid4()) for _ in range(len(texts))] 53 | all_ids += ids 54 | embeddings = self.embeddings.embed_documents(texts) 55 | self.index.upsert( 56 | vectors=zip(ids, embeddings, metadatas), 57 | ) 58 | 59 | n = len(all_ids) 60 | print(f"Successfully indexed {n} dense vectors to Upstash.") 61 | print(self.index.stats()) 62 | return all_ids 63 | 64 | def similarity_search_with_score( 65 | self, 66 | query: str, 67 | k: int = 4, 68 | ) -> List[Tuple[Document, float]]: 69 | query_embedding = self.embeddings.embed_query(query) 70 | query_results = self.index.query( 71 | query_embedding, 72 | top_k=k, 73 | include_metadata=True, 74 | ) 75 | output = [] 76 | for query_result in query_results: 77 | score = query_result.score 78 | metadata = query_result.metadata 79 | context = metadata.pop("context") 80 | doc = Document( 81 | page_content=context, 82 | metadata=metadata, 83 | ) 84 | output.append((doc, score)) 85 | return output 86 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | --------------------------------------------------------------------------------