├── .env.azure.template
├── .env.template
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── app
    ├── app.py
    ├── assets
    │   ├── RedisOpenAI-QnA-Architecture.drawio.png
    │   ├── RedisOpenAI.png
    │   ├── arxivguru.png
    │   ├── arxivguru_crop.png
    │   ├── diagram-dark.png
    │   ├── diagram.excalidraw
    │   └── diagram.png
    ├── pages
    │   ├── Stats.py
    │   └── __init__.py
    └── qna
    │   ├── __init__.py
    │   ├── arxiv.yaml
    │   ├── constants.py
    │   ├── data.py
    │   ├── db.py
    │   ├── llm.py
    │   └── prompt.py
├── docker-compose.yml
├── poetry.lock
└── pyproject.toml


/.env.azure.template:
--------------------------------------------------------------------------------
 1 | OPENAI_API_KEY=ADD_YOUR_KEY_HERE
 2 | OPENAI_API_TYPE=azure
 3 | OPENAI_API_VERSION=2022-12-01
 4 | OPENAI_API_BASE=https://YOUR_BASE_HERE.openai.azure.com/
 5 | OPENAI_COMPLETIONS_ENGINE=text-davinci-003
 6 | OPENAI_EMBEDDINGS_ENGINE=text-embedding-ada-002
 7 | REDIS_HOST=YOUR_REDIS_INSTANCE.YOUR_REGION.redisenterprise.cache.azure.net
 8 | REDIS_PORT=10000
 9 | REDIS_PASSWORD=YOUR_REDIS_ACCESS_KEY
10 | TOKENIZERS_PARALLELISM=false
11 | 


--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=ADD_YOUR_KEY_HERE
2 | OPENAI_COMPLETIONS_ENGINE=gpt-3.5-turbo-16k
3 | OPENAI_EMBEDDINGS_ENGINE=text-embedding-ada-002
4 | REDIS_INDEX_NAME="arxiv"
5 | REDIS_URL=redis://redis:6379
6 | TOKENIZERS_PARALLELISM=false
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .ipynb_checkpoints
3 | .DS_Store
4 | __pycache__/


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim-buster
 2 | 
 3 | RUN apt-get update && apt-get install python-tk python3-tk tk-dev curl -y
 4 | 
 5 | RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
 6 |     cd /usr/local/bin && \
 7 |     ln -s /opt/poetry/bin/poetry && \
 8 |     poetry config virtualenvs.create false
 9 | 
10 | # Copy the deps file into the container
11 | COPY ./poetry.lock ./pyproject.toml ./
12 | 
13 | WORKDIR /app
14 | 
15 | # Copy all files and subdirectories from ./app to /app in the image
16 | COPY ./app /app
17 | 
18 | RUN poetry install --no-root
19 | 
20 | LABEL org.opencontainers.image.source https://github.com/redis-developer/ArxivChatGuru
21 | 
22 | CMD ["poetry", "run", "streamlit", "run", "app.py", "--server.fileWatcherType", "none", "--browser.gatherUsageStats", "false","--server.enableXsrfProtection", "false", "--server.address", "0.0.0.0"]
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Redis Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <div align="center">
  3 |     <a href="https://github.com/redis-developer/redis-ai-resources"><img src="./app/assets/arxivguru_crop.png" width="30%"><img></a>
  4 | </div>
  5 | 
  6 | # ArXiv ChatGuru
  7 | 
  8 | Welcome to **ArXiv ChatGuru**. This tool harnesses `LangChain` and `Redis` to make ArXiv's vast collection of scientific papers more interactive. Through this approach, we aim to make accessing and understanding research easier and more engaging, but also just to teach about how Retrieval Augmented Generation (RAG) systems work.
  9 | 
 10 | ## 📖 How it Works
 11 | 
 12 | This diagram shows the process how ArXiv ChatGuru works. The user submits a topic, which is used to retrieve relevant papers from ArXiv. These papers are then chunked into smaller pieces, for which embeddings are generated. These embeddings are stored in Redis, which is used as a vector database. The user can then ask questions about the papers retrieved by the topic they submitted, and the system will return the most relevant answer.
 13 | 
 14 | ![ref arch](app/assets/diagram.png#gh-light-mode-only)
 15 | ![ref arch](app/assets/diagram-dark.png#gh-dark-mode-only)
 16 | 
 17 | ## 🛠 Components
 18 | 
 19 | 1. **LangChain's ArXiv Loader**: Efficiently pull scientific literature directly from ArXiv.
 20 | 2. **Chunking + Embedding**: Using LangChain, we segment lengthy papers into manageable pieces (rather arbitrarily currently), for which we then generate embeddings.
 21 | 3. **Redis**: Demonstrating fast and efficient vector storage, indexing, and retrieval for RAG.
 22 | 4. **RetrievalQA**: Building on LangChain's RetrievalQA and OpenAI models, users can write queries about papers retrieved by the topic they submit.
 23 | 5. **Python Libraries**: Making use of tools such as [`redisvl`](https://redisvl.com), [`Langchain`](https://www.langchain.com/), [`Streamlit`](https://streamlit.io/), etc
 24 | 
 25 | ## 💡 Learning Outcomes with ArXiv ChatGuru
 26 | 
 27 | - **Context Window Exploration**: Learn about the importance of context window size and how it influences interaction results.
 28 | - **Vector Distance Insights**: Understand the role of vector distance in context retrieval for RAG and see how adjustments can change response specificity.
 29 | - **Document Retrieval Dynamics**: Observe how the number of documents retrieved can influence the performance of a RAG (Retriever-Augmented Generation) system.
 30 | - **Using Redis as a Vector DB and Semantic Cache**: Learn how to use Redis as a vector database for RAG systems and how to use it as a semantic cache for RAG systems.
 31 | 
 32 | 
 33 | **Note**: This **is not** a production application. It's a learning tool more than anything. We're using Streamlit to make it easy to interact with, but it's not meant to be a scalable application. It's meant to be a learning tool for understanding how RAG systems work, and how they can be used to make scientific literature more interactive. We will continue to make this better over time.
 34 | 
 35 | 
 36 | 🌟 If you love what we're doing, give us a star! Contributions and feedback are always welcome. 🌌🔭
 37 | 
 38 | ## Up Next
 39 | 
 40 | What we want to do next (ideas welcome!):
 41 | 
 42 | - [x] Pin stable versions of dependencies using poetry
 43 | - Filters for Year, Author, etc.
 44 | - More efficient chunking
 45 | - Various LLM caching toggles
 46 | - Chat history and conversational memory in Redis
 47 | 
 48 | ____
 49 | 
 50 | ## Run the App
 51 | 
 52 | ### Run Locally
 53 | 
 54 | 1. First, clone this repo and cd into it.
 55 |     ```bash
 56 |     $ git clone https://github.com/redis-developer/ArXivChatGuru.git && cd ArxivChatGuru
 57 |     ```
 58 | 
 59 | 2. Create your env file:
 60 |     ```bash
 61 |     $ cp .env.template .env
 62 |     ```
 63 |     *fill out values, most importantly, your `OPENAI_API_KEY`.*
 64 | 
 65 | 3. Install dependencies with Poetry:
 66 |     ```bash
 67 |     $ poetry install --no-root
 68 |     ```
 69 | 
 70 | 4. Run the app:
 71 |     ```bash
 72 |     $ poetry run streamlit run app.py --server.fileWatcherType none --browser.gatherUsageStats false --server.enableXsrfProtection false --server.address 0.0.0.0
 73 |     ```
 74 | 
 75 | 5. Navigate to:
 76 |     ```
 77 |     http://localhost:8501/
 78 |     ```
 79 | 
 80 | 
 81 | ### Docker Compose
 82 | 
 83 | First, clone the repo like above.
 84 | 
 85 | 1. Create your env file:
 86 |     ```bash
 87 |     $ cp .env.template .env
 88 |     ```
 89 |     *fill out values, most importantly, your `OPENAI_API_KEY`.*
 90 | 
 91 | 2. Run with docker compose:
 92 |     ```bash
 93 |     $ docker compose up
 94 |     ```
 95 |     *add `-d` option to daemonize the processes to the background if you wish.*
 96 | 
 97 |     Issues with dependencies? Try force-building with no-cache:
 98 |     ```
 99 |     $ docker compose build --no-cache
100 |     ```
101 | 
102 | 3. Navigate to:
103 |     ```
104 |     http://localhost:8501/
105 |     ```
106 | 


--------------------------------------------------------------------------------
/app/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import langchain
  3 | import streamlit as st
  4 | 
  5 | from collections import defaultdict
  6 | from urllib.error import URLError
  7 | from dotenv import load_dotenv
  8 | load_dotenv()
  9 | 
 10 | if os.environ.get("QNA_DEBUG") == "true":
 11 |     langchain.debug = True
 12 | 
 13 | from qna.llm import make_qna_chain, get_llm
 14 | from qna.db import get_vectorstore#, get_cache
 15 | from qna.prompt import basic_prompt
 16 | from qna.data import get_arxiv_docs
 17 | from qna.constants import REDIS_URL
 18 | 
 19 | # @st.cache_resource
 20 | # def fetch_llm_cache():
 21 | #     return get_cache()
 22 | 
 23 | @st.cache_resource
 24 | def create_arxiv_index(topic_query, _num_papers, _prompt):
 25 |     arxiv_documents = get_arxiv_docs(topic_query, _num_papers)
 26 |     arxiv_db = get_vectorstore(arxiv_documents)
 27 |     st.session_state['arxiv_db'] = arxiv_db
 28 |     return arxiv_db
 29 | 
 30 | def is_updated(topic):
 31 |     return (
 32 |         topic != st.session_state['previous_topic']
 33 |     )
 34 | 
 35 | def reset_app():
 36 |     st.session_state['previous_topic'] = ""
 37 |     st.session_state['arxiv_topic'] = ""
 38 |     st.session_state['arxiv_query'] = ""
 39 |     st.session_state['messages'].clear()
 40 | 
 41 |     arxiv_db = st.session_state['arxiv_db']
 42 |     if arxiv_db is not None:
 43 |         #clear_cache()
 44 |         arxiv_db.index.clear()
 45 | 
 46 | 
 47 | # def clear_cache():
 48 | #     if not st.session_state["llm"]:
 49 | #         st.warning("Could not find llm to clear cache of")
 50 | #     llm = st.session_state["llm"]
 51 | #     llm_string = llm._get_llm_string()
 52 | #     langchain.llm_cache.clear(llm_string=llm_string)
 53 | 
 54 | 
 55 | try:
 56 |     #langchain.llm_cache = fetch_llm_cache()
 57 |     prompt = basic_prompt()
 58 | 
 59 |     # Defining default values
 60 |     default_question = ""
 61 |     default_answer = ""
 62 |     defaults = {
 63 |         "response": {
 64 |             "choices" :[{
 65 |                 "text" : default_answer
 66 |             }]
 67 |         },
 68 |         "question": default_question,
 69 |         "context": [],
 70 |         "chain": None,
 71 |         "previous_topic": "",
 72 |         "arxiv_topic": "",
 73 |         "arxiv_query": "",
 74 |         "arxiv_db": None,
 75 |         "llm": None,
 76 |         "messages": [],
 77 |     }
 78 | 
 79 |     # Checking if keys exist in session state, if not, initializing them
 80 |     for key, value in defaults.items():
 81 |         if key not in st.session_state:
 82 |             st.session_state[key] = value
 83 | 
 84 |     with st.sidebar:
 85 |         st.write("## LLM Settings")
 86 |         ##st.write("### Prompt") TODO make possible to change prompt
 87 |         st.write("Change these before you run the app!")
 88 |         st.slider("Number of Tokens", 100, 8000, 400, key="max_tokens")
 89 | 
 90 |         st.write("## Retrieval Settings")
 91 |         st.write("Feel free to change these anytime")
 92 |         st.slider("Number of Context Documents", 2, 20, 2, key="num_context_docs")
 93 | 
 94 |         st.write("## App Settings")
 95 |         st.button("Clear Chat", key="clear_chat", on_click=lambda: st.session_state['messages'].clear())
 96 |         #st.button("Clear Cache", key="clear_cache", on_click=clear_cache)
 97 |         st.button("New Conversation", key="reset", on_click=reset_app)
 98 | 
 99 |     col1, col2 = st.columns(2)
100 |     with col1:
101 |         st.title("Arxiv ChatGuru")
102 |         st.write("**Put in a topic area and a question within that area to get an answer!**")
103 |         topic = st.text_input("Topic Area", key="arxiv_topic")
104 |         papers = st.number_input("Number of Papers", key="num_papers", value=10, min_value=1, max_value=50, step=2)
105 |     with col2:
106 |         st.image("./assets/arxivguru_crop.png")
107 | 
108 | 
109 | 
110 |     if st.button("Chat!"):
111 |         if is_updated(topic):
112 |             st.session_state['previous_topic'] = topic
113 |             with st.spinner("Loading information from Arxiv to answer your question..."):
114 |                 create_arxiv_index(st.session_state['arxiv_topic'], st.session_state['num_papers'], prompt)
115 | 
116 |     arxiv_db = st.session_state['arxiv_db']
117 |     if st.session_state["llm"] is None:
118 |         tokens = st.session_state["max_tokens"]
119 |         st.session_state["llm"] = get_llm(max_tokens=tokens)
120 |     try:
121 |         chain = make_qna_chain(
122 |             st.session_state["llm"],
123 |             arxiv_db,
124 |             prompt=prompt,
125 |             k=st.session_state['num_context_docs'],
126 |             search_type="similarity"
127 |         )
128 |         st.session_state['chain'] = chain
129 |     except AttributeError:
130 |         st.info("Please enter a topic area")
131 |         st.stop()
132 | 
133 |     for message in st.session_state.messages:
134 |         with st.chat_message(message["role"]):
135 |             st.markdown(message["content"])
136 | 
137 |     if query := st.chat_input("What do you want to know about this topic?"):
138 |         st.session_state.messages.append({"role": "user", "content": query})
139 |         with st.chat_message("user"):
140 |             st.markdown(query)
141 | 
142 |         with st.chat_message("assistant", avatar="./assets/arxivguru_crop.png"):
143 |             message_placeholder = st.empty()
144 |             st.session_state['context'], st.session_state['response'] = [], ""
145 |             chain = st.session_state['chain']
146 | 
147 |             result = chain({"query": query})
148 |             print(result, flush=True)
149 |             st.markdown(result["result"])
150 |             st.session_state['context'], st.session_state['response'] = result['source_documents'], result['result']
151 |             if st.session_state['context']:
152 |                 with st.expander("Context"):
153 |                     context = defaultdict(list)
154 |                     for doc in st.session_state['context']:
155 |                         context[doc.metadata['title']].append(doc)
156 |                     for i, doc_tuple in enumerate(context.items(), 1):
157 |                         title, doc_list = doc_tuple[0], doc_tuple[1]
158 |                         st.write(f"{i}. **{title}**")
159 |                         for context_num, doc in enumerate(doc_list, 1):
160 |                             st.write(f" - **Context {context_num}**: {doc.page_content}")
161 | 
162 |             st.session_state.messages.append({"role": "assistant", "content": st.session_state['response']})
163 | 
164 | 
165 | except URLError as e:
166 |     st.error(
167 |         """
168 |         **This demo requires internet access.**
169 |         Connection error: %s
170 |         """
171 |         % e.reason
172 |     )
173 | 


--------------------------------------------------------------------------------
/app/assets/RedisOpenAI-QnA-Architecture.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/assets/RedisOpenAI-QnA-Architecture.drawio.png


--------------------------------------------------------------------------------
/app/assets/RedisOpenAI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/assets/RedisOpenAI.png


--------------------------------------------------------------------------------
/app/assets/arxivguru.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/assets/arxivguru.png


--------------------------------------------------------------------------------
/app/assets/arxivguru_crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/assets/arxivguru_crop.png


--------------------------------------------------------------------------------
/app/assets/diagram-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/assets/diagram-dark.png


--------------------------------------------------------------------------------
/app/assets/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/assets/diagram.png


--------------------------------------------------------------------------------
/app/pages/Stats.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | 
  3 | from urllib.error import URLError
  4 | from redisvl.redis.utils import make_dict
  5 | from redisvl.index import SearchIndex
  6 | from redisvl.schema import IndexSchema
  7 | from redis.exceptions import ConnectionError, ResponseError
  8 | from tabulate import tabulate
  9 | from dotenv import load_dotenv
 10 | load_dotenv()
 11 | 
 12 | from qna.constants import REDIS_URL
 13 | 
 14 | STATS_KEYS = [
 15 |     "num_docs",
 16 |     "num_records",
 17 |     "number_of_uses",
 18 |     "percent_indexed",
 19 |     "total_indexing_time",
 20 |     "bytes_per_record_avg",
 21 |     "records_per_doc_avg",
 22 |     "doc_table_size_mb",
 23 |     "vector_index_sz_mb",
 24 | ]
 25 | 
 26 | 
 27 | def display_stats(index_info, output_format="html"):
 28 |     # Extracting the statistics
 29 |     stats_data = [(key, str(index_info.get(key))) for key in STATS_KEYS]
 30 | 
 31 |     # Display the statistics in tabular format
 32 |     st.write("## Statistics:")
 33 |     st.write(
 34 |         tabulate(
 35 |             stats_data,
 36 |             headers=["Stat Key", "Value"],
 37 |             tablefmt=output_format,
 38 |             colalign=("left", "left"),
 39 |         ), unsafe_allow_html=True
 40 |     )
 41 | 
 42 | def display_index_stats(index_info, output_format="html"):
 43 |     attributes = index_info.get("attributes", [])
 44 |     definition = make_dict(index_info.get("index_definition"))
 45 |     index_info = [
 46 |         index_info.get("index_name"),
 47 |         definition.get("key_type"),
 48 |         definition.get("prefixes"),
 49 |         index_info.get("index_options"),
 50 |         index_info.get("indexing"),
 51 |     ]
 52 | 
 53 |     # Display the index information in tabular format
 54 |     st.write("## Index Information:")
 55 |     st.write(
 56 |         tabulate(
 57 |             [index_info],
 58 |             headers=[
 59 |                 "Index Name",
 60 |                 "Storage Type",
 61 |                 "Prefixes",
 62 |                 "Index Options",
 63 |                 "Indexing",
 64 |             ],
 65 |             tablefmt=output_format,
 66 |         ), unsafe_allow_html=True
 67 |     )
 68 | 
 69 |     attr_values = []
 70 |     headers = [
 71 |         "Name",
 72 |         "Attribute",
 73 |         "Type",
 74 |     ]
 75 | 
 76 |     for attrs in attributes:
 77 |         attr = make_dict(attrs)
 78 | 
 79 |         values = [attr.get("identifier"), attr.get("attribute"), attr.get("type")]
 80 |         if len(attrs) > 5:
 81 |             options = make_dict(attrs)
 82 |             for k, v in options.items():
 83 |                 if k not in ["identifier", "attribute", "type"]:
 84 |                     headers.append("Field Option")
 85 |                     headers.append("Option Value")
 86 |                     values.append(k)
 87 |                     values.append(v)
 88 |         attr_values.append(values)
 89 | 
 90 |     # Display the attributes in tabular format
 91 |     st.write("## Index Fields:")
 92 |     st.write(
 93 |         tabulate(
 94 |             attr_values,
 95 |             headers=headers,
 96 |             tablefmt=output_format,
 97 |         ), unsafe_allow_html=True
 98 |     )
 99 | 
100 | try:
101 | 
102 |     try:
103 |         schema = IndexSchema.from_yaml("qna/arxiv.yaml")
104 |         index = SearchIndex.from_existing(name=schema.index.name, redis_url=REDIS_URL)
105 |         index_info = index.info()
106 |         display_index_stats(index_info)
107 |         display_stats(index_info)
108 | 
109 |     except ConnectionError as e:
110 |         st.error(
111 |             """
112 |             **Could not connect to Redis**
113 |             Connection error: %s
114 |             """
115 |             % e
116 |         )
117 |     except ResponseError as e:
118 |         st.error(
119 |             """
120 |             **Could not connect to index for demo**
121 |             Response error: %s
122 |             """
123 |             % e
124 |         )
125 | 
126 | 
127 | except URLError as e:
128 |     st.error(
129 |         """
130 |         **This demo requires internet access.**
131 |         Connection error: %s
132 |         """
133 |         % e.reason
134 |     )
135 | 


--------------------------------------------------------------------------------
/app/pages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/pages/__init__.py


--------------------------------------------------------------------------------
/app/qna/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/redis-developer/ArXivChatGuru/d32aca478504b4b0032914f2b990131891b415f8/app/qna/__init__.py


--------------------------------------------------------------------------------
/app/qna/arxiv.yaml:
--------------------------------------------------------------------------------
 1 | index:
 2 |   name: arxiv
 3 |   prefix: arxiv
 4 |   key_separator: ':'
 5 |   storage_type: hash
 6 | fields:
 7 | - name: text
 8 |   type: text
 9 | - name: authors
10 |   type: tag
11 | - name: title
12 |   type: text
13 | - name: links
14 |   type: tag
15 | - name: embedding
16 |   type: vector
17 |   attrs:
18 |     dims: 1536
19 |     algorithm: flat
20 |     datatype: float32
21 |     distance_metric: cosine
22 | version: 0.1.0
23 | 


--------------------------------------------------------------------------------
/app/qna/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # Env Vars and constants
 4 | CACHE_TYPE = os.getenv("CACHE_TYPE", "semantic")
 5 | OPENAI_COMPLETIONS_ENGINE = os.getenv("OPENAI_COMPLETIONS_ENGINE", "gpt-3.5-turbo-16k")
 6 | OPENAI_EMBEDDINGS_ENGINE = os.getenv("OPENAI_EMBEDDINGS_ENGINE", "text-embedding-ada-002")
 7 | 
 8 | REDIS_INDEX_NAME = os.getenv("REDIS_INDEX_NAME", "arxiv")
 9 | REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")
10 | 


--------------------------------------------------------------------------------
/app/qna/data.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from langchain.schema import Document
 4 | from langchain_community.document_loaders import ArxivLoader
 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 6 | 
 7 | 
 8 | def get_arxiv_docs(paper_topic_query, num_docs=10) -> List[Document]:
 9 |     loader = ArxivLoader(
10 |         paper_topic_query,
11 |         load_max_docs=num_docs,
12 |         load_all_available_meta=True
13 |     )
14 |     raw_documents = loader.load()
15 |     text_splitter = RecursiveCharacterTextSplitter(
16 |         # Set a really small chunk size, just to show.
17 |         chunk_size = 500,
18 |         chunk_overlap  = 20,
19 |         length_function = len,
20 |         add_start_index = True,
21 |     )
22 |     documents = text_splitter.split_documents(raw_documents)
23 |     return documents


--------------------------------------------------------------------------------
/app/qna/db.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from langchain.schema import Document
 4 | from langchain_redis import RedisVectorStore, RedisConfig
 5 | 
 6 | from qna.llm import get_embeddings
 7 | from qna.constants import CACHE_TYPE, REDIS_INDEX_NAME, REDIS_URL
 8 | 
 9 | 
10 | # def get_cache():
11 | #     # construct cache implementation based on env var
12 | #     if CACHE_TYPE == "semantic":
13 | #         from langchain_redis import RedisSemanticCache
14 | #         print("Using semantic cache")
15 | #         # TODO change to using huggingface embeddings
16 | #         # so that caching is cheaper and faster.
17 | #         return RedisSemanticCache(
18 | #             redis_url=REDIS_URL,
19 | #             embeddings=get_embeddings(),
20 | #             distance_threshold=0.1
21 | #         )
22 | #     return None
23 | 
24 | 
25 | 
26 | def get_vectorstore(documents: List[Document]=None) -> "RedisVectorStore":
27 |     """Create the Redis vectorstore."""
28 | 
29 |     config = RedisConfig.from_yaml("qna/arxiv.yaml", redis_url=REDIS_URL)
30 | 
31 |     embeddings = get_embeddings()
32 | 
33 |     cleaned_docs = [
34 |         Document(
35 |             page_content=doc.page_content,
36 |             metadata={
37 |                 "title": doc.metadata["Title"],
38 |                 "authors": doc.metadata["Authors"],
39 |                 "category": doc.metadata["primary_category"],
40 |                 "links": doc.metadata["links"]
41 |             }
42 |         ) for doc in documents
43 |     ]
44 | 
45 |     try:
46 |         vectorstore = RedisVectorStore.from_existing_index(
47 |             embedding=embeddings,
48 |             index_name=REDIS_INDEX_NAME,
49 |             redis_url=REDIS_URL,
50 |             config=config
51 |         )
52 |         return vectorstore
53 |     except:
54 |         pass
55 | 
56 |     print(REDIS_URL, flush=True)
57 |     vectorstore = RedisVectorStore.from_documents(
58 |         documents=cleaned_docs,
59 |         embedding=embeddings,
60 |         redis_url=REDIS_URL,
61 |         config=config
62 |     )
63 |     return vectorstore
64 | 


--------------------------------------------------------------------------------
/app/qna/llm.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | from langchain.chains import RetrievalQA
 4 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 5 | from langchain.llms.base import LLM
 6 | from langchain.embeddings.base import Embeddings
 7 | from langchain_redis import RedisVectorStore
 8 | from qna.constants import (
 9 |     OPENAI_COMPLETIONS_ENGINE,
10 |     OPENAI_EMBEDDINGS_ENGINE,
11 | )
12 | 
13 | 
14 | def get_llm(max_tokens=100) -> LLM:
15 |     llm = ChatOpenAI(model_name=OPENAI_COMPLETIONS_ENGINE, max_tokens=max_tokens)
16 |     return llm
17 | 
18 | 
19 | def get_embeddings() -> Embeddings:
20 |     embeddings = OpenAIEmbeddings(model=OPENAI_EMBEDDINGS_ENGINE)
21 |     return embeddings
22 | 
23 | 
24 | def make_qna_chain(llm: LLM, vector_db: RedisVectorStore, prompt: str = "", **kwargs):
25 |     """Create the QA chain."""
26 | 
27 |     search_type = "similarity"
28 |     if "search_type" in kwargs:
29 |         search_type = kwargs.pop("search_type")
30 | 
31 |     # Create retreival QnA Chain
32 |     chain = RetrievalQA.from_chain_type(
33 |         llm=llm,
34 |         chain_type="stuff",
35 |         retriever=vector_db.as_retriever(search_kwargs=kwargs, search_type=search_type),
36 |         return_source_documents=True,
37 |         chain_type_kwargs={"prompt": prompt},
38 |         verbose=True,
39 |     )
40 |     return chain
41 | 


--------------------------------------------------------------------------------
/app/qna/prompt.py:
--------------------------------------------------------------------------------
 1 | from langchain.prompts import PromptTemplate
 2 | 
 3 | def basic_prompt():
 4 |     # Define our prompt
 5 |     prompt_template = """You are an AI assistant for answering questions about technical topics.
 6 |     You are given the following extracted parts of long documents and a question. Provide a conversational answer.
 7 |     Use the context as a source of information, but be sure to answer the question directly. You're
 8 |     job is to provide the user a helpful summary of the information in the context if it applies to the question.
 9 |     If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
10 | 
11 |     Question: {question}
12 |     =========
13 |     {context}
14 |     =========
15 |     Answer in Markdown:
16 |     """
17 | 
18 |     return PromptTemplate(
19 |         template=prompt_template,
20 |         input_variables=["context", "question"],
21 |     )
22 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | services:
 3 |   streamlit:
 4 |     container_name: streamlit
 5 |     build:
 6 |       context: ./
 7 |     volumes:
 8 |       - ./app:/app
 9 |     ports:
10 |       - "8501:8501"
11 |     env_file:
12 |       - .env
13 |     depends_on:
14 |       redis:
15 |         condition: service_healthy
16 |   redis:
17 |     image: redis/redis-stack:latest
18 |     ports:
19 |       - "6379:6379"
20 |     env_file:
21 |       - .env
22 |     volumes:
23 |         - redis_data:/data
24 |     healthcheck:
25 |       test: ["CMD", "redis-cli", "-h", "localhost", "-p", "6379", "ping"]
26 |       interval: 2s
27 |       timeout: 1m30s
28 |       retries: 5
29 |       start_period: 5s
30 | 
31 | volumes:
32 |   redis_data:


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "arxivchatguru"
 3 | version = "0.2.0"
 4 | description = "Interact with scientific papers from Arxiv using OpenAI, Redis, and LangChain."
 5 | authors = ["Tyler Hutcherson <tyler.hutcherson@redis.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | package-mode = false
 9 | 
10 | [tool.poetry.dependencies]
11 | python = ">=3.11,<3.14"
12 | langchain-core = "^0.3.15"
13 | langchain = "^0.3.6"
14 | langchain-openai = "^0.2.5"
15 | langchain-redis = "^0.2.0"
16 | langchain-huggingface = "^0.1.2"
17 | tenacity = "^9.0.0"
18 | streamlit = "^1.39.0"
19 | python-dotenv = "^1.0.1"
20 | pymupdf = "^1.24.13"
21 | langchain-community = "^0.3.4"
22 | arxiv = "^2.1.3"
23 | 
24 | 
25 | [build-system]
26 | requires = ["poetry-core"]
27 | build-backend = "poetry.core.masonry.api"
28 | 


--------------------------------------------------------------------------------