├── .gitattributes
├── .gitignore
├── .replit
├── .streamlit
    └── secrets.toml.example
├── LICENSE
├── README.md
├── agent_helper.py
├── app-agent.py
├── app-agent2.py
├── app.py
├── embed_pdf.py
├── index
    ├── Mahmoudi_Nima_202202_PhD.pdf.index.faiss
    ├── Mahmoudi_Nima_202202_PhD.pdf.index.pkl
    ├── NIPS-2017-attention-is-all-you-need-Paper.pdf.index.faiss
    └── NIPS-2017-attention-is-all-you-need-Paper.pdf.index.pkl
├── llm_helper.py
├── pdf
    └── .gitignore
├── poetry.lock
├── pyproject.toml
└── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 
154 | 
155 | .streamlit/secrets.toml
156 | index/*.faiss
157 | index/*.pkl
158 | .pythonlibs
159 | 


--------------------------------------------------------------------------------
/.replit:
--------------------------------------------------------------------------------
 1 | modules = ["python-3.10:v18-20230807-322e88b"]
 2 | 
 3 | hidden = [".pythonlibs"]
 4 | run = "streamlit run app.py"
 5 | 
 6 | [nix]
 7 | channel = "stable-23_05"
 8 | 
 9 | [unitTest]
10 | language = "python3"
11 | 
12 | [deployment]
13 | run = ["sh", "-c", "streamlit run app.py"]
14 | deploymentTarget = "cloudrun"
15 | 


--------------------------------------------------------------------------------
/.streamlit/secrets.toml.example:
--------------------------------------------------------------------------------
1 | # OPENAI_API_KEY = "sk-..."


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Nima Mahmoudi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLM Streamlit Demo (Basic)
 2 | 
 3 | - [Live Demo](https://llmappdemobasic.streamlit.app/)
 4 | 
 5 | ## Installation Process
 6 | 
 7 | ### Without Poetry
 8 | 
 9 | You can use conda to install the required packages:
10 | 
11 | ```sh
12 | conda create -n p311-llm python=3.11
13 | conda activate p311-llm
14 | pip install -r requirements.txt
15 | ```
16 | 
17 | ### With Poetry (optional)
18 | 
19 | To install the required packages, you can install them using poetry:
20 | 
21 | ```sh
22 | # (optional) to set poetry to use the project folder
23 | # poetry config virtualenvs.in-project true
24 | # install all dependencies
25 | poetry install
26 | ```
27 | 
28 | In case you changed requirements.txt, you'll need to run the following command to update the poetry definitions:
29 | 
30 | ```sh
31 | cat requirements.txt | xargs poetry add
32 | ```
33 | 
34 | You can then enable poetry shell:
35 | 
36 | ```sh
37 | poetry shell
38 | ```
39 | 
40 | ## Running the App
41 | 
42 | After installing all requirements, you'll need to add your OpenAI API key to the secrets,
43 | or let the user input it in the sidebar every time they visit the page.
44 | You can add your secrets to `.streamlit/secrets.toml` in the following format:
45 | 
46 | ```toml
47 | OPENAI_API_KEY = "sk-..."
48 | ```
49 | 
50 | Then, you can run the code using the following command:
51 | 
52 | ```sh
53 | streamlit run app.py
54 | ```
55 | 
56 | For more detail about this code, you can follow [my blog posts](https://medium.com/@nima.mahmoudi).
57 | 


--------------------------------------------------------------------------------
/agent_helper.py:
--------------------------------------------------------------------------------
 1 | # from langchain.callbacks import StreamlitCallbackHandler
 2 | from langchain.callbacks.streamlit.streamlit_callback_handler import StreamlitCallbackHandler
 3 | from tenacity import retry, wait_exponential, stop_after_attempt
 4 | 
 5 | def bind_logger(toolClass):
 6 |     class newToolClass(toolClass):
 7 |         def __init__(self, tool_name: str, st_cb: StreamlitCallbackHandler, *args, **kwargs):
 8 |             super().__init__(*args, **kwargs)
 9 |             self.st_cb = st_cb
10 |             self.tool_name = tool_name
11 | 
12 |         def run(self, *args, **kwargs):
13 |             print(f"Running {toolClass.__name__} {[*args]}, {kwargs}")
14 | 
15 |             if self.st_cb._current_thought is None:
16 |                 self.st_cb.on_llm_start({}, [])
17 | 
18 |             args_str = ' '.join(args) + ' ' + ' '.join([f'{k}=`{v}`' for k, v in kwargs.items()])
19 |             self.st_cb.on_tool_start({'name': self.tool_name}, args_str)
20 | 
21 |             try:
22 |                 ret_val = retry(
23 |                     wait=wait_exponential(min=2, max=20),
24 |                     stop=stop_after_attempt(5),
25 |                 )(super().run)(*args, **kwargs)
26 |                 self.st_cb.on_tool_end(ret_val)
27 |                 return ret_val
28 |             except Exception as e:
29 |                 original_exception = e.last_attempt.result()
30 |                 print(f"Exception {original_exception} in {toolClass.__name__} {[*args]}, {kwargs}")
31 |                 raise original_exception
32 |             
33 |         
34 |     return newToolClass
35 |         
36 | from functools import wraps
37 | 
38 | def retry_and_streamlit_callback(st_cb: StreamlitCallbackHandler, tool_name: str):
39 |     if st_cb is None:
40 |         return lambda x: x
41 | 
42 |     def decorator(tool_func):
43 |         @wraps(tool_func)
44 |         def decorated_func(*args, **kwargs):
45 |             print(f"Running {tool_name} {args}, {kwargs}")
46 | 
47 |             if st_cb._current_thought is None:
48 |                 st_cb.on_llm_start({}, [])
49 | 
50 |             args_str = ' '.join(args) + ' ' + ' '.join([f'{k}=`{v}`' for k, v in kwargs.items()])
51 |             st_cb.on_tool_start({'name': tool_name}, args_str)
52 | 
53 |             @retry(wait=wait_exponential(min=2, max=20), stop=stop_after_attempt(5))
54 |             def retry_wrapper():
55 |                 return tool_func(*args, **kwargs)
56 | 
57 |             try:
58 |                 ret_val = retry_wrapper()
59 |                 st_cb.on_tool_end(ret_val)
60 |                 return ret_val
61 |             except Exception as e:
62 |                 print(f"Exception {e} in {tool_name} {args}, {kwargs}")
63 |                 raise e
64 | 
65 |         return decorated_func
66 | 
67 |     return decorator
68 | 


--------------------------------------------------------------------------------
/app-agent.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | from langchain.agents import initialize_agent, AgentType
 4 | from langchain.callbacks import StreamlitCallbackHandler
 5 | from langchain.chat_models import ChatOpenAI
 6 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 7 | 
 8 | from llm_helper import get_agent_chain, get_lc_oai_tools
 9 | 
10 | with st.sidebar:
11 |     openai_api_key = st.secrets["OPENAI_API_KEY"]
12 |     "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)"
13 |     "[View the source code](https://github.com/streamlit/llm-examples/blob/main/pages/2_Chat_with_search.py)"
14 |     "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)"
15 | 
16 | st.title("🔎 LangChain - Chat with search")
17 | 
18 | """
19 | In this example, we're using `StreamlitCallbackHandler` to display the thoughts and actions of an agent in an interactive Streamlit app.
20 | Try more LangChain 🤝 Streamlit Agent examples at [github.com/langchain-ai/streamlit-agent](https://github.com/langchain-ai/streamlit-agent).
21 | """
22 | 
23 | if "messages" not in st.session_state:
24 |     st.session_state["messages"] = [
25 |         {"role": "assistant", "content": "Hi, I'm a chatbot who can search the web. How can I help you?"}
26 |     ]
27 | 
28 | for msg in st.session_state.messages:
29 |     st.chat_message(msg["role"]).write(msg["content"])
30 | 
31 | if prompt := st.chat_input(placeholder="Who won the Women's U.S. Open in 2018?"):
32 |     st.session_state.messages.append({"role": "user", "content": prompt})
33 |     st.chat_message("user").write(prompt)
34 | 
35 |     if not openai_api_key:
36 |         st.info("Please add your OpenAI API key to continue.")
37 |         st.stop()
38 | 
39 |     llm = ChatOpenAI(model_name="gpt-3.5-turbo-1106", openai_api_key=openai_api_key, streaming=True)
40 |     lc_tools, _ = get_lc_oai_tools()
41 |     search_agent = initialize_agent(lc_tools, llm, agent=AgentType.OPENAI_FUNCTIONS, handle_parsing_errors=True, verbose=True)
42 | 
43 |     agent_prompt = ChatPromptTemplate.from_messages(
44 |         [
45 |             ("system", "You are a helpful assistant, use the search tool to answer the user's question and cite only the page number when you use information coming (like [p1]) from the source document. Always use the content from the source document to answer the user's question. If you need to compare multiple subjects, search them one by one."),
46 |             ("user", "{input}"),
47 |             MessagesPlaceholder(variable_name="agent_scratchpad"),
48 |         ]
49 |     )
50 |     search_agent.agent.prompt = agent_prompt
51 |     with st.chat_message("assistant"):
52 |         st_cb = StreamlitCallbackHandler(st.container(), expand_new_thoughts=False)
53 |         response = search_agent.run(prompt, callbacks=[st_cb])
54 |         # search_agent = get_agent_chain(callbacks=[st_cb])
55 |         # response = search_agent.invoke({"input": prompt})
56 |         # response = response["output"]
57 |         
58 |         st.session_state.messages.append({"role": "assistant", "content": response})
59 |         st.write(response)
60 | 


--------------------------------------------------------------------------------
/app-agent2.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | from langchain.agents import initialize_agent, AgentType
 4 | from langchain.callbacks import StreamlitCallbackHandler
 5 | from langchain.chat_models import ChatOpenAI
 6 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
 7 | 
 8 | from langchain.agents.format_scratchpad import format_to_openai_function_messages
 9 | from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
10 | 
11 | from llm_helper import get_agent_chain, get_lc_oai_tools, convert_message
12 | from langchain.agents import AgentExecutor
13 | 
14 | with st.sidebar:
15 |     openai_api_key = st.secrets["OPENAI_API_KEY"]
16 |     "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)"
17 |     "[View the source code](https://github.com/streamlit/llm-examples/blob/main/pages/2_Chat_with_search.py)"
18 |     "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)"
19 | 
20 | st.title("🔎 LangChain - Chat with search")
21 | 
22 | """
23 | In this example, we're using `StreamlitCallbackHandler` to display the thoughts and actions of an agent in an interactive Streamlit app.
24 | Try more LangChain 🤝 Streamlit Agent examples at [github.com/langchain-ai/streamlit-agent](https://github.com/langchain-ai/streamlit-agent).
25 | """
26 | 
27 | if "messages" not in st.session_state:
28 |     st.session_state["messages"] = [
29 |         {"role": "assistant", "content": "Hi, I'm a chatbot who can search the web. How can I help you?"}
30 |     ]
31 | 
32 | for msg in st.session_state.messages:
33 |     st.chat_message(msg["role"]).write(msg["content"])
34 | 
35 | if prompt := st.chat_input(placeholder="Who won the Women's U.S. Open in 2018?"):
36 |     st.session_state.messages.append({"role": "user", "content": prompt})
37 |     st.chat_message("user").write(prompt)
38 | 
39 |     if not openai_api_key:
40 |         st.info("Please add your OpenAI API key to continue.")
41 |         st.stop()
42 | 
43 |     if "messages" in st.session_state:
44 |         chat_history = [convert_message(m) for m in st.session_state.messages[:-1]]
45 |     else:
46 |         chat_history = []
47 | 
48 |     with st.chat_message("assistant"):
49 |         st_cb = StreamlitCallbackHandler(st.container(), expand_new_thoughts=False)
50 |         agent = get_agent_chain(st_cb=st_cb)
51 | 
52 |         response = agent.invoke({
53 |             "input": prompt,
54 |             "chat_history": chat_history,
55 |         })
56 |         response = response["output"]
57 |         
58 |         st.session_state.messages.append({"role": "assistant", "content": response})
59 |         st.write(response)
60 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import os
  3 | import embed_pdf
  4 | 
  5 | # create sidebar and ask for openai api key if not set in secrets
  6 | secrets_file_path = os.path.join(".streamlit", "secrets.toml")
  7 | if os.path.exists(secrets_file_path):
  8 |     try:
  9 |         if "OPENAI_API_KEY" in st.secrets:
 10 |             os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
 11 |         else:
 12 |             print("OpenAI API Key not found in environment variables")
 13 |     except FileNotFoundError:
 14 |         print('Secrets file not found')
 15 | else:
 16 |     print('Secrets file not found')
 17 | 
 18 | if not os.getenv('OPENAI_API_KEY', '').startswith("sk-"):
 19 |     os.environ["OPENAI_API_KEY"] = st.sidebar.text_input(
 20 |         "OpenAI API Key", type="password"
 21 |     )
 22 | else:
 23 |     if st.sidebar.button("Embed Documents"):
 24 |         st.sidebar.info("Embedding documents...")
 25 |         try:
 26 |             embed_pdf.embed_all_pdf_docs()
 27 |             st.sidebar.info("Done!")
 28 |         except Exception as e:
 29 |             st.sidebar.error(e)
 30 |             st.sidebar.error("Failed to embed documents.")
 31 | 
 32 | # create the app
 33 | st.title("Welcome to NimaGPT")
 34 | 
 35 | chosen_file = st.radio(
 36 |     "Choose a file to search", embed_pdf.get_all_index_files(), index=0
 37 | )
 38 | 
 39 | # check if openai api key is set
 40 | if not os.getenv('OPENAI_API_KEY', '').startswith("sk-"):
 41 |     st.warning("Please enter your OpenAI API key!", icon="⚠")
 42 |     st.stop()
 43 | 
 44 | # load the agent
 45 | from llm_helper import convert_message, get_rag_chain, get_rag_fusion_chain
 46 | 
 47 | rag_method_map = {
 48 |     'Basic RAG': get_rag_chain,
 49 |     'RAG Fusion': get_rag_fusion_chain
 50 | }
 51 | chosen_rag_method = st.radio(
 52 |     "Choose a RAG method", rag_method_map.keys(), index=0
 53 | )
 54 | get_rag_chain_func = rag_method_map[chosen_rag_method]
 55 | ## get the chain WITHOUT the retrieval callback (not used)
 56 | # custom_chain = get_rag_chain_func(chosen_file)
 57 | 
 58 | # create the message history state
 59 | if "messages" not in st.session_state:
 60 |     st.session_state.messages = []
 61 | 
 62 | # render older messages
 63 | for message in st.session_state.messages:
 64 |     with st.chat_message(message["role"]):
 65 |         st.markdown(message["content"])
 66 | 
 67 | # render the chat input
 68 | prompt = st.chat_input("Enter your message...")
 69 | if prompt:
 70 |     st.session_state.messages.append({"role": "user", "content": prompt})
 71 | 
 72 |     # render the user's new message
 73 |     with st.chat_message("user"):
 74 |         st.markdown(prompt)
 75 | 
 76 |     # render the assistant's response
 77 |     with st.chat_message("assistant"):
 78 |         retrival_container = st.container()
 79 |         message_placeholder = st.empty()
 80 | 
 81 |         retrieval_status = retrival_container.status("**Context Retrieval**")
 82 |         queried_questions = []
 83 |         rendered_questions = set()
 84 |         def update_retrieval_status():
 85 |             for q in queried_questions:
 86 |                 if q in rendered_questions:
 87 |                     continue
 88 |                 rendered_questions.add(q)
 89 |                 retrieval_status.markdown(f"\n\n`- {q}`")
 90 |         def retrieval_cb(qs):
 91 |             for q in qs:
 92 |                 if q not in queried_questions:
 93 |                     queried_questions.append(q)
 94 |             return qs
 95 |         
 96 |         # get the chain with the retrieval callback
 97 |         custom_chain = get_rag_chain_func(chosen_file, retrieval_cb=retrieval_cb)
 98 |         
 99 |         if "messages" in st.session_state:
100 |             chat_history = [convert_message(m) for m in st.session_state.messages[:-1]]
101 |         else:
102 |             chat_history = []
103 | 
104 |         full_response = ""
105 |         for response in custom_chain.stream(
106 |             {"input": prompt, "chat_history": chat_history}
107 |         ):
108 |             if "output" in response:
109 |                 full_response += response["output"]
110 |             else:
111 |                 full_response += response.content
112 | 
113 |             message_placeholder.markdown(full_response + "▌")
114 |             update_retrieval_status()
115 | 
116 |         retrieval_status.update(state="complete")
117 |         message_placeholder.markdown(full_response)
118 | 
119 |     # add the full response to the message history
120 |     st.session_state.messages.append({"role": "assistant", "content": full_response})
121 | 


--------------------------------------------------------------------------------
/embed_pdf.py:
--------------------------------------------------------------------------------
 1 | from langchain.document_loaders import PagedPDFSplitter
 2 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 3 | from langchain.embeddings.openai import OpenAIEmbeddings
 4 | from langchain.vectorstores import FAISS
 5 | 
 6 | import os
 7 | 
 8 | 
 9 | def embed_document(file_name, file_folder="pdf", embedding_folder="index"):
10 |     file_path = f"{file_folder}/{file_name}"
11 |     loader = PagedPDFSplitter(file_path)
12 |     source_pages = loader.load_and_split()
13 | 
14 |     embedding_func = OpenAIEmbeddings()
15 |     text_splitter = RecursiveCharacterTextSplitter(
16 |         chunk_size=500,
17 |         chunk_overlap=100,
18 |         length_function=len,
19 |         is_separator_regex=False,
20 |         separators=["\n\n", "\n", " ", ""],
21 |     )
22 |     source_chunks = text_splitter.split_documents(source_pages)
23 |     search_index = FAISS.from_documents(source_chunks, embedding_func)
24 |     search_index.save_local(
25 |         folder_path=embedding_folder, index_name=file_name + ".index"
26 |     )
27 | 
28 | 
29 | def embed_all_pdf_docs():
30 |     # Define the directory path
31 |     pdf_directory = "pdf"
32 | 
33 |     # Check if the directory exists
34 |     if os.path.exists(pdf_directory):
35 |         # List all PDF files in the directory
36 |         pdf_files = [
37 |             file for file in os.listdir(pdf_directory) if file.endswith(".pdf")
38 |         ]
39 | 
40 |         if pdf_files:
41 |             for pdf_file in pdf_files:
42 |                 print(f"Embedding {pdf_file}...")
43 |                 embed_document(file_name=pdf_file, file_folder=pdf_directory)
44 |                 print("Done!")
45 |         else:
46 |             raise Exception("No PDF files found in the directory.")
47 |     else:
48 |         raise Exception(f"Directory '{pdf_directory}' does not exist.")
49 | 
50 | 
51 | def get_all_index_files():
52 |     # Define the directory path
53 |     index_directory = "index"
54 | 
55 |     # Check if the directory exists
56 |     if os.path.exists(index_directory):
57 |         # List all index files in the directory
58 |         postfix = ".index.faiss"
59 |         index_files = [
60 |             file.replace(postfix, "")
61 |             for file in os.listdir(index_directory)
62 |             if file.endswith(postfix)
63 |         ]
64 | 
65 |         if index_files:
66 |             return index_files
67 |         else:
68 |             raise Exception("No index files found in the directory.")
69 |     else:
70 |         raise Exception(f"Directory '{index_directory}' does not exist.")
71 | 


--------------------------------------------------------------------------------
/index/Mahmoudi_Nima_202202_PhD.pdf.index.faiss:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nimamahmoudi/LLMStreamlitDemoBasic/0d466282626dba8a1951534c1fbd9719caaa35fd/index/Mahmoudi_Nima_202202_PhD.pdf.index.faiss


--------------------------------------------------------------------------------
/index/Mahmoudi_Nima_202202_PhD.pdf.index.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nimamahmoudi/LLMStreamlitDemoBasic/0d466282626dba8a1951534c1fbd9719caaa35fd/index/Mahmoudi_Nima_202202_PhD.pdf.index.pkl


--------------------------------------------------------------------------------
/index/NIPS-2017-attention-is-all-you-need-Paper.pdf.index.faiss:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nimamahmoudi/LLMStreamlitDemoBasic/0d466282626dba8a1951534c1fbd9719caaa35fd/index/NIPS-2017-attention-is-all-you-need-Paper.pdf.index.faiss


--------------------------------------------------------------------------------
/index/NIPS-2017-attention-is-all-you-need-Paper.pdf.index.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nimamahmoudi/LLMStreamlitDemoBasic/0d466282626dba8a1951534c1fbd9719caaa35fd/index/NIPS-2017-attention-is-all-you-need-Paper.pdf.index.pkl


--------------------------------------------------------------------------------
/llm_helper.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | # langchain imports
  4 | from langchain.chat_models import ChatOpenAI
  5 | from langchain.schema.runnable import RunnableMap
  6 | from langchain.prompts.prompt import PromptTemplate
  7 | from langchain.prompts import ChatPromptTemplate
  8 | from langchain.schema.runnable import RunnablePassthrough
  9 | from langchain.schema.output_parser import StrOutputParser
 10 | from operator import itemgetter
 11 | from langchain.schema.messages import HumanMessage, SystemMessage, AIMessage
 12 | from langchain.callbacks.streamlit.streamlit_callback_handler import StreamlitCallbackHandler
 13 | 
 14 | 
 15 | def format_docs(docs):
 16 |     res = ""
 17 |     # res = str(docs)
 18 |     for doc in docs:
 19 |         escaped_page_content = doc.page_content.replace("\n", "\\n")
 20 |         res += "<doc>\n"
 21 |         res += f"  <content>{escaped_page_content}</content>\n"
 22 |         for m in doc.metadata:
 23 |             res += f"  <{m}>{doc.metadata[m]}</{m}>\n"
 24 |         res += "</doc>\n"
 25 |     return res
 26 | 
 27 | 
 28 | def get_search_index(file_name="Mahmoudi_Nima_202202_PhD.pdf", index_folder="index"):
 29 |     # load embeddings
 30 |     from langchain.vectorstores import FAISS
 31 |     from langchain.embeddings.openai import OpenAIEmbeddings
 32 | 
 33 |     search_index = FAISS.load_local(
 34 |         folder_path=index_folder,
 35 |         index_name=file_name + ".index",
 36 |         embeddings=OpenAIEmbeddings(),
 37 |     )
 38 |     return search_index
 39 | 
 40 | 
 41 | def convert_message(m):
 42 |     if m["role"] == "user":
 43 |         return HumanMessage(content=m["content"])
 44 |     elif m["role"] == "assistant":
 45 |         return AIMessage(content=m["content"])
 46 |     elif m["role"] == "system":
 47 |         return SystemMessage(content=m["content"])
 48 |     else:
 49 |         raise ValueError(f"Unknown role {m['role']}")
 50 | 
 51 | 
 52 | _condense_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
 53 | 
 54 | Chat History:
 55 | {chat_history}
 56 | Follow Up Input: {input}
 57 | Standalone question:"""
 58 | CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_condense_template)
 59 | 
 60 | _rag_template = """Answer the question based only on the following context, citing the page number(s) of the document(s) you used to answer the question:
 61 | {context}
 62 | 
 63 | Question: {question}
 64 | """
 65 | ANSWER_PROMPT = ChatPromptTemplate.from_template(_rag_template)
 66 | 
 67 | 
 68 | def _format_chat_history(chat_history):
 69 |     def format_single_chat_message(m):
 70 |         if type(m) is HumanMessage:
 71 |             return "Human: " + m.content
 72 |         elif type(m) is AIMessage:
 73 |             return "Assistant: " + m.content
 74 |         elif type(m) is SystemMessage:
 75 |             return "System: " + m.content
 76 |         else:
 77 |             raise ValueError(f"Unknown role {m['role']}")
 78 | 
 79 |     return "\n".join([format_single_chat_message(m) for m in chat_history])
 80 | 
 81 | def get_standalone_question_from_chat_history_chain():
 82 |     _inputs = RunnableMap(
 83 |         standalone_question=RunnablePassthrough.assign(
 84 |             chat_history=lambda x: _format_chat_history(x["chat_history"])
 85 |         )
 86 |         | CONDENSE_QUESTION_PROMPT
 87 |         | ChatOpenAI(temperature=0)
 88 |         | StrOutputParser(),
 89 |     )
 90 |     return _inputs
 91 | 
 92 | def get_rag_chain(file_name="Mahmoudi_Nima_202202_PhD.pdf", index_folder="index", retrieval_cb=None):
 93 |     vectorstore = get_search_index(file_name, index_folder)
 94 |     retriever = vectorstore.as_retriever()
 95 | 
 96 |     if retrieval_cb is None:
 97 |         retrieval_cb = lambda x: x
 98 | 
 99 |     def context_update_fn(q):
100 |         retrieval_cb([q])
101 |         return q
102 | 
103 |     _inputs = RunnableMap(
104 |         standalone_question=RunnablePassthrough.assign(
105 |             chat_history=lambda x: _format_chat_history(x["chat_history"])
106 |         )
107 |         | CONDENSE_QUESTION_PROMPT
108 |         | ChatOpenAI(temperature=0)
109 |         | StrOutputParser(),
110 |     )
111 |     _context = {
112 |         "context": itemgetter("standalone_question") | RunnablePassthrough(context_update_fn) | retriever | format_docs,
113 |         "question": lambda x: x["standalone_question"],
114 |     }
115 |     conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()
116 |     return conversational_qa_chain
117 | 
118 | 
119 | # RAG fusion chain
120 | # source1: https://youtu.be/GchC5WxeXGc?si=6i7J0rPZI7SNwFYZ
121 | # source2: https://towardsdatascience.com/forget-rag-the-future-is-rag-fusion-1147298d8ad1
122 | def reciprocal_rank_fusion(results: list[list], k=60):
123 |     from langchain.load import dumps, loads
124 |     fused_scores = {}
125 |     for docs in results:
126 |         # Assumes the docs are returned in sorted order of relevance
127 |         for rank, doc in enumerate(docs):
128 |             doc_str = dumps(doc)
129 |             if doc_str not in fused_scores:
130 |                 fused_scores[doc_str] = 0
131 |             fused_scores[doc_str] += 1 / (rank + k)
132 | 
133 |     reranked_results = [
134 |         (loads(doc), score)
135 |         for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
136 |     ]
137 |     return reranked_results
138 | 
139 | 
140 | def get_search_query_generation_chain():
141 |     from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
142 |     prompt = ChatPromptTemplate(
143 |         input_variables=['original_query'],
144 |         messages=[
145 |             SystemMessagePromptTemplate(
146 |                 prompt=PromptTemplate(
147 |                     input_variables=[],
148 |                     template='You are a helpful assistant that generates multiple search queries based on a single input query.'
149 |                 )
150 |             ),
151 |             HumanMessagePromptTemplate(
152 |                 prompt=PromptTemplate(
153 |                     input_variables=['original_query'],
154 |                     template='Generate multiple search queries related to: {original_query} \n OUTPUT (4 queries):'
155 |                 )
156 |             )
157 |         ]
158 |     )
159 | 
160 |     generate_queries = (
161 |         prompt |
162 |         ChatOpenAI(temperature=0) |
163 |         StrOutputParser() |
164 |         (lambda x: x.split("\n"))
165 |     )
166 | 
167 |     return generate_queries
168 | 
169 | def get_rag_fusion_chain(file_name="Mahmoudi_Nima_202202_PhD.pdf", index_folder="index", retrieval_cb=None):
170 |     vectorstore = get_search_index(file_name, index_folder)
171 |     retriever = vectorstore.as_retriever()
172 |     query_generation_chain = get_search_query_generation_chain()
173 |     _inputs = RunnableMap(
174 |         standalone_question=RunnablePassthrough.assign(
175 |             chat_history=lambda x: _format_chat_history(x["chat_history"])
176 |         )
177 |         | CONDENSE_QUESTION_PROMPT
178 |         | ChatOpenAI(temperature=0)
179 |         | StrOutputParser(),
180 |     )
181 | 
182 |     if retrieval_cb is None:
183 |         retrieval_cb = lambda x: x
184 | 
185 |     _context = {
186 |         "context":
187 |             RunnablePassthrough.assign(
188 |                 original_query=lambda x: x["standalone_question"]
189 |             )
190 |             | query_generation_chain
191 |             | retrieval_cb
192 |             | retriever.map()
193 |             | reciprocal_rank_fusion
194 |             | (lambda x: [item[0] for item in x])
195 |             | format_docs,
196 |         "question": lambda x: x["standalone_question"],
197 |     }
198 |     conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()
199 |     return conversational_qa_chain
200 | 
201 | 
202 | ####################
203 | # Adding agent chain with OpenAI function calling
204 | 
205 | def get_search_tool_from_index(search_index, st_cb: Optional[StreamlitCallbackHandler] = None, ):
206 |     from langchain.agents import tool
207 |     from agent_helper import retry_and_streamlit_callback
208 | 
209 |     @tool
210 |     @retry_and_streamlit_callback(st_cb=st_cb, tool_name="Content Seach Tool")
211 |     def search(query: str) -> str:
212 |         """Search the contents of the source document for the queries."""
213 | 
214 |         docs = search_index.similarity_search(query, k=5)
215 |         return format_docs(docs)
216 |     
217 |     return search
218 | 
219 | def get_lc_oai_tools(file_name:str = "Mahmoudi_Nima_202202_PhD.pdf", index_folder: str = "index", st_cb: Optional[StreamlitCallbackHandler] = None, ):
220 |     from langchain.tools.render import format_tool_to_openai_tool
221 |     search_index = get_search_index(file_name, index_folder)
222 |     lc_tools = [get_search_tool_from_index(search_index=search_index, st_cb=st_cb)]
223 |     oai_tools = [format_tool_to_openai_tool(t) for t in lc_tools]
224 |     return lc_tools, oai_tools
225 | 
226 | def get_agent_chain(file_name="Mahmoudi_Nima_202202_PhD.pdf", index_folder="index", callbacks=None, st_cb: Optional[StreamlitCallbackHandler] = None, ):
227 |     if callbacks is None:
228 |         callbacks = []
229 | 
230 |     from langchain.agents import initialize_agent, AgentType
231 |     from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
232 |     from langchain.agents.format_scratchpad.openai_tools import (
233 |         format_to_openai_tool_messages,
234 |     )
235 |     from langchain.agents import AgentExecutor
236 |     from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
237 | 
238 |     lc_tools, oai_tools = get_lc_oai_tools(file_name, index_folder, st_cb)
239 |     
240 | 
241 |     prompt = ChatPromptTemplate.from_messages(
242 |         [
243 |             ("system", "You are a helpful assistant, use the search tool to answer the user's question and cite only the page number when you use information coming (like [p1]) from the source document.\nchat history: {chat_history}"),
244 |             ("user", "{input}"),
245 |             MessagesPlaceholder(variable_name="agent_scratchpad"),
246 |         ]
247 |     )
248 |     llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")
249 | 
250 |     agent = (
251 |         {
252 |             "input": lambda x: x["input"],
253 |             "agent_scratchpad": lambda x: format_to_openai_tool_messages(
254 |                 x["intermediate_steps"]
255 |             ),
256 |             "chat_history": lambda x: _format_chat_history(x["chat_history"]),
257 |         }
258 |         | prompt
259 |         | llm.bind(tools=oai_tools)
260 |         | OpenAIToolsAgentOutputParser()
261 |     )
262 | 
263 |     agent_executor = AgentExecutor(agent=agent, tools=lc_tools, verbose=True, callbacks=callbacks)
264 |     return agent_executor
265 | 
266 | 
267 | if __name__ == "__main__":
268 |     question_generation_chain = get_search_query_generation_chain()
269 |     print('='*50)
270 |     print('RAG Chain')
271 |     chain = get_rag_chain()
272 |     print(chain.invoke({'input': 'serverless computing', 'chat_history': []}))
273 | 
274 |     print('='*50)
275 |     print('Question Generation Chain')
276 |     print(question_generation_chain.invoke({'original_query': 'serverless computing'}))
277 | 
278 |     print('-'*50)
279 |     print('RAG Fusion Chain')
280 |     chain = get_rag_fusion_chain()
281 |     print(chain.invoke({'input': 'serverless computing', 'chat_history': []}))
282 | 
283 |     agent_executor = get_agent_chain()
284 |     print(
285 |         agent_executor.invoke({
286 |             "input": "based on the source document, compare FaaS with BaaS??",
287 |             "chat_history": [],
288 |         })
289 |     )
290 | 


--------------------------------------------------------------------------------
/pdf/.gitignore:
--------------------------------------------------------------------------------
1 | *.pdf
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "llm-streamlit-demo-basic"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Nima Mahmoudi <nima.mahmoudi.w@gmail.com>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = ">=3.10.0,<3.11"
 9 | langchain = "^0.0.321"
10 | openai = "^0.28.1"
11 | streamlit = "^1.27.2"
12 | faiss-cpu = "^1.7.4"
13 | tiktoken = "^0.5.1"
14 | langchainhub = "^0.1.13"
15 | pypdf = "^3.17.0"
16 | 
17 | [tool.pyright]
18 | # https://github.com/microsoft/pyright/blob/main/docs/configuration.md
19 | useLibraryCodeForTypes = true
20 | exclude = [".cache"]
21 | 
22 | [tool.ruff]
23 | # https://beta.ruff.rs/docs/configuration/
24 | select = ['E', 'W', 'F', 'I', 'B', 'C4', 'ARG', 'SIM']
25 | ignore = ['W291', 'W292', 'W293']
26 | 
27 | [build-system]
28 | requires = ["poetry-core>=1.0.0"]
29 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain
2 | openai
3 | streamlit
4 | faiss-cpu
5 | tiktoken
6 | langchainhub
7 | pypdf


--------------------------------------------------------------------------------