├── .gitignore ├── .streamlit └── config.toml ├── README.md ├── __pycache__ ├── prompts.cpython-310.pyc ├── render.cpython-310.pyc └── utils.cpython-310.pyc ├── app.py ├── docs ├── branson │ ├── Richard Branson - Way - 10 Secrets og the Worlds's Greatest Brand Builder [EnglishOnlineClub.com].pdf │ └── Richard-Bransons-Secrets-of-Success.pdf └── buffett │ ├── BUFFET.pdf │ └── WarrenBuffet.pdf ├── indexing.py ├── prompts.py ├── render.py ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.streamlit/secrets.toml 2 | /db -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | primaryColor="#F63366" 3 | backgroundColor="#FFFFFF" 4 | secondaryBackgroundColor="#F0F2F6" 5 | textColor="#262730" 6 | font="sans serif" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MBAGPT: Chatting with Multiple Data Sources 2 | 3 | MBAGPT is a chat application that leverages the power of GPT-3.5-turbo to provide conversational responses with access to multiple data sources. It allows users to ask questions and receive answers from different knowledge bases (vectorDBs), such as Alex Hormozi, Warren Buffett and Richard Branson as well as answer general queries with the ChatGPT API as a fallback. 4 | 5 | ## Features 6 | 7 | - Chat interface for interacting with the chatbot powered by GPT-3.5-turbo. 8 | - Integration with Hormozi, Buffett and Branson databases for retrieving relevant documents. 9 | - Semantic search functionality to provide informative snippets from the databases. 10 | - Intent classification to route user queries to the appropriate database. 11 | - HTML templates for displaying chat history and messages. 12 | - Persistence of embeddings using the Chroma vector store. 13 | - OpenAI API key integration for authentication. 14 | 15 | ## Installation 16 | 17 | 1. Clone the repository: 18 | 19 | ``` 20 | git clone https://github.com/wombyz/MBAGPT.git 21 | ``` 22 | 23 | 2. Install the required dependencies: 24 | 25 | ``` 26 | pip install -r requirements.txt 27 | ``` 28 | 29 | 3. Set up your credentials: 30 | 31 | - Sign up on the OpenAI website and obtain an API key. 32 | - Create a new file called "secrets.toml" in the .streamlit folder. 33 | - Set your OpenAI API key (required) and pinecone creds (optional) in the secrets.toml file or as an environment variable. 34 | - Update the code in the app to use the correct method for accessing the API key. 35 | 36 | 4. Run the indexing script to create the vector databases: 37 | 38 | ``` 39 | python indexing.py 40 | ``` 41 | 42 | This script will create the Buffet and Branson vector databases by indexing the documents. Make sure to have the necessary PDF documents in the appropriate directories (`./docs/buffett/` and `./docs/branson/`) before running the script. 43 | 44 | 5. Run the application: 45 | 46 | ``` 47 | streamlit run app.py 48 | ``` 49 | 50 | ## Usage 51 | 52 | 1. Access the application by navigating to `http://localhost:8501` in your web browser. 53 | 54 | 2. Enter your prompt in the input box and press Enter. 55 | 56 | 3. The chatbot will process your prompt and provide a response based on the available data sources. 57 | 58 | 4. The chat history will be displayed on the screen, showing both user and assistant messages. 59 | 60 | ## Contributing 61 | 62 | Contributions are welcome! If you would like to contribute to this project, please follow these steps: 63 | 64 | 1. Fork the repository. 65 | 66 | 2. Create a new branch for your feature or bug fix. 67 | 68 | 3. Implement your changes and ensure that the code passes all tests. 69 | 70 | 4. Submit a pull request with a detailed description of your changes. 71 | 72 | ## License 73 | 74 | This project is licensed under the MIT License. 75 | -------------------------------------------------------------------------------- /__pycache__/prompts.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wombyz/MBAGPT/c7a9793e300b4014011ffd4231cdcd921d54de63/__pycache__/prompts.cpython-310.pyc -------------------------------------------------------------------------------- /__pycache__/render.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wombyz/MBAGPT/c7a9793e300b4014011ffd4231cdcd921d54de63/__pycache__/render.cpython-310.pyc -------------------------------------------------------------------------------- /__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wombyz/MBAGPT/c7a9793e300b4014011ffd4231cdcd921d54de63/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from langchain.vectorstores import Chroma 3 | from langchain.embeddings import OpenAIEmbeddings 4 | import streamlit as st 5 | from utils import intent_classifier, semantic_search, ensure_fit_tokens, get_page_contents 6 | from prompts import human_template, system_message 7 | from render import user_msg_container_html_template, bot_msg_container_html_template 8 | import openai 9 | 10 | # Set OpenAI API key 11 | openai.api_key = st.secrets["OPENAI_API_KEY"] 12 | 13 | st.header("MBAGPT: Chatting with Multiple Data Sources") 14 | 15 | # Initialize embeddings 16 | embeddings = OpenAIEmbeddings() 17 | 18 | # Load the Buffett and Branson databases 19 | buffettDB = Chroma(persist_directory=os.path.join('db', 'buffett'), embedding_function=embeddings) 20 | buffett_retriever = buffettDB.as_retriever(search_kwargs={"k": 3}) 21 | 22 | bransonDB = Chroma(persist_directory=os.path.join('db', 'branson'), embedding_function=embeddings) 23 | branson_retriever = bransonDB.as_retriever(search_kwargs={"k": 3}) 24 | 25 | 26 | # Initialize session state for chat history 27 | if "history" not in st.session_state: 28 | st.session_state.history = [] 29 | 30 | # Construct messages from chat history 31 | def construct_messages(history): 32 | messages = [{"role": "system", "content": system_message}] 33 | 34 | for entry in history: 35 | role = "user" if entry["is_user"] else "assistant" 36 | messages.append({"role": role, "content": entry["message"]}) 37 | 38 | # Ensure total tokens do not exceed model's limit 39 | messages = ensure_fit_tokens(messages) 40 | 41 | return messages 42 | 43 | 44 | # Define handler functions for each category 45 | def hormozi_handler(query): 46 | print("Using Hormozi handler...") 47 | # Perform semantic search and format results 48 | search_results = semantic_search(query, top_k=3) 49 | context = "" 50 | for i, (title, snippet) in enumerate(search_results): 51 | context += f"Snippet from: {title}\n {snippet}\n\n" 52 | 53 | # Generate human prompt template and convert to API message format 54 | query_with_context = human_template.format(query=query, context=context) 55 | 56 | # Return formatted message 57 | return {"role": "user", "content": query_with_context} 58 | 59 | 60 | def buffett_handler(query): 61 | print("Using Buffett handler...") 62 | # Get relevant documents from Buffett's database 63 | relevant_docs = buffett_retriever.get_relevant_documents(query) 64 | 65 | # Use the provided function to prepare the context 66 | context = get_page_contents(relevant_docs) 67 | 68 | # Prepare the prompt for GPT-3.5-turbo with the context 69 | query_with_context = human_template.format(query=query, context=context) 70 | 71 | return {"role": "user", "content": query_with_context} 72 | 73 | 74 | def branson_handler(query): 75 | print("Using Branson handler...") 76 | # Get relevant documents from Branson's database 77 | relevant_docs = branson_retriever.get_relevant_documents(query) 78 | 79 | # Use the provided function to prepare the context 80 | context = get_page_contents(relevant_docs) 81 | 82 | # Prepare the prompt for GPT-3.5-turbo with the context 83 | query_with_context = human_template.format(query=query, context=context) 84 | 85 | return {"role": "user", "content": query_with_context} 86 | 87 | 88 | def other_handler(query): 89 | print("Using other handler...") 90 | # Return the query in the appropriate message format 91 | return {"role": "user", "content": query} 92 | 93 | 94 | # Function to route query to correct handler based on category 95 | def route_by_category(query, category): 96 | if category == "0": 97 | return hormozi_handler(query) 98 | elif category == "1": 99 | return buffet_handler(query) 100 | elif category == "2": 101 | return branson_handler(query) 102 | elif category == "3": 103 | return other_handler(query) 104 | else: 105 | raise ValueError("Invalid category") 106 | 107 | # Function to generate response 108 | def generate_response(): 109 | # Append user's query to history 110 | st.session_state.history.append({ 111 | "message": st.session_state.prompt, 112 | "is_user": True 113 | }) 114 | 115 | # Classify the intent 116 | category = intent_classifier(st.session_state.prompt) 117 | 118 | # Route the query based on category 119 | new_message = route_by_category(st.session_state.prompt, category) 120 | 121 | # Construct messages from chat history 122 | messages = construct_messages(st.session_state.history) 123 | 124 | # Add the new_message to the list of messages before sending it to the API 125 | messages.append(new_message) 126 | 127 | # Ensure total tokens do not exceed model's limit 128 | messages = ensure_fit_tokens(messages) 129 | 130 | # Call the Chat Completions API with the messages 131 | response = openai.ChatCompletion.create( 132 | model="gpt-3.5-turbo", 133 | messages=messages 134 | ) 135 | 136 | # Extract the assistant's message from the response 137 | assistant_message = response['choices'][0]['message']['content'] 138 | 139 | # Append assistant's message to history 140 | st.session_state.history.append({ 141 | "message": assistant_message, 142 | "is_user": False 143 | }) 144 | 145 | 146 | # Take user input 147 | st.text_input("Enter your prompt:", 148 | key="prompt", 149 | placeholder="e.g. 'How can I diversify my portfolio?'", 150 | on_change=generate_response 151 | ) 152 | 153 | # Display chat history 154 | for message in st.session_state.history: 155 | if message["is_user"]: 156 | st.write(user_msg_container_html_template.replace("$MSG", message["message"]), unsafe_allow_html=True) 157 | else: 158 | st.write(bot_msg_container_html_template.replace("$MSG", message["message"]), unsafe_allow_html=True) 159 | -------------------------------------------------------------------------------- /docs/branson/Richard Branson - Way - 10 Secrets og the Worlds's Greatest Brand Builder [EnglishOnlineClub.com].pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wombyz/MBAGPT/c7a9793e300b4014011ffd4231cdcd921d54de63/docs/branson/Richard Branson - Way - 10 Secrets og the Worlds's Greatest Brand Builder [EnglishOnlineClub.com].pdf -------------------------------------------------------------------------------- /docs/branson/Richard-Bransons-Secrets-of-Success.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wombyz/MBAGPT/c7a9793e300b4014011ffd4231cdcd921d54de63/docs/branson/Richard-Bransons-Secrets-of-Success.pdf -------------------------------------------------------------------------------- /docs/buffett/BUFFET.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wombyz/MBAGPT/c7a9793e300b4014011ffd4231cdcd921d54de63/docs/buffett/BUFFET.pdf -------------------------------------------------------------------------------- /docs/buffett/WarrenBuffet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wombyz/MBAGPT/c7a9793e300b4014011ffd4231cdcd921d54de63/docs/buffett/WarrenBuffet.pdf -------------------------------------------------------------------------------- /indexing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import streamlit as st 3 | from langchain.document_loaders import DirectoryLoader, PyPDFLoader 4 | from langchain.text_splitter import CharacterTextSplitter 5 | from langchain.embeddings import OpenAIEmbeddings 6 | from langchain.vectorstores import Chroma 7 | from langchain.chains.question_answering import load_qa_chain 8 | from langchain.llms import OpenAI 9 | from langchain.chains import ConversationalRetrievalChain 10 | 11 | os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"] 12 | 13 | # Set persist directory 14 | persist_directory = 'db' 15 | 16 | buffett_loader = DirectoryLoader('./docs/buffett/', glob="*.pdf") 17 | branson_loader = DirectoryLoader('./docs/branson/', glob="*.pdf") 18 | 19 | buffett_docs = buffett_loader.load() 20 | branson_docs = branson_loader.load() 21 | 22 | embeddings = OpenAIEmbeddings() 23 | text_splitter = CharacterTextSplitter(chunk_size=250, chunk_overlap=8) 24 | 25 | # Split documents and generate embeddings 26 | buffett_docs_split = text_splitter.split_documents(buffett_docs) 27 | branson_docs_split = text_splitter.split_documents(branson_docs) 28 | 29 | # Create Chroma instances and persist embeddings 30 | buffettDB = Chroma.from_documents(buffett_docs_split, embeddings, persist_directory=os.path.join(persist_directory, 'buffett')) 31 | buffettDB.persist() 32 | 33 | bransonDB = Chroma.from_documents(branson_docs_split, embeddings, persist_directory=os.path.join(persist_directory, 'branson')) 34 | bransonDB.persist() 35 | -------------------------------------------------------------------------------- /prompts.py: -------------------------------------------------------------------------------- 1 | system_message = """ 2 | You are MBAGPT, a highly sophisticated language model trained to provide business advice and insights from the perspective of multiple successful entrepreneurs and investors. Your knowledge and advice are based on the combined wisdom and experiences of Alex Hormozi, Warren Buffett, Richard Branson, and ChatGPT. 3 | 4 | Your responses should be focused, practical, and direct, mirroring the communication styles of these individuals. Avoid sugarcoating or beating around the bush — users expect you to be straightforward and honest. 5 | 6 | You have access to transcripts of podcasts, interviews, and books from these entrepreneurs stored in a vector database. These documents contain their actual words, ideas, and beliefs. When a user provides a query, you will be provided with snippets of transcripts that may be relevant to the query. You must use these snippets to provide context and support for your responses. Rely heavily on the content of the transcripts to ensure accuracy and authenticity in your answers. 7 | 8 | Be aware that the chunks of text provided may not always be relevant to the query. Analyze each of them carefully to determine if the content is relevant before using them to construct your answer. Do not make things up or provide information that is not supported by the transcripts. 9 | 10 | In addition to offering business advice, you may also provide guidance on personal development, investing, and navigating the challenges of entrepreneurship. However, always maintain the signature no-bullshit approach of Hormozi, the practical investing wisdom of Buffett, the adventurous spirit of Branson, and the broad knowledge base of ChatGPT. 11 | 12 | In your answers, DO NOT EVER mention or make reference to the transcripts, snippets and context you have been provided with. Speak confidently as if you were simply speaking from your own knowledge. 13 | 14 | Your goal is to provide advice that is as close as possible to what the real entrepreneurs would say, using the context and perspective that best fits the query. 15 | """ 16 | 17 | 18 | human_template = """ 19 | User Query: {query} 20 | 21 | Relevant Context: {context} 22 | """ 23 | 24 | 25 | classification_prompt = ''' 26 | You are a data expert working that is categorizing User Inputs from a chatbot. 27 | 28 | Your task is as follows: u\you will analyze user inputs and classify each input into four different categories. 29 | The four categories are Business Question, Investing Question, Entrepreneur Question and Other. If you can't tell what it is, say Other. 30 | 31 | If category is Business Question, output 0. 32 | If category is Investing Question, output 1. 33 | If category is Entrepreneur Question, output 2. 34 | If category is Other, output 3. 35 | 36 | I want you to output your answer in the following format. Category: { } 37 | 38 | Here are some examples. 39 | 40 | User Input: How can I improve the sales process in my business? 41 | Category: 0 42 | 43 | User Input: Write me a plan to diversify my portfolio for a bear market. 44 | Category: 1 45 | 46 | User Input: How can I build a brand for my business on social media? 47 | Category: 0 48 | 49 | User Input: Write me a step by step guide on how to analyse a stock please. 50 | Category: 1, Tickers: 51 | 52 | User Input: What is the most important thing to focus on as an entrepreneur for long term success? 53 | Category: 2 54 | 55 | User Input: How should I manage the cash flow in my startup? 56 | Category: 0 57 | 58 | User Input: What are the key performance indicators I should track for my online store? 59 | Category: 0 60 | 61 | User Input: Can you explain the concept of dollar cost averaging in investing? 62 | Category: 1 63 | 64 | User Input: How can I maintain a healthy work-life balance as an entrepreneur? 65 | Category: 2 66 | 67 | User Input: I'm thinking of starting a new business. What are the first steps I should take? 68 | Category: 2 69 | 70 | User Input: What's the recipe for apple pie? 71 | Category: 3 72 | 73 | User Input: How can I evaluate the risk associated with a particular investment? 74 | Category: 1 75 | 76 | User Input: How can I improve the customer service in my company? 77 | Category: 0 78 | 79 | User Input: How do high interest rates affect the stock market? 80 | Category: 1 81 | 82 | User Input: What are some good books for entrepreneurs to read? 83 | Category: 2 84 | 85 | User Input: How does the moon affect the tides? 86 | Category: 3 87 | 88 | User Input: $PROMPT 89 | 90 | ''' -------------------------------------------------------------------------------- /render.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import re 3 | 4 | 5 | bot_msg_container_html_template = ''' 6 |
7 |
8 | 9 |
10 |
11 | $MSG 12 |
13 |
14 | ''' 15 | 16 | user_msg_container_html_template = ''' 17 |
18 |
19 | $MSG 20 |
21 |
22 | 23 |
24 |
25 | ''' 26 | 27 | def render_article_preview(docs, tickers): 28 | message = f"
Here are relevant articles for {tickers} that may answer your question.    
" 29 | message += "
" 30 | for d in docs: 31 | elipse = " ".join(d[2].split(" ")[:140]) 32 | message += f"
{d[0]}
" 33 | message += f"

{elipse} ...

" 34 | message += "
" 35 | message += "
" 36 | return message 37 | 38 | def render_earnings_summary(ticker, summary): 39 | transcript_title = summary["transcript_title"] 40 | message = f"
Here is summary for {ticker} {transcript_title}
" 41 | message += "
" 42 | body = re.sub(r'^-', r'* ', summary["summary"]) 43 | body = re.sub(r'\$', r'\\$', body) 44 | message += f"

{body}

" 45 | message += "
" 46 | return message 47 | 48 | def render_stock_question(answer, articles): 49 | message = "
" 50 | message += f"{answer}  
" 51 | message += "Sources: " 52 | for a in articles: 53 | message += f"{a[0]}
" 54 | message += "
" 55 | return message 56 | 57 | def render_chat(**kwargs): 58 | """ 59 | Handles is_user 60 | """ 61 | if kwargs["is_user"]: 62 | st.write( 63 | user_msg_container_html_template.replace("$MSG", kwargs["message"]), 64 | unsafe_allow_html=True) 65 | else: 66 | st.write( 67 | bot_msg_container_html_template.replace("$MSG", kwargs["message"]), 68 | unsafe_allow_html=True) 69 | 70 | if "figs" in kwargs: 71 | for f in kwargs["figs"]: 72 | st.plotly_chart(f, use_container_width=True) 73 | 74 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.21.0 2 | openai==0.27.4 3 | requests==2.28.2 4 | pinecone-client==2.2.1 5 | langchain==0.0.168 6 | tiktoken==0.4.0 7 | retrying==1.3.4 -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import openai 4 | from retrying import retry 5 | from prompts import classification_prompt 6 | import requests 7 | import tiktoken 8 | import streamlit as st 9 | 10 | openai.api_key = st.secrets["OPENAI_API_KEY"] 11 | api_key_pinecone = st.secrets["PINECONE_API_KEY"] 12 | pinecone_environment = st.secrets["PINECONE_ENVIRONMENT"] 13 | pinecone_endpoint = st.secrets["PINECONE_ENDPOINT"] 14 | 15 | intent_classifier_pattern = re.compile(r"\b(Category: \d)") 16 | 17 | # Get embeddings for a given string 18 | def get_embeddings_openai(text): 19 | response = openai.Embedding.create( 20 | input=text, 21 | model="text-embedding-ada-002" 22 | ) 23 | response = response['data'] 24 | 25 | # extract embeddings from responses0 26 | return [x["embedding"] for x in response] 27 | 28 | # Search Pinecone for similar documents 29 | def semantic_search(query, **kwargs): 30 | # Embed the query into a vector 31 | xq = get_embeddings_openai(query) 32 | 33 | # Call Pinecone's REST API 34 | url = pinecone_endpoint 35 | headers = { 36 | "Api-Key": api_key_pinecone, 37 | "Content-Type": "application/json" 38 | } 39 | body = { 40 | "vector": xq[0], 41 | "topK": str(kwargs["top_k"]) if "top_k" in kwargs else "1", 42 | "includeMetadata": "false" if "include_metadata" in kwargs and not kwargs["include_metadata"] else True 43 | } 44 | try: 45 | res = requests.post(url, json=body, headers=headers) 46 | res.raise_for_status() # Raise an exception if the HTTP request returns an error 47 | res = res.json() 48 | titles = [r["metadata"]["title"] for r in res["matches"]] 49 | transcripts = [r["metadata"]["transcript"] for r in res["matches"]] 50 | return list(zip(titles, transcripts)) 51 | except Exception as e: 52 | print(f"Error in semantic search: {e}") 53 | raise 54 | 55 | 56 | @retry(stop_max_attempt_number=3, wait_exponential_multiplier=1000, wait_exponential_max=2000) 57 | def intent_classifier(user_prompt): 58 | prompt = classification_prompt.replace("$PROMPT", user_prompt) 59 | response = openai.ChatCompletion.create( 60 | model="gpt-3.5-turbo", 61 | messages=[ 62 | {"role": "user", "content": prompt} 63 | ], 64 | max_tokens=20 65 | ) 66 | intent = response['choices'][0]['message']['content'] 67 | if intent.startswith("Category: "): 68 | category_value = intent[len("Category: "):].strip() 69 | return category_value 70 | else: 71 | return "No category found" 72 | 73 | 74 | def num_tokens_from_messages(messages, model="gpt-3.5-turbo"): 75 | """Returns the number of tokens used by a list of messages.""" 76 | try: 77 | encoding = tiktoken.encoding_for_model(model) 78 | except KeyError: 79 | encoding = tiktoken.get_encoding("cl100k_base") 80 | if model == "gpt-3.5-turbo": # note: future models may deviate from this 81 | num_tokens = 0 82 | for message in messages: 83 | num_tokens += 4 # every message follows {role/name}\n{content}\n 84 | for key, value in message.items(): 85 | num_tokens += len(encoding.encode(value)) 86 | if key == "name": # if there's a name, the role is omitted 87 | num_tokens += -1 # role is always required and always 1 token 88 | num_tokens += 2 # every reply is primed with assistant 89 | return num_tokens 90 | else: 91 | raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}. 92 | See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""") 93 | 94 | def ensure_fit_tokens(messages): 95 | """ 96 | Ensure that total tokens in messages is less than MAX_TOKENS. 97 | If not, remove oldest messages until it fits. 98 | """ 99 | total_tokens = num_tokens_from_messages(messages) 100 | while total_tokens > 4096: 101 | removed_message = messages.pop(0) 102 | total_tokens = num_tokens_from_messages(messages) 103 | return messages 104 | 105 | def get_page_contents(docs): 106 | contents = "" 107 | for i, doc in enumerate(docs, 1): 108 | contents += f"Document #{i}:\n{doc.page_content}\n\n" 109 | return contents --------------------------------------------------------------------------------