├── .env.example ├── .flake8 ├── .gitignore ├── .vscode ├── extensions.json └── settings.json ├── LICENSE ├── README.md ├── dev-requirements.txt ├── pyproject.toml ├── requirements.txt └── src ├── __init__.py ├── main.py └── utils ├── __init__.py ├── chat.py └── process.py /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=your_openai_api_key 2 | ACTIVELOOP_TOKEN=your_activeloop_api_token 3 | ACTIVELOOP_USERNAME=your_activeloop_username 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, W503 3 | max-line-length = 88 4 | exclude = .git,__pycache__,build,dist 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .env 3 | repos 4 | venv 5 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["ms-python.python", "ms-python.vscode-pylance"] 3 | } 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.defaultFormatter": null, 4 | "editor.formatOnSave": true, 5 | "editor.insertSpaces": true, 6 | "editor.tabSize": 4 7 | }, 8 | "cSpell.words": [ 9 | "activeloop", 10 | "deeplake", 11 | "dotenv", 12 | "ipynb", 13 | "langchain", 14 | "openai", 15 | "pathspec", 16 | "stcli", 17 | "streamlit", 18 | "vectorstores" 19 | ], 20 | "python.defaultInterpreterPath": "./venv/bin/python", 21 | "python.formatting.blackArgs": ["--preview", "--line-length=88"], 22 | "python.formatting.blackPath": "./venv/bin/black", 23 | "python.formatting.provider": "black", 24 | "python.linting.enabled": true, 25 | "python.linting.flake8Args": ["--max-line-length=88"], 26 | "python.linting.flake8Enabled": true, 27 | "python.linting.flake8Path": "./venv/bin/flake8", 28 | "python.linting.lintOnSave": true, 29 | "python.linting.pylintEnabled": false 30 | } 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Peter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chat-with-Github-Repo 2 | 3 | This repository contains Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and Activeloop's Deep Lake. 4 | 5 | The chatbot searches a dataset stored in Deep Lake to find relevant information from any Git repository and generates responses based on the user's input. 6 | 7 | ## Files 8 | 9 | - `src/utils/process.py`: This script clones a Git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance. 10 | 11 | - `src/utils/chat.py`: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo. 12 | 13 | - `src/main.py`: This script contains the command line interface (CLI) that allows you to run the chatbot application. 14 | 15 | ## Setup 16 | 17 | Before getting started, be sure to sign up for an [Activeloop](https://www.activeloop.ai/) and [OpenAI](https://openai.com/) account and create API keys. 18 | 19 | To set up and run this project, follow these steps: 20 | 21 | 1. Clone the repository and navigate to the project directory: 22 | 23 | ```bash 24 | git clone https://github.com/peterw/Chat-with-Git-Repo.git 25 | cd Chat-with-Git-Repo 26 | ``` 27 | 28 | 2. Install the required packages with `pip`: 29 | 30 | ```bash 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | For development dependencies, you can install them using the following command: 35 | 36 | ```bash 37 | pip install -r dev-requirements.txt 38 | ``` 39 | 40 | 3. Set the environment variables: 41 | 42 | Copy the `.env.example` file: 43 | 44 | ```bash 45 | cp .env.example .env 46 | ``` 47 | 48 | Provide your API keys and username: 49 | 50 | ``` 51 | OPENAI_API_KEY=your_openai_api_key 52 | ACTIVELOOP_TOKEN=your_activeloop_api_token 53 | ACTIVELOOP_USERNAME=your_activeloop_username 54 | ``` 55 | 56 | 4. Use the CLI to run the chatbot application. You can either process a Git repository or start the chat application using an existing dataset. 57 | 58 | > For complete CLI instructions run `python src/main.py --help` 59 | 60 | To process a Git repository, use the `process` subcommand: 61 | 62 | ```bash 63 | python src/main.py process --repo-url https://github.com/username/repo_name 64 | ``` 65 | 66 | You can also specify additional options, such as file extensions to include while processing the repository, the name for the Activeloop dataset, or the destination to clone the repository: 67 | 68 | ```bash 69 | python src/main.py process --repo-url https://github.com/username/repo_name --include-file-extensions .md .txt --activeloop-dataset-name my-dataset --repo-destination repos 70 | ``` 71 | 72 | To start the chat application using an existing dataset, use the `chat` subcommand: 73 | 74 | ```bash 75 | python src/main.py chat --activeloop-dataset-name my-dataset 76 | ``` 77 | 78 | The Streamlit chat app will run, and you can interact with the chatbot at `http://localhost:8501` (or the next available port) to ask questions about the repository. 79 | 80 | ## Sponsors 81 | 82 | ✨ Find profitable ideas faster: [Exploding Insights](https://explodinginsights.com/) 83 | 84 | ## License 85 | 86 | [MIT License](LICENSE) 87 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | target-version = ['py37', 'py38', 'py39'] 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | deeplake 2 | langchain 3 | openai 4 | pathspec 5 | python-dotenv 6 | streamlit 7 | streamlit_chat 8 | tiktoken 9 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterw/Chat-with-Github-Repo/446c41431cb985c6d99c429912e2989a16a9b882/src/__init__.py -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from dotenv import load_dotenv 5 | from streamlit.web import cli as stcli 6 | from utils.process import process 7 | 8 | # Load environment variables from a .env file (containing OPENAI_API_KEY) 9 | load_dotenv() 10 | 11 | 12 | def extract_repo_name(repo_url): 13 | """Extract the repository name from the given repository URL.""" 14 | repo_name = repo_url.split("/")[-1].replace(".git", "") 15 | return repo_name 16 | 17 | 18 | def process_repo(args): 19 | """ 20 | Process the git repository by cloning it, filtering files, and 21 | creating an Activeloop dataset with the contents. 22 | """ 23 | repo_name = extract_repo_name(args.repo_url) 24 | activeloop_username = os.environ.get("ACTIVELOOP_USERNAME") 25 | 26 | if not args.activeloop_dataset_name: 27 | args.activeloop_dataset_path = f"hub://{activeloop_username}/{repo_name}" 28 | else: 29 | args.activeloop_dataset_path = ( 30 | f"hub://{activeloop_username}/{args.activeloop_dataset_name}" 31 | ) 32 | 33 | process( 34 | args.repo_url, 35 | args.include_file_extensions, 36 | args.activeloop_dataset_path, 37 | args.repo_destination, 38 | ) 39 | 40 | 41 | def chat(args): 42 | """ 43 | Start the Streamlit chat application using the specified Activeloop dataset. 44 | """ 45 | activeloop_username = os.environ.get("ACTIVELOOP_USERNAME") 46 | 47 | args.activeloop_dataset_path = ( 48 | f"hub://{activeloop_username}/{args.activeloop_dataset_name}" 49 | ) 50 | 51 | sys.argv = [ 52 | "streamlit", 53 | "run", 54 | "src/utils/chat.py", 55 | "--", 56 | f"--activeloop_dataset_path={args.activeloop_dataset_path}", 57 | ] 58 | 59 | sys.exit(stcli.main()) 60 | 61 | 62 | def main(): 63 | """Define and parse CLI arguments, then execute the appropriate subcommand.""" 64 | parser = argparse.ArgumentParser(description="Chat with a git repository") 65 | subparsers = parser.add_subparsers(dest="command") 66 | 67 | # Process subcommand 68 | process_parser = subparsers.add_parser("process", help="Process a git repository") 69 | process_parser.add_argument( 70 | "--repo-url", required=True, help="The git repository URL" 71 | ) 72 | process_parser.add_argument( 73 | "--include-file-extensions", 74 | nargs="+", 75 | default=None, 76 | help=( 77 | "Exclude all files not matching these extensions. Example:" 78 | " --include-file-extensions .py .js .ts .html .css .md .txt" 79 | ), 80 | ) 81 | process_parser.add_argument( 82 | "--activeloop-dataset-name", 83 | help=( 84 | "The name for the Activeloop dataset. Defaults to the git repository name." 85 | ), 86 | ) 87 | process_parser.add_argument( 88 | "--repo-destination", 89 | default="repos", 90 | help="The destination to clone the repository. Defaults to 'repos'.", 91 | ) 92 | 93 | # Chat subcommand 94 | chat_parser = subparsers.add_parser("chat", help="Start the chat application") 95 | chat_parser.add_argument( 96 | "--activeloop-dataset-name", 97 | required=True, 98 | help="The name of one of your existing Activeloop datasets.", 99 | ) 100 | 101 | args = parser.parse_args() 102 | 103 | if args.command == "process": 104 | process_repo(args) 105 | elif args.command == "chat": 106 | chat(args) 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterw/Chat-with-Github-Repo/446c41431cb985c6d99c429912e2989a16a9b882/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/chat.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from langchain.vectorstores import DeepLake 4 | from langchain.embeddings.openai import OpenAIEmbeddings 5 | from langchain.chains import RetrievalQA 6 | from langchain.chat_models import ChatOpenAI 7 | import openai 8 | import streamlit as st 9 | from streamlit_chat import message 10 | 11 | 12 | def run_chat_app(activeloop_dataset_path): 13 | """Run the chat application using the Streamlit framework.""" 14 | # Set the title for the Streamlit app 15 | st.title(f"{os.path.basename(activeloop_dataset_path)} GPT") 16 | 17 | # Set the OpenAI API key from the environment variable 18 | openai.api_key = os.environ.get("OPENAI_API_KEY") 19 | 20 | # Create an instance of OpenAIEmbeddings 21 | embeddings = OpenAIEmbeddings() 22 | 23 | # Create an instance of DeepLake with the specified dataset path and embeddings 24 | db = DeepLake( 25 | dataset_path=activeloop_dataset_path, 26 | read_only=True, 27 | embedding_function=embeddings, 28 | ) 29 | 30 | # Initialize the session state for generated responses and past inputs 31 | if "generated" not in st.session_state: 32 | st.session_state["generated"] = ["i am ready to help you ser"] 33 | 34 | if "past" not in st.session_state: 35 | st.session_state["past"] = ["hello"] 36 | 37 | # Get the user's input from the text input field 38 | user_input = get_text() 39 | 40 | # If there is user input, search for a response using the search_db function 41 | if user_input: 42 | output = search_db(db, user_input) 43 | st.session_state.past.append(user_input) 44 | st.session_state.generated.append(output) 45 | 46 | # If there are generated responses, display the conversation using Streamlit 47 | # messages 48 | if st.session_state["generated"]: 49 | for i in range(len(st.session_state["generated"])): 50 | message(st.session_state["past"][i], is_user=True, key=str(i) + "_user") 51 | message(st.session_state["generated"][i], key=str(i)) 52 | 53 | 54 | def generate_response(prompt): 55 | """ 56 | Generate a response using OpenAI's ChatCompletion API and the specified prompt. 57 | """ 58 | completion = openai.ChatCompletion.create( 59 | model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}] 60 | ) 61 | response = completion.choices[0].message.content 62 | return response 63 | 64 | 65 | def get_text(): 66 | """Create a Streamlit input field and return the user's input.""" 67 | input_text = st.text_input("", key="input") 68 | return input_text 69 | 70 | 71 | def search_db(db, query): 72 | """Search for a response to the query in the DeepLake database.""" 73 | # Create a retriever from the DeepLake instance 74 | retriever = db.as_retriever() 75 | # Set the search parameters for the retriever 76 | retriever.search_kwargs["distance_metric"] = "cos" 77 | retriever.search_kwargs["fetch_k"] = 100 78 | retriever.search_kwargs["maximal_marginal_relevance"] = True 79 | retriever.search_kwargs["k"] = 10 80 | # Create a ChatOpenAI model instance 81 | model = ChatOpenAI(model="gpt-3.5-turbo") 82 | # Create a RetrievalQA instance from the model and retriever 83 | qa = RetrievalQA.from_llm(model, retriever=retriever) 84 | # Return the result of the query 85 | return qa.run(query) 86 | 87 | 88 | if __name__ == "__main__": 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument("--activeloop_dataset_path", type=str, required=True) 91 | args = parser.parse_args() 92 | 93 | run_chat_app(args.activeloop_dataset_path) 94 | -------------------------------------------------------------------------------- /src/utils/process.py: -------------------------------------------------------------------------------- 1 | import deeplake 2 | import openai 3 | import os 4 | import pathspec 5 | import subprocess 6 | from langchain.document_loaders import TextLoader 7 | from langchain.embeddings.openai import OpenAIEmbeddings 8 | from langchain.text_splitter import CharacterTextSplitter 9 | from langchain.vectorstores import DeepLake 10 | 11 | # Set the OpenAI API key 12 | openai.api_key = os.environ.get("OPENAI_API_KEY") 13 | 14 | 15 | def clone_repository(repo_url, local_path): 16 | """Clone the specified git repository to the given local path.""" 17 | subprocess.run(["git", "clone", repo_url, local_path]) 18 | 19 | 20 | def load_docs(root_dir, file_extensions=None): 21 | """ 22 | Load documents from the specified root directory. 23 | Ignore dotfiles, dot directories, and files that match .gitignore rules. 24 | Optionally filter by file extensions. 25 | """ 26 | docs = [] 27 | 28 | # Load .gitignore rules 29 | gitignore_path = os.path.join(root_dir, ".gitignore") 30 | 31 | if os.path.isfile(gitignore_path): 32 | with open(gitignore_path, "r") as gitignore_file: 33 | gitignore = gitignore_file.read() 34 | spec = pathspec.PathSpec.from_lines( 35 | pathspec.patterns.GitWildMatchPattern, gitignore.splitlines() 36 | ) 37 | else: 38 | spec = None 39 | 40 | for dirpath, dirnames, filenames in os.walk(root_dir): 41 | # Remove dot directories from the list of directory names 42 | dirnames[:] = [d for d in dirnames if not d.startswith(".")] 43 | 44 | for file in filenames: 45 | file_path = os.path.join(dirpath, file) 46 | 47 | # Skip dotfiles 48 | if file.startswith("."): 49 | continue 50 | 51 | # Skip files that match .gitignore rules 52 | if spec and spec.match_file(file_path): 53 | continue 54 | 55 | if file_extensions and os.path.splitext(file)[1] not in file_extensions: 56 | continue 57 | 58 | try: 59 | loader = TextLoader(file_path, encoding="utf-8") 60 | docs.extend(loader.load_and_split()) 61 | except Exception: 62 | pass 63 | return docs 64 | 65 | 66 | def split_docs(docs): 67 | """Split the input documents into smaller chunks.""" 68 | text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) 69 | return text_splitter.split_documents(docs) 70 | 71 | 72 | def create_deeplake_dataset(activeloop_dataset_path, activeloop_token): 73 | """Create an empty DeepLake dataset with the specified path and token.""" 74 | ds = deeplake.empty( 75 | activeloop_dataset_path, 76 | token=activeloop_token, 77 | overwrite=True, 78 | ) 79 | 80 | ds.create_tensor("ids") 81 | ds.create_tensor("metadata") 82 | ds.create_tensor("embedding") 83 | ds.create_tensor("text") 84 | 85 | 86 | def process( 87 | repo_url, include_file_extensions, activeloop_dataset_path, repo_destination 88 | ): 89 | """ 90 | Process a git repository by cloning it, filtering files, splitting documents, 91 | creating embeddings, and storing everything in a DeepLake dataset. 92 | """ 93 | activeloop_token = os.getenv("ACTIVELOOP_TOKEN") 94 | 95 | create_deeplake_dataset(activeloop_dataset_path, activeloop_token) 96 | 97 | clone_repository(repo_url, repo_destination) 98 | 99 | docs = load_docs(repo_destination, include_file_extensions) 100 | texts = split_docs(docs) 101 | 102 | embeddings = OpenAIEmbeddings() 103 | 104 | db = DeepLake(dataset_path=activeloop_dataset_path, embedding_function=embeddings) 105 | db.add_documents(texts) 106 | --------------------------------------------------------------------------------