├── .env.example
├── .flake8
├── .gitignore
├── .vscode
    ├── extensions.json
    └── settings.json
├── LICENSE
├── README.md
├── dev-requirements.txt
├── pyproject.toml
├── requirements.txt
└── src
    ├── __init__.py
    ├── main.py
    └── utils
        ├── __init__.py
        ├── chat.py
        └── process.py


/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=your_openai_api_key
2 | ACTIVELOOP_TOKEN=your_activeloop_api_token
3 | ACTIVELOOP_USERNAME=your_activeloop_username
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, W503
3 | max-line-length = 88
4 | exclude = .git,__pycache__,build,dist
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .env
3 | repos
4 | venv
5 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"recommendations": ["ms-python.python", "ms-python.vscode-pylance"]
3 | }
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"[python]": {
 3 | 		"editor.defaultFormatter": null,
 4 | 		"editor.formatOnSave": true,
 5 | 		"editor.insertSpaces": true,
 6 | 		"editor.tabSize": 4
 7 | 	},
 8 | 	"cSpell.words": [
 9 | 		"activeloop",
10 | 		"deeplake",
11 | 		"dotenv",
12 | 		"ipynb",
13 | 		"langchain",
14 | 		"openai",
15 | 		"pathspec",
16 | 		"stcli",
17 | 		"streamlit",
18 | 		"vectorstores"
19 | 	],
20 | 	"python.defaultInterpreterPath": "./venv/bin/python",
21 | 	"python.formatting.blackArgs": ["--preview", "--line-length=88"],
22 | 	"python.formatting.blackPath": "./venv/bin/black",
23 | 	"python.formatting.provider": "black",
24 | 	"python.linting.enabled": true,
25 | 	"python.linting.flake8Args": ["--max-line-length=88"],
26 | 	"python.linting.flake8Enabled": true,
27 | 	"python.linting.flake8Path": "./venv/bin/flake8",
28 | 	"python.linting.lintOnSave": true,
29 | 	"python.linting.pylintEnabled": false
30 | }
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Peter
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Chat-with-Github-Repo
 2 | 
 3 | This repository contains Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and Activeloop's Deep Lake.
 4 | 
 5 | The chatbot searches a dataset stored in Deep Lake to find relevant information from any Git repository and generates responses based on the user's input.
 6 | 
 7 | ## Files
 8 | 
 9 | - `src/utils/process.py`: This script clones a Git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance.
10 | 
11 | - `src/utils/chat.py`: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo.
12 | 
13 | - `src/main.py`: This script contains the command line interface (CLI) that allows you to run the chatbot application.
14 | 
15 | ## Setup
16 | 
17 | Before getting started, be sure to sign up for an [Activeloop](https://www.activeloop.ai/) and [OpenAI](https://openai.com/) account and create API keys.
18 | 
19 | To set up and run this project, follow these steps:
20 | 
21 | 1. Clone the repository and navigate to the project directory:
22 | 
23 | ```bash
24 | git clone https://github.com/peterw/Chat-with-Git-Repo.git
25 | cd Chat-with-Git-Repo
26 | ```
27 | 
28 | 2. Install the required packages with `pip`:
29 | 
30 | ```bash
31 | pip install -r requirements.txt
32 | ```
33 | 
34 | For development dependencies, you can install them using the following command:
35 | 
36 | ```bash
37 | pip install -r dev-requirements.txt
38 | ```
39 | 
40 | 3. Set the environment variables:
41 | 
42 | Copy the `.env.example` file:
43 | 
44 | ```bash
45 | cp .env.example .env
46 | ```
47 | 
48 | Provide your API keys and username:
49 | 
50 | ```
51 | OPENAI_API_KEY=your_openai_api_key
52 | ACTIVELOOP_TOKEN=your_activeloop_api_token
53 | ACTIVELOOP_USERNAME=your_activeloop_username
54 | ```
55 | 
56 | 4. Use the CLI to run the chatbot application. You can either process a Git repository or start the chat application using an existing dataset.
57 | 
58 | > For complete CLI instructions run `python src/main.py --help`
59 | 
60 | To process a Git repository, use the `process` subcommand:
61 | 
62 | ```bash
63 | python src/main.py process --repo-url https://github.com/username/repo_name
64 | ```
65 | 
66 | You can also specify additional options, such as file extensions to include while processing the repository, the name for the Activeloop dataset, or the destination to clone the repository:
67 | 
68 | ```bash
69 | python src/main.py process --repo-url https://github.com/username/repo_name --include-file-extensions .md .txt --activeloop-dataset-name my-dataset --repo-destination repos
70 | ```
71 | 
72 | To start the chat application using an existing dataset, use the `chat` subcommand:
73 | 
74 | ```bash
75 | python src/main.py chat --activeloop-dataset-name my-dataset
76 | ```
77 | 
78 | The Streamlit chat app will run, and you can interact with the chatbot at `http://localhost:8501` (or the next available port) to ask questions about the repository.
79 | 
80 | ## Sponsors
81 | 
82 | ✨ Find profitable ideas faster: [Exploding Insights](https://explodinginsights.com/)
83 | 
84 | ## License
85 | 
86 | [MIT License](LICENSE)
87 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 88
3 | target-version = ['py37', 'py38', 'py39']
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | deeplake
2 | langchain
3 | openai
4 | pathspec
5 | python-dotenv
6 | streamlit
7 | streamlit_chat
8 | tiktoken
9 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterw/Chat-with-Github-Repo/446c41431cb985c6d99c429912e2989a16a9b882/src/__init__.py


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | from dotenv import load_dotenv
  5 | from streamlit.web import cli as stcli
  6 | from utils.process import process
  7 | 
  8 | # Load environment variables from a .env file (containing OPENAI_API_KEY)
  9 | load_dotenv()
 10 | 
 11 | 
 12 | def extract_repo_name(repo_url):
 13 |     """Extract the repository name from the given repository URL."""
 14 |     repo_name = repo_url.split("/")[-1].replace(".git", "")
 15 |     return repo_name
 16 | 
 17 | 
 18 | def process_repo(args):
 19 |     """
 20 |     Process the git repository by cloning it, filtering files, and
 21 |     creating an Activeloop dataset with the contents.
 22 |     """
 23 |     repo_name = extract_repo_name(args.repo_url)
 24 |     activeloop_username = os.environ.get("ACTIVELOOP_USERNAME")
 25 | 
 26 |     if not args.activeloop_dataset_name:
 27 |         args.activeloop_dataset_path = f"hub://{activeloop_username}/{repo_name}"
 28 |     else:
 29 |         args.activeloop_dataset_path = (
 30 |             f"hub://{activeloop_username}/{args.activeloop_dataset_name}"
 31 |         )
 32 | 
 33 |     process(
 34 |         args.repo_url,
 35 |         args.include_file_extensions,
 36 |         args.activeloop_dataset_path,
 37 |         args.repo_destination,
 38 |     )
 39 | 
 40 | 
 41 | def chat(args):
 42 |     """
 43 |     Start the Streamlit chat application using the specified Activeloop dataset.
 44 |     """
 45 |     activeloop_username = os.environ.get("ACTIVELOOP_USERNAME")
 46 | 
 47 |     args.activeloop_dataset_path = (
 48 |         f"hub://{activeloop_username}/{args.activeloop_dataset_name}"
 49 |     )
 50 | 
 51 |     sys.argv = [
 52 |         "streamlit",
 53 |         "run",
 54 |         "src/utils/chat.py",
 55 |         "--",
 56 |         f"--activeloop_dataset_path={args.activeloop_dataset_path}",
 57 |     ]
 58 | 
 59 |     sys.exit(stcli.main())
 60 | 
 61 | 
 62 | def main():
 63 |     """Define and parse CLI arguments, then execute the appropriate subcommand."""
 64 |     parser = argparse.ArgumentParser(description="Chat with a git repository")
 65 |     subparsers = parser.add_subparsers(dest="command")
 66 | 
 67 |     # Process subcommand
 68 |     process_parser = subparsers.add_parser("process", help="Process a git repository")
 69 |     process_parser.add_argument(
 70 |         "--repo-url", required=True, help="The git repository URL"
 71 |     )
 72 |     process_parser.add_argument(
 73 |         "--include-file-extensions",
 74 |         nargs="+",
 75 |         default=None,
 76 |         help=(
 77 |             "Exclude all files not matching these extensions. Example:"
 78 |             " --include-file-extensions .py .js .ts .html .css .md .txt"
 79 |         ),
 80 |     )
 81 |     process_parser.add_argument(
 82 |         "--activeloop-dataset-name",
 83 |         help=(
 84 |             "The name for the Activeloop dataset. Defaults to the git repository name."
 85 |         ),
 86 |     )
 87 |     process_parser.add_argument(
 88 |         "--repo-destination",
 89 |         default="repos",
 90 |         help="The destination to clone the repository. Defaults to 'repos'.",
 91 |     )
 92 | 
 93 |     # Chat subcommand
 94 |     chat_parser = subparsers.add_parser("chat", help="Start the chat application")
 95 |     chat_parser.add_argument(
 96 |         "--activeloop-dataset-name",
 97 |         required=True,
 98 |         help="The name of one of your existing Activeloop datasets.",
 99 |     )
100 | 
101 |     args = parser.parse_args()
102 | 
103 |     if args.command == "process":
104 |         process_repo(args)
105 |     elif args.command == "chat":
106 |         chat(args)
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterw/Chat-with-Github-Repo/446c41431cb985c6d99c429912e2989a16a9b882/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/chat.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from langchain.vectorstores import DeepLake
 4 | from langchain.embeddings.openai import OpenAIEmbeddings
 5 | from langchain.chains import RetrievalQA
 6 | from langchain.chat_models import ChatOpenAI
 7 | import openai
 8 | import streamlit as st
 9 | from streamlit_chat import message
10 | 
11 | 
12 | def run_chat_app(activeloop_dataset_path):
13 |     """Run the chat application using the Streamlit framework."""
14 |     # Set the title for the Streamlit app
15 |     st.title(f"{os.path.basename(activeloop_dataset_path)} GPT")
16 | 
17 |     # Set the OpenAI API key from the environment variable
18 |     openai.api_key = os.environ.get("OPENAI_API_KEY")
19 | 
20 |     # Create an instance of OpenAIEmbeddings
21 |     embeddings = OpenAIEmbeddings()
22 | 
23 |     # Create an instance of DeepLake with the specified dataset path and embeddings
24 |     db = DeepLake(
25 |         dataset_path=activeloop_dataset_path,
26 |         read_only=True,
27 |         embedding_function=embeddings,
28 |     )
29 | 
30 |     # Initialize the session state for generated responses and past inputs
31 |     if "generated" not in st.session_state:
32 |         st.session_state["generated"] = ["i am ready to help you ser"]
33 | 
34 |     if "past" not in st.session_state:
35 |         st.session_state["past"] = ["hello"]
36 | 
37 |     # Get the user's input from the text input field
38 |     user_input = get_text()
39 | 
40 |     # If there is user input, search for a response using the search_db function
41 |     if user_input:
42 |         output = search_db(db, user_input)
43 |         st.session_state.past.append(user_input)
44 |         st.session_state.generated.append(output)
45 | 
46 |     # If there are generated responses, display the conversation using Streamlit
47 |     # messages
48 |     if st.session_state["generated"]:
49 |         for i in range(len(st.session_state["generated"])):
50 |             message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
51 |             message(st.session_state["generated"][i], key=str(i))
52 | 
53 | 
54 | def generate_response(prompt):
55 |     """
56 |     Generate a response using OpenAI's ChatCompletion API and the specified prompt.
57 |     """
58 |     completion = openai.ChatCompletion.create(
59 |         model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}]
60 |     )
61 |     response = completion.choices[0].message.content
62 |     return response
63 | 
64 | 
65 | def get_text():
66 |     """Create a Streamlit input field and return the user's input."""
67 |     input_text = st.text_input("", key="input")
68 |     return input_text
69 | 
70 | 
71 | def search_db(db, query):
72 |     """Search for a response to the query in the DeepLake database."""
73 |     # Create a retriever from the DeepLake instance
74 |     retriever = db.as_retriever()
75 |     # Set the search parameters for the retriever
76 |     retriever.search_kwargs["distance_metric"] = "cos"
77 |     retriever.search_kwargs["fetch_k"] = 100
78 |     retriever.search_kwargs["maximal_marginal_relevance"] = True
79 |     retriever.search_kwargs["k"] = 10
80 |     # Create a ChatOpenAI model instance
81 |     model = ChatOpenAI(model="gpt-3.5-turbo")
82 |     # Create a RetrievalQA instance from the model and retriever
83 |     qa = RetrievalQA.from_llm(model, retriever=retriever)
84 |     # Return the result of the query
85 |     return qa.run(query)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     parser = argparse.ArgumentParser()
90 |     parser.add_argument("--activeloop_dataset_path", type=str, required=True)
91 |     args = parser.parse_args()
92 | 
93 |     run_chat_app(args.activeloop_dataset_path)
94 | 


--------------------------------------------------------------------------------
/src/utils/process.py:
--------------------------------------------------------------------------------
  1 | import deeplake
  2 | import openai
  3 | import os
  4 | import pathspec
  5 | import subprocess
  6 | from langchain.document_loaders import TextLoader
  7 | from langchain.embeddings.openai import OpenAIEmbeddings
  8 | from langchain.text_splitter import CharacterTextSplitter
  9 | from langchain.vectorstores import DeepLake
 10 | 
 11 | # Set the OpenAI API key
 12 | openai.api_key = os.environ.get("OPENAI_API_KEY")
 13 | 
 14 | 
 15 | def clone_repository(repo_url, local_path):
 16 |     """Clone the specified git repository to the given local path."""
 17 |     subprocess.run(["git", "clone", repo_url, local_path])
 18 | 
 19 | 
 20 | def load_docs(root_dir, file_extensions=None):
 21 |     """
 22 |     Load documents from the specified root directory.
 23 |     Ignore dotfiles, dot directories, and files that match .gitignore rules.
 24 |     Optionally filter by file extensions.
 25 |     """
 26 |     docs = []
 27 | 
 28 |     # Load .gitignore rules
 29 |     gitignore_path = os.path.join(root_dir, ".gitignore")
 30 | 
 31 |     if os.path.isfile(gitignore_path):
 32 |         with open(gitignore_path, "r") as gitignore_file:
 33 |             gitignore = gitignore_file.read()
 34 |         spec = pathspec.PathSpec.from_lines(
 35 |             pathspec.patterns.GitWildMatchPattern, gitignore.splitlines()
 36 |         )
 37 |     else:
 38 |         spec = None
 39 | 
 40 |     for dirpath, dirnames, filenames in os.walk(root_dir):
 41 |         # Remove dot directories from the list of directory names
 42 |         dirnames[:] = [d for d in dirnames if not d.startswith(".")]
 43 | 
 44 |         for file in filenames:
 45 |             file_path = os.path.join(dirpath, file)
 46 | 
 47 |             # Skip dotfiles
 48 |             if file.startswith("."):
 49 |                 continue
 50 | 
 51 |             # Skip files that match .gitignore rules
 52 |             if spec and spec.match_file(file_path):
 53 |                 continue
 54 | 
 55 |             if file_extensions and os.path.splitext(file)[1] not in file_extensions:
 56 |                 continue
 57 | 
 58 |             try:
 59 |                 loader = TextLoader(file_path, encoding="utf-8")
 60 |                 docs.extend(loader.load_and_split())
 61 |             except Exception:
 62 |                 pass
 63 |     return docs
 64 | 
 65 | 
 66 | def split_docs(docs):
 67 |     """Split the input documents into smaller chunks."""
 68 |     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
 69 |     return text_splitter.split_documents(docs)
 70 | 
 71 | 
 72 | def create_deeplake_dataset(activeloop_dataset_path, activeloop_token):
 73 |     """Create an empty DeepLake dataset with the specified path and token."""
 74 |     ds = deeplake.empty(
 75 |         activeloop_dataset_path,
 76 |         token=activeloop_token,
 77 |         overwrite=True,
 78 |     )
 79 | 
 80 |     ds.create_tensor("ids")
 81 |     ds.create_tensor("metadata")
 82 |     ds.create_tensor("embedding")
 83 |     ds.create_tensor("text")
 84 | 
 85 | 
 86 | def process(
 87 |     repo_url, include_file_extensions, activeloop_dataset_path, repo_destination
 88 | ):
 89 |     """
 90 |     Process a git repository by cloning it, filtering files, splitting documents,
 91 |     creating embeddings, and storing everything in a DeepLake dataset.
 92 |     """
 93 |     activeloop_token = os.getenv("ACTIVELOOP_TOKEN")
 94 | 
 95 |     create_deeplake_dataset(activeloop_dataset_path, activeloop_token)
 96 | 
 97 |     clone_repository(repo_url, repo_destination)
 98 | 
 99 |     docs = load_docs(repo_destination, include_file_extensions)
100 |     texts = split_docs(docs)
101 | 
102 |     embeddings = OpenAIEmbeddings()
103 | 
104 |     db = DeepLake(dataset_path=activeloop_dataset_path, embedding_function=embeddings)
105 |     db.add_documents(texts)
106 | 


--------------------------------------------------------------------------------