├── .env-template
├── .gitattributes
├── examples
├── count.png
├── logo.png
├── repos.png
├── example.gif
├── shadcn.png
├── langgraph.png
├── llamaindex.png
└── selection.png
├── requirements.txt
├── config.py
├── config.yaml
├── token_count.py
├── LICENSE
├── useful_tool
└── metadata_extract.py
├── llm_service.py
├── .gitignore
├── README.md
├── app.py
└── repo_service.py
/.env-template:
--------------------------------------------------------------------------------
1 | OPENROUTER_API_KEY=insert_api_key_here
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/examples/count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/count.png
--------------------------------------------------------------------------------
/examples/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/logo.png
--------------------------------------------------------------------------------
/examples/repos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/repos.png
--------------------------------------------------------------------------------
/examples/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/example.gif
--------------------------------------------------------------------------------
/examples/shadcn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/shadcn.png
--------------------------------------------------------------------------------
/examples/langgraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/langgraph.png
--------------------------------------------------------------------------------
/examples/llamaindex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/llamaindex.png
--------------------------------------------------------------------------------
/examples/selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/selection.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai~=1.14.2
2 | pandas~=2.2.1
3 | Pygments~=2.17.2
4 | streamlit~=1.32.2
5 | tiktoken~=0.6.0
6 | nbformat~=5.10.3
7 | PyYAML~=6.0.1
8 | GitPython~=3.1.42
9 | python-dotenv~=1.0.1
10 | loguru~=0.7.2
11 | send2trash
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | # config.py
2 | import yaml
3 | from dotenv import load_dotenv
4 |
5 | def load_config(config_file="config.yaml"):
6 | with open(config_file, "r") as f:
7 | config = yaml.safe_load(f)
8 | return config
9 |
10 | Config = load_config()
11 |
12 | load_dotenv()
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | # config.yaml
2 | repos_dir: "./repos" # Directory to store the downloaded repositories
3 |
4 | # logging
5 | log_level: "INFO" # Log level
6 | log_file: "repo_stats.log" # Log file
7 |
8 | # download method
9 | download_method: "auto" # Download method:auto (both git or http) / git / http
10 |
--------------------------------------------------------------------------------
/token_count.py:
--------------------------------------------------------------------------------
1 | import tiktoken
2 |
3 | def num_tokens_from_string(string: str, model="gpt-3.5-turbo-0613") -> int:
4 | """Returns the number of tokens in a text string based on the specified model's encoding."""
5 | try:
6 | encoding = tiktoken.encoding_for_model(model) # Attempt to get encoding for the specified model
7 | except KeyError:
8 | print("Warning: model not found. Using cl100k_base encoding.")
9 | encoding = tiktoken.get_encoding("cl100k_base") # Fallback encoding if model's encoding not found
10 |
11 | num_tokens = len(encoding.encode(string, disallowed_special=())) # Calculate number of tokens based on encoding
12 | return num_tokens
13 |
14 | def num_messages(messages: dict, model="gpt-3.5-turbo-0613") -> int:
15 | """Returns the number of tokens in a chat message based on the specified model's encoding."""
16 | num_tokens = 0
17 | for msg in messages:
18 | num_tokens += num_tokens_from_string(msg["content"], model=model)
19 | return num_tokens
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 chty627
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/useful_tool/metadata_extract.py:
--------------------------------------------------------------------------------
1 | """
2 | Extract metadata from a file and summarize it into xml format.
3 | NOTE: This file is not used in the project. You can use this as a reference to create your own meta extraction.
4 | """
5 |
6 | # from langchain_core.output_parsers import StrOutputParser, XMLOutputParser, JsonOutputParser
7 | from langchain_core.prompts import ChatPromptTemplate
8 | from typing import Optional
9 | from langchain_openai import ChatOpenAI
10 | from langchain_core.messages import AIMessage
11 | import re
12 |
13 | class ChatOpenRouter(ChatOpenAI):
14 | openai_api_base: str
15 | openai_api_key: str
16 | model_name: str
17 |
18 | def __init__(self,
19 | model_name: str,
20 | openai_api_key: Optional[str] = None,
21 | openai_api_base: str = "https://openrouter.ai/api/v1",
22 | **kwargs):
23 | openai_api_key = openai_api_key or os.getenv('OPENROUTER_API_KEY')
24 | super().__init__(openai_api_base=openai_api_base,
25 | openai_api_key=openai_api_key,
26 | model_name=model_name, **kwargs)
27 |
28 |
29 | template = """\
30 | ===
31 | Langgraph README.md:
32 | {readme_content}
33 | ===
34 | File Content:
35 | {file_content}
36 | ===
37 | Summarzie the above file "{file_name}" into xml with attributes of description and graph abstract
38 |
39 | description: describe the file in a few words, concisely;
40 | graph: use langgraph syntax to describe the high level structure of the graph mentioned in the file, only output python code of the graph.
41 |
42 | Please enclose the output in xml format. Only output xml. Donot output any prefix or suffix.
43 |
44 | Output Format:
45 | insert_description_here
46 | insert_graph_abstract_here\
47 | """
48 | prompt = ChatPromptTemplate.from_template(template)
49 | print(prompt)
50 |
51 |
52 | def custom_parse(ai_message: AIMessage) -> str:
53 | """Parse the AI message."""
54 | content = ai_message.content
55 |
56 | # create a dictionary to store the parsed data
57 | data = {}
58 |
59 | # use regular expressions to extract elements and their content
60 | pattern = r"<(\w+)>(.*?)\1>"
61 | matches = re.findall(pattern, content, re.DOTALL)
62 |
63 | for match in matches:
64 | tag, text = match
65 | data[tag] = text.strip()
66 |
67 | return data
68 |
69 | llm = ChatOpenRouter(
70 | model_name="anthropic/claude-3-haiku",
71 | temperature=0.7,
72 | )
73 |
74 | llm_chain = (
75 | prompt
76 | | llm
77 | | custom_parse
78 | )
--------------------------------------------------------------------------------
/llm_service.py:
--------------------------------------------------------------------------------
1 | import os
2 | from enum import Enum
3 | from loguru import logger
4 | from openai import OpenAI
5 |
6 | class ProviderType(str, Enum):
7 | OPENAI = "OPENAI"
8 | OPENROUTER = "OPENROUTER"
9 |
10 |
11 | BASE_URL_MAP = {
12 | ProviderType.OPENAI: "https://api.openai.com/v1",
13 | ProviderType.OPENROUTER: "https://openrouter.ai/api/v1",
14 | }
15 | MODEL_MAP = {
16 | ProviderType.OPENAI: [
17 | "gpt-4-1106-preview",
18 | "gpt-3.5-turbo-16k",
19 | ],
20 | ProviderType.OPENROUTER: [
21 | "anthropic/claude-3-haiku",
22 | "anthropic/claude-3-haiku:beta",
23 | "anthropic/claude-3-opus",
24 | "anthropic/claude-3-sonnet",
25 | "anthropic/claude-3-sonnet:beta",
26 | "anthropic/claude-3-opus:beta",
27 | ],
28 | }
29 | MODELS = [*MODEL_MAP[ProviderType.OPENROUTER], *MODEL_MAP[ProviderType.OPENAI]]
30 |
31 |
32 | def get_base_url(selected_model: str) -> str:
33 | """Get the base url for the selected model.
34 | Args:
35 | selected_model(str): selected model
36 |
37 | Returns:
38 | str: base url for the selected model
39 | """
40 | if selected_model in MODEL_MAP[ProviderType.OPENAI]:
41 | return BASE_URL_MAP[ProviderType.OPENAI]
42 | elif selected_model in MODEL_MAP[ProviderType.OPENROUTER]:
43 | return BASE_URL_MAP[ProviderType.OPENROUTER]
44 | else:
45 | raise ValueError(f"Model {selected_model} not found.")
46 |
47 |
48 | def get_api_key(selected_model: str) -> str:
49 | """Get the api key for the selected model.
50 | Args:
51 | selected_model(str): selected model
52 |
53 | Returns:
54 | str: api key for the selected model
55 | """
56 | if selected_model in MODEL_MAP[ProviderType.OPENAI]:
57 | return os.getenv("OPENAI_API_KEY")
58 | elif selected_model in MODEL_MAP[ProviderType.OPENROUTER]:
59 | return os.getenv("OPENROUTER_API_KEY")
60 | else:
61 | raise ValueError(f"Model {selected_model} not found.")
62 |
63 |
64 | class ChatClient:
65 | def __init__(self, base_url: str, api_key: str):
66 | logger.info(
67 | f"Initializing ChatClient, base_url: {base_url} and api_key: {api_key[:5]}..."
68 | )
69 | self.client = OpenAI(base_url=base_url, api_key=api_key)
70 |
71 | def chat(
72 | self, messages, model="anthropic/claude-3-opus", temperature=0.7, stream=True
73 | ):
74 | return self.client.chat.completions.create(
75 | model=model, messages=messages, stream=stream, temperature=temperature
76 | )
77 |
78 |
79 | def create_client_for_model(selected_model: str):
80 | """Create a client for the selected model.
81 | Args:
82 | selected_model(str): selected model
83 |
84 | Returns:
85 | ChatClient: ChatClient for the selected model
86 | """
87 | base_url = get_base_url(selected_model)
88 | api_key = get_api_key(selected_model)
89 |
90 | if api_key is None:
91 | raise ValueError(f"API Key not found for model: {selected_model}")
92 |
93 | return ChatClient(base_url, api_key)
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | repos/
2 | test.ipynb
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 | cover/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 | db.sqlite3-journal
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | .pybuilder/
79 | target/
80 |
81 | # Jupyter Notebook
82 | .ipynb_checkpoints
83 |
84 | # IPython
85 | profile_default/
86 | ipython_config.py
87 |
88 | # pyenv
89 | # For a library or package, you might want to ignore these files since the code is
90 | # intended to run in multiple environments; otherwise, check them in:
91 | # .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # poetry
101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | # This is especially recommended for binary packages to ensure reproducibility, and is more
103 | # commonly ignored for libraries.
104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 |
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 |
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 |
114 | # SageMath parsed files
115 | *.sage.py
116 |
117 | # Environments
118 | .env
119 | .venv
120 | env/
121 | venv/
122 | ENV/
123 | env.bak/
124 | venv.bak/
125 |
126 | # Spyder project settings
127 | .spyderproject
128 | .spyproject
129 |
130 | # Rope project settings
131 | .ropeproject
132 |
133 | # mkdocs documentation
134 | /site
135 |
136 | # mypy
137 | .mypy_cache/
138 | .dmypy.json
139 | dmypy.json
140 |
141 | # Pyre type checker
142 | .pyre/
143 |
144 | # pytype static type analyzer
145 | .pytype/
146 |
147 | # Cython debug symbols
148 | cython_debug/
149 |
150 | # PyCharm
151 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
152 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153 | # and can be added to the global gitignore or merged into this file. For a more nuclear
154 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
155 | .idea/
156 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RepoChat in 200k - No RAG!
2 |
3 | ⚡Chat with GitHub Repo Using 200k context window of Claude instead of RAG!⚡
4 |
5 |
6 |
7 |
8 | ## Why RepoChat - No RAG?
9 |
10 | Take the advantage of Claude 200k! Put all examples and codes to the contexts!
11 |
12 | **We need copilot rather than agent sometimes!**
13 |
14 | ### 1. Not just Chat, But Write Codes with the Lastest Documents
15 |
16 | Have troubles memorizing all the apis in llama-index or langchain?
17 |
18 | No worries, just include the components examples and the documents from the repo and let Claude Opus - the strongest model and long context window of 200k to write your agent for you!
19 |
20 | ### 2. No copy paste, Just Select
21 |
22 |
23 |
24 | Download/Clone your Repo from Github then just select the files you'd like, I got you covered on constructing the prompt.
25 |
26 | ### 3. No RAG, You decide
27 |
28 | I've seen many Chat with Repo projects, they all have the painpoints:
29 |
30 | `Which files do this query need?`
31 |
32 | They use **embedding search** in Code database but most of the time I already knew what documents I'm refering... So make your own choices each time when you are coding.
33 |
34 | - Coding Frontend? Just select components and examples.
35 |
36 | - Coding Agents? Just select Jupyter Notebook of langgraph.
37 |
38 | - Coding RAG? Just select Jupyter Notebook of llamaindex.
39 |
40 |
41 |
42 |
43 | ### 4. More Use Cases!
44 |
45 | #### Use llamaindex doc
46 |
47 | select llamaindex example of pipeline to write RAG graph.
48 |
49 |
50 |
51 | #### Use Shadcn doc to write frontend
52 |
53 | select examples and components definition.
54 |
55 |
56 |
57 |
58 | ## Suggestions!
59 |
60 | 1. You can use Haiku for most of the case.
61 | 2. Change models based on tasks.
62 | 3. Change files based on tasks.
63 | 4. Clone Repos You like!
64 | 5. Usually I will include README of repo to help Claude Understand better
65 | 6. USE `COUNT TOKENS` on the sidebar to see how many tokens you will send!!!
66 |
67 |
68 |
69 | ## Features
70 |
71 | 1. **Repository Download**: Users can provide a GitHub repository URL, and the application will automatically download and analyze the repository.
72 | 2. **File and Folder Selection**: Users can select specific files or folders from the repository to include in the LLM's input.
73 | 3. **Language Filtering**: Users can filter the files by programming language to focus the LLM's understanding on specific parts of the codebase.
74 | 4. **Token Limit**: Users can set a token limit to control the amount of information sent to the LLM, which can be useful for performance or cost considerations.
75 | 5. **Chat Interface**: Users can interact with the LLM through a chat-style interface, allowing them to ask questions or request code generation based on the repository contents.
76 | 6. **Streaming Output**: The LLM's responses are displayed in a streaming fashion, providing a more engaging and real-time user experience.
77 |
78 | Currently I only supported Openrouter. Planing to add more and refactor someday.
79 |
80 | ## Get Started
81 |
82 | 1. **Environment Settings**: Run `pip install -r requirements.txt` to set up environment.
83 |
84 | 2. **Create a .env file**: Create a `.env` file in the root directory of the project and add your OpenRouter API key (Recommended):
85 | ```bash
86 | OPENROUTER_API_KEY=your_openrouter_api_key_here
87 | ```
88 | I recommend [OpenRouter](https://openrouter.ai/) because it has all models!
89 |
90 | If you want to use OpenAI GPT models, add your `openai api key` as well.
91 |
92 | ```bash
93 | OPENAI_API_KEY=your_openai_api_key_here
94 | ```
95 |
96 | 3. **Run the application**: Run the `app.py` script using Streamlit:
97 | ```bash
98 | streamlit run app.py
99 | ```
100 | 4. **Use the application**: Follow the instructions in the application to download a GitHub repository, select files and folders, and chat with the LLM.
101 |
102 | If you encounter some issues with repo, you can always delete the repo dir in ./repos dir and download it again.
103 |
104 | ## Configuration
105 |
106 | The application's behavior can be customized through the following configuration options:
107 |
108 | - **Model**: The specific LLM model to use (e.g., "anthropic/claude-3-haiku", "anthropic/claude-3-opus").
109 | - **Temperature**: The temperature parameter that controls the "creativity" of the LLM's responses.
110 | - **System Prompt**: The initial prompt given to the LLM to set the desired behavior.
111 |
112 | These settings can be adjusted in the sidebar of the Streamlit application.
113 |
114 | ## Contributing
115 |
116 | If you'd like to contribute to the RepoChat-200k project, please feel free to submit issues or pull requests on the [GitHub repository](https://github.com/jw782cn/RepoChat-200k).
117 |
118 | ## License
119 |
120 | This project is licensed under the [MIT License](LICENSE).
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import streamlit as st
4 | from loguru import logger
5 | from openai import OpenAI
6 | from token_count import num_messages, num_tokens_from_string
7 | from llm_service import MODELS, create_client_for_model
8 | from repo_service import RepoManager
9 |
10 |
11 | class StreamHandler:
12 | def __init__(self, container, initial_text=""):
13 | self.container = container
14 | self.text = initial_text
15 |
16 | def process_token(self, token: str):
17 | self.text += token
18 | self.container.markdown(self.text)
19 |
20 | def refresh_repos():
21 | logger.info("Refreshing repositories")
22 | if 'repoManager' not in st.session_state:
23 | st.session_state['repoManager'] = RepoManager()
24 | st.session_state['repoManager'].load_repos()
25 | st.success("Refreshed repositories")
26 |
27 | def create_app():
28 | st.set_page_config(page_title="ChatWithRepo", page_icon="🤖")
29 |
30 | if 'repoManager' not in st.session_state:
31 | st.session_state['repoManager'] = RepoManager()
32 | if "messages" not in st.session_state:
33 | st.session_state["messages"] = []
34 |
35 | repoManager: RepoManager = st.session_state['repoManager']
36 | with st.sidebar:
37 | st.title("Settings for Repo")
38 | custom_repo_url = st.text_input("Custom Repository URL")
39 | col1, col2 = st.columns(2)
40 | with col1:
41 | if st.button("Add Custom Repository"):
42 | if repoManager.add_repo(custom_repo_url):
43 | st.success(f"Added custom repository: {custom_repo_url}")
44 | else:
45 | st.error(f"Repository add failed: {custom_repo_url}")
46 | repo_url = custom_repo_url
47 | with col2:
48 | if st.button("Refresh Repositories"):
49 | refresh_repos()
50 |
51 | repo_url = st.selectbox(
52 | "Repository URL", options=repoManager.get_repo_urls())
53 | if repoManager.check_if_repo_exists(repo_url):
54 | repo = repoManager.get_repo_service(repo_url)
55 | selected_folder = st.multiselect(
56 | "Select Folder", options=repo.get_folders_options())
57 | selected_files = st.multiselect(
58 | "Select Files", options=repo.get_files_options(), default="README.md")
59 | selected_languages = st.multiselect(
60 | "Filtered by Language", options=repo.get_languages_options())
61 | limit = st.number_input("Limit", value=100000, step=10000)
62 | col1, col2, col3 = st.columns(3)
63 | with col1:
64 | if st.button("Count Tokens"):
65 | file_string = repo.get_filtered_files(
66 | selected_folders=selected_folder,
67 | selected_files=selected_files,
68 | selected_languages=selected_languages,
69 | limit=limit,
70 | )
71 | st.write(
72 | f"Total Tokens: {num_tokens_from_string(file_string)}")
73 | with col2:
74 | if st.button("Update Repo"):
75 | if repo.update_repo():
76 | st.success(f"Updated repository: {repo_url}")
77 | else:
78 | st.error(f"Repository update failed: {repo_url}")
79 | st.rerun()
80 | with col3:
81 | if st.button("Delete Repo"):
82 | if repo.delete_repo():
83 | st.success(f"Deleted repository: {repo_url}")
84 | else:
85 | st.error(f"Repository delete failed: {repo_url}")
86 | refresh_repos()
87 | st.rerun()
88 |
89 | st.title("Settings for LLM")
90 |
91 | selected_model = st.selectbox("Model", options=MODELS)
92 | temperature = st.slider(
93 | "Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.1
94 | )
95 | system_prompt = st.text_area(
96 | "System Prompt",
97 | value="You are a helpful assistant. You are provided with a repo information and files from the repo. Answer the user's questions based on the information and files provided.",
98 | )
99 |
100 | if st.button("Clear Chat"):
101 | st.session_state["messages"] = []
102 |
103 | if "client" not in st.session_state:
104 | st.session_state["client"] = create_client_for_model(selected_model)
105 |
106 | if repoManager.isEmpty():
107 | st.info("Copy the repository URL and click the download button.")
108 | st.stop()
109 |
110 | if not repoManager.check_if_repo_exists(repo_url):
111 | st.info(f"{repo_url} does not exist. Please add the repository first.")
112 | st.stop()
113 |
114 | repo = repoManager.get_repo_service(repo_url)
115 | st.title(f"Repo: {repo.repo_name}")
116 | st.write(
117 | "Chat with LLM using the repository information and files. You can change model settings anytime during the chat."
118 | )
119 | st.info(
120 | f"""
121 | Files : {selected_files}
122 | Folder: {selected_folder}
123 | Languages: {selected_languages}
124 | Limit: {limit}
125 | """
126 | )
127 | for msg in st.session_state.messages:
128 | st.chat_message(msg["role"]).write(msg["content"])
129 |
130 | if prompt := st.chat_input():
131 | st.session_state.messages.append({"role": "user", "content": prompt})
132 | st.chat_message("user").write(prompt)
133 | logger.info(f"User: {prompt}, received at {pd.Timestamp.now()}")
134 |
135 | start_time = pd.Timestamp.now()
136 | # Check if the selected model has changed
137 | if "selected_model" not in st.session_state:
138 | st.session_state.selected_model = None
139 |
140 | if st.session_state.selected_model != selected_model:
141 | st.session_state.client = create_client_for_model(selected_model)
142 | st.session_state.selected_model = selected_model
143 |
144 | file_string = repo.get_filtered_files(
145 | selected_folders=selected_folder,
146 | selected_files=selected_files,
147 | selected_languages=selected_languages,
148 | limit=limit,
149 | )
150 | end_time = pd.Timestamp.now()
151 | logger.info(
152 | f"Time taken to get filtered files: {end_time - start_time}")
153 |
154 | with st.chat_message("assistant"):
155 | stream_handler = StreamHandler(st.empty())
156 | # only add file content to the system prompt
157 | messages = (
158 | [{"role": "system", "content": system_prompt}]
159 | + [{"role": "user", "content": file_string}]
160 | + st.session_state.messages
161 | )
162 | client = st.session_state["client"]
163 |
164 | # log the information
165 | total_tokens = num_messages(messages)
166 | logger.info(
167 | f"Information: {selected_files}, {selected_folder}, {selected_languages}")
168 | logger.info(f"Using settings: {selected_model}, {temperature}")
169 | logger.info(f"File token: {num_tokens_from_string(file_string)}")
170 | logger.info(f"Total Messages Token: {total_tokens}")
171 | st.sidebar.write(
172 | f"Sending file content: {selected_files} and filter folder: {selected_folder} to the assistant.")
173 | st.sidebar.write(f"total messages token: {total_tokens}")
174 |
175 | # send to llm
176 | completion = client.chat(
177 | messages, stream=True, temperature=temperature, model=selected_model
178 | )
179 |
180 | for chunk in completion:
181 | content = chunk.choices[0].delta.content
182 | stream_handler.process_token(content)
183 |
184 | st.session_state.messages.append(
185 | {"role": "assistant", "content": stream_handler.text}
186 | )
187 |
188 |
189 | if __name__ == "__main__":
190 | create_app()
191 |
--------------------------------------------------------------------------------
/repo_service.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import zipfile
4 | import time
5 | import json
6 | import nbformat
7 | import requests
8 | from git import Repo, GitCommandError, NoSuchPathError, InvalidGitRepositoryError
9 | from loguru import logger
10 | from send2trash import send2trash
11 | import pandas as pd
12 | from pygments.lexers import guess_lexer_for_filename, TextLexer
13 | from functools import wraps
14 | from pygments.util import ClassNotFound
15 | from token_count import num_tokens_from_string
16 | from config import Config
17 |
18 |
19 | def convert_ipynb_to_text(ipynb_content):
20 | notebook = json.loads(ipynb_content)
21 | text = ""
22 | for cell in notebook['cells']:
23 | if cell['cell_type'] == 'markdown':
24 | text += ''.join(cell['source']) + '\n\n'
25 | elif cell['cell_type'] == 'code':
26 | text += '```python\n'
27 | text += ''.join(cell['source']) + '\n'
28 | text += '```\n\n'
29 | if len(cell['outputs']) > 0:
30 | text += '\n\n'
39 |
40 | return text.strip()
41 |
42 |
43 | def retry(max_retries=3, retry_delay=5):
44 | def decorator(func):
45 | @wraps(func)
46 | def wrapper(*args, **kwargs):
47 | last_exception = None # last exception that occurred
48 | retries = 0
49 | while retries < max_retries:
50 | try:
51 | return func(*args, **kwargs)
52 | except (subprocess.CalledProcessError, requests.exceptions.RequestException, zipfile.BadZipFile) as e:
53 | retries += 1
54 | last_exception = e # update last exception
55 | logger.error(f"Error in {func.__name__}. Retrying ({retries}/{max_retries})...")
56 | time.sleep(retry_delay)
57 | logger.error(f"Failed to execute {func.__name__} after {max_retries} retries.")
58 | if last_exception:
59 | raise last_exception # if an exception occurred, raise it
60 | else:
61 | # usually this should not happen
62 | raise Exception(f"Failed to execute {func.__name__} after {max_retries} retries without catching an exception.")
63 | return wrapper
64 | return decorator
65 |
66 |
67 |
68 | class RepoService:
69 | def __init__(self, repo_url, repo_name=None):
70 | self.repo_url = repo_url
71 | self.repo_name = repo_name if repo_name else repo_url.split(
72 | "/")[-1].replace(".git", "")
73 | self.repo_path = os.path.join(Config["repos_dir"], self.repo_name)
74 | self.clone_path = os.path.join(
75 | self.repo_path, self.repo_name + "-main")
76 |
77 | if self.check_if_exist():
78 | logger.info(
79 | f"Repository {self.repo_name} already exists at {self.repo_path}")
80 | else:
81 | self.set_up()
82 |
83 | def check_if_exist(self):
84 | repo_info_path = os.path.join(self.repo_path, "repo_info.json")
85 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
86 |
87 | if not os.path.exists(repo_info_path) or not os.path.exists(csv_path):
88 | return False
89 | if pd.read_csv(csv_path).empty:
90 | return False
91 |
92 | with open(repo_info_path, "r") as f:
93 | repo_info = json.load(f)
94 | if "repo_url" not in repo_info or repo_info["repo_url"] != self.repo_url:
95 | return False
96 |
97 | # check if the repo has file, if no files, then return False
98 | if not os.listdir(self.clone_path):
99 | return False
100 | return True
101 |
102 | def set_up(self):
103 | if not os.path.exists(self.repo_path):
104 | os.makedirs(self.repo_path, exist_ok=True)
105 | repo_info = {"repo_url": self.repo_url}
106 | with open(os.path.join(self.repo_path, "repo_info.json"), "w") as f:
107 | json.dump(repo_info, f)
108 | self.clone_repo()
109 | if not os.path.exists(os.path.join(self.repo_path, "repo_stats.csv")):
110 | self.get_repo_stats()
111 | logger.info(
112 | f"Repository {self.repo_name} set up successfully at {self.repo_path}")
113 | logger.info(
114 | f"Last updated: {time.ctime(os.path.getmtime(os.path.join(self.repo_path, 'repo_stats.csv')))}")
115 |
116 | def clone_repo(self):
117 | if os.path.exists(self.clone_path) and os.listdir(self.clone_path):
118 | logger.info(
119 | f"The repository {self.repo_name} already exists at {self.clone_path}.")
120 | return True
121 |
122 | os.makedirs(self.clone_path, exist_ok=True)
123 | download_method = Config.get("download_method", "auto").lower()
124 |
125 | if download_method == "git":
126 | return self.try_clone_using_git()
127 | elif download_method == "http":
128 | return self.try_clone_using_http()
129 | elif download_method == "auto":
130 | if self.try_clone_using_git():
131 | return True
132 | logger.info("Git clone failed. Trying HTTP download.")
133 | return self.try_clone_using_http()
134 |
135 | logger.error(
136 | f"Invalid download method specified in config: {download_method}")
137 | return False
138 |
139 | def try_clone_using_git(self):
140 | try:
141 | subprocess.run(["git", "--version"], check=True,
142 | stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
143 | self._clone_using_git()
144 | return True
145 | except (subprocess.CalledProcessError, FileNotFoundError) as e:
146 | logger.error(
147 | f"Failed to clone repository {self.repo_name} using Git. {e}")
148 | self.delete_repo()
149 | return False
150 |
151 | def try_clone_using_http(self):
152 | try:
153 | self._clone_using_download()
154 | return True
155 | except (requests.exceptions.RequestException, zipfile.BadZipFile) as e:
156 | logger.error(
157 | f"Failed to clone repository {self.repo_name} using HTTP download method. {e}")
158 | self.delete_repo()
159 | return False
160 |
161 | @retry(max_retries=1, retry_delay=5)
162 | def _clone_using_git(self):
163 | logger.info(f"Cloning repository {self.repo_name} using Git...")
164 | subprocess.run(["git", "clone", self.repo_url,
165 | self.clone_path], check=True, timeout=60)
166 |
167 | @retry(max_retries=1, retry_delay=5)
168 | def _clone_using_download(self):
169 | logger.info(f"Cloning repository {self.repo_name} using download...")
170 | response = requests.get(self.repo_url, timeout=60)
171 | if response.status_code == 200:
172 | with open(os.path.join(self.repo_path, "repo.zip"), "wb") as f:
173 | f.write(response.content)
174 | with zipfile.ZipFile(os.path.join(self.repo_path, "repo.zip"), "r") as zip_ref:
175 | zip_ref.extractall(self.repo_path)
176 | os.remove(os.path.join(self.repo_path, "repo.zip"))
177 | else:
178 | raise requests.exceptions.RequestException(
179 | f"Failed to download repository {self.repo_name}")
180 |
181 | def update_repo(self):
182 | try:
183 | logger.info(f"Updating repository {self.repo_name}...")
184 | repo = Repo(self.clone_path)
185 | origin = repo.remotes.origin
186 | origin.fetch() # Fetches the latest changes from the remote repository but does not merge them
187 |
188 | current_commit = repo.head.commit # get the current commit
189 | # get the remote commit
190 | remote_commit = origin.refs[repo.active_branch.name].commit
191 |
192 | if current_commit.hexsha == remote_commit.hexsha:
193 | logger.info(
194 | f"Repository {self.repo_name} is already up-to-date.")
195 | return True # if the current commit is the same as the remote commit, the repository is up-to-date
196 |
197 | # if the current commit is not the same as the remote commit, pull the changes
198 | origin.pull()
199 | logger.info(f"Repository {self.repo_name} updated successfully.")
200 |
201 | # after updating the repository, get the latest stats
202 | self.get_repo_stats()
203 | return True
204 | except (GitCommandError, NoSuchPathError, InvalidGitRepositoryError) as e:
205 | logger.error(f"Failed to update repository {self.repo_name}: {e}")
206 | return False
207 |
208 | def delete_repo(self):
209 | if os.path.exists(self.repo_path):
210 | send2trash(self.repo_path)
211 | logger.info(
212 | f"Deleted repository {self.repo_name} at {self.repo_path}")
213 | return True
214 | else:
215 | logger.info(
216 | f"Repository {self.repo_name} does not exist at {self.repo_path}")
217 | return False
218 |
219 | def get_repo_stats(self):
220 | data = []
221 | for root, dirs, files in os.walk(self.clone_path):
222 | if '.git' in dirs:
223 | dirs.remove('.git') # don't visit .git directories
224 |
225 | for file in files:
226 | file_path = os.path.join(root, file)
227 | rel_path = os.path.relpath(file_path, self.clone_path)
228 | content = ''
229 | language = None
230 | if file.endswith('.ipynb'):
231 | with open(file_path, 'r', encoding='utf-8') as f:
232 | notebook = nbformat.read(f, as_version=4)
233 | content = nbformat.writes(notebook)
234 | language = 'Jupyter Notebook'
235 | else:
236 | with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
237 | content = f.read()
238 |
239 | try:
240 | lexer = guess_lexer_for_filename(file_path, content)
241 | language = lexer.name
242 | except ClassNotFound:
243 | language = None
244 |
245 | if language is not None and isinstance(lexer, TextLexer):
246 | language = None
247 |
248 | data.append({
249 | 'file_content': content,
250 | 'language': language,
251 | 'line_count': len(content.split('\n')),
252 | 'file_size': os.path.getsize(file_path),
253 | 'file_name': file,
254 | 'file_path': rel_path,
255 | 'token_count': num_tokens_from_string(content),
256 | 'description': None,
257 | 'graph': None
258 | })
259 |
260 | df = pd.DataFrame(data)
261 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
262 | df.to_csv(csv_path, index=False, escapechar='\\')
263 | logger.info(f"Saved repo stats to {csv_path}")
264 | return df
265 |
266 | def filter_files(self, selected_files=None, selected_folders=None, selected_languages=None):
267 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
268 | df = pd.read_csv(csv_path)
269 | df['file_path'] = df['file_path'].apply(
270 | lambda x: x.replace(os.sep, '/').replace('\\', '/').lower())
271 |
272 | final_condition = pd.Series([False] * len(df))
273 |
274 | if selected_files:
275 | selected_files = [path.replace(
276 | os.sep, '/').replace('\\', '/').lower() for path in selected_files]
277 | final_condition |= df['file_path'].isin(selected_files)
278 |
279 | if selected_folders:
280 | selected_folders = [folder.replace(
281 | os.sep, '/').replace('\\', '/').lower() for folder in selected_folders]
282 | folder_condition = pd.Series([any(df['file_path'].iloc[i].startswith(
283 | folder) for folder in selected_folders) for i in range(len(df))])
284 | final_condition |= folder_condition
285 |
286 | df = df[final_condition]
287 |
288 | if selected_languages:
289 | df = df[df['language'].isin(selected_languages)]
290 |
291 | return df
292 |
293 | def get_language_percentage(self):
294 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
295 | df = pd.read_csv(csv_path)
296 |
297 | if df['language'].isna().all():
298 | logger.warning(
299 | "Warning: 'language' column is empty. Please make sure the 'language' column is populated.")
300 | return None
301 |
302 | language_counts = df.groupby('language')['line_count'].sum()
303 | total_lines = language_counts.sum()
304 |
305 | if total_lines == 0:
306 | logger.warning(
307 | "Warning: Total line count is zero. Cannot calculate language percentage.")
308 | return None
309 |
310 | language_percentage = language_counts / total_lines * 100
311 | return language_percentage
312 |
313 | def print_directory_structure(self):
314 | directory_structure = {}
315 |
316 | for root, dirs, files in os.walk(self.repo_path):
317 | for file in files:
318 | file_path = os.path.relpath(
319 | os.path.join(root, file), self.repo_path)
320 | parts = file_path.split(os.sep)
321 | current_level = directory_structure
322 |
323 | for part in parts:
324 | if part not in current_level:
325 | current_level[part] = {}
326 | current_level = current_level[part]
327 |
328 | def print_structure(structure, level=0):
329 | for key, value in structure.items():
330 | logger.info(' ' * level + '- ' + key)
331 | print_structure(value, level + 1)
332 |
333 | print_structure(directory_structure)
334 |
335 | def preprocess_dataframe(self, df, limit=None, concat_method='xml', include_directory=True, metadata_list=None):
336 | result = ''
337 |
338 | if include_directory:
339 | directory_structure = {}
340 | for _, row in df.iterrows():
341 | file_path = row['file_path']
342 | parts = file_path.split('/')
343 | current_level = directory_structure
344 | for part in parts:
345 | if part not in current_level:
346 | current_level[part] = {}
347 | current_level = current_level[part]
348 |
349 | def flatten_directory(structure, prefix=''):
350 | flattened = []
351 | for key, value in structure.items():
352 | flattened.append(prefix + key)
353 | flattened.extend(flatten_directory(value, prefix + ' '))
354 | return flattened
355 |
356 | directory_lines = flatten_directory(directory_structure)
357 | result += 'Directory Structure:\n' + \
358 | '\n'.join(directory_lines) + '\n\n'
359 |
360 | for _, row in df.iterrows():
361 | r = result
362 | result += '\n\n' + '=' * 10 + '\n\n'
363 | content = row['file_content']
364 | if row['language'] == 'Jupyter Notebook':
365 | content = convert_ipynb_to_text(content)
366 |
367 | if metadata_list:
368 | metadata = [str(row[col]) for col in metadata_list]
369 | else:
370 | metadata = ""
371 |
372 | if concat_method == 'xml':
373 | result += f'\n'
374 | if metadata:
375 | result += f'{", ".join(metadata)}\n'
376 | result += f'\n{content}\n\n'
377 | result += ''
378 | else:
379 | result += f'File: {row["file_path"]}\n'
380 | if metadata:
381 | result += f'Metadata: {", ".join(metadata)}\n'
382 | result += f'Content:\n{content}'
383 | result += '\n\n' + '=' * 10 + '\n\n'
384 | if limit and num_tokens_from_string(result) > limit:
385 | result = r
386 | break
387 |
388 | return result.strip()
389 |
390 | def get_filtered_files(self, selected_folders=None, selected_files=None, selected_languages=None, limit=None, concat_method='xml', include_directory=True, metadata_list=None):
391 | filtered_files = self.filter_files(
392 | selected_folders=selected_folders, selected_files=selected_files, selected_languages=selected_languages)
393 | file_string = self.preprocess_dataframe(filtered_files, limit=limit, concat_method=concat_method,
394 | include_directory=include_directory, metadata_list=metadata_list)
395 | return file_string
396 |
397 | def get_content_from_file_name(self, file_name):
398 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
399 | df = pd.read_csv(csv_path)
400 | df = df[df["file_name"] == file_name]
401 | row = df.iloc[0]
402 | return row["file_content"]
403 |
404 | def get_folders_options(self):
405 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
406 | df = pd.read_csv(csv_path)
407 | file_paths = df['file_path'].dropna().unique()
408 | # filter out files start with .git
409 | file_paths = [
410 | file for file in file_paths if not file.startswith('.git')]
411 | folders = list(set([os.path.dirname(file) for file in file_paths]))
412 | return sorted(folders)
413 |
414 | def get_files_options(self):
415 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
416 | df = pd.read_csv(csv_path)
417 | # filter out files start with .git
418 | files = df['file_path'].dropna().unique()
419 | files = [file for file in files if not file.startswith('.git')]
420 | return sorted(files)
421 |
422 | def get_languages_options(self):
423 | csv_path = os.path.join(self.repo_path, "repo_stats.csv")
424 | df = pd.read_csv(csv_path)
425 | languages = df['language'].dropna().unique()
426 | return sorted(languages)
427 |
428 |
429 | def singleton(cls):
430 | instances = {}
431 |
432 | def get_instance(*args, **kwargs):
433 | if cls not in instances:
434 | instances[cls] = cls(*args, **kwargs)
435 | return instances[cls]
436 | return get_instance
437 |
438 |
439 | @singleton
440 | class RepoManager:
441 | def __init__(self):
442 | logger.info("Initializing RepoManager...")
443 | self.repos = {}
444 | # if no repo dir
445 | if not os.path.exists(Config["repos_dir"]):
446 | os.makedirs(Config["repos_dir"], exist_ok=True)
447 | self.load_repos()
448 | logger.info(f"Loaded {len(self.repos)} repositories.")
449 |
450 | def _find_repos(self):
451 | repos = []
452 | top_level = Config["repos_dir"]
453 | for repo_dir in os.listdir(top_level):
454 | repo_path = os.path.join(top_level, repo_dir)
455 | if os.path.isdir(repo_path):
456 | if 'repo_stats.csv' in os.listdir(repo_path):
457 | root = repo_path
458 | repo_info_path = os.path.join(root, "repo_info.json")
459 | repo_url_txt_path = os.path.join(root, "repo_url.txt")
460 |
461 | if os.path.exists(repo_info_path):
462 | with open(repo_info_path, "r") as f:
463 | # logger.info(f"Reading repo info from {repo_info_path}")
464 | try:
465 | repo_info = json.load(f)
466 | repo_url = repo_info.get(
467 | "repo_url", "").strip('"')
468 | # fix repo_url if it has extra quotes
469 | repo_info['repo_url'] = repo_url
470 | with open(repo_info_path, "w") as f_update:
471 | json.dump(repo_info, f_update)
472 | except json.JSONDecodeError as e:
473 | logger.error(
474 | f"Error decoding JSON from {repo_info_path}: {e}")
475 | elif os.path.exists(repo_url_txt_path):
476 | with open(repo_url_txt_path, "r") as f:
477 | repo_url = f.read().strip().strip('"') # legacy support
478 | repo_info = {"repo_url": repo_url}
479 | with open(repo_info_path, "w") as f:
480 | json.dump(repo_info, f)
481 | os.remove(repo_url_txt_path) # delete legacy file
482 | else:
483 | repo_url = ""
484 |
485 | if repo_url:
486 | repos.append({
487 | "repo_name": os.path.basename(root),
488 | "repo_url": repo_url,
489 | "last_updated": time.ctime(os.path.getmtime(os.path.join(root, "repo_stats.csv")))
490 | })
491 |
492 | return repos
493 |
494 | def load_repos(self):
495 | repo_details = self._find_repos()
496 | for repo in repo_details:
497 | repo_url = repo["repo_url"]
498 | repo_name = repo["repo_name"]
499 | self.repos[repo_url] = RepoService(
500 | repo_url=repo_url, repo_name=repo_name)
501 |
502 | def add_repo(self, repo_url):
503 | if repo_url not in self.repos:
504 | repo_service = RepoService(repo_url=repo_url)
505 | if repo_service.check_if_exist():
506 | self.repos[repo_url] = repo_service
507 | logger.info(f"Added repository: {repo_url}")
508 | else:
509 | logger.error(f"Failed to add repository: {repo_url}")
510 | return False
511 | else:
512 | logger.warning(f"Repository already exists: {repo_url}")
513 | return True
514 |
515 | def delete_repo(self, repo_url):
516 | if repo_url in self.repos:
517 | self.repos[repo_url].delete_repo()
518 | del self.repos[repo_url]
519 | logger.info(f"Deleted repository: {repo_url}")
520 | else:
521 | logger.warning(f"Repository does not exist: {repo_url}")
522 |
523 | def update_all_repos(self):
524 | for repo_service in self.repos.values():
525 | repo_service.update_repo()
526 |
527 | def get_repo_service(self, repo_url) -> RepoService:
528 | return self.repos.get(repo_url)
529 |
530 | def get_repo_urls(self):
531 | return list(self.repos.keys())
532 |
533 | def check_if_repo_exists(self, repo_url):
534 | return repo_url in self.repos
535 |
536 | def isEmpty(self):
537 | return len(self.repos) == 0
538 |
539 |
540 | if __name__ == "__main__":
541 | # repo_url -> reposervice
542 | repoManager = RepoManager()
543 | # repo = RepoService("https://github.com/jw782cn/RepoChat-200k")
544 | # repo.get_folders()
545 | # print(len(repo.get_folders()))
546 | # repo.delete_repo()
547 |
--------------------------------------------------------------------------------