├── .env-template
├── .gitattributes
├── examples
    ├── count.png
    ├── logo.png
    ├── repos.png
    ├── example.gif
    ├── shadcn.png
    ├── langgraph.png
    ├── llamaindex.png
    └── selection.png
├── requirements.txt
├── config.py
├── config.yaml
├── token_count.py
├── LICENSE
├── useful_tool
    └── metadata_extract.py
├── llm_service.py
├── .gitignore
├── README.md
├── app.py
└── repo_service.py


/.env-template:
--------------------------------------------------------------------------------
1 | OPENROUTER_API_KEY=insert_api_key_here


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/examples/count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/count.png


--------------------------------------------------------------------------------
/examples/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/logo.png


--------------------------------------------------------------------------------
/examples/repos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/repos.png


--------------------------------------------------------------------------------
/examples/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/example.gif


--------------------------------------------------------------------------------
/examples/shadcn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/shadcn.png


--------------------------------------------------------------------------------
/examples/langgraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/langgraph.png


--------------------------------------------------------------------------------
/examples/llamaindex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/llamaindex.png


--------------------------------------------------------------------------------
/examples/selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jw782cn/RepoChat-200k/HEAD/examples/selection.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai~=1.14.2
 2 | pandas~=2.2.1
 3 | Pygments~=2.17.2
 4 | streamlit~=1.32.2
 5 | tiktoken~=0.6.0
 6 | nbformat~=5.10.3
 7 | PyYAML~=6.0.1
 8 | GitPython~=3.1.42
 9 | python-dotenv~=1.0.1
10 | loguru~=0.7.2
11 | send2trash


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # config.py
 2 | import yaml
 3 | from dotenv import load_dotenv
 4 | 
 5 | def load_config(config_file="config.yaml"):
 6 |     with open(config_file, "r") as f:
 7 |         config = yaml.safe_load(f)
 8 |     return config
 9 | 
10 | Config = load_config()
11 | 
12 | load_dotenv()


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | # config.yaml
 2 | repos_dir: "./repos" # Directory to store the downloaded repositories
 3 | 
 4 | # logging
 5 | log_level: "INFO" # Log level
 6 | log_file: "repo_stats.log" # Log file
 7 | 
 8 | # download method
 9 | download_method: "auto" # Download method:auto (both git or http) / git / http
10 | 


--------------------------------------------------------------------------------
/token_count.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | 
 3 | def num_tokens_from_string(string: str, model="gpt-3.5-turbo-0613") -> int:
 4 |     """Returns the number of tokens in a text string based on the specified model's encoding."""
 5 |     try:
 6 |         encoding = tiktoken.encoding_for_model(model)  # Attempt to get encoding for the specified model
 7 |     except KeyError:
 8 |         print("Warning: model not found. Using cl100k_base encoding.")
 9 |         encoding = tiktoken.get_encoding("cl100k_base")  # Fallback encoding if model's encoding not found
10 | 
11 |     num_tokens = len(encoding.encode(string, disallowed_special=()))  # Calculate number of tokens based on encoding
12 |     return num_tokens
13 | 
14 | def num_messages(messages: dict, model="gpt-3.5-turbo-0613") -> int:
15 |     """Returns the number of tokens in a chat message based on the specified model's encoding."""
16 |     num_tokens = 0
17 |     for msg in messages:
18 |         num_tokens += num_tokens_from_string(msg["content"], model=model)
19 |     return num_tokens


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 chty627
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/useful_tool/metadata_extract.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Extract metadata from a file and summarize it into xml format.
 3 | NOTE: This file is not used in the project. You can use this as a reference to create your own meta extraction.
 4 | """
 5 | 
 6 | # from langchain_core.output_parsers import StrOutputParser, XMLOutputParser, JsonOutputParser
 7 | from langchain_core.prompts import ChatPromptTemplate
 8 | from typing import Optional
 9 | from langchain_openai import ChatOpenAI
10 | from langchain_core.messages import AIMessage
11 | import re
12 | 
13 | class ChatOpenRouter(ChatOpenAI):
14 |     openai_api_base: str
15 |     openai_api_key: str
16 |     model_name: str
17 | 
18 |     def __init__(self,
19 |                  model_name: str,
20 |                  openai_api_key: Optional[str] = None,
21 |                  openai_api_base: str = "https://openrouter.ai/api/v1",
22 |                  **kwargs):
23 |         openai_api_key = openai_api_key or os.getenv('OPENROUTER_API_KEY')
24 |         super().__init__(openai_api_base=openai_api_base,
25 |                          openai_api_key=openai_api_key,
26 |                          model_name=model_name, **kwargs)
27 | 
28 | 
29 | template = """\
30 | ===
31 | Langgraph README.md:
32 | {readme_content}
33 | ===
34 | File Content:
35 | {file_content}
36 | ===
37 | Summarzie the above file "{file_name}" into xml with attributes of description and graph abstract
38 | 
39 | description: describe the file in a few words, concisely;
40 | graph: use langgraph syntax to describe the high level structure of the graph mentioned in the file, only output python code of the graph.
41 | 
42 | Please enclose the output in xml format. Only output xml. Donot output any prefix or suffix.
43 | 
44 | Output Format:
45 | <description>insert_description_here</description>
46 | <graph>insert_graph_abstract_here</graph>\
47 | """
48 | prompt = ChatPromptTemplate.from_template(template)
49 | print(prompt)
50 | 
51 | 
52 | def custom_parse(ai_message: AIMessage) -> str:
53 |     """Parse the AI message."""
54 |     content = ai_message.content
55 | 
56 |     # create a dictionary to store the parsed data
57 |     data = {}
58 | 
59 |     # use regular expressions to extract elements and their content
60 |     pattern = r"<(\w+)>(.*?)</\1>"
61 |     matches = re.findall(pattern, content, re.DOTALL)
62 | 
63 |     for match in matches:
64 |         tag, text = match
65 |         data[tag] = text.strip()
66 | 
67 |     return data
68 | 
69 | llm = ChatOpenRouter(
70 |     model_name="anthropic/claude-3-haiku",
71 |     temperature=0.7,
72 | )
73 | 
74 | llm_chain = (
75 |     prompt
76 |     | llm 
77 |     | custom_parse
78 | )


--------------------------------------------------------------------------------
/llm_service.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from enum import Enum
 3 | from loguru import logger
 4 | from openai import OpenAI
 5 | 
 6 | class ProviderType(str, Enum):
 7 |     OPENAI = "OPENAI"
 8 |     OPENROUTER = "OPENROUTER"
 9 | 
10 | 
11 | BASE_URL_MAP = {
12 |     ProviderType.OPENAI: "https://api.openai.com/v1",
13 |     ProviderType.OPENROUTER: "https://openrouter.ai/api/v1",
14 | }
15 | MODEL_MAP = {
16 |     ProviderType.OPENAI: [
17 |         "gpt-4-1106-preview",
18 |         "gpt-3.5-turbo-16k",
19 |     ],
20 |     ProviderType.OPENROUTER: [
21 |         "anthropic/claude-3-haiku",
22 |         "anthropic/claude-3-haiku:beta",
23 |         "anthropic/claude-3-opus",
24 |         "anthropic/claude-3-sonnet",
25 |         "anthropic/claude-3-sonnet:beta",
26 |         "anthropic/claude-3-opus:beta",
27 |     ],
28 | }
29 | MODELS = [*MODEL_MAP[ProviderType.OPENROUTER], *MODEL_MAP[ProviderType.OPENAI]]
30 | 
31 | 
32 | def get_base_url(selected_model: str) -> str:
33 |     """Get the base url for the selected model.
34 |     Args:
35 |         selected_model(str): selected model
36 | 
37 |     Returns:
38 |         str: base url for the selected model
39 |     """
40 |     if selected_model in MODEL_MAP[ProviderType.OPENAI]:
41 |         return BASE_URL_MAP[ProviderType.OPENAI]
42 |     elif selected_model in MODEL_MAP[ProviderType.OPENROUTER]:
43 |         return BASE_URL_MAP[ProviderType.OPENROUTER]
44 |     else:
45 |         raise ValueError(f"Model {selected_model} not found.")
46 | 
47 | 
48 | def get_api_key(selected_model: str) -> str:
49 |     """Get the api key for the selected model.
50 |     Args:
51 |         selected_model(str): selected model
52 | 
53 |     Returns:
54 |         str: api key for the selected model
55 |     """
56 |     if selected_model in MODEL_MAP[ProviderType.OPENAI]:
57 |         return os.getenv("OPENAI_API_KEY")
58 |     elif selected_model in MODEL_MAP[ProviderType.OPENROUTER]:
59 |         return os.getenv("OPENROUTER_API_KEY")
60 |     else:
61 |         raise ValueError(f"Model {selected_model} not found.")
62 | 
63 | 
64 | class ChatClient:
65 |     def __init__(self, base_url: str, api_key: str):
66 |         logger.info(
67 |             f"Initializing ChatClient, base_url: {base_url} and api_key: {api_key[:5]}..."
68 |         )
69 |         self.client = OpenAI(base_url=base_url, api_key=api_key)
70 | 
71 |     def chat(
72 |         self, messages, model="anthropic/claude-3-opus", temperature=0.7, stream=True
73 |     ):
74 |         return self.client.chat.completions.create(
75 |             model=model, messages=messages, stream=stream, temperature=temperature
76 |         )
77 | 
78 | 
79 | def create_client_for_model(selected_model: str):
80 |     """Create a client for the selected model.
81 |     Args:
82 |         selected_model(str): selected model
83 | 
84 |     Returns:
85 |         ChatClient: ChatClient for the selected model
86 |     """
87 |     base_url = get_base_url(selected_model)
88 |     api_key = get_api_key(selected_model)
89 | 
90 |     if api_key is None:
91 |         raise ValueError(f"API Key not found for model: {selected_model}")
92 | 
93 |     return ChatClient(base_url, api_key)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | repos/
  2 | test.ipynb
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 | 
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 | 
114 | # SageMath parsed files
115 | *.sage.py
116 | 
117 | # Environments
118 | .env
119 | .venv
120 | env/
121 | venv/
122 | ENV/
123 | env.bak/
124 | venv.bak/
125 | 
126 | # Spyder project settings
127 | .spyderproject
128 | .spyproject
129 | 
130 | # Rope project settings
131 | .ropeproject
132 | 
133 | # mkdocs documentation
134 | /site
135 | 
136 | # mypy
137 | .mypy_cache/
138 | .dmypy.json
139 | dmypy.json
140 | 
141 | # Pyre type checker
142 | .pyre/
143 | 
144 | # pytype static type analyzer
145 | .pytype/
146 | 
147 | # Cython debug symbols
148 | cython_debug/
149 | 
150 | # PyCharm
151 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
152 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
153 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
154 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
155 | .idea/
156 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RepoChat in 200k - No RAG!
  2 | 
  3 | ⚡Chat with GitHub Repo Using 200k context window of Claude instead of RAG!⚡
  4 | 
  5 | <img src="examples/example.gif" height="400">
  6 | 
  7 | 
  8 | ## Why RepoChat - No RAG?
  9 | 
 10 | Take the advantage of Claude 200k! Put all examples and codes to the contexts!
 11 | 
 12 | **We need copilot rather than agent sometimes!**
 13 | 
 14 | ### 1. Not just Chat, But Write Codes with the Lastest Documents
 15 | 
 16 | Have troubles memorizing all the apis in llama-index or langchain? 
 17 | 
 18 | No worries, just include the components examples and the documents from the repo and let Claude Opus - the strongest model and long context window of 200k to write your agent for you!
 19 | 
 20 | ### 2. No copy paste, Just Select
 21 | 
 22 | <img src="examples/repos.png" height="300">
 23 | 
 24 | Download/Clone your Repo from Github then just select the files you'd like, I got you covered on constructing the prompt.
 25 | 
 26 | ### 3. No RAG, You decide
 27 | 
 28 | I've seen many Chat with Repo projects, they all have the painpoints:
 29 | 
 30 | `Which files do this query need?`
 31 | 
 32 | They use **embedding search** in Code database but most of the time I already knew what documents I'm refering... So make your own choices each time when you are coding.
 33 | 
 34 | - Coding Frontend? Just select components and examples.
 35 | 
 36 | - Coding Agents? Just select Jupyter Notebook of langgraph.
 37 | 
 38 | - Coding RAG? Just select Jupyter Notebook of llamaindex.
 39 | 
 40 | <img src="examples/selection.png" height="400">
 41 | 
 42 | 
 43 | ### 4. More Use Cases!
 44 | 
 45 | #### Use llamaindex doc
 46 | 
 47 | select llamaindex example of pipeline to write RAG graph.
 48 | 
 49 | <img src="examples/llamaindex.png" height="400">
 50 | 
 51 | #### Use Shadcn doc to write frontend
 52 | 
 53 | select examples and components definition.
 54 | 
 55 | <img src="examples/shadcn.png" height="400">
 56 | 
 57 | 
 58 | ## Suggestions!
 59 | 
 60 | 1. You can use Haiku for most of the case.
 61 | 2. Change models based on tasks.
 62 | 3. Change files based on tasks.
 63 | 4. Clone Repos You like! 
 64 | 5. Usually I will include README of repo to help Claude Understand better
 65 | 6. USE `COUNT TOKENS` on the sidebar to see how many tokens you will send!!!
 66 | 
 67 | <img src="examples/count.png" height="200">
 68 | 
 69 | ## Features
 70 | 
 71 | 1. **Repository Download**: Users can provide a GitHub repository URL, and the application will automatically download and analyze the repository.
 72 | 2. **File and Folder Selection**: Users can select specific files or folders from the repository to include in the LLM's input.
 73 | 3. **Language Filtering**: Users can filter the files by programming language to focus the LLM's understanding on specific parts of the codebase.
 74 | 4. **Token Limit**: Users can set a token limit to control the amount of information sent to the LLM, which can be useful for performance or cost considerations.
 75 | 5. **Chat Interface**: Users can interact with the LLM through a chat-style interface, allowing them to ask questions or request code generation based on the repository contents.
 76 | 6. **Streaming Output**: The LLM's responses are displayed in a streaming fashion, providing a more engaging and real-time user experience.
 77 | 
 78 | Currently I only supported Openrouter. Planing to add more and refactor someday.
 79 | 
 80 | ## Get Started
 81 | 
 82 | 1. **Environment Settings**: Run `pip install -r requirements.txt` to set up environment.
 83 | 
 84 | 2. **Create a .env file**: Create a `.env` file in the root directory of the project and add your OpenRouter API key (Recommended):
 85 | ```bash
 86 | OPENROUTER_API_KEY=your_openrouter_api_key_here
 87 | ```
 88 | I recommend [OpenRouter](https://openrouter.ai/) because it has all models!
 89 | 
 90 | If you want to use OpenAI GPT models, add your `openai api key` as well.
 91 | 
 92 | ```bash
 93 | OPENAI_API_KEY=your_openai_api_key_here
 94 | ```
 95 | 
 96 | 3. **Run the application**: Run the `app.py` script using Streamlit:
 97 | ```bash
 98 | streamlit run app.py
 99 | ```
100 | 4. **Use the application**: Follow the instructions in the application to download a GitHub repository, select files and folders, and chat with the LLM.
101 | 
102 | If you encounter some issues with repo, you can always delete the repo dir in ./repos dir and download it again.
103 | 
104 | ## Configuration
105 | 
106 | The application's behavior can be customized through the following configuration options:
107 | 
108 | - **Model**: The specific LLM model to use (e.g., "anthropic/claude-3-haiku", "anthropic/claude-3-opus").
109 | - **Temperature**: The temperature parameter that controls the "creativity" of the LLM's responses.
110 | - **System Prompt**: The initial prompt given to the LLM to set the desired behavior.
111 | 
112 | These settings can be adjusted in the sidebar of the Streamlit application.
113 | 
114 | ## Contributing
115 | 
116 | If you'd like to contribute to the RepoChat-200k project, please feel free to submit issues or pull requests on the [GitHub repository](https://github.com/jw782cn/RepoChat-200k).
117 | 
118 | ## License
119 | 
120 | This project is licensed under the [MIT License](LICENSE).


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import streamlit as st
  4 | from loguru import logger
  5 | from openai import OpenAI
  6 | from token_count import num_messages, num_tokens_from_string
  7 | from llm_service import MODELS, create_client_for_model
  8 | from repo_service import RepoManager
  9 | 
 10 | 
 11 | class StreamHandler:
 12 |     def __init__(self, container, initial_text=""):
 13 |         self.container = container
 14 |         self.text = initial_text
 15 | 
 16 |     def process_token(self, token: str):
 17 |         self.text += token
 18 |         self.container.markdown(self.text)
 19 | 
 20 | def refresh_repos():
 21 |     logger.info("Refreshing repositories")
 22 |     if 'repoManager' not in st.session_state:
 23 |         st.session_state['repoManager'] = RepoManager()
 24 |     st.session_state['repoManager'].load_repos()
 25 |     st.success("Refreshed repositories")
 26 | 
 27 | def create_app():
 28 |     st.set_page_config(page_title="ChatWithRepo", page_icon="🤖")
 29 | 
 30 |     if 'repoManager' not in st.session_state:
 31 |         st.session_state['repoManager'] = RepoManager()
 32 |     if "messages" not in st.session_state:
 33 |         st.session_state["messages"] = []
 34 | 
 35 |     repoManager: RepoManager = st.session_state['repoManager']
 36 |     with st.sidebar:
 37 |         st.title("Settings for Repo")
 38 |         custom_repo_url = st.text_input("Custom Repository URL")
 39 |         col1, col2 = st.columns(2)
 40 |         with col1:
 41 |             if st.button("Add Custom Repository"):
 42 |                 if repoManager.add_repo(custom_repo_url):
 43 |                     st.success(f"Added custom repository: {custom_repo_url}")
 44 |                 else:
 45 |                     st.error(f"Repository add failed: {custom_repo_url}")
 46 |                 repo_url = custom_repo_url
 47 |         with col2:
 48 |             if st.button("Refresh Repositories"):
 49 |                 refresh_repos()
 50 |             
 51 |         repo_url = st.selectbox(
 52 |             "Repository URL", options=repoManager.get_repo_urls())
 53 |         if repoManager.check_if_repo_exists(repo_url):
 54 |             repo = repoManager.get_repo_service(repo_url)
 55 |             selected_folder = st.multiselect(
 56 |                 "Select Folder", options=repo.get_folders_options())
 57 |             selected_files = st.multiselect(
 58 |                 "Select Files", options=repo.get_files_options(), default="README.md")
 59 |             selected_languages = st.multiselect(
 60 |                 "Filtered by Language", options=repo.get_languages_options())
 61 |             limit = st.number_input("Limit", value=100000, step=10000)
 62 |             col1, col2, col3 = st.columns(3)
 63 |             with col1:
 64 |                 if st.button("Count Tokens"):
 65 |                     file_string = repo.get_filtered_files(
 66 |                         selected_folders=selected_folder,
 67 |                         selected_files=selected_files,
 68 |                         selected_languages=selected_languages,
 69 |                         limit=limit,
 70 |                     )
 71 |                     st.write(
 72 |                         f"Total Tokens: {num_tokens_from_string(file_string)}")
 73 |             with col2:
 74 |                 if st.button("Update Repo"):
 75 |                     if repo.update_repo():
 76 |                         st.success(f"Updated repository: {repo_url}")
 77 |                     else:
 78 |                         st.error(f"Repository update failed: {repo_url}")
 79 |                     st.rerun()
 80 |             with col3:
 81 |                 if st.button("Delete Repo"):
 82 |                     if repo.delete_repo():
 83 |                         st.success(f"Deleted repository: {repo_url}")
 84 |                     else:
 85 |                         st.error(f"Repository delete failed: {repo_url}")
 86 |                     refresh_repos()
 87 |                     st.rerun()
 88 | 
 89 |         st.title("Settings for LLM")
 90 | 
 91 |         selected_model = st.selectbox("Model", options=MODELS)
 92 |         temperature = st.slider(
 93 |             "Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.1
 94 |         )
 95 |         system_prompt = st.text_area(
 96 |             "System Prompt",
 97 |             value="You are a helpful assistant. You are provided with a repo information and files from the repo. Answer the user's questions based on the information and files provided.",
 98 |         )
 99 | 
100 |         if st.button("Clear Chat"):
101 |             st.session_state["messages"] = []
102 | 
103 |     if "client" not in st.session_state:
104 |         st.session_state["client"] = create_client_for_model(selected_model)
105 | 
106 |     if repoManager.isEmpty():
107 |         st.info("Copy the repository URL and click the download button.")
108 |         st.stop()
109 | 
110 |     if not repoManager.check_if_repo_exists(repo_url):
111 |         st.info(f"{repo_url} does not exist. Please add the repository first.")
112 |         st.stop()
113 | 
114 |     repo = repoManager.get_repo_service(repo_url)
115 |     st.title(f"Repo: {repo.repo_name}")
116 |     st.write(
117 |             "Chat with LLM using the repository information and files. You can change model settings anytime during the chat."
118 |         )
119 |     st.info(
120 |         f"""
121 |     Files : {selected_files}
122 |     Folder: {selected_folder}
123 |     Languages: {selected_languages}
124 |     Limit: {limit}
125 |     """
126 |     )
127 |     for msg in st.session_state.messages:
128 |         st.chat_message(msg["role"]).write(msg["content"])
129 | 
130 |     if prompt := st.chat_input():
131 |         st.session_state.messages.append({"role": "user", "content": prompt})
132 |         st.chat_message("user").write(prompt)
133 |         logger.info(f"User: {prompt}, received at {pd.Timestamp.now()}")
134 | 
135 |         start_time = pd.Timestamp.now()
136 |         # Check if the selected model has changed
137 |         if "selected_model" not in st.session_state:
138 |             st.session_state.selected_model = None
139 | 
140 |         if st.session_state.selected_model != selected_model:
141 |             st.session_state.client = create_client_for_model(selected_model)
142 |             st.session_state.selected_model = selected_model
143 | 
144 |         file_string = repo.get_filtered_files(
145 |             selected_folders=selected_folder,
146 |             selected_files=selected_files,
147 |             selected_languages=selected_languages,
148 |             limit=limit,
149 |         )
150 |         end_time = pd.Timestamp.now()
151 |         logger.info(
152 |             f"Time taken to get filtered files: {end_time - start_time}")
153 | 
154 |         with st.chat_message("assistant"):
155 |             stream_handler = StreamHandler(st.empty())
156 |             # only add file content to the system prompt
157 |             messages = (
158 |                 [{"role": "system", "content": system_prompt}]
159 |                 + [{"role": "user", "content": file_string}]
160 |                 + st.session_state.messages
161 |             )
162 |             client = st.session_state["client"]
163 | 
164 |             # log the information
165 |             total_tokens = num_messages(messages)
166 |             logger.info(
167 |                 f"Information: {selected_files}, {selected_folder}, {selected_languages}")
168 |             logger.info(f"Using settings: {selected_model}, {temperature}")
169 |             logger.info(f"File token: {num_tokens_from_string(file_string)}")
170 |             logger.info(f"Total Messages Token: {total_tokens}")
171 |             st.sidebar.write(
172 |                 f"Sending file content: {selected_files} and filter folder: {selected_folder} to the assistant.")
173 |             st.sidebar.write(f"total messages token: {total_tokens}")
174 | 
175 |             # send to llm
176 |             completion = client.chat(
177 |                 messages, stream=True, temperature=temperature, model=selected_model
178 |             )
179 | 
180 |             for chunk in completion:
181 |                 content = chunk.choices[0].delta.content
182 |                 stream_handler.process_token(content)
183 | 
184 |             st.session_state.messages.append(
185 |                 {"role": "assistant", "content": stream_handler.text}
186 |             )
187 | 
188 | 
189 | if __name__ == "__main__":
190 |     create_app()
191 | 


--------------------------------------------------------------------------------
/repo_service.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import zipfile
  4 | import time
  5 | import json
  6 | import nbformat
  7 | import requests
  8 | from git import Repo, GitCommandError, NoSuchPathError, InvalidGitRepositoryError
  9 | from loguru import logger
 10 | from send2trash import send2trash
 11 | import pandas as pd
 12 | from pygments.lexers import guess_lexer_for_filename, TextLexer
 13 | from functools import wraps
 14 | from pygments.util import ClassNotFound
 15 | from token_count import num_tokens_from_string
 16 | from config import Config
 17 | 
 18 | 
 19 | def convert_ipynb_to_text(ipynb_content):
 20 |     notebook = json.loads(ipynb_content)
 21 |     text = ""
 22 |     for cell in notebook['cells']:
 23 |         if cell['cell_type'] == 'markdown':
 24 |             text += ''.join(cell['source']) + '\n\n'
 25 |         elif cell['cell_type'] == 'code':
 26 |             text += '```python\n'
 27 |             text += ''.join(cell['source']) + '\n'
 28 |             text += '```\n\n'
 29 |             if len(cell['outputs']) > 0:
 30 |                 text += '<output>\n'
 31 |                 for output in cell['outputs']:
 32 |                     if output['output_type'] == 'stream':
 33 |                         text += ''.join(output['text']) + '\n'
 34 |                     elif output['output_type'] == 'execute_result':
 35 |                         text += ''.join(output['data'].get('text/plain', '')) + '\n'
 36 |                     elif output['output_type'] == 'error':
 37 |                         text += ''.join(output['traceback']) + '\n'
 38 |                 text += '</output>\n\n'
 39 | 
 40 |     return text.strip()
 41 | 
 42 | 
 43 | def retry(max_retries=3, retry_delay=5):
 44 |     def decorator(func):
 45 |         @wraps(func)
 46 |         def wrapper(*args, **kwargs):
 47 |             last_exception = None  # last exception that occurred
 48 |             retries = 0
 49 |             while retries < max_retries:
 50 |                 try:
 51 |                     return func(*args, **kwargs)
 52 |                 except (subprocess.CalledProcessError, requests.exceptions.RequestException, zipfile.BadZipFile) as e:
 53 |                     retries += 1
 54 |                     last_exception = e  # update last exception
 55 |                     logger.error(f"Error in {func.__name__}. Retrying ({retries}/{max_retries})...")
 56 |                     time.sleep(retry_delay)
 57 |             logger.error(f"Failed to execute {func.__name__} after {max_retries} retries.")
 58 |             if last_exception:
 59 |                 raise last_exception  # if an exception occurred, raise it
 60 |             else:
 61 |                 # usually this should not happen
 62 |                 raise Exception(f"Failed to execute {func.__name__} after {max_retries} retries without catching an exception.")
 63 |         return wrapper
 64 |     return decorator
 65 | 
 66 | 
 67 | 
 68 | class RepoService:
 69 |     def __init__(self, repo_url, repo_name=None):
 70 |         self.repo_url = repo_url
 71 |         self.repo_name = repo_name if repo_name else repo_url.split(
 72 |             "/")[-1].replace(".git", "")
 73 |         self.repo_path = os.path.join(Config["repos_dir"], self.repo_name)
 74 |         self.clone_path = os.path.join(
 75 |             self.repo_path, self.repo_name + "-main")
 76 | 
 77 |         if self.check_if_exist():
 78 |             logger.info(
 79 |                 f"Repository {self.repo_name} already exists at {self.repo_path}")
 80 |         else:
 81 |             self.set_up()
 82 | 
 83 |     def check_if_exist(self):
 84 |         repo_info_path = os.path.join(self.repo_path, "repo_info.json")
 85 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
 86 | 
 87 |         if not os.path.exists(repo_info_path) or not os.path.exists(csv_path):
 88 |             return False
 89 |         if pd.read_csv(csv_path).empty:
 90 |             return False
 91 | 
 92 |         with open(repo_info_path, "r") as f:
 93 |             repo_info = json.load(f)
 94 |             if "repo_url" not in repo_info or repo_info["repo_url"] != self.repo_url:
 95 |                 return False
 96 | 
 97 |         # check if the repo has file, if no files, then return False
 98 |         if not os.listdir(self.clone_path):
 99 |             return False
100 |         return True
101 | 
102 |     def set_up(self):
103 |         if not os.path.exists(self.repo_path):
104 |             os.makedirs(self.repo_path, exist_ok=True)
105 |         repo_info = {"repo_url": self.repo_url}
106 |         with open(os.path.join(self.repo_path, "repo_info.json"), "w") as f:
107 |             json.dump(repo_info, f)
108 |         self.clone_repo()
109 |         if not os.path.exists(os.path.join(self.repo_path, "repo_stats.csv")):
110 |             self.get_repo_stats()
111 |         logger.info(
112 |             f"Repository {self.repo_name} set up successfully at {self.repo_path}")
113 |         logger.info(
114 |             f"Last updated: {time.ctime(os.path.getmtime(os.path.join(self.repo_path, 'repo_stats.csv')))}")
115 | 
116 |     def clone_repo(self):
117 |         if os.path.exists(self.clone_path) and os.listdir(self.clone_path):
118 |             logger.info(
119 |                 f"The repository {self.repo_name} already exists at {self.clone_path}.")
120 |             return True
121 | 
122 |         os.makedirs(self.clone_path, exist_ok=True)
123 |         download_method = Config.get("download_method", "auto").lower()
124 | 
125 |         if download_method == "git":
126 |             return self.try_clone_using_git()
127 |         elif download_method == "http":
128 |             return self.try_clone_using_http()
129 |         elif download_method == "auto":
130 |             if self.try_clone_using_git():
131 |                 return True
132 |             logger.info("Git clone failed. Trying HTTP download.")
133 |             return self.try_clone_using_http()
134 | 
135 |         logger.error(
136 |             f"Invalid download method specified in config: {download_method}")
137 |         return False
138 | 
139 |     def try_clone_using_git(self):
140 |         try:
141 |             subprocess.run(["git", "--version"], check=True,
142 |                            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
143 |             self._clone_using_git()
144 |             return True
145 |         except (subprocess.CalledProcessError, FileNotFoundError) as e:
146 |             logger.error(
147 |                 f"Failed to clone repository {self.repo_name} using Git. {e}")
148 |             self.delete_repo()
149 |             return False
150 | 
151 |     def try_clone_using_http(self):
152 |         try:
153 |             self._clone_using_download()
154 |             return True
155 |         except (requests.exceptions.RequestException, zipfile.BadZipFile) as e:
156 |             logger.error(
157 |                 f"Failed to clone repository {self.repo_name} using HTTP download method. {e}")
158 |             self.delete_repo()
159 |             return False
160 | 
161 |     @retry(max_retries=1, retry_delay=5)
162 |     def _clone_using_git(self):
163 |         logger.info(f"Cloning repository {self.repo_name} using Git...")
164 |         subprocess.run(["git", "clone", self.repo_url,
165 |                        self.clone_path], check=True, timeout=60)
166 | 
167 |     @retry(max_retries=1, retry_delay=5)
168 |     def _clone_using_download(self):
169 |         logger.info(f"Cloning repository {self.repo_name} using download...")
170 |         response = requests.get(self.repo_url, timeout=60)
171 |         if response.status_code == 200:
172 |             with open(os.path.join(self.repo_path, "repo.zip"), "wb") as f:
173 |                 f.write(response.content)
174 |             with zipfile.ZipFile(os.path.join(self.repo_path, "repo.zip"), "r") as zip_ref:
175 |                 zip_ref.extractall(self.repo_path)
176 |             os.remove(os.path.join(self.repo_path, "repo.zip"))
177 |         else:
178 |             raise requests.exceptions.RequestException(
179 |                 f"Failed to download repository {self.repo_name}")
180 | 
181 |     def update_repo(self):
182 |         try:
183 |             logger.info(f"Updating repository {self.repo_name}...")
184 |             repo = Repo(self.clone_path)
185 |             origin = repo.remotes.origin
186 |             origin.fetch()  # Fetches the latest changes from the remote repository but does not merge them
187 | 
188 |             current_commit = repo.head.commit  # get the current commit
189 |             # get the remote commit
190 |             remote_commit = origin.refs[repo.active_branch.name].commit
191 | 
192 |             if current_commit.hexsha == remote_commit.hexsha:
193 |                 logger.info(
194 |                     f"Repository {self.repo_name} is already up-to-date.")
195 |                 return True  # if the current commit is the same as the remote commit, the repository is up-to-date
196 | 
197 |             # if the current commit is not the same as the remote commit, pull the changes
198 |             origin.pull()
199 |             logger.info(f"Repository {self.repo_name} updated successfully.")
200 | 
201 |             # after updating the repository, get the latest stats
202 |             self.get_repo_stats()
203 |             return True
204 |         except (GitCommandError, NoSuchPathError, InvalidGitRepositoryError) as e:
205 |             logger.error(f"Failed to update repository {self.repo_name}: {e}")
206 |             return False
207 | 
208 |     def delete_repo(self):
209 |         if os.path.exists(self.repo_path):
210 |             send2trash(self.repo_path)
211 |             logger.info(
212 |                 f"Deleted repository {self.repo_name} at {self.repo_path}")
213 |             return True
214 |         else:
215 |             logger.info(
216 |                 f"Repository {self.repo_name} does not exist at {self.repo_path}")
217 |             return False
218 | 
219 |     def get_repo_stats(self):
220 |         data = []
221 |         for root, dirs, files in os.walk(self.clone_path):
222 |             if '.git' in dirs:
223 |                 dirs.remove('.git')  # don't visit .git directories
224 | 
225 |             for file in files:
226 |                 file_path = os.path.join(root, file)
227 |                 rel_path = os.path.relpath(file_path, self.clone_path)
228 |                 content = ''
229 |                 language = None
230 |                 if file.endswith('.ipynb'):
231 |                     with open(file_path, 'r', encoding='utf-8') as f:
232 |                         notebook = nbformat.read(f, as_version=4)
233 |                         content = nbformat.writes(notebook)
234 |                         language = 'Jupyter Notebook'
235 |                 else:
236 |                     with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
237 |                         content = f.read()
238 | 
239 |                     try:
240 |                         lexer = guess_lexer_for_filename(file_path, content)
241 |                         language = lexer.name
242 |                     except ClassNotFound:
243 |                         language = None
244 | 
245 |                     if language is not None and isinstance(lexer, TextLexer):
246 |                         language = None
247 | 
248 |                 data.append({
249 |                     'file_content': content,
250 |                     'language': language,
251 |                     'line_count': len(content.split('\n')),
252 |                     'file_size': os.path.getsize(file_path),
253 |                     'file_name': file,
254 |                     'file_path': rel_path,
255 |                     'token_count': num_tokens_from_string(content),
256 |                     'description': None,
257 |                     'graph': None
258 |                 })
259 | 
260 |         df = pd.DataFrame(data)
261 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
262 |         df.to_csv(csv_path, index=False, escapechar='\\')
263 |         logger.info(f"Saved repo stats to {csv_path}")
264 |         return df
265 | 
266 |     def filter_files(self, selected_files=None, selected_folders=None, selected_languages=None):
267 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
268 |         df = pd.read_csv(csv_path)
269 |         df['file_path'] = df['file_path'].apply(
270 |             lambda x: x.replace(os.sep, '/').replace('\\', '/').lower())
271 | 
272 |         final_condition = pd.Series([False] * len(df))
273 | 
274 |         if selected_files:
275 |             selected_files = [path.replace(
276 |                 os.sep, '/').replace('\\', '/').lower() for path in selected_files]
277 |             final_condition |= df['file_path'].isin(selected_files)
278 | 
279 |         if selected_folders:
280 |             selected_folders = [folder.replace(
281 |                 os.sep, '/').replace('\\', '/').lower() for folder in selected_folders]
282 |             folder_condition = pd.Series([any(df['file_path'].iloc[i].startswith(
283 |                 folder) for folder in selected_folders) for i in range(len(df))])
284 |             final_condition |= folder_condition
285 | 
286 |         df = df[final_condition]
287 | 
288 |         if selected_languages:
289 |             df = df[df['language'].isin(selected_languages)]
290 | 
291 |         return df
292 | 
293 |     def get_language_percentage(self):
294 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
295 |         df = pd.read_csv(csv_path)
296 | 
297 |         if df['language'].isna().all():
298 |             logger.warning(
299 |                 "Warning: 'language' column is empty. Please make sure the 'language' column is populated.")
300 |             return None
301 | 
302 |         language_counts = df.groupby('language')['line_count'].sum()
303 |         total_lines = language_counts.sum()
304 | 
305 |         if total_lines == 0:
306 |             logger.warning(
307 |                 "Warning: Total line count is zero. Cannot calculate language percentage.")
308 |             return None
309 | 
310 |         language_percentage = language_counts / total_lines * 100
311 |         return language_percentage
312 | 
313 |     def print_directory_structure(self):
314 |         directory_structure = {}
315 | 
316 |         for root, dirs, files in os.walk(self.repo_path):
317 |             for file in files:
318 |                 file_path = os.path.relpath(
319 |                     os.path.join(root, file), self.repo_path)
320 |                 parts = file_path.split(os.sep)
321 |                 current_level = directory_structure
322 | 
323 |                 for part in parts:
324 |                     if part not in current_level:
325 |                         current_level[part] = {}
326 |                     current_level = current_level[part]
327 | 
328 |         def print_structure(structure, level=0):
329 |             for key, value in structure.items():
330 |                 logger.info('  ' * level + '- ' + key)
331 |                 print_structure(value, level + 1)
332 | 
333 |         print_structure(directory_structure)
334 | 
335 |     def preprocess_dataframe(self, df, limit=None, concat_method='xml', include_directory=True, metadata_list=None):
336 |         result = ''
337 | 
338 |         if include_directory:
339 |             directory_structure = {}
340 |             for _, row in df.iterrows():
341 |                 file_path = row['file_path']
342 |                 parts = file_path.split('/')
343 |                 current_level = directory_structure
344 |                 for part in parts:
345 |                     if part not in current_level:
346 |                         current_level[part] = {}
347 |                     current_level = current_level[part]
348 | 
349 |             def flatten_directory(structure, prefix=''):
350 |                 flattened = []
351 |                 for key, value in structure.items():
352 |                     flattened.append(prefix + key)
353 |                     flattened.extend(flatten_directory(value, prefix + '  '))
354 |                 return flattened
355 | 
356 |             directory_lines = flatten_directory(directory_structure)
357 |             result += 'Directory Structure:\n' + \
358 |                 '\n'.join(directory_lines) + '\n\n'
359 | 
360 |         for _, row in df.iterrows():
361 |             r = result
362 |             result += '\n\n' + '=' * 10 + '\n\n'
363 |             content = row['file_content']
364 |             if row['language'] == 'Jupyter Notebook':
365 |                 content = convert_ipynb_to_text(content)
366 | 
367 |             if metadata_list:
368 |                 metadata = [str(row[col]) for col in metadata_list]
369 |             else:
370 |                 metadata = ""
371 | 
372 |             if concat_method == 'xml':
373 |                 result += f'<file name="{row["file_path"]}">\n'
374 |                 if metadata:
375 |                     result += f'<metadata>{", ".join(metadata)}</metadata>\n'
376 |                 result += f'<content>\n{content}\n</content>\n'
377 |                 result += '</file>'
378 |             else:
379 |                 result += f'File: {row["file_path"]}\n'
380 |                 if metadata:
381 |                     result += f'Metadata: {", ".join(metadata)}\n'
382 |                 result += f'Content:\n{content}'
383 |             result += '\n\n' + '=' * 10 + '\n\n'
384 |             if limit and num_tokens_from_string(result) > limit:
385 |                 result = r
386 |                 break
387 | 
388 |         return result.strip()
389 | 
390 |     def get_filtered_files(self, selected_folders=None, selected_files=None, selected_languages=None, limit=None, concat_method='xml', include_directory=True, metadata_list=None):
391 |         filtered_files = self.filter_files(
392 |             selected_folders=selected_folders, selected_files=selected_files, selected_languages=selected_languages)
393 |         file_string = self.preprocess_dataframe(filtered_files, limit=limit,  concat_method=concat_method,
394 |                                                 include_directory=include_directory, metadata_list=metadata_list)
395 |         return file_string
396 | 
397 |     def get_content_from_file_name(self, file_name):
398 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
399 |         df = pd.read_csv(csv_path)
400 |         df = df[df["file_name"] == file_name]
401 |         row = df.iloc[0]
402 |         return row["file_content"]
403 | 
404 |     def get_folders_options(self):
405 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
406 |         df = pd.read_csv(csv_path)
407 |         file_paths = df['file_path'].dropna().unique()
408 |         # filter out files start with .git
409 |         file_paths = [
410 |             file for file in file_paths if not file.startswith('.git')]
411 |         folders = list(set([os.path.dirname(file) for file in file_paths]))
412 |         return sorted(folders)
413 | 
414 |     def get_files_options(self):
415 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
416 |         df = pd.read_csv(csv_path)
417 |         # filter out files start with .git
418 |         files = df['file_path'].dropna().unique()
419 |         files = [file for file in files if not file.startswith('.git')]
420 |         return sorted(files)
421 | 
422 |     def get_languages_options(self):
423 |         csv_path = os.path.join(self.repo_path, "repo_stats.csv")
424 |         df = pd.read_csv(csv_path)
425 |         languages = df['language'].dropna().unique()
426 |         return sorted(languages)
427 | 
428 | 
429 | def singleton(cls):
430 |     instances = {}
431 | 
432 |     def get_instance(*args, **kwargs):
433 |         if cls not in instances:
434 |             instances[cls] = cls(*args, **kwargs)
435 |         return instances[cls]
436 |     return get_instance
437 | 
438 | 
439 | @singleton
440 | class RepoManager:
441 |     def __init__(self):
442 |         logger.info("Initializing RepoManager...")
443 |         self.repos = {}
444 |         # if no repo dir
445 |         if not os.path.exists(Config["repos_dir"]):
446 |             os.makedirs(Config["repos_dir"], exist_ok=True)
447 |         self.load_repos()
448 |         logger.info(f"Loaded {len(self.repos)} repositories.")
449 | 
450 |     def _find_repos(self):
451 |         repos = []
452 |         top_level = Config["repos_dir"]
453 |         for repo_dir in os.listdir(top_level):
454 |             repo_path = os.path.join(top_level, repo_dir)
455 |             if os.path.isdir(repo_path):
456 |                 if 'repo_stats.csv' in os.listdir(repo_path):
457 |                     root = repo_path
458 |                     repo_info_path = os.path.join(root, "repo_info.json")
459 |                     repo_url_txt_path = os.path.join(root, "repo_url.txt")
460 | 
461 |                     if os.path.exists(repo_info_path):
462 |                         with open(repo_info_path, "r") as f:
463 |                             # logger.info(f"Reading repo info from {repo_info_path}")
464 |                             try:
465 |                                 repo_info = json.load(f)
466 |                                 repo_url = repo_info.get(
467 |                                     "repo_url", "").strip('"')
468 |                                 # fix repo_url if it has extra quotes
469 |                                 repo_info['repo_url'] = repo_url
470 |                                 with open(repo_info_path, "w") as f_update:
471 |                                     json.dump(repo_info, f_update)
472 |                             except json.JSONDecodeError as e:
473 |                                 logger.error(
474 |                                     f"Error decoding JSON from {repo_info_path}: {e}")
475 |                     elif os.path.exists(repo_url_txt_path):
476 |                         with open(repo_url_txt_path, "r") as f:
477 |                             repo_url = f.read().strip().strip('"')  # legacy support
478 |                         repo_info = {"repo_url": repo_url}
479 |                         with open(repo_info_path, "w") as f:
480 |                             json.dump(repo_info, f)
481 |                         os.remove(repo_url_txt_path)  # delete legacy file
482 |                     else:
483 |                         repo_url = ""
484 | 
485 |                     if repo_url:
486 |                         repos.append({
487 |                             "repo_name": os.path.basename(root),
488 |                             "repo_url": repo_url,
489 |                             "last_updated": time.ctime(os.path.getmtime(os.path.join(root, "repo_stats.csv")))
490 |                         })
491 | 
492 |         return repos
493 | 
494 |     def load_repos(self):
495 |         repo_details = self._find_repos()
496 |         for repo in repo_details:
497 |             repo_url = repo["repo_url"]
498 |             repo_name = repo["repo_name"]
499 |             self.repos[repo_url] = RepoService(
500 |                 repo_url=repo_url, repo_name=repo_name)
501 | 
502 |     def add_repo(self, repo_url):
503 |         if repo_url not in self.repos:
504 |             repo_service = RepoService(repo_url=repo_url)
505 |             if repo_service.check_if_exist():
506 |                 self.repos[repo_url] = repo_service
507 |                 logger.info(f"Added repository: {repo_url}")
508 |             else:
509 |                 logger.error(f"Failed to add repository: {repo_url}")
510 |                 return False
511 |         else:
512 |             logger.warning(f"Repository already exists: {repo_url}")
513 |         return True
514 | 
515 |     def delete_repo(self, repo_url):
516 |         if repo_url in self.repos:
517 |             self.repos[repo_url].delete_repo()
518 |             del self.repos[repo_url]
519 |             logger.info(f"Deleted repository: {repo_url}")
520 |         else:
521 |             logger.warning(f"Repository does not exist: {repo_url}")
522 | 
523 |     def update_all_repos(self):
524 |         for repo_service in self.repos.values():
525 |             repo_service.update_repo()
526 | 
527 |     def get_repo_service(self, repo_url) -> RepoService:
528 |         return self.repos.get(repo_url)
529 | 
530 |     def get_repo_urls(self):
531 |         return list(self.repos.keys())
532 | 
533 |     def check_if_repo_exists(self, repo_url):
534 |         return repo_url in self.repos
535 | 
536 |     def isEmpty(self):
537 |         return len(self.repos) == 0
538 | 
539 | 
540 | if __name__ == "__main__":
541 |     # repo_url -> reposervice
542 |     repoManager = RepoManager()
543 |     # repo = RepoService("https://github.com/jw782cn/RepoChat-200k")
544 |     # repo.get_folders()
545 |     # print(len(repo.get_folders()))
546 |     # repo.delete_repo()
547 | 


--------------------------------------------------------------------------------