├── .github ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md ├── gitleaks.toml ├── workflows │ ├── secret-scan.yml │ └── publish.yml └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── docs ├── examples │ ├── simple_query.md │ └── advanced_integration.md ├── high_level_architecture.gif ├── data_collection_policy.md ├── installation.md ├── api_reference.md └── usage.md ├── llama_github ├── config │ ├── __init__.py │ ├── config.py │ └── config.json ├── features │ ├── __init__.py │ ├── feature_flags.py │ └── insider_features.py ├── data_retrieval │ ├── __init__.py │ └── github_api.py ├── llm_integration │ ├── __init__.py │ ├── llm_handler.py │ └── initial_load.py ├── rag_processing │ ├── __init__.py │ └── rag_processor.py ├── version.py ├── __init__.py ├── github_integration │ ├── __init__.py │ └── github_auth_manager.py ├── logger.py ├── utils.py └── github_rag.py ├── tests ├── __init__.py ├── test_logger.py ├── test_llm_handler.py ├── conftest.py ├── test_initial_load.py ├── test_utils.py ├── test_github_auth_manager.py ├── test_rag_processor.py └── test_data_retrieval.py ├── .vscode └── settings.json ├── MANIFEST.in ├── requirements.txt ├── setup.py ├── setup.cfg ├── VISION_AND_ROADMAP.md ├── CODE_OF_CONDUCT.md ├── .gitignore ├── CONTRIBUTING.md ├── CHANGELOG.md ├── README.md └── LICENSE /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/examples/simple_query.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/features/feature_flags.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/data_retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/features/insider_features.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/llm_integration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/rag_processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_github/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.3.3' 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # tests/__init__.py 2 | # Marker file for test package -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.REPL.enableREPLSmartSend": false 3 | } -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include llama_github/config/config.json 2 | include CHANGELOG.md -------------------------------------------------------------------------------- /docs/high_level_architecture.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JetXu-LLM/llama-github/HEAD/docs/high_level_architecture.gif -------------------------------------------------------------------------------- /llama_github/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__ 2 | from .github_rag import GithubRAG 3 | from .logger import configure_logging -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | asyncio 3 | PyGithub 4 | langchain 5 | numpy 6 | pydantic 7 | requests 8 | transformers 9 | urllib3 10 | pydantic_core 11 | langchain_openai 12 | langchain_mistralai 13 | httpx_sse 14 | tokenizers -------------------------------------------------------------------------------- /llama_github/github_integration/__init__.py: -------------------------------------------------------------------------------- 1 | # Import the AuthManager for easy access 2 | from .github_auth_manager import GitHubAuthManager 3 | 4 | # Define what is available for import 5 | __all__ = ["GitHubAuthManager"] 6 | 7 | # Any initialization code specific to GitHub integration can go here 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from configparser import ConfigParser 3 | 4 | # Read the requirements from requirements.txt 5 | with open("requirements.txt", "r", encoding="utf-8") as fh: 6 | requirements = fh.readlines() 7 | 8 | # Read version from setup.cfg 9 | config = ConfigParser() 10 | config.read('setup.cfg') 11 | version = config['metadata']['version'] 12 | 13 | setup( 14 | version=version, 15 | install_requires=[req.strip() for req in requirements], 16 | ) -------------------------------------------------------------------------------- /.github/gitleaks.toml: -------------------------------------------------------------------------------- 1 | title = "gitleaks config" 2 | 3 | [[rules]] 4 | description = "OpenAI API Key" 5 | regex = '''sk-[a-zA-Z0-9]{32}''' 6 | tags = ["apikey"] 7 | 8 | [[rules]] 9 | description = "Hugging Face Token" 10 | regex = '''hf_[a-zA-Z0-9]{40}''' 11 | tags = ["apikey"] 12 | 13 | [[rules]] 14 | description = "GitHub Token" 15 | regex = '''ghp_[a-zA-Z0-9]{36}''' 16 | tags = ["apikey"] 17 | 18 | [[rules]] 19 | description = "Jina AI API Key" 20 | regex = '''jina_[a-zA-Z0-9]{32}[a-zA-Z0-9]{16}''' 21 | tags = ["apikey"] 22 | -------------------------------------------------------------------------------- /llama_github/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger('llama_github') 4 | 5 | def configure_logging(level=logging.INFO, handler=None): 6 | logger.setLevel(level) 7 | if handler: 8 | logger.addHandler(handler) 9 | else: 10 | # default handler output to console 11 | ch = logging.StreamHandler() 12 | ch.setLevel(level) 13 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 14 | ch.setFormatter(formatter) 15 | logger.addHandler(ch) -------------------------------------------------------------------------------- /.github/workflows/secret-scan.yml: -------------------------------------------------------------------------------- 1 | name: Secret Scan 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | gitleaks: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout code 10 | uses: actions/checkout@v3 11 | 12 | - name: Install Gitleaks 13 | run: | 14 | curl -sSL https://github.com/zricethezav/gitleaks/releases/download/v8.2.0/gitleaks_8.2.0_linux_x64.tar.gz | tar -xz -C /usr/local/bin gitleaks 15 | 16 | - name: Run Gitleaks 17 | run: | 18 | gitleaks detect --source . --config .github/gitleaks.toml 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | permissions: 11 | id-token: write 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.x' 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install setuptools wheel 22 | - name: Build package 23 | run: python setup.py sdist bdist_wheel 24 | - name: List distribution files 25 | run: ls -l dist/ 26 | - name: Publish package 27 | uses: pypa/gh-action-pypi-publish@release/v1 28 | with: 29 | packages-dir: dist/ -------------------------------------------------------------------------------- /llama_github/config/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | import json 3 | from importlib import resources 4 | from llama_github.logger import logger 5 | 6 | # utils.py 7 | class SingletonMeta(type): 8 | _instances = {} 9 | def __call__(cls, *args, **kwargs): 10 | if cls not in cls._instances: 11 | instance = super().__call__(*args, **kwargs) 12 | cls._instances[cls] = instance 13 | return cls._instances[cls] 14 | 15 | class Config(metaclass=SingletonMeta): 16 | _config = None 17 | 18 | def __init__(self): 19 | if Config._config is None: 20 | with resources.open_text('llama_github.config', 'config.json') as file: 21 | Config._config = json.load(file) 22 | 23 | @classmethod 24 | def get(cls, key, default=None): 25 | # Ensure the singleton instance is created 26 | if cls._config is None: 27 | cls() 28 | return cls._config.get(key, default) 29 | 30 | config = Config() -------------------------------------------------------------------------------- /tests/test_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pytest 3 | from llama_github.logger import configure_logging, logger 4 | 5 | def test_configure_logging_defaults(): 6 | """Test default logging configuration.""" 7 | # Reset handlers 8 | logger.handlers = [] 9 | 10 | configure_logging() 11 | 12 | assert logger.level == logging.INFO 13 | assert len(logger.handlers) == 1 14 | assert isinstance(logger.handlers[0], logging.StreamHandler) 15 | 16 | def test_configure_logging_custom_level(): 17 | """Test logging with custom level.""" 18 | logger.handlers = [] 19 | configure_logging(level=logging.DEBUG) 20 | assert logger.level == logging.DEBUG 21 | 22 | def test_configure_logging_custom_handler(): 23 | """Test logging with a custom handler.""" 24 | logger.handlers = [] 25 | custom_handler = logging.NullHandler() 26 | configure_logging(handler=custom_handler) 27 | 28 | assert len(logger.handlers) == 1 29 | assert logger.handlers[0] == custom_handler -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = llama-github 3 | version = 0.3.3 4 | author = Jet Xu 5 | author_email = Voldemort.xu@foxmail.com 6 | description = Llama-github is an open-source Python library that empowers LLM Chatbots, AI Agents, and Auto-dev Agents to conduct Retrieval from actively selected GitHub public projects. It Augments through LLMs and Generates context for any coding question, in order to streamline the development of sophisticated AI-driven applications. 7 | long_description = file: README.md, CHANGELOG.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/JetXu-LLM/llama-github 10 | classifiers = 11 | Programming Language :: Python :: 3 12 | License :: OSI Approved :: Apache Software License 13 | Operating System :: OS Independent 14 | 15 | [options] 16 | packages = find: 17 | python_requires = >=3.6 18 | include_package_data = True 19 | 20 | [options.packages.find] 21 | include = 22 | llama_github 23 | llama_github.* 24 | 25 | [options.package_data] 26 | llama_github = config/config.json 27 | 28 | [options.extras_require] 29 | dev = 30 | pytest 31 | black 32 | flake8 33 | -------------------------------------------------------------------------------- /tests/test_llm_handler.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import MagicMock, AsyncMock 3 | from llama_github.llm_integration.llm_handler import LLMHandler 4 | from langchain_core.messages import HumanMessage, AIMessage, SystemMessage 5 | 6 | @pytest.mark.asyncio 7 | async def test_ainvoke_basic(): 8 | mock_manager = MagicMock() 9 | mock_llm = MagicMock() 10 | mock_llm.ainvoke = AsyncMock(return_value="AI Response") 11 | mock_manager.get_llm.return_value = mock_llm 12 | mock_manager.model_type = "OpenAI" 13 | 14 | handler = LLMHandler(llm_manager=mock_manager) 15 | 16 | response = await handler.ainvoke("Hello") 17 | assert response == "AI Response" 18 | 19 | def test_compose_chat_history(): 20 | handler = LLMHandler(MagicMock()) 21 | history = ["Hi", "Hello"] 22 | messages = handler._compose_chat_history_messages(history) 23 | 24 | assert len(messages) == 2 25 | assert isinstance(messages[0], HumanMessage) 26 | assert messages[0].content == "Hi" 27 | assert isinstance(messages[1], AIMessage) 28 | assert messages[1].content == "Hello" 29 | 30 | def test_compose_context_messages(): 31 | handler = LLMHandler(MagicMock()) 32 | context = ["ctx1", "ctx2"] 33 | messages = handler._compose_context_messages(context) 34 | 35 | assert len(messages) == 2 36 | assert isinstance(messages[0], SystemMessage) -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import MagicMock, AsyncMock 3 | import sys 4 | from datetime import datetime, timezone 5 | 6 | # Mock external dependencies that might try to connect to internet or load heavy models 7 | sys.modules['langchain_openai'] = MagicMock() 8 | sys.modules['langchain_mistralai'] = MagicMock() 9 | sys.modules['transformers'] = MagicMock() 10 | sys.modules['sentence_transformers'] = MagicMock() 11 | 12 | @pytest.fixture 13 | def mock_github_instance(): 14 | """Mocks the ExtendedGithub instance.""" 15 | mock = MagicMock() 16 | mock.get_user.return_value.login = "test_user" 17 | return mock 18 | 19 | @pytest.fixture 20 | def mock_repo_object(): 21 | """Mocks a PyGithub Repository object.""" 22 | mock_repo = MagicMock() 23 | mock_repo.id = 12345 24 | mock_repo.name = "test-repo" 25 | mock_repo.full_name = "owner/test-repo" 26 | mock_repo.description = "A test repository" 27 | mock_repo.html_url = "https://github.com/owner/test-repo" 28 | mock_repo.stargazers_count = 100 29 | mock_repo.subscribers_count = 10 30 | mock_repo.language = "Python" 31 | mock_repo.default_branch = "main" 32 | mock_repo.updated_at = datetime.now(timezone.utc) 33 | return mock_repo 34 | 35 | @pytest.fixture 36 | def mock_content_file(): 37 | """Mocks a PyGithub ContentFile object.""" 38 | mock_file = MagicMock() 39 | mock_file.name = "test.py" 40 | mock_file.path = "src/test.py" 41 | mock_file.encoding = "base64" 42 | mock_file.content = "cHJpbnQoImhlbGxvIik=" # print("hello") in base64 43 | mock_file.decoded_content = b'print("hello")' 44 | return mock_file 45 | 46 | @pytest.fixture 47 | def mock_llm_handler(): 48 | """Mocks the LLMHandler.""" 49 | handler = MagicMock() 50 | handler.ainvoke = AsyncMock() 51 | return handler -------------------------------------------------------------------------------- /tests/test_initial_load.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import patch, MagicMock 3 | from llama_github.llm_integration.initial_load import LLMManager 4 | 5 | @pytest.fixture(autouse=True) 6 | def reset_singleton(): 7 | LLMManager._instance = None 8 | LLMManager._initialized = False 9 | yield 10 | LLMManager._instance = None 11 | 12 | class TestLLMManager: 13 | @patch('llama_github.llm_integration.initial_load.ChatOpenAI') 14 | def test_init_openai(self, mock_chat_openai): 15 | manager = LLMManager(openai_api_key="sk-test", simple_mode=True) 16 | assert manager.model_type == "OpenAI" 17 | assert manager.llm is not None 18 | mock_chat_openai.assert_called() 19 | 20 | @patch('llama_github.llm_integration.initial_load.AutoTokenizer') 21 | @patch('llama_github.llm_integration.initial_load.AutoModel') 22 | @patch('llama_github.llm_integration.initial_load.AutoModelForSequenceClassification') 23 | def test_init_huggingface_full_mode(self, mock_seq, mock_model, mock_tokenizer): 24 | # Mock system checks 25 | with patch('sys.platform', 'linux'), \ 26 | patch('subprocess.run'): 27 | 28 | manager = LLMManager( 29 | open_source_models_hg_dir="/tmp", 30 | simple_mode=False, 31 | embedding_model="emb-model", 32 | rerank_model="rerank-model" 33 | ) 34 | 35 | assert manager.tokenizer is not None 36 | assert manager.embedding_model is not None 37 | assert manager.rerank_model is not None 38 | mock_tokenizer.from_pretrained.assert_called_with("emb-model") 39 | 40 | def test_simple_mode_skips_heavy_models(self): 41 | manager = LLMManager(simple_mode=True) 42 | assert manager.embedding_model is None 43 | assert manager.rerank_model is None -------------------------------------------------------------------------------- /VISION_AND_ROADMAP.md: -------------------------------------------------------------------------------- 1 | # Vision and Roadmap 2 | 3 | ## Vision 4 | 5 | Our vision is to transform Llama-github into a cornerstone module for AI-driven development solutions. By seamlessly integrating with GitHub, Llama-github will empower Large Language Models (LLMs) to autonomously resolve complex coding tasks. This involves efficiently retrieving relevant code snippets, issues, and repository information, and transforming them into valuable knowledge contexts that enhance the capabilities of LLM Chatbots, AI Agents, and Auto-dev Agents. 6 | 7 | ### Future Vision: Llama-github in Automated AI-Driven Development 8 | 9 | The future vision for Llama-github is to be an integral part of an automated AI-driven development solution. This involves: 10 | 11 | - **Efficient Coding Knowledge Retrieval**: Leveraging advanced question analysis and contextual answer generation. 12 | - **Repository Pool Caching**: Enhancing retrieval efficiency through asynchronous processing and flexible GitHub API integration. 13 | - **Contextual Answer Generation**: Utilizing advanced language models to generate comprehensive and contextually relevant answers. 14 | 15 | ![Vision Architecture](./docs/vision.drawio.svg) 16 | 17 | ## Roadmap 18 | 19 | Our roadmap outlines the key phases and tasks needed to achieve our vision. Each phase builds upon the previous one, ensuring a structured and methodical approach to development. 20 | 21 | ### Phase 1: In-depth Analysis of a Single Repository 22 | - **Task 1.1**: Initial Repository Content Analysis 23 | - **Task 1.2**: Integrate Advanced Algorithms for In-depth Analysis 24 | - **Task 1.3**: Optimize Retrieval Results 25 | 26 | ### Phase 2: Predefined Repositories Feature 27 | - **Task 2.1**: Implement User-defined Repository Feature 28 | - **Task 2.2**: Optimize Loading and Analysis of Predefined Repositories 29 | - **Task 2.3**: Enhance Retrieval Speed and Accuracy 30 | 31 | ### Phase 3: Integration with Vector Database for Persistent Caching 32 | - **Task 3.1**: Integrate Vector Database 33 | - **Task 3.2**: Implement Persistent Caching 34 | - **Task 3.3**: Enhance Large-scale Production Deployment Capability 35 | 36 | ### Additional Features 37 | - **Add User-defined Retrieval Strategy Feature** 38 | - **Implement Multi-language Support (e.g., Chinese support based on QWen2 model)** 39 | - **Integrate More LLM Providers** 40 | 41 | For a detailed view of our project roadmap, please visit our [Project Roadmap](https://github.com/users/JetXu-LLM/projects/2). -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from llama_github.utils import DiffGenerator, DataAnonymizer, CodeAnalyzer 3 | 4 | class TestDiffGenerator: 5 | def test_generate_custom_diff_simple(self): 6 | base = "line1\nline2\nline3" 7 | head = "line1\nline2 modified\nline3" 8 | diff = DiffGenerator.generate_custom_diff(base, head, context_lines=1) 9 | assert "line2 modified" in diff 10 | assert "line1" in diff 11 | 12 | def test_generate_custom_diff_new_file(self): 13 | base = None 14 | head = "new line" 15 | diff = DiffGenerator.generate_custom_diff(base, head, context_lines=1) 16 | assert "+ new line" in diff 17 | 18 | def test_generate_custom_diff_deleted_file(self): 19 | base = "old line" 20 | head = None 21 | diff = DiffGenerator.generate_custom_diff(base, head, context_lines=1) 22 | assert "- old line" in diff 23 | 24 | def test_find_context_python(self): 25 | lines = [ 26 | "def my_func():", 27 | " x = 1", 28 | " y = 2" 29 | ] 30 | context = DiffGenerator._find_context(2, lines) 31 | assert context == "def my_func():" 32 | 33 | class TestDataAnonymizer: 34 | def setup_method(self): 35 | self.anonymizer = DataAnonymizer() 36 | 37 | def test_anonymize_api_key(self): 38 | text = "api_key = 'sk-1234567890abcdef1234567890abcdef'" 39 | anonymized = self.anonymizer.anonymize_sensitive_data(text) 40 | assert "sk-" not in anonymized 41 | assert " pool.max_idle_time: 88 | del pool._locks_registry["expired/repo"] 89 | mock_repo.clear_cache() 90 | 91 | mock_repo.clear_cache.assert_called() 92 | 93 | class TestGitHubAPIHandler: 94 | def test_search_code_integration(self, mock_github_instance): 95 | handler = GitHubAPIHandler(mock_github_instance) 96 | 97 | # Mock search_code response 98 | mock_code_result = MagicMock() 99 | mock_code_result.name = "test.py" 100 | mock_code_result.path = "test.py" 101 | mock_code_result.repository.full_name = "owner/repo" 102 | mock_code_result.html_url = "http://url" 103 | 104 | mock_github_instance.search_code.return_value = [mock_code_result] 105 | 106 | # Mock RepositoryPool to return a mock repo that returns content 107 | with patch.object(handler, 'get_repository') as mock_get_repo: 108 | mock_repo = MagicMock() 109 | mock_repo.get_file_content.return_value = "content" 110 | mock_get_repo.return_value = mock_repo 111 | 112 | results = handler.search_code("query") 113 | 114 | assert len(results) == 1 115 | assert results[0]['content'] == "content" -------------------------------------------------------------------------------- /llama_github/llm_integration/llm_handler.py: -------------------------------------------------------------------------------- 1 | # llm_handler.py 2 | # to do list 3 | # 1. add streaming output for invoke. 4 | 5 | from llama_github.llm_integration.initial_load import LLMManager 6 | from langchain_core.prompts import ChatPromptTemplate, ChatMessagePromptTemplate, MessagesPlaceholder 7 | from langchain_core.output_parsers import StrOutputParser 8 | from llama_github.config.config import config 9 | from llama_github.logger import logger 10 | from langchain_core.messages import AIMessage, HumanMessage, SystemMessage 11 | from pydantic import BaseModel 12 | from typing import Optional 13 | from langchain_openai import output_parsers 14 | 15 | class LLMHandler: 16 | def __init__(self, llm_manager: Optional[LLMManager] = None): 17 | """ 18 | Initializes the LLMHandler class which is responsible for handling the interaction 19 | with a language model (LLM) using the LangChain framework. 20 | 21 | Attributes: 22 | llm_manager (LLMManager): Manages interactions with the language model. 23 | """ 24 | if llm_manager is not None: 25 | self.llm_manager = llm_manager 26 | else: 27 | self.llm_manager = LLMManager() 28 | 29 | async def ainvoke(self, human_question: str, chat_history: Optional[list[str]] = None, context: Optional[list[str]] = None, output_structure: Optional[BaseModel] = None, prompt: str = config.get("general_prompt"), simple_llm=False) -> str: 30 | """ 31 | Asynchronously invokes the language model with a given question, chat history, and context, 32 | and returns the model's response. 33 | 34 | Parameters: 35 | human_question (str): The question or input from the human user. 36 | chat_history (list[str]): A list of strings representing the chat history, where each 37 | string is a message. This parameter is optional. 38 | context (list[str]): A list of strings representing additional context for the model. 39 | This parameter is optional. 40 | output_structure: A langchain_core.pydantic_v1.BaseModel object to control desired 41 | structure of the output from the language model. 42 | This parameter is optional and allows for more detailed control over 43 | the model's responses. 44 | prompt (str): A template for the prompt to be used with the language model. Defaults 45 | to a general prompt defined in the configuration. 46 | 47 | Returns: 48 | str: The response from the language model. 49 | """ 50 | try: 51 | if simple_llm and self.llm_manager.get_llm_simple() is not None: 52 | llm = self.llm_manager.get_llm_simple() 53 | else: 54 | llm = self.llm_manager.get_llm() 55 | if self.llm_manager.model_type == "OpenAI": 56 | # Create a prompt template with placeholders for dynamic content. 57 | prompt_template = ChatMessagePromptTemplate.from_template( 58 | role="system", template=prompt) 59 | chat_prompt = ChatPromptTemplate.from_messages([ 60 | prompt_template, 61 | MessagesPlaceholder( 62 | variable_name="history_messages", optional=True), 63 | MessagesPlaceholder(variable_name="human_message"), 64 | MessagesPlaceholder( 65 | variable_name="context_messages", optional=True) 66 | ]) 67 | 68 | # Convert chat_history and context from [str] to their respective message types. 69 | chat_history_messages = self._compose_context_messages( 70 | chat_history) 71 | context_messages = self._compose_chat_history_messages(context) 72 | human_question_message = HumanMessage(content=human_question) 73 | 74 | prompt_params = { 75 | "history_messages": chat_history_messages, 76 | "human_message": [human_question_message], 77 | "context_messages": context_messages 78 | } 79 | 80 | # Format the prompt with the provided parameters. 81 | formatted_prompt = chat_prompt.format_prompt(**prompt_params) 82 | # Determine the processing chain based on the presence of an output structure. 83 | if output_structure is not None: 84 | chain = llm.with_structured_output(output_structure) 85 | else: 86 | chain = llm 87 | 88 | # Invoke the chain and return the model's response. 89 | try: 90 | response = await chain.ainvoke(formatted_prompt.to_messages()) 91 | except Exception as e: 92 | logger.exception( 93 | f"Call {'simple ' if simple_llm else ''}llm with #{human_question}# generated an exception:{e}") 94 | if output_structure is not None: 95 | response = await chain.ainvoke(formatted_prompt.to_messages()) 96 | return response 97 | except Exception as e: 98 | logger.exception( 99 | f"Call llm with #{human_question}# generated an exception:{e}") 100 | return "An error occurred during processing." 101 | 102 | def _compose_chat_history_messages(self, chat_history: list[str]) -> list: 103 | """ 104 | Converts chat history from a list of strings to a list of alternating HumanMessage 105 | and AIMessage objects, starting with HumanMessage. 106 | 107 | Parameters: 108 | chat_history (list[str]): The chat history as a list of strings. 109 | 110 | Returns: 111 | list: A list of alternating HumanMessage and AIMessage objects. 112 | """ 113 | messages = [] 114 | for i, message in enumerate(chat_history or []): 115 | message_class = HumanMessage if i % 2 == 0 else AIMessage 116 | messages.append(message_class(content=message)) 117 | return messages 118 | 119 | def _compose_context_messages(self, context: list[str]) -> list: 120 | """ 121 | Converts context from a list of strings to a list of SystemMessage objects. 122 | 123 | Parameters: 124 | context (list[str]): The context as a list of strings. 125 | 126 | Returns: 127 | list: A list of SystemMessage objects. 128 | """ 129 | return [SystemMessage(content=message) for message in context or []] 130 | -------------------------------------------------------------------------------- /llama_github/llm_integration/initial_load.py: -------------------------------------------------------------------------------- 1 | # initial_load.py 2 | from typing import Optional, Any 3 | from threading import Lock 4 | 5 | from llama_github.config.config import config 6 | from llama_github.logger import logger 7 | 8 | class LLMManager: 9 | """ 10 | Singleton class for managing Language Models and related components. 11 | This class handles initialization and access to various models including LLMs, 12 | embedding models, and reranking models. 13 | """ 14 | _instance_lock = Lock() 15 | _instance = None 16 | llm = None 17 | rerank_model = None 18 | _initialized = False 19 | llm_simple = None 20 | tokenizer = None 21 | embedding_model = None 22 | 23 | def __new__(cls, *args, **kwargs): 24 | """ 25 | Ensure only one instance of LLMManager is created (Singleton pattern). 26 | """ 27 | if cls._instance is None: # First check (unlocked) 28 | with cls._instance_lock: # Acquire lock 29 | if cls._instance is None: # Second check (locked) 30 | cls._instance = super(LLMManager, cls).__new__(cls) 31 | return cls._instance 32 | 33 | def __init__(self, 34 | openai_api_key: Optional[str] = None, 35 | mistral_api_key: Optional[str] = None, 36 | huggingface_token: Optional[str] = None, 37 | open_source_models_hg_dir: Optional[str] = None, 38 | embedding_model: Optional[str] = config.get( 39 | "default_embedding"), 40 | rerank_model: Optional[str] = config.get("default_reranker"), 41 | llm: Any = None, 42 | simple_mode: bool = False): 43 | """ 44 | Initialize the LLMManager with specified models and API keys. 45 | 46 | Args: 47 | openai_api_key (Optional[str]): API key for OpenAI. 48 | mistral_api_key (Optional[str]): API key for Mistral AI. 49 | huggingface_token (Optional[str]): Token for Hugging Face. 50 | open_source_models_hg_dir (Optional[str]): Directory for open-source models. 51 | embedding_model (Optional[str]): Name or path of the embedding model. 52 | rerank_model (Optional[str]): Name or path of the reranking model. 53 | llm (Any): Custom LLM instance if provided. 54 | simple_mode (bool): If True, skip initialization of embedding and reranking models. 55 | """ 56 | with self._instance_lock: # Prevent re-initialization 57 | if self._initialized: 58 | return 59 | self._initialized = True 60 | 61 | self.simple_mode = simple_mode 62 | 63 | # Initialize LLM based on provided API keys or custom LLM 64 | if llm is not None: 65 | self.llm = llm 66 | self.model_type = "Custom_langchain_llm" 67 | elif mistral_api_key is not None and mistral_api_key != "" and self.llm is None: 68 | logger.info("Initializing Codestral API...") 69 | from langchain_mistralai.chat_models import ChatMistralAI 70 | self.llm = ChatMistralAI(mistral_api_key=mistral_api_key, model="mistral-medium-latest", temperature=0.3) 71 | self.llm_simple = ChatMistralAI( 72 | mistral_api_key=mistral_api_key, 73 | model="devstral-small-latest", 74 | temperature=0.2 75 | ) 76 | self.model_type = "OpenAI" 77 | elif openai_api_key is not None and openai_api_key != "" and self.llm is None: 78 | from langchain_openai import ChatOpenAI 79 | logger.info("Initializing OpenAI API...") 80 | self.llm = ChatOpenAI(api_key=openai_api_key, model="gpt-4-turbo") 81 | self.llm_simple = ChatOpenAI( 82 | api_key=openai_api_key, model="gpt-4o-mini") 83 | self.model_type = "OpenAI" 84 | # Initialize for Open Source Models 85 | elif open_source_models_hg_dir is not None and open_source_models_hg_dir != "" and self.llm is None: 86 | logger.info(f"Initializing {open_source_models_hg_dir}...") 87 | # load huggingface models 88 | self.model_type = "Hubgingface" 89 | elif self.llm is None: 90 | # default model is phi3_mini_128k 91 | self.model_type = "Hubgingface" 92 | 93 | if not self.simple_mode: 94 | import sys 95 | import platform 96 | import subprocess 97 | 98 | def get_device(): 99 | if sys.platform.startswith('darwin'): # macOS 100 | # Check for Apple Silicon (M1/M2) 101 | if platform.machine() == 'arm64': 102 | return 'mps' 103 | elif sys.platform.startswith('linux') or sys.platform.startswith('win'): 104 | # Check for NVIDIA GPU 105 | try: 106 | subprocess.run(['nvidia-smi'], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 107 | return 'cuda' 108 | except (subprocess.CalledProcessError, FileNotFoundError): 109 | pass 110 | 111 | # Default to CPU 112 | return 'cpu' 113 | 114 | # Usage 115 | self.device = get_device() 116 | 117 | from transformers import AutoModel 118 | from transformers import AutoModelForSequenceClassification 119 | from transformers import AutoTokenizer 120 | # Initialize embedding model 121 | if self.tokenizer is None: 122 | logger.info(f"Initializing {embedding_model}...") 123 | self.tokenizer = AutoTokenizer.from_pretrained(embedding_model) 124 | self.embedding_model = AutoModel.from_pretrained( 125 | embedding_model, trust_remote_code=True).to(self.device) 126 | 127 | # Initialize reranking model 128 | if self.rerank_model is None: 129 | logger.info(f"Initializing {rerank_model}...") 130 | self.rerank_model = AutoModelForSequenceClassification.from_pretrained( 131 | rerank_model, num_labels=1, trust_remote_code=True 132 | ).to(self.device) 133 | else: 134 | logger.info("Simple mode enabled. Skipping embedding and rerank model initialization.") 135 | 136 | def get_llm(self): 137 | """ 138 | Get the main Language Model. 139 | 140 | Returns: 141 | The initialized Language Model. 142 | """ 143 | return self.llm 144 | 145 | def get_llm_simple(self): 146 | """ 147 | Get the simplified Language Model. 148 | 149 | Returns: 150 | The initialized simplified Language Model. 151 | """ 152 | return self.llm_simple 153 | 154 | def get_tokenizer(self): 155 | """ 156 | Get the tokenizer for the embedding model. 157 | 158 | Returns: 159 | The initialized tokenizer. 160 | """ 161 | return self.tokenizer 162 | 163 | def get_rerank_model(self): 164 | """ 165 | Get the reranking model. 166 | 167 | Returns: 168 | The initialized reranking model. 169 | """ 170 | return self.rerank_model 171 | 172 | def get_embedding_model(self): 173 | """ 174 | Get the embedding model. 175 | 176 | Returns: 177 | The initialized embedding model. 178 | """ 179 | return self.embedding_model 180 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | This document provides a comprehensive guide on how to use the `llama-github` library effectively. It covers various aspects of the library, including initialization, context retrieval, and advanced usage. 4 | 5 | ## Initialization 6 | 7 | To start using `llama-github`, you need to initialize the `GithubRAG` class with the necessary credentials. Here's an example of how to initialize `GithubRAG`: 8 | 9 | ```python 10 | from llama_github import GithubRAG 11 | 12 | # Initialize GithubRAG with your credentials 13 | github_rag = GithubRAG( 14 | github_access_token="your_github_access_token", 15 | openai_api_key="your_openai_api_key", # Optional in Simple Mode 16 | jina_api_key="your_jina_api_key" # Optional - unless you want high concurrency production deployment (s.jina.ai API will be used in llama-github) 17 | ) 18 | ``` 19 | 20 | Make sure to replace `"your_github_access_token"`, `"your_openai_api_key"`, and `"your_jina_api_key"` with your actual credentials. 21 | 22 | ## Context Retrieval 23 | 24 | The primary functionality of `llama-github` is to retrieve relevant context based on a given query. You can use the `retrieve_context` method to achieve this: 25 | 26 | ```python 27 | query = "How to create a NumPy array in Python?" 28 | context = github_rag.retrieve_context(query) 29 | print(context) 30 | ``` 31 | 32 | The `retrieve_context` method takes a query string as input and returns a list of relevant context strings retrieved from GitHub. 33 | 34 | ### Simple Mode 35 | 36 | By default, `retrieve_context` operates in professional mode, which performs a comprehensive search across code, issues, and repositories on GitHub. However, you can enable simple mode by setting the `simple_mode` parameter to `True`: 37 | 38 | ```python 39 | context = github_rag.retrieve_context(query, simple_mode=True) 40 | ``` 41 | 42 | In simple mode, only a Google search is conducted based on the user's question. This mode is suitable for shorter queries (less than 20 words). 43 | 44 | ## Advanced Usage 45 | 46 | ### Asynchronous Processing 47 | 48 | `llama-github` is built to leverage asynchronous programming for efficient processing. You can use the `async_retrieve_context` method to retrieve context asynchronously: 49 | 50 | ```python 51 | import asyncio 52 | 53 | async def retrieve_context_async(): 54 | context = await github_rag.async_retrieve_context(query) 55 | print(context) 56 | 57 | asyncio.run(retrieve_context_async()) 58 | ``` 59 | 60 | This allows you to handle multiple requests concurrently and boost overall performance. 61 | 62 | ### Customizing LLM Integration 63 | 64 | `llama-github` provides flexibility in integrating with different LLM providers, embedding models, and reranking models. You can customize these integrations during initialization: 65 | 66 | ```python 67 | github_rag = GithubRAG( 68 | github_access_token="your_github_access_token", 69 | openai_api_key="your_openai_api_key", 70 | huggingface_token="your_huggingface_token", 71 | open_source_models_hg_dir="path/to/open_source_models", 72 | embedding_model="custom_embedding_model", 73 | rerank_model="custom_rerank_model", 74 | llm=custom_llm_object 75 | ) 76 | ``` 77 | 78 | - `openai_api_key`: API key for OpenAI services (recommended for using GPT-4-turbo). 79 | - `huggingface_token`: Token for Hugging Face services (recommended). 80 | - `open_source_models_hg_dir`: Path to open-source models from Hugging Face to replace OpenAI. 81 | - `embedding_model`: Name of the custom embedding model from Hugging Face. 82 | - `rerank_model`: Name of the custom reranking model from Hugging Face. 83 | - `llm`: Custom LangChain LLM chat object to replace OpenAI or open-source models from Hugging Face. 84 | 85 | ### Authentication Options 86 | 87 | `llama-github` supports both personal access tokens and GitHub App authentication. You can provide the necessary credentials during initialization: 88 | 89 | ```python 90 | # Personal access token authentication 91 | github_rag = GithubRAG(github_access_token="your_github_access_token") 92 | 93 | # GitHub App authentication 94 | github_app_credentials = GitHubAppCredentials( 95 | app_id=your_app_id, 96 | private_key="your_private_key", 97 | installation_id=your_installation_id 98 | ) 99 | github_rag = GithubRAG(github_app_credentials=github_app_credentials) 100 | ``` 101 | 102 | Make sure to replace the placeholders with your actual credentials. 103 | 104 | ### Logging 105 | 106 | Certainly! Here's an enhanced version of the logging section that emphasizes `llama-github`'s adherence to best practices for Python libraries: 107 | 108 | ## Logging 109 | 110 | `llama-github` follows the best practices for logging in Python libraries by seamlessly integrating with the developer's main application logger. This approach ensures that the library's logging behavior aligns with the overall logging strategy of the application, providing a consistent and unified logging experience. 111 | 112 | By default, `llama-github` does not configure its own logging settings to avoid interfering with the application's existing logging configuration. Instead, it respects the log levels and handlers set up by the developer in their main application. 113 | 114 | To enable logging in `llama-github`, you simply need to configure the logging in your main application using Python's built-in `logging` module. For example: 115 | 116 | ```python 117 | import logging 118 | 119 | # Configure the main application's logger 120 | logging.basicConfig(level=logging.INFO) 121 | 122 | # Your application code goes here 123 | ``` 124 | 125 | In this example, the main application's logger is configured with a log level of `logging.INFO`. `llama-github` will automatically inherit this log level and emit log messages accordingly. 126 | 127 | If you wish to have more control over the logging behavior specific to `llama-github`, you can use the `configure_logging` function provided by the library: 128 | 129 | ```python 130 | from llama_github import configure_logging 131 | 132 | # Configure llama-github's logger 133 | configure_logging(level=logging.DEBUG) 134 | ``` 135 | 136 | By leveraging the flexibility and configurability of Python's `logging` module, `llama-github` provides developers with the tools necessary to gain valuable insights into the library's behavior and quickly identify and resolve any issues that may arise. 137 | 138 | ## Repository Pool Caching 139 | 140 | `llama-github` utilizes an innovative repository pool caching mechanism to optimize performance and minimize GitHub API token consumption. The caching mechanism is automatically enabled and requires no additional configuration. 141 | 142 | The repository pool caching works as follows: 143 | - When a repository is accessed for the first time, it is fetched from the GitHub API and stored in the cache. 144 | - Subsequent requests for the same repository retrieve the cached version, eliminating the need for additional API calls. 145 | - The cache is thread-safe, allowing concurrent access from multiple threads without data inconsistencies. 146 | - Cached repositories are periodically cleaned up based on their last access time to prevent the cache from growing indefinitely. 147 | 148 | You can customize the caching behavior by providing additional parameters during initialization: 149 | 150 | ```python 151 | github_rag = GithubRAG( 152 | github_access_token="your_github_access_token", 153 | repo_cleanup_interval=3600, # Cache cleanup interval in seconds (default: 3600) 154 | repo_max_idle_time=7200 # Maximum idle time for a cached repository in seconds (default: 7200) 155 | ) 156 | ``` 157 | 158 | - `repo_cleanup_interval`: Specifies how often the cache cleanup process runs (default: 3600 seconds, i.e., 1 hour). 159 | - `repo_max_idle_time`: Determines the maximum idle time for a cached repository before it is considered for removal (default: 7200 seconds, i.e., 2 hours). 160 | 161 | The repository pool caching mechanism significantly improves performance by reducing the number of API calls made to GitHub, especially in scenarios where the same repositories are accessed frequently. 162 | 163 | ## Conclusion 164 | 165 | `llama-github` provides a powerful and flexible solution for retrieving relevant context from GitHub based on user queries. By leveraging advanced retrieval techniques, LLM-powered question analysis, comprehensive context generation, and asynchronous processing, `llama-github` empowers developers to find the information they need quickly and efficiently. 166 | 167 | With its support for different authentication methods, customizable LLM integrations, and robust logging capabilities, `llama-github` can be easily integrated into various development environments and tailored to specific requirements. 168 | 169 | By following the usage guidelines outlined in this document and exploring the advanced features provided by `llama-github`, you can unlock the full potential of the library and enhance your development workflow. 170 | 171 | For more information and examples, please refer to the [README](../README.md) and the [API documentation](api_reference.md). -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 |
4 | 🌐 Language 5 |
6 |
7 | English 8 | | 简体中文 9 | | 繁體中文 10 | | 日本語 11 | | 한국어 12 | | हिन्दी 13 | | ไทย 14 | | Français 15 | | Deutsch 16 | | Español 17 | | Italiano 18 | | Русский 19 | | Português 20 | | Nederlands 21 | | Polski 22 | | العربية 23 | | فارسی 24 | | Türkçe 25 | | Tiếng Việt 26 | | Bahasa Indonesia 27 | | অসমীয়া 29 |
30 |
31 |
32 | 33 | # llama-github 34 | 35 | [Detail Document] https://deepwiki.com/JetXu-LLM/llama-github 36 | 37 | [![PyPI version](https://badge.fury.io/py/llama-github.svg)](https://badge.fury.io/py/llama-github) 38 | [![Downloads](https://static.pepy.tech/badge/Llama-github)](https://pepy.tech/project/Llama-github) 39 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 40 | 41 | Llama-github is a powerful tool that helps you retrieve(based on Agentic RAG) the most relevant code snippets, issues, and repository information from GitHub based on your queries, transforming them into valuable knowledge context. It empowers LLM Chatbots, AI Agents, and Auto-dev Agents to solve complex coding tasks. Whether you're a developer looking for quick solutions or an engineer implementing advanced Auto Dev AI Agents, llama-github makes it easy and efficient. 42 | 43 | If you like this project or believe it has potential, please give it a ⭐️. Your support is our greatest motivation! 44 | 45 | ## Architecture 46 | ![High Level Architecture](./docs/high_level_architecture.drawio.svg) 47 | 48 | ## Installation 49 | ``` 50 | pip install llama-github 51 | ``` 52 | 53 | ## Usage 54 | 55 | Here's a simple example of how to use llama-github: 56 | 57 | ```python 58 | from llama_github import GithubRAG 59 | 60 | # Initialize GithubRAG with your credentials 61 | github_rag = GithubRAG( 62 | github_access_token="your_github_access_token", 63 | openai_api_key="your_openai_api_key", # Optional in Simple Mode 64 | jina_api_key="your_jina_api_key" # Optional - unless you want high concurrency production deployment (s.jina.ai API will be used in llama-github) 65 | ) 66 | 67 | # Retrieve context for a coding question (simple_mode is default set to False) 68 | query = "How to create a NumPy array in Python?" 69 | context = github_rag.retrieve_context( 70 | query, # In professional mode, one query will take nearly 1 min to generate final contexts. You could set log level to INFO to monitor the retrieval progress 71 | # simple_mode = True 72 | ) 73 | 74 | print(context) 75 | ``` 76 | 77 | For more advanced usage and examples, please refer to the [documentation](docs/usage.md). 78 | 79 | ## Key Features 80 | 81 | - **🔍 Intelligent GitHub Retrieval**: Harness the power of llama-github to retrieve highly relevant code snippets, issues, and repository information from GitHub based on user queries. Our advanced retrieval techniques ensure you find the most pertinent information quickly and efficiently. 82 | 83 | - **⚡ Repository Pool Caching**: Llama-github has an innovative repository pool caching mechanism. By caching repositories (including READMEs, structures, code, and issues) across threads, llama-github significantly accelerates GitHub search retrieval efficiency and minimizes the consumption of GitHub API tokens. Deploy llama-github in multi-threaded production environments with confidence, knowing that it will perform optimally and save you valuable resources. 84 | 85 | - **🧠 LLM-Powered Question Analysis**: Leverage state-of-the-art language models to analyze user questions and generate highly effective search strategies and criteria. Llama-github intelligently breaks down complex queries, ensuring that you retrieve the most relevant information from GitHub's vast repository network. 86 | 87 | - **📚 Comprehensive Context Generation**: Generate rich, contextually relevant answers by seamlessly combining information retrieved from GitHub with the reasoning capabilities of advanced language models. Llama-github excels at handling even the most complex and lengthy questions, providing comprehensive and insightful responses that include extensive context to support your development needs. 88 | 89 | - **🚀 Asynchronous Processing Excellence**: Llama-github is built from the ground up to leverage the full potential of asynchronous programming. With meticulously implemented asynchronous mechanisms woven throughout the codebase, llama-github can handle multiple requests concurrently, significantly boosting overall performance. Experience the difference as llama-github efficiently manages high-volume workloads without compromising on speed or quality. 90 | 91 | - **🔧 Flexible LLM Integration**: Easily integrate llama-github with various LLM providers, embedding models, and reranking models to tailor the library's capabilities to your specific requirements. Our extensible architecture allows you to customize and enhance llama-github's functionality, ensuring that it adapts seamlessly to your unique development environment. 92 | 93 | - **🔒 Robust Authentication Options**: Llama-github supports both personal access tokens and GitHub App authentication, providing you with the flexibility to integrate it into different development setups. Whether you're an individual developer or working within an organizational context, llama-github has you covered with secure and reliable authentication mechanisms. 94 | 95 | - **🛠️ Logging and Error Handling**: We understand the importance of smooth operations and easy troubleshooting. That's why llama-github comes equipped with comprehensive logging and error handling mechanisms. Gain deep insights into the library's behavior, quickly diagnose issues, and maintain a stable and reliable development workflow. 96 | 97 | ## 🤖 Try Our AI-Powered PR Review Assistant: LlamaPReview 98 | 99 | If you find llama-github useful, you might also be interested in our AI-powered GitHub PR review assistant, LlamaPReview. It's designed to complement your development workflow and further enhance code quality. 100 | 101 | ### Key Features of LlamaPReview: 102 | - 🚀 One-click installation, zero configuration required, fully auto-run 103 | - 💯 Currently free to use - no credit card or payment info needed 104 | - 🧠 AI-powered, automatic PR reviews with deep code understanding 105 | - 🌐 Supports multiple programming languages 106 | 107 | **LlamaPReview utilizes llama-github's advanced context retrieval and LLM-powered analysis** to provide intelligent, context-aware code reviews. It's like having a senior developer, armed with the full context of your repository, review every PR automatically! 108 | 109 | 👉 [Install LlamaPReview Now](https://github.com/marketplace/llamapreview/) (Free) 110 | 111 | By using llama-github for context retrieval and LlamaPReview for code reviews, you can create a powerful, AI-enhanced development environment. 112 | 113 | ## Vision and Roadmap 114 | 115 | ### Vision 116 | 117 | Our vision is to become a pivotal module in the future of AI-driven development solutions, seamlessly integrating with GitHub to empower LLMs in automatically resolving complex coding tasks. 118 | 119 | ![Vision Architecture](./docs/vision.drawio.svg) 120 | 121 | ### Roadmap 122 | 123 | For a detailed view of our project roadmap, please visit our [Project Roadmap](https://github.com/users/JetXu-LLM/projects/2). 124 | 125 | ## Acknowledgments 126 | 127 | We would like to express our gratitude to the following open-source projects for their support and contributions: 128 | 129 | - **[LangChain](https://github.com/langchain-ai/langchain)**: For providing the foundational framework that empowers the LLM prompting and processing capabilities in llama-github. 130 | - **[Jina.ai](https://github.com/jina-ai/reader)**: For offering s.jina.ai API and open source reranker and embedding models that enhance the accuracy and relevance of the generated contexts in llama-github. 131 | 132 | Their contributions have been instrumental in the development of llama-github, and we highly recommend checking out their projects for more innovative solutions. 133 | 134 | ## Contributing 135 | 136 | We welcome contributions to llama-github! Please see our [contributing guidelines](CONTRIBUTING.md) for more information. 137 | 138 | ## License 139 | 140 | This project is licensed under the terms of the Apache 2.0 license. See the [LICENSE](LICENSE) file for more details. 141 | 142 | ## Contact 143 | 144 | If you have any questions, suggestions, or feedback, please feel free to reach out to us at [Jet Xu's email](mailto:Voldemort.xu@foxmail.com). 145 | 146 | --- 147 | 148 | Thank you for choosing llama-github! We hope this library enhances your AI development experience and helps you build powerful applications with ease. 149 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2024] [Jet Xu] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /llama_github/config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "general_prompt": "You are a highly intelligent assistant with expertise in GitHub repositories and coding practices. Your primary task is to provide comprehensive and accurate answers to questions related to GitHub projects, coding issues, or programming concepts. When analyzing queries, focus on delivering a complete response that directly addresses the original question. While you may be provided with additional context, use this information judiciously to enhance your answer without deviating from the main point. Your extensive knowledge base, combined with your ability to understand complex coding queries and retrieve pertinent information, should be the foundation of your responses. When referencing provided context, integrate it seamlessly into your answer without explicitly evaluating or critiquing it. Your goal is to guide developers towards solutions, explain concepts clearly, or provide the information they seek about GitHub projects and software development, always ensuring that your final response is a cohesive and complete answer to the original question.", 3 | "always_answer_prompt": "**Instructions:**\nAs an advanced AI assistant with deep expertise in GitHub repositories, coding practices, and programming concepts, your primary goal is to provide concise, accurate, and contextually relevant answers to complex coding queries. When presented with a question, your first step is to analyze the query and generate a succinct abstraction that captures its core essence by using only one sentence, especially if the original question is lengthy or convoluted.\n\nNext, leverage your extensive knowledge base and reasoning capabilities to craft a coherent and informative response. If possible, enhance your answer with sample code snippets that demonstrate the practical application of the concepts discussed. Remember, your responses should guide developers towards solving their problems, understanding new concepts, or finding the information they seek related to GitHub projects and software development. Please keep your responses concise and to the point, focusing on the most essential information needed to address the query. Avoid generating long articles or overly detailed explanations.\n\nIn addition to the answer itself, provide a brief analysis of how you would approach searching for relevant code and issues within GitHub repositories. This analysis should outline your thought process and the key factors you would consider when conducting these searches. However, keep this analysis concise and focused on the high-level logic rather than delving into specific search criteria or keywords.\n\nThroughout your responses, prioritize clarity and brevity. Focus on delivering the most essential information needed to address the query effectively. Even if certain details are unknown, ensure that your answers are plausible, useful, and serve as a foundation for further exploration and context generation.\n\nRemember, your ultimate aim is to empower developers with the knowledge and guidance they need to overcome challenges, expand their understanding, and navigate the vast landscape of GitHub repositories and software development practices.", 4 | "code_search_criteria_prompt": "**Instructions:**\n- **Expertise-Driven Github Code Search Criteria Generation:** Generate GitHub code search criteria strings based on the provided question and its draft answer. Analyze both the question and answer to identify key concepts, technologies, and coding practices that can help locate relevant code snippets on GitHub. Always include the `language:` qualifier to focus your search on language-related content.\n\n**Output Format:** Present each search criteria string on a new line, formatted for immediate use in GitHub's code search, without additional explanations or commentary.\n\n**Optimization Considerations:**\n- **Keyword Relevance:** Extract keywords and phrases tightly related to the question from the question and answer that are likely to appear in relevant code and code comments. Prioritize terms that reflect specific coding concepts, libraries, or techniques. Avoid generic terms like \"example\" or \"integration\" that may not be present in actual code.\n- **Contextual Understanding:** Use the provided answer as additional context to inform your keyword selection. Identify key insights, technologies, or approaches mentioned in the answer tightly related to the question that can help refine the search criteria.\n- **Language and Platform Specificity:** If the question is specific to a certain programming language or platform, ensure to include relevant language or platform-specific keywords, libraries, or frameworks in the search criteria. This helps filter out irrelevant results from other languages or platforms.\n- **Simplicity and Effectiveness:** Craft search criteria with simple and limited keywords which could lead to precise search results to relevant code snippets tightly related to original question. Strike a balance between specificity and breadth to ensure the criteria capture the essential aspects of the question and answer. The search criteria should be neither too narrow that no results are returned, nor too broad that many irrelevant results are included.\n- **Multiple Perspectives:** Generate multiple search criteria strings that approach the question from different angles or emphasize different aspects mentioned in the question and answer. This increases the chances of finding relevant code snippets.", 5 | "issue_search_criteria_prompt": "**Instructions:**\n- **Question-Driven GitHub Issue Search Criteria Generation:** Generate GitHub issue search criteria strings based on the provided question. Analyze the question to identify key concepts, technologies, and problem-solving approaches that can help locate relevant issues on GitHub. Consider using relevant `label:` or `is:` qualifiers when applicable.\n\n**Output Format:** Present each search criteria string on a new line, formatted for immediate use in GitHub's issue search, without additional explanations or commentary.\n\n**Optimization Considerations:**\n- **Keyword Relevance:** Extract keywords and phrases tightly related to the question that are likely to appear in issue titles, descriptions, and discussions. Prioritize terms that reflect specific problems, error messages, or technologies. Avoid generic terms like \"help\" or \"problem\" that may not effectively narrow down the search results.\n- **Contextual Understanding:** Use the question's draft answer to inform your keyword selection. Identify key aspects, technologies, or potential troubleshooting areas tightly related to the question but not only specific aspects of answers that can help refine the search criteria.\n- **Simplicity and Effectiveness:** Craft search criteria with simple and limited keywords which could lead to precise search results relevant to the original question. Strike a balance between specificity and breadth to ensure the criteria capture the essential aspects of the question without being overly restrictive.\n- **Multiple Perspectives:** Generate multiple search criteria strings that approach the question from different angles or emphasize different aspects mentioned in the question. This increases the chances of finding relevant issues that discuss similar problems or solutions.\n- **Leveraging Labels:** When appropriate, include relevant `label:` qualifiers in the search criteria to narrow down the results to issues with specific labels, such as \"bug,\" \"enhancement,\" or \"documentation.\" This can help focus the search on issues that align with the nature of the question.\n- **Considering Issue Discussions:** Keep in mind that issue discussions often contain valuable information, experiences, and workarounds shared by other developers. Craft search criteria that not only match the issue title and description but also consider the likelihood of the keywords appearing in the issue's comments and discussions.", 6 | "repo_search_criteria_prompt": "**Instructions:**\n- **Expertise-Driven Github Repository Search Criteria Generation:** Generate GitHub repo search criteria strings based on the provided question. Analyze the question leverage your expertise for related key concepts, technologies, and problem-solving approaches that can help locate relevant repositories on GitHub. Focus on practical keywords and phrases likely to be present in repository names, descriptions, and topics. Use the `language:` qualifier to direct your search toward repositories written in a specific language, keeping the criteria simple and effective.\n- **Necessity Score Determination:** Evaluate the necessity of conducting a GitHub repository search based on the difficulty of question. Determine if repository-level information is essential to comprehensively address the question. Assign a necessity score indicating the importance of performing a repository search.\n\n**Output Format:**\n- **Necessity Score:** Begin your output with a necessity score (0-100) indicating the importance of performing a separate GitHub repository search. Use the following scale:\n - 0-59: Low necessity - Only code and issue search results is sufficient.\n - 60-79: Medium necessity - One repository search may offer additional insights and context.\n - 80-100: High necessity - Two repository searches are crucial to gather comprehensive information, such as project structure, documentation, or community engagement, to thoroughly address the question.\n\n- **Search Criteria:** Present each search criteria string on a new line, formatted for immediate use in GitHub's repository search, without additional explanations or commentary.\n**Optimization Considerations:**\n- **Keyword Relevance:** Generate search criteria keywords and phrases from the question that are uniquely relevant to repository names, descriptions, and topics. Prioritize terms that reflect the broader context, expertise, and strategic thinking required to address the question effectively. Avoid generic terms that may lead to irrelevant search results.\n- **Simplicity and Effectiveness:** Craft search criteria that are simple yet effective in narrowing down the repository search results to the most relevant and informative ones. Strike a balance between specificity and breadth, ensuring that the criteria capture the essential aspects of the question without being overly restrictive. Aim for criteria that yield a manageable number of high-quality repository results.\n- **Language and Platform Specificity:** If the question pertains to a specific programming language or platform, incorporate relevant language or platform-specific keywords in the search criteria. Use the `language:` qualifier to filter repositories based on the language of interest. This helps focus the search on repositories that are more likely to contain relevant code, documentation, and community expertise.\n- **Multiple Criteria Flexibility:** Generate multiple search criteria strings that approach the question from different angles or emphasize different aspects mentioned in the question. This flexibility allows for a more comprehensive repository search, increasing the chances of discovering relevant repositories that may offer valuable insights, code samples, or best practices related to the question at hand.", 7 | "scoring_context_prompt": "You are an expert in evaluating the relevance of coding-related contexts to given questions. Your primary function is to analyze the provided context and question, and output a single integer score between 0 and 100, indicating how well the context supports answering the question.\n\nScoring criteria:\n0-20: The context is completely irrelevant to the question and provides no useful information to answer it.\n21-40: The context is slightly relevant to the question but lacks crucial information to provide a complete answer.\n41-60: The context is somewhat relevant to the question and provides some useful information, but it may not be sufficient to fully answer the question.\n61-80: The context is highly relevant to the question and provides most of the necessary information to answer it, but some minor details may be missing.\n81-100: The context is extremely relevant to the question and provides all the necessary information to comprehensively answer it.\n\nRemember, your output should consist of only a single integer score without any additional text or explanation. Analyze the context and question carefully, and provide a score that accurately reflects the relevance of the context in answering the question.", 8 | "default_embedding": "jinaai/jina-embeddings-v2-base-code", 9 | "default_reranker": "jinaai/jina-reranker-v2-base-multilingual", 10 | "min_stars_to_keep_result": 20, 11 | "max_workers": 8, 12 | "code_search_max_hits": 30, 13 | "issue_search_max_hits": 30, 14 | "repo_search_max_hits": 10, 15 | "chunk_size": 2000, 16 | "issue_chunk_size": 7000, 17 | "repo_chunk_size": 7000, 18 | "google_chunk_size": 7000, 19 | "top_n_contexts": 4 20 | } -------------------------------------------------------------------------------- /llama_github/github_integration/github_auth_manager.py: -------------------------------------------------------------------------------- 1 | # To do list: 2 | # 1. add the mechanism for installation_access_token and github_instance refreshment in authenticate_with_app model 3 | # 2. add re-try mechanism for the API calls 4 | # 3. add the mechanism for the rate limit handling 5 | # 4. add the mechanism for the error handling 6 | # 5. add the mechanism for the logging 7 | # 6. add search issues functionality 8 | # 7. add search discussions functionality through Github GraphQL API 9 | 10 | from github import Github, GithubIntegration 11 | import requests 12 | from requests.adapters import HTTPAdapter 13 | from requests.exceptions import HTTPError, RequestException 14 | from urllib3.util.retry import Retry 15 | from llama_github.logger import logger 16 | 17 | 18 | class GitHubAuthManager: 19 | def __init__(self): 20 | self.github_instance = None 21 | self.access_token = None 22 | self.app_id = None 23 | self.private_key = None 24 | self.installation_id = None 25 | 26 | def authenticate_with_token(self, access_token): 27 | """ 28 | Authenticate using a personal access token or an OAuth token. 29 | Suitable for individual developers and applications using OAuth for authorization. 30 | """ 31 | self.access_token = access_token 32 | self.github_instance = ExtendedGithub(login_or_token=access_token) 33 | return self.github_instance 34 | 35 | def authenticate_with_app(self, app_id, private_key, installation_id): 36 | """ 37 | Authenticate using a GitHub App. 38 | Suitable for integrations in organizational or enterprise environments. 39 | """ 40 | self.app_id = app_id 41 | self.private_key = private_key 42 | self.installation_id = installation_id 43 | integration = GithubIntegration(app_id, private_key) 44 | installation_access_token = integration.get_access_token( 45 | installation_id).token 46 | self.access_token = installation_access_token 47 | self.github_instance = ExtendedGithub( 48 | login_or_token=installation_access_token) 49 | return self.github_instance 50 | 51 | def close_connection(self): 52 | """ 53 | Close the connection to GitHub to free up resources. 54 | """ 55 | if self.github_instance: 56 | self.github_instance = None 57 | 58 | # Extended Github Class for powerful API calls - e.g. recursive call to get repo structure 59 | 60 | 61 | class ExtendedGithub(Github): 62 | def __init__(self, login_or_token): 63 | self.access_token = login_or_token 64 | super().__init__(login_or_token=login_or_token) 65 | 66 | def get_repo_structure(self, repo_full_name, branch='main') -> dict: 67 | """ 68 | Get the structure of a repository (files and directories) recursively. 69 | """ 70 | owner, repo_name = repo_full_name.split('/') 71 | headers = {'Authorization': f'token {self.access_token}'} 72 | 73 | # Function to convert the flat list to a hierarchical structure 74 | def list_to_tree(items): 75 | """ 76 | Convert the flat list to a hierarchical structure with full paths. 77 | Include size metadata for files and remove 'type' attributes. 78 | """ 79 | tree = {} 80 | for item in items: 81 | path_parts = item['path'].split('/') 82 | current_level = tree 83 | for part in path_parts[:-1]: 84 | # Ensure 'children' dictionary exists for directories without explicitly adding 'type' 85 | current_level = current_level.setdefault( 86 | part, {'children': {}}) 87 | # Ensure we don't inadvertently create a 'type' key for directories 88 | current_level = current_level.get('children') 89 | 90 | # For the last part of the path, decide if it's a file or directory and add appropriate information 91 | if item['type'] == 'blob': # It's a file 92 | current_level[path_parts[-1]] = { 93 | 'path': item['path'], # Include full path 94 | # Include size if available 95 | 'size': item.get('size', 0) 96 | } 97 | else: # It's a directory 98 | # Initialize the directory if not already present, without adding 'type' 99 | if path_parts[-1] not in current_level: 100 | current_level[path_parts[-1]] = {'children': {}} 101 | return tree 102 | 103 | # Directly use the Trees API to get the full directory structure of the "main" branch 104 | tree_url = f'https://api.github.com/repos/{owner}/{repo_name}/git/trees/{branch}?recursive=1' 105 | tree_response = requests.get(tree_url, headers=headers) 106 | 107 | # Check if the request was successful 108 | if tree_response.status_code == 200: 109 | tree_data = tree_response.json() 110 | # Convert the flat list of items to a hierarchical tree structure 111 | repo_structure = list_to_tree(tree_data['tree']) 112 | return repo_structure 113 | else: 114 | print( 115 | f"Error fetching tree structure: {tree_response.status_code}") 116 | print("Details:", tree_response.json()) 117 | 118 | def search_code(self, query: str, per_page: int = 30) -> dict: 119 | """ 120 | Search for code on GitHub using the GitHub API. 121 | 122 | Parameters: 123 | query (str): The search query. 124 | per_page (int): The number of results per page. 125 | 126 | Returns: 127 | dict: The search result in dict format. 128 | """ 129 | url = 'https://api.github.com/search/code' 130 | headers = { 131 | 'Accept': 'application/vnd.github.v3+json', 132 | 'Authorization': f'token {self.access_token}' 133 | } 134 | params = { 135 | 'q': query, 136 | 'per_page': per_page 137 | } 138 | 139 | # Retry strategy 140 | retry_strategy = Retry( 141 | total=3, # Total number of retries 142 | # Retry on these HTTP status codes 143 | status_forcelist=[429, 500, 502, 503, 504], 144 | # Retry on these HTTP methods 145 | allowed_methods=["HEAD", "GET", "OPTIONS"], 146 | backoff_factor=1 # Exponential backoff factor 147 | ) 148 | adapter = HTTPAdapter(max_retries=retry_strategy) 149 | http = requests.Session() 150 | http.mount("https://", adapter) 151 | 152 | try: 153 | response = http.get(url, headers=headers, params=params) 154 | response.raise_for_status() # Raise HTTPError for bad responses 155 | return response.json().get('items', []) 156 | except HTTPError as http_err: 157 | logger.error(f"HTTP error occurred: {http_err}") 158 | except RequestException as req_err: 159 | logger.error(f"Request error occurred: {req_err}") 160 | except Exception as err: 161 | logger.error(f"An error occurred: {err}") 162 | 163 | def search_issues(self, query: str, per_page: int = 30) -> dict: 164 | """ 165 | Search for code on GitHub using the GitHub API. 166 | 167 | Parameters: 168 | query (str): The search query. 169 | per_page (int): The number of results per page. 170 | 171 | Returns: 172 | dict: The search result in dict format. 173 | """ 174 | url = 'https://api.github.com/search/issues' 175 | headers = { 176 | 'Accept': 'application/vnd.github.v3+json', 177 | 'Authorization': f'token {self.access_token}' 178 | } 179 | params = { 180 | 'q': query, 181 | 'per_page': per_page 182 | } 183 | 184 | # Retry strategy 185 | retry_strategy = Retry( 186 | total=3, # Total number of retries 187 | # Retry on these HTTP status codes 188 | status_forcelist=[429, 500, 502, 503, 504], 189 | # Retry on these HTTP methods 190 | allowed_methods=["HEAD", "GET", "OPTIONS"], 191 | backoff_factor=1 # Exponential backoff factor 192 | ) 193 | adapter = HTTPAdapter(max_retries=retry_strategy) 194 | http = requests.Session() 195 | http.mount("https://", adapter) 196 | 197 | try: 198 | response = http.get(url, headers=headers, params=params) 199 | response.raise_for_status() # Raise HTTPError for bad responses 200 | return response.json().get('items', []) 201 | except HTTPError as http_err: 202 | logger.error(f"HTTP error occurred: {http_err}") 203 | except RequestException as req_err: 204 | logger.error(f"Request error occurred: {req_err}") 205 | except Exception as err: 206 | logger.error(f"An error occurred: {err}") 207 | 208 | def get_issue_comments(self, repo_full_name: str, issue_number: int) -> dict: 209 | """ 210 | Get comments of an issue on GitHub using the GitHub API. 211 | 212 | Parameters: 213 | repo_full_name (str): The full name of the repository (e.g., 'octocat/Hello-World'). 214 | issue_number (int): The issue number. 215 | 216 | Returns: 217 | dict: The comments of the issue in dict format. 218 | """ 219 | url = f'https://api.github.com/repos/{repo_full_name}/issues/{issue_number}/comments' 220 | headers = { 221 | 'Accept': 'application/vnd.github.v3+json', 222 | 'Authorization': f'token {self.access_token}' 223 | } 224 | # Retry strategy 225 | retry_strategy = Retry( 226 | total=3, # Total number of retries 227 | # Retry on these HTTP status codes 228 | status_forcelist=[429, 500, 502, 503, 504], 229 | # Retry on these HTTP methods 230 | allowed_methods=["HEAD", "GET", "OPTIONS"], 231 | backoff_factor=1 # Exponential backoff factor 232 | ) 233 | adapter = HTTPAdapter(max_retries=retry_strategy) 234 | http = requests.Session() 235 | http.mount("https://", adapter) 236 | 237 | try: 238 | response = http.get(url, headers=headers) 239 | response.raise_for_status() # Raise HTTPError for bad responses 240 | return response.json() 241 | except HTTPError as http_err: 242 | logger.error(f"HTTP error occurred: {http_err}") 243 | except RequestException as req_err: 244 | logger.error(f"Request error occurred: {req_err}") 245 | except Exception as err: 246 | logger.error(f"An error occurred: {err}") 247 | 248 | def get_pr_files(self, repo_full_name: str, pr_number: int) -> list: 249 | """ 250 | Get the files of a pull request on GitHub using the GitHub API. 251 | 252 | Parameters: 253 | repo_full_name (str): The full name of the repository (e.g., 'octocat/Hello-World'). 254 | pr_number (int): The pull request number. 255 | 256 | Returns: 257 | list: The files of the pull request in list format. 258 | """ 259 | url = f'https://api.github.com/repos/{repo_full_name}/pulls/{pr_number}/files' 260 | headers = { 261 | 'Accept': 'application/vnd.github.v3+json', 262 | 'Authorization': f'token {self.access_token}' 263 | } 264 | 265 | # Retry strategy 266 | retry_strategy = Retry( 267 | total=3, 268 | status_forcelist=[429, 500, 502, 503, 504], 269 | allowed_methods=["HEAD", "GET", "OPTIONS"], 270 | backoff_factor=1 271 | ) 272 | adapter = HTTPAdapter(max_retries=retry_strategy) 273 | http = requests.Session() 274 | http.mount("https://", adapter) 275 | 276 | try: 277 | response = http.get(url, headers=headers) 278 | response.raise_for_status() 279 | return response.json() 280 | except HTTPError as http_err: 281 | logger.error(f"HTTP error occurred: {http_err}") 282 | except RequestException as req_err: 283 | logger.error(f"Request error occurred: {req_err}") 284 | except Exception as err: 285 | logger.error(f"An error occurred: {err}") 286 | return [] 287 | 288 | def get_pr_comments(self, repo_full_name: str, pr_number: int) -> list: 289 | """ 290 | Get the comments of a pull request on GitHub using the GitHub API. 291 | 292 | Parameters: 293 | repo_full_name (str): The full name of the repository (e.g., 'octocat/Hello-World'). 294 | pr_number (int): The pull request number. 295 | 296 | Returns: 297 | list: The comments of the pull request in list format. 298 | """ 299 | url = f'https://api.github.com/repos/{repo_full_name}/issues/{pr_number}/comments' 300 | headers = { 301 | 'Accept': 'application/vnd.github.v3+json', 302 | 'Authorization': f'token {self.access_token}' 303 | } 304 | 305 | # Retry strategy 306 | retry_strategy = Retry( 307 | total=3, 308 | status_forcelist=[429, 500, 502, 503, 504], 309 | allowed_methods=["HEAD", "GET", "OPTIONS"], 310 | backoff_factor=1 311 | ) 312 | adapter = HTTPAdapter(max_retries=retry_strategy) 313 | http = requests.Session() 314 | http.mount("https://", adapter) 315 | 316 | try: 317 | response = http.get(url, headers=headers) 318 | response.raise_for_status() 319 | return response.json() 320 | except HTTPError as http_err: 321 | logger.error(f"HTTP error occurred: {http_err}") 322 | except RequestException as req_err: 323 | logger.error(f"Request error occurred: {req_err}") 324 | except Exception as err: 325 | logger.error(f"An error occurred: {err}") 326 | return [] 327 | 328 | # Example usage: 329 | if __name__ == "__main__": 330 | auth_manager = GitHubAuthManager() 331 | 332 | # For developers using a personal access token or an OAuth token 333 | github_instance = auth_manager.authenticate_with_token( 334 | "your_personal_access_token_or_oauth_token_here") 335 | 336 | # For organizational or enterprise environments using GitHub App 337 | # github_instance = auth_manager.authenticate_with_app("app_id", "private_key", "installation_id") 338 | 339 | # Example action: List all repositories for the authenticated user 340 | if github_instance: 341 | for repo in github_instance.get_user().get_repos(): 342 | print(repo.name) 343 | 344 | # Close the connection when done 345 | auth_manager.close_connection() 346 | -------------------------------------------------------------------------------- /llama_github/data_retrieval/github_api.py: -------------------------------------------------------------------------------- 1 | from github import GithubException 2 | from .github_entities import Repository, RepositoryPool 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from llama_github.logger import logger 5 | from llama_github.github_integration.github_auth_manager import ExtendedGithub 6 | from llama_github.config.config import config 7 | import re 8 | from typing import Any, Dict, List 9 | 10 | 11 | class GitHubAPIHandler: 12 | def __init__(self, github_instance: ExtendedGithub): 13 | """ 14 | Initializes the GitHubAPIHandler with a GitHub instance. 15 | 16 | :param github_instance: Authenticated instance of a Github client. 17 | """ 18 | self._github = github_instance 19 | self.pool = RepositoryPool(github_instance) 20 | 21 | def search_repositories(self, query, sort="best match", order="desc"): 22 | """ 23 | Searches for repositories on GitHub based on a query. 24 | 25 | :param query: The search query string. 26 | :param sort: The field to sort the results by. Default is 'stars'. 27 | :param order: The order of sorting, 'asc' or 'desc'. Default is 'desc'. 28 | :return: A list of Repository objects or None if an error occurs. 29 | """ 30 | try: 31 | if sort not in ['stars', 'forks', 'updated']: 32 | repositories = self._github.search_repositories( 33 | query=query, order=order) 34 | else: 35 | repositories = self._github.search_repositories( 36 | query=query, sort=sort, order=order) 37 | result = [] 38 | for i, repo in enumerate(repositories): 39 | if i >= config.get("repo_search_max_hits"): 40 | break 41 | result.append( 42 | Repository( 43 | repo.full_name, 44 | self._github, 45 | **{ 46 | 'id': repo.id, 47 | 'name': repo.name, 48 | 'description': repo.description, 49 | 'html_url': repo.html_url, 50 | 'stargazers_count': repo.stargazers_count, 51 | 'language': repo.language, 52 | 'default_branch': repo.default_branch, 53 | 'updated_at': repo.updated_at, 54 | } 55 | ) 56 | ) 57 | return result 58 | except GithubException as e: 59 | logger.exception( 60 | f"Error searching repositories with query '{query}':") 61 | return None 62 | 63 | def get_repository(self, full_repo_name): 64 | """ 65 | Retrieves a single repository by its full name. 66 | 67 | :param full_repo_name: The full name of the repository (e.g., 'octocat/Hello-World'). 68 | :return: A Repository object or None if an error occurs. 69 | """ 70 | return self.pool.get_repository(full_repo_name) 71 | 72 | def _get_file_content_through_repository(self, code_search_result): 73 | """ 74 | Helper method to get file content through a Repository object. 75 | 76 | :param code_search_result: A single code search result. 77 | :return: Tuple containing the Repository object and the file content. 78 | """ 79 | # Assuming RepositoryPool is accessible and initialized somewhere in this class 80 | repository_obj = self.get_repository( 81 | code_search_result['repository']['full_name']) 82 | file_content = repository_obj.get_file_content( 83 | code_search_result['path']) 84 | return repository_obj, file_content 85 | 86 | async def get_pr_files(self, repo: Repository, pr_number: int) -> List[Dict[str, Any]]: 87 | url = f"{self.base_url}/repos/{repo.full_name}/pulls/{pr_number}/files" 88 | headers = {"Authorization": f"token {self.token}"} 89 | async with self.session.get(url, headers=headers) as response: 90 | if response.status == 200: 91 | return await response.json() 92 | else: 93 | logger.error(f"Failed to get PR files: {response.status}") 94 | return [] 95 | 96 | async def get_pr_comments(self, repo: Repository, pr_number: int) -> List[Dict[str, Any]]: 97 | url = f"{self.base_url}/repos/{repo.full_name}/issues/{pr_number}/comments" 98 | headers = {"Authorization": f"token {self.token}"} 99 | async with self.session.get(url, headers=headers) as response: 100 | if response.status == 200: 101 | return await response.json() 102 | else: 103 | logger.error(f"Failed to get PR comments: {response.status}") 104 | return [] 105 | 106 | def search_code(self, query, repo_full_name=None): 107 | """ 108 | Searches for code on GitHub based on a query, optionally within a specific repository. 109 | 110 | :param query: The search query string. 111 | :param repo_full_name: Optional. The full name of the repository (e.g., 'octocat/Hello-World') to restrict the search to. 112 | :return: A list of code search results or None if an error occurs. 113 | """ 114 | try: 115 | logger.debug(f"Searching code with query '{query}'...") 116 | # If a repository full name is provided, include it in the query 117 | if repo_full_name: 118 | query = f"{query} repo:{repo_full_name}" 119 | 120 | # Perform the search 121 | code_results = self._github.search_code( 122 | query=query, per_page=config.get("code_search_max_hits")) 123 | 124 | results_with_index = [] 125 | with ThreadPoolExecutor(max_workers=config.get("max_workers")) as executor: 126 | # Concurrently fetch the file content for each code search result 127 | future_to_index = {executor.submit( 128 | self._get_file_content_through_repository, code_result): index for index, code_result in enumerate(code_results)} 129 | for future in as_completed(future_to_index): 130 | index = future_to_index[future] 131 | code_result = code_results[index] 132 | try: 133 | repository_obj, file_content = future.result() 134 | if repository_obj and file_content: 135 | results_with_index.append({ 136 | 'index': index, 137 | 'name': code_result['name'], 138 | 'path': code_result['path'], 139 | 'repository_full_name': code_result['repository']['full_name'], 140 | 'url': code_result['html_url'], 141 | 'content': file_content, 142 | 'stargazers_count': repository_obj.stargazers_count, 143 | 'watchers_count': repository_obj.watchers_count, 144 | 'language': repository_obj.language, 145 | 'description': repository_obj.description, 146 | 'updated_at': repository_obj.updated_at, 147 | }) 148 | except Exception as e: 149 | logger.exception( 150 | f"{code_result['name']} generated an exception:") 151 | 152 | # Sort the results by index to maintain the original order 153 | sorted_results = sorted( 154 | results_with_index, key=lambda x: x['index']) 155 | logger.debug( 156 | f"Code search retrieved successfully with {len(sorted_results)} results.") 157 | return sorted_results 158 | except GithubException as e: 159 | logger.exception(f"Error searching code with query '{query}':") 160 | return None 161 | 162 | def _get_issue_content_through_repository(self, issue): 163 | """ 164 | Helper method to get issue content through issue url. 165 | 166 | :param code_result: A single code search result. 167 | :return: Tuple containing the Repository object and the file content. 168 | """ 169 | # Assuming RepositoryPool is accessible and initialized somewhere in this clas 170 | issue_content = '' 171 | issue_url = issue['url'] 172 | # Use regular expressions to extract repo_full_name and issue_number 173 | match = re.search( 174 | r'https://api.github.com/repos/([^/]+/[^/]+)/issues/(\d+)', issue_url) 175 | if match: 176 | repo_full_name = match.group(1) 177 | issue_number = int(match.group(2)) 178 | repository_obj = self.get_repository(repo_full_name) 179 | issue_content = repository_obj.get_issue_content( 180 | number=issue_number, issue=issue) 181 | else: 182 | logger.warning( 183 | f"Failed to extract repo_full_name and issue_number from issue url: {issue_url}") 184 | return issue_content 185 | 186 | def search_issues(self, query, repo_full_name=None): 187 | """ 188 | Searches for issues on GitHub based on a query, optionally within a specific repository. 189 | 190 | :param query: The search query string. 191 | :param repo_full_name: Optional. The full name of the repository (e.g., 'octocat/Hello-World') to restrict the search to. 192 | :return: A list of issue search results or None if an error occurs. 193 | """ 194 | try: 195 | logger.debug(f"Searching issue with query '{query}'...") 196 | # If a repository full name is provided, include it in the query 197 | if repo_full_name: 198 | query = f"{query} repo:{repo_full_name}" 199 | 200 | # Perform the search 201 | issue_results = self._github.search_issues( 202 | query=query, per_page=config.get("issue_search_max_hits")) 203 | 204 | issue_results = [issue for issue in issue_results if issue['body'] 205 | is not None and issue['body'] != 'null'] 206 | 207 | results_with_index = [] 208 | with ThreadPoolExecutor(max_workers=config.get("max_workers")) as executor: 209 | # Concurrently fetch the issue content for each issue search result 210 | future_to_index = {executor.submit( 211 | self._get_issue_content_through_repository, issue): index for index, issue in enumerate(issue_results)} 212 | for future in as_completed(future_to_index): 213 | index = future_to_index[future] 214 | issue_result = issue_results[index] 215 | try: 216 | issue_content = future.result() 217 | results_with_index.append({ 218 | 'index': index, 219 | 'url': issue_result['url'], 220 | 'created_at': issue_result['created_at'], 221 | 'updated_at': issue_result['updated_at'], 222 | 'issue_content': issue_content, 223 | }) 224 | except Exception as e: 225 | logger.exception( 226 | f"{issue_result['url']} generated an exception:") 227 | 228 | # Sort the results by index to maintain the original order 229 | sorted_results = sorted( 230 | results_with_index, key=lambda x: x['index']) 231 | logger.debug( 232 | f"Issue search retrieved successfully with {len(sorted_results)} results.") 233 | return sorted_results 234 | except GithubException as e: 235 | logger.exception(f"Error searching issue with query '{query}':") 236 | return None 237 | 238 | def _categorize_github_url(url): 239 | repo_pattern = r'^https://github\.com/[^/]+/[^/]+$' 240 | issue_pattern = r'^https://github\.com/[^/]+/[^/]+/issues/\d+$' 241 | repo_file_pattern = r'^https://github\.com/[^/]+/[^/]+/(?:blob|tree)/[^/]+/.+$' 242 | readme_pattern = r'^https://github\.com/[^/]+/[^/]+#readme$' 243 | 244 | if re.match(repo_pattern, url): 245 | return "repo" 246 | elif re.match(issue_pattern, url): 247 | return "issue" 248 | elif re.match(repo_file_pattern, url): 249 | return "file" 250 | elif re.match(readme_pattern, url): 251 | return "readme" 252 | else: 253 | return "other" 254 | 255 | def get_github_url_content(self, url): 256 | """ 257 | Retrieves the content of a GitHub URL. 258 | 259 | :param url: The GitHub URL to retrieve content from. 260 | :return: The content of the URL or None if an error occurs. 261 | """ 262 | try: 263 | logger.debug(f"Retrieving content from GitHub URL '{url}'...") 264 | content = None 265 | category = GitHubAPIHandler._categorize_github_url(url) 266 | if category == "repo": 267 | # Extract the repository full name from the URL 268 | match = re.search(r'https://github\.com/([^/]+/[^/]+)', url) 269 | if match: 270 | repo_full_name = match.group(1) 271 | repository_obj = self.get_repository(repo_full_name) 272 | content = repository_obj.get_readme() 273 | else: 274 | logger.warning( 275 | f"Failed to extract repository full name from URL: {url}") 276 | elif category == "issue": 277 | # Use regular expressions to extract repo_full_name and issue_number 278 | match = re.search( 279 | r'https://github\.com/([^/]+/[^/]+)/issues/(\d+)', url) 280 | if match: 281 | repo_full_name = match.group(1) 282 | issue_number = int(match.group(2)) 283 | repository_obj = self.get_repository(repo_full_name) 284 | content = repository_obj.get_issue_content( 285 | number=issue_number) 286 | else: 287 | logger.warning( 288 | f"Failed to extract repo_full_name and issue_number from URL: {url}") 289 | elif category == "file": 290 | # Extract the repository full name and file path from the URL 291 | match = re.search( 292 | r'https://github\.com/([^/]+/[^/]+)/(?:blob|tree)/([^/]+)/(.+)', url) 293 | if match: 294 | repo_full_name = match.group(1) 295 | file_path = match.group(3) 296 | repository_obj = self.get_repository(repo_full_name) 297 | content = repository_obj.get_file_content( 298 | file_path) 299 | else: 300 | logger.warning( 301 | f"Failed to extract repository full name and file path from URL: {url}") 302 | elif category == "readme": 303 | # Extract the repository full name from the URL 304 | match = re.search(r'https://github\.com/([^/]+/[^/]+)#readme', url) 305 | if match: 306 | repo_full_name = match.group(1) 307 | repository_obj = self.get_repository(repo_full_name) 308 | content = repository_obj.get_readme() 309 | else: 310 | logger.warning( 311 | f"Failed to extract repository full name from URL: {url}") 312 | else: 313 | logger.warning(f"Unsupported GitHub URL category: {category}") 314 | return content 315 | except GithubException as e: 316 | logger.exception(f"Error retrieving content from GitHub URL '{url}':") 317 | return None -------------------------------------------------------------------------------- /llama_github/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import base64 3 | import re 4 | import asyncio 5 | import aiohttp 6 | from typing import Optional, Dict, Any, List, Tuple 7 | from llama_github.logger import logger 8 | import difflib 9 | import ast 10 | 11 | class DiffGenerator: 12 | """ 13 | A class for generating custom diffs between two pieces of content. 14 | It enhances the standard unified diff by adding function/class context to hunk headers, 15 | similar to `git diff`, in a fail-safe manner. 16 | """ 17 | 18 | # A pre-compiled list of regex patterns to find function/class definitions. 19 | # This is the core mechanism that mimics Git's `xfuncname` feature. 20 | # It covers a wide range of common languages to provide broad, out-of-the-box support. 21 | _FUNC_CONTEXT_PATTERNS = [ 22 | re.compile(r'^\s*(def|class)\s+.*', re.IGNORECASE), # Python 23 | re.compile(r'^\s*(public|private|protected|static|final|native|synchronized|abstract|transient|volatile|strictfp|async|function|class|interface|enum|@|implements|extends)'), # Java, JS, TS, PHP, C# 24 | re.compile(r'^\s*(func|fn|impl|trait|struct|enum|mod)\s+.*', re.IGNORECASE), # Go, Rust 25 | re.compile(r'^\s*(def|class|module)\s+.*', re.IGNORECASE), # Ruby 26 | re.compile(r'^\s*([a-zA-Z_][a-zA-Z0-9_]*\s+)*[a-zA-Z_][a-zA-Z0-9_]*\s*\(.*\)\s*\{'), # C, C++ style function definitions 27 | re.compile(r'^sub\s+.*'), # Perl 28 | ] 29 | 30 | @staticmethod 31 | def _find_context(line_index: int, lines: List[str]) -> str: 32 | """ 33 | Search upwards from a given line index to find the nearest function/class context. 34 | 35 | Args: 36 | line_index (int): The 0-based index to start searching upwards from. 37 | lines (List[str]): The content of the file, as a list of lines. 38 | 39 | Returns: 40 | str: The found context line, stripped of whitespace, or an empty string if not found. 41 | """ 42 | # Search from the target line upwards to the beginning of the file. 43 | for i in range(line_index, -1, -1): 44 | line = lines[i] 45 | # Check the line against all our predefined patterns. 46 | for pattern in DiffGenerator._FUNC_CONTEXT_PATTERNS: 47 | if pattern.search(line): 48 | return line.strip() 49 | return "" # Return empty string if no context is found. 50 | 51 | @staticmethod 52 | def generate_custom_diff(base_content: str, head_content: str, context_lines: int) -> str: 53 | """ 54 | Generate a custom diff between two pieces of content with specified context lines, 55 | and automatically add function/class context to hunk headers, similar to `git diff`. 56 | This method is designed to be fail-safe; if context addition fails, it returns the standard diff. 57 | 58 | Args: 59 | base_content (str): The original content. 60 | head_content (str): The new content to compare against the base. 61 | context_lines (int): The number of context lines to include in the diff. 62 | 63 | Returns: 64 | str: A string representation of the unified diff, preferably with hunk headers. 65 | 66 | Raises: 67 | ValueError: If context_lines is negative. 68 | """ 69 | if context_lines < 0: 70 | raise ValueError("context_lines must be non-negative") 71 | if base_content is None and head_content is None: 72 | return "" # Both contents are None, no diff to generate 73 | elif base_content is None: 74 | # File is newly added 75 | return "".join(f"+ {line}\n" for line in head_content.splitlines()) 76 | elif head_content is None: 77 | # File is deleted 78 | return "".join(f"- {line}\n" for line in base_content.splitlines()) 79 | 80 | # Use empty strings for None content to ensure difflib handles them correctly 81 | # as file additions or deletions. This is more robust and aligns with difflib's expectations. 82 | base_content = base_content or "" 83 | head_content = head_content or "" 84 | 85 | base_lines: List[str] = base_content.splitlines() 86 | head_lines: List[str] = head_content.splitlines() 87 | 88 | # Generate the standard unified diff. This part is considered stable. 89 | diff: List[str] = list(difflib.unified_diff( 90 | base_lines, 91 | head_lines, 92 | n=context_lines, 93 | lineterm='' 94 | )) 95 | 96 | if not diff: 97 | return "" # No differences found, return early. 98 | 99 | # --- Start of the fail-safe enhancement logic --- 100 | # This entire block attempts to add context to hunk headers. 101 | # If any exception occurs here, we catch it and return the original, un-enhanced diff. 102 | # This ensures the function is always reliable (Pareto improvement). 103 | try: 104 | enhanced_diff = [] 105 | # Regex to parse the original line number from a hunk header. 106 | # e.g., from "@@ -35,7 +35,7 @@" it captures "35". 107 | hunk_header_re = re.compile(r'^@@ -(\d+)(?:,\d+)? .*') 108 | 109 | for line in diff: 110 | match = hunk_header_re.match(line) 111 | if match: 112 | # This is a hunk header line. 113 | # The line number from the regex is 1-based. 114 | start_line_num = int(match.group(1)) 115 | 116 | # The index is 0-based, so we subtract 1. 117 | # We search from the line where the change starts, or the line before it. 118 | context_line_index = max(0, start_line_num - 1) 119 | context = DiffGenerator._find_context(context_line_index, base_lines) 120 | 121 | if context: 122 | # If context was found, append it to the hunk header. 123 | enhanced_diff.append(f"{line} {context}") 124 | else: 125 | # Otherwise, use the original hunk header. 126 | enhanced_diff.append(line) 127 | else: 128 | # This is not a hunk header, just a regular diff line (+, -, ' '). 129 | enhanced_diff.append(line) 130 | 131 | # If the enhancement process completes successfully, return the result. 132 | return '\n'.join(enhanced_diff) 133 | 134 | except Exception as e: 135 | # If any error occurred during the enhancement, log a warning and fall back. 136 | logger.warning( 137 | f"Could not add hunk header context due to an unexpected error: {str(e)}. " 138 | "Falling back to standard diff output." 139 | ) 140 | # --- Fallback mechanism --- 141 | # Return the original, unmodified diff generated by difflib. 142 | return '\n'.join(diff) 143 | 144 | 145 | class DataAnonymizer: 146 | def __init__(self): 147 | self.patterns = { 148 | 'api_key': r'(?i)(api[_-]?key|sk[_-]live|sk[_-]test|sk[_-]prod|sk[_-]sandbox|openai[_-]?key)\s*[:=]\s*[\'"]?([a-zA-Z0-9-_]{20,})[\'"]?', 149 | 'token': r'(?i)(token|access[_-]?token|auth[_-]?token|github[_-]?token|ghp_[a-zA-Z0-9]{36}|gho_[a-zA-Z0-9]{36}|ghu_[a-zA-Z0-9]{36}|ghr_[a-zA-Z0-9]{36}|ghs_[a-zA-Z0-9]{36})\s*[:=]\s*[\'"]?([a-zA-Z0-9-_]{20,})[\'"]?', 150 | 'password': r'(?i)password\s*[:=]\s*[\'"]?([^\'"]+)[\'"]?', 151 | 'email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', 152 | 'ip_address': r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', 153 | 'jwt': r'eyJ[a-zA-Z0-9-_]+\.eyJ[a-zA-Z0-9-_]+\.[a-zA-Z0-9-_]+', 154 | 'phone_number': r'\+?[0-9]{1,4}?[-.\s]?(\(?\d{1,3}?\)?[-.\s]?)?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}', 155 | 'url': r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', 156 | 'credit_card': r'\b(?:\d[ -]*?){13,16}\b', 157 | 'ssn': r'\b(?:\d[ -]*?){9}\b', 158 | 'ipv6': r'(?i)([0-9a-f]{1,4}:){7}([0-9a-f]{1,4}|:)', 159 | 'mac_address': r'(?i)([0-9a-f]{2}([:-]|$)){6}', 160 | 'latitude_longitude': r'(?i)(lat|latitude|lon|longitude)\s*[:=]\s*[-+]?([0-9]*\.[0-9]+|[0-9]+),\s*[-+]?([0-9]*\.[0-9]+|[0-9]+)', 161 | 'driver_license': r'(?i)([A-Z0-9]{1,20})\s*[:=]\s*([A-Z0-9]{1,20})', 162 | 'date_of_birth': r'(?i)(dob|date[_-]?of[_-]?birth)\s*[:=]\s*([0-9]{4}-[0-9]{2}-[0-9]{2})', 163 | 'name': r'(?i)(name|first[_-]?name|last[_-]?name)\s*[:=]\s*([a-zA-Z]{2,})', 164 | 'address': r'(?i)(address|street[_-]?address)\s*[:=]\s*([a-zA-Z0-9\s,]{10,})', 165 | 'zipcode': r'(?i)(zip|zipcode)\s*[:=]\s*([0-9]{5})', 166 | 'company': r'(?i)(company|organization)\s*[:=]\s*([a-zA-Z\s]{2,})', 167 | 'job_title': r'(?i)(job[_-]?title)\s*[:=]\s*([a-zA-Z\s]{2,})', 168 | 'domain': r'(?i)(domain)\s*[:=]\s*([a-zA-Z0-9.-]{2,})', 169 | 'hostname': r'(?i)(hostname)\s*[:=]\s*([a-zA-Z0-9.-]{2,})', 170 | 'port': r'(?i)(port)\s*[:=]\s*([0-9]{2,})', 171 | } 172 | 173 | def hash_replacement(match): 174 | sensitive_data = match.group(0) 175 | hash_object = hashlib.sha256(sensitive_data.encode()) 176 | hashed_data = base64.urlsafe_b64encode( 177 | hash_object.digest()).decode('utf-8') 178 | return f'' 179 | 180 | def anonymize_sensitive_data(self, question): 181 | anonymized_question = question 182 | for pattern_name, pattern in self.patterns.items(): 183 | anonymized_question = re.sub( 184 | pattern, self.hash_replacement, anonymized_question) 185 | return anonymized_question 186 | 187 | class AsyncHTTPClient: 188 | """ 189 | Asynchronous HTTP client class for sending asynchronous HTTP requests. 190 | """ 191 | 192 | @staticmethod 193 | async def request( 194 | url: str, 195 | method: str = "GET", 196 | headers: Optional[Dict[str, str]] = None, 197 | data: Optional[Dict[str, Any]] = None, 198 | retry_count: int = 1, 199 | retry_delay: int = 1, 200 | ) -> Optional[aiohttp.ClientResponse]: 201 | """ 202 | Send an asynchronous HTTP request. 203 | 204 | :param url: The URL to send the request to. 205 | :param method: The HTTP request method, default is "GET". 206 | :param headers: The request headers, default is None. 207 | :param data: The request data, default is None. 208 | :param retry_count: The number of retries, default is 1. 209 | :param retry_delay: The delay in seconds between each retry, default is 1. 210 | :return: The response object if the request is successful, otherwise None. 211 | """ 212 | async with aiohttp.ClientSession() as session: 213 | for attempt in range(retry_count): 214 | try: 215 | async with session.request( 216 | method, url, headers=headers, json=data 217 | ) as response: 218 | if response.status == 200: 219 | return await response.json() 220 | else: 221 | logger.error( 222 | f"Request failed with status code: {response.status}. " 223 | f"Retrying ({attempt + 1}/{retry_count})..." 224 | ) 225 | except aiohttp.ClientError as e: 226 | logger.error( 227 | f"Request failed with error: {str(e)}. " 228 | f"Retrying ({attempt + 1}/{retry_count})..." 229 | ) 230 | 231 | if attempt < retry_count - 1: 232 | await asyncio.sleep(retry_delay) 233 | 234 | return None 235 | 236 | class CodeAnalyzer: 237 | """ 238 | A utility class for analyzing Python code. 239 | 240 | This class provides methods for extracting abstract syntax trees, 241 | analyzing imports, and categorizing code changes. 242 | """ 243 | 244 | @staticmethod 245 | def get_ast_representation(code_str: str) -> Optional[str]: 246 | """ 247 | Parses code into an Abstract Syntax Tree (AST) representation. 248 | 249 | :param code_str: The code string to parse. 250 | :return: String representation of the AST or None if parsing fails. 251 | """ 252 | if not code_str: 253 | return None 254 | try: 255 | tree = ast.parse(code_str) 256 | return ast.dump(tree) 257 | except Exception: 258 | logger.error("Syntax error in the provided code") 259 | return None 260 | 261 | @staticmethod 262 | def extract_imports(code_str: str) -> Dict[str, Any]: 263 | """ 264 | Extracts detailed import information from the given code string. 265 | 266 | :param code_str: The code string to analyze. 267 | :return: A dictionary containing detailed import information. 268 | """ 269 | import_info = { 270 | "standard_imports": [], 271 | "third_party_imports": [], 272 | "local_imports": [], 273 | "from_imports": [], 274 | "errors": [] 275 | } 276 | 277 | if not code_str: 278 | return import_info 279 | 280 | try: 281 | tree = ast.parse(code_str) 282 | except Exception as e: 283 | logger.error(f"Syntax error in the provided code: {e}") 284 | import_info["errors"].append(str(e)) 285 | return import_info 286 | 287 | for node in ast.walk(tree): 288 | if isinstance(node, ast.Import): 289 | for alias in node.names: 290 | CodeAnalyzer._categorize_import(alias.name, import_info) 291 | elif isinstance(node, ast.ImportFrom): 292 | if node.module: 293 | from_import = { 294 | "module": node.module, 295 | "names": [n.name for n in node.names], 296 | "level": node.level 297 | } 298 | import_info["from_imports"].append(from_import) 299 | CodeAnalyzer._categorize_import(node.module, import_info) 300 | 301 | return import_info 302 | 303 | @staticmethod 304 | def _categorize_import(module_name: str, import_info: Dict[str, List[str]]) -> None: 305 | """ 306 | Categorizes an import as standard library, third-party, or local. 307 | 308 | :param module_name: The name of the module to categorize. 309 | :param import_info: The dictionary to update with the categorized import. 310 | """ 311 | std_libs = set(CodeAnalyzer._get_standard_library_modules()) 312 | 313 | if module_name in std_libs: 314 | import_info["standard_imports"].append(module_name) 315 | elif '.' in module_name: 316 | import_info["local_imports"].append(module_name) 317 | else: 318 | import_info["third_party_imports"].append(module_name) 319 | 320 | @staticmethod 321 | def _get_standard_library_modules() -> List[str]: 322 | """ 323 | Returns a list of Python standard library module names. 324 | 325 | :return: List of standard library module names. 326 | """ 327 | import sys 328 | import pkgutil 329 | 330 | std_lib = [] 331 | for module in pkgutil.iter_modules(): 332 | if module.name not in sys.builtin_module_names: 333 | try: 334 | spec = pkgutil.find_loader(module.name) 335 | if spec is not None: 336 | if hasattr(spec, 'get_filename'): 337 | pathname = spec.get_filename() 338 | elif hasattr(spec, 'origin'): 339 | pathname = spec.origin 340 | else: 341 | pathname = None 342 | 343 | if pathname and 'site-packages' not in pathname: 344 | std_lib.append(module.name) 345 | except Exception as e: 346 | logger.warning(f"Error processing module {module.name}: {e}") 347 | continue 348 | 349 | return std_lib + list(sys.builtin_module_names) 350 | 351 | @staticmethod 352 | def analyze_imports(code_str: str) -> Tuple[Dict[str, Any], str]: 353 | """ 354 | Analyzes imports and provides a summary. 355 | 356 | :param code_str: The code string to analyze. 357 | :return: A tuple containing the import information dictionary and a summary string. 358 | """ 359 | import_info = CodeAnalyzer.extract_imports(code_str) 360 | 361 | summary = [ 362 | f"Standard library imports: {len(import_info['standard_imports'])}", 363 | f"Third-party imports: {len(import_info['third_party_imports'])}", 364 | f"Local imports: {len(import_info['local_imports'])}", 365 | f"From imports: {len(import_info['from_imports'])}" 366 | ] 367 | 368 | if import_info['errors']: 369 | summary.append(f"Errors encountered: {len(import_info['errors'])}") 370 | 371 | return import_info, "\n".join(summary) 372 | 373 | @staticmethod 374 | def categorize_change(diff_text: str) -> List[str]: 375 | """ 376 | Categorizes the type of code changes based on diff text. 377 | 378 | :param diff_text: The diff text to analyze. 379 | :return: A list of change categories. 380 | """ 381 | categories = [] 382 | 383 | if not diff_text: 384 | categories.append('general_change') 385 | return categories 386 | 387 | patterns = { 388 | 'function_added': r'^\+.*def\s+\w+\(', 389 | 'function_removed': r'^-.*def\s+\w+\(', 390 | 'class_added': r'^\+.*class\s+\w+\(', 391 | 'class_removed': r'^-.*class\s+\w+\(', 392 | 'import_added': r'^\+.*import\s+\w+', 393 | 'import_removed': r'^-.*import\s+\w+' 394 | } 395 | 396 | for category, pattern in patterns.items(): 397 | if re.search(pattern, diff_text, re.MULTILINE): 398 | categories.append(category) 399 | 400 | if not categories: 401 | categories.append('general_change') 402 | 403 | return categories -------------------------------------------------------------------------------- /llama_github/rag_processing/rag_processor.py: -------------------------------------------------------------------------------- 1 | # rag_processor.py 2 | from llama_github.config.config import config 3 | from llama_github.data_retrieval.github_api import GitHubAPIHandler 4 | from llama_github.data_retrieval.github_entities import Repository 5 | from llama_github.llm_integration.llm_handler import LLMManager, LLMHandler 6 | from llama_github.logger import logger 7 | from langchain_core.pydantic_v1 import BaseModel, Field 8 | from typing import List, Optional, Dict 9 | from langchain_text_splitters import Language, RecursiveCharacterTextSplitter 10 | import json 11 | import math 12 | from numpy.linalg import norm 13 | import numpy as np 14 | import asyncio 15 | 16 | 17 | class RAGProcessor: 18 | def __init__(self, github_api_handler: GitHubAPIHandler, llm_manager: LLMManager = None, llm_handler: LLMHandler = None): 19 | if llm_manager: 20 | self.llm_manager = llm_manager 21 | else: 22 | self.llm_manager = LLMManager() 23 | 24 | if llm_handler: 25 | self.llm_handler = llm_handler 26 | else: 27 | self.llm_handler = LLMHandler(llm_manager=self.llm_manager) 28 | 29 | self.github_api_handler = github_api_handler 30 | 31 | class _LLMFirstGenenralAnswer(BaseModel): 32 | question: str = Field( 33 | ..., 34 | description="The abstraction of user's question, only one sentence no more than 20 words.", 35 | example="How to create a NumPy array in Python?" 36 | ) 37 | answer: str = Field( 38 | ..., 39 | description="The answer to the user's question, better with sample code.", 40 | example="You can use the `numpy.array` function to create a NumPy array in Python. The sample code is as follows:\n\n```python\nimport numpy as np\n\narray = np.array([1, 2, 3])\nprint(array)\n```" 41 | ) 42 | code_search_logic: str = Field( 43 | ..., 44 | description="Simple logic analyze on how to search for Github code related to the user's question without detail search criteria nor keywords.", 45 | ) 46 | issue_search_logic: str = Field( 47 | ..., 48 | description="Simple logic analyze on how to search for Github issues related to the user's question without detail search criteria nor keywords.", 49 | ) 50 | 51 | async def analyze_question(self, query: str) -> List[str]: 52 | """ 53 | analyze user's question and generate strategy for code search and issue search 54 | 55 | Args: 56 | query (str): user's initial question. 57 | 58 | Returns: 59 | str: the answer of question. 60 | """ 61 | try: 62 | logger.debug( 63 | f"Analyzing question and generating strategy") 64 | prompt = config.get("always_answer_prompt") 65 | response = await self.llm_handler.ainvoke(human_question=query, prompt=prompt, output_structure=self._LLMFirstGenenralAnswer) 66 | return [response.question, response.answer, response.code_search_logic, response.issue_search_logic] 67 | except Exception as e: 68 | logger.error(f"Error in analyzing question: {e}") 69 | return [response.question, ""] 70 | 71 | class _GitHubCodeSearchCriteria(BaseModel): 72 | search_criteria: List[str] = Field( 73 | ..., 74 | description="A list of search criteria strings for GitHub code search, each following GitHub's search syntax.", 75 | example=["NumPy Array language:python", 76 | "log4j LoggingUtil language:java"], 77 | min_items=1, 78 | max_items=2 79 | ) 80 | 81 | async def get_code_search_criteria(self, query: str, draft_answer: Optional[str] = None) -> List[str]: 82 | """ 83 | generate Github search criteria based on user's question 84 | 85 | Args: 86 | query (str): user's initial question. 87 | 88 | Returns: 89 | str[]: the search criteria for Github code search. 90 | """ 91 | try: 92 | logger.debug( 93 | f"Generating code search criteria for question: {query}") 94 | prompt = config.get("code_search_criteria_prompt") 95 | response = await self.llm_handler.ainvoke(human_question=query, prompt=prompt, context=[draft_answer] if draft_answer is not None else None, output_structure=self._GitHubCodeSearchCriteria) 96 | logger.debug( 97 | f"For {query}, the search_criterias for code search is: {response.search_criteria}") 98 | return response.search_criteria 99 | except Exception as e: 100 | logger.error(f"Error in get_code_search_criteria: {e}") 101 | return [] 102 | 103 | class _GitHubRepoSearchCriteria(BaseModel): 104 | necessity_score: int = Field( 105 | ..., 106 | description="In case there is already Github Code search and issue search for question related context retrieve. How necessity do you think seperate repo search in Github will be required. 0-59:no necessity, 60-79:medium necessity, 80-100:high necessity", 107 | example=65 108 | ) 109 | search_criteria: List[str] = Field( 110 | ..., 111 | description="A list of search criteria strings for GitHub repository search, each following GitHub's search syntax. The sorting of the list should be based on the necessity of the search criteria.", 112 | example=["NumPy Array language:python", 113 | "spring-boot log4j language:java"], 114 | min_items=0, 115 | max_items=2 116 | ) 117 | 118 | async def get_repo_search_criteria(self, query: str, draft_answer: Optional[str] = None) -> List[str]: 119 | """ 120 | generate Github search criteria based on user's question 121 | 122 | Args: 123 | query (str): user's initial question. 124 | 125 | Returns: 126 | str[]: the search criteria for Github code search. 127 | """ 128 | search_criteria = [] 129 | try: 130 | logger.debug( 131 | f"Generating repo search criteria for question: {query}") 132 | prompt = config.get("repo_search_criteria_prompt") 133 | response = await self.llm_handler.ainvoke(human_question=query, prompt=prompt, context=[draft_answer] if draft_answer is not None else None, output_structure=self._GitHubRepoSearchCriteria) 134 | if response.necessity_score >= 80: 135 | search_criteria = response.search_criteria 136 | elif response.necessity_score >= 60 and len(response.search_criteria) >= 1: 137 | search_criteria = response.search_criteria[:1] 138 | logger.debug( 139 | f"For {query}, the search_criterias for repo search is: {search_criteria} and repo search necessity score is {response.necessity_score}") 140 | except Exception as e: 141 | logger.error(f"Error in get_repo_search_criteria: {e}") 142 | return search_criteria 143 | return search_criteria 144 | 145 | def get_repo_simple_structure(self, repo: Repository) -> str: 146 | """ 147 | get a simple structure of a repository, only contains first 3 levels of repo folder/file structure. 148 | 149 | Args: 150 | repo (Repository): the repository object. 151 | 152 | Returns: 153 | json: the simple structure of the repository. 154 | """ 155 | full_structure = repo.get_structure() 156 | 157 | if not full_structure: 158 | return json.dumps({}) 159 | 160 | def simplify_tree(tree, level=1): 161 | """ 162 | Simplify the tree structure to keep only three levels deep. 163 | """ 164 | if level > 3: 165 | return '...' 166 | 167 | simplified_tree = {} 168 | for key, value in tree.items(): 169 | if 'children' in value: 170 | simplified_tree[key] = { 171 | 'children': simplify_tree(value['children'], level + 1) 172 | } 173 | else: 174 | simplified_tree[key] = value 175 | return simplified_tree 176 | 177 | simplified_structure = simplify_tree(full_structure) 178 | return json.dumps(simplified_structure, indent=4) 179 | 180 | class _GitHubIssueSearchCriteria(BaseModel): 181 | search_criteria: List[str] = Field( 182 | ..., 183 | description="A list of search criteria strings for GitHub issue search, each following GitHub's search syntax.", 184 | example=["is:open label:bug Numpy Array", 185 | "is:closed label:documentation langchain ollama"], 186 | min_items=1, 187 | max_items=2 188 | ) 189 | 190 | async def get_issue_search_criteria(self, query: str, draft_answer: Optional[str] = None) -> List[str]: 191 | """ 192 | generate Github search criteria based on user's question 193 | 194 | Args: 195 | query (str): user's initial question. 196 | 197 | Returns: 198 | str[]: the search criteria for Github issue search. 199 | """ 200 | try: 201 | logger.debug( 202 | f"Generating issue search criteria for question: {query}") 203 | prompt = config.get("issue_search_criteria_prompt") 204 | response = await self.llm_handler.ainvoke(human_question=query, prompt=prompt, context=[draft_answer] if draft_answer is not None else None, output_structure=self._GitHubIssueSearchCriteria) 205 | logger.debug( 206 | f"For {query}, the search_criterias for issue search is: {response.search_criteria}") 207 | return response.search_criteria 208 | except Exception as e: 209 | logger.error(f"Error in get_issue_search_criteria: {e}") 210 | return [] 211 | 212 | def _arrange_code_search_result(self, code_search_result: List[Dict]) -> List[Dict[str, str]]: 213 | """ 214 | Arrange the result of Code search with metadata. 215 | 216 | Args: 217 | _arrange_code_search_result (dict): The result of Code search. 218 | 219 | Returns: 220 | List[Dict[str, str]]: The arranged result of Code search with metadata. 221 | """ 222 | arranged_results = [] 223 | 224 | for result in code_search_result: 225 | content = result['content'] 226 | url = result.get('url', '') 227 | 228 | # Split content into chunks 229 | chunks = self._split_content_into_chunks( 230 | content, language=result['language'] if 'language' in result else None) 231 | 232 | for chunk in chunks: 233 | repository_full_name = result.get( 234 | 'repository_full_name', 'None') 235 | description = result.get('description', 'None') 236 | stargazers_count = result.get('stargazers_count', 'None') 237 | updated_at = result.get('updated_at', 'None') 238 | path = result.get('path', 'None') 239 | language = result.get('language', 'None') 240 | 241 | if updated_at != 'None': 242 | updated_at = updated_at.strftime('%Y-%m-%d %H:%M:%S %Z') 243 | 244 | chunk_text = ( 245 | f"Sample code from repository: {repository_full_name}\n" 246 | f"repository description: {description}\n" 247 | f"repository stars: {stargazers_count}\n" 248 | f"repository last updated: {updated_at}\n" 249 | f"code path in repository: {path}\n" 250 | f"programming language is: {language}\n\n" 251 | f"{chunk}" 252 | ) 253 | arranged_results.append({'context': chunk_text, 'url': url}) 254 | 255 | return arranged_results 256 | 257 | def _split_content_into_chunks(self, content: str, language: Optional[str] = None, max_tokens: Optional[int] = config.get('chunk_size')) -> List[str]: 258 | """ 259 | Split the content into chunks of maximum token length using LangChain's RecursiveCharacterTextSplitter. 260 | 261 | Args: 262 | content (str): The content to be split. 263 | language (Optional[str]): The programming language of the code. Defaults to None. 264 | 265 | Returns: 266 | list: A list of content chunks. 267 | """ 268 | chunk_overlap = math.ceil(max_tokens * 0.15 / 100) * 100 269 | 270 | if language is None or language.lower() not in [e.value for e in Language] or language.lower() in ['markdown', 'html', 'c', 'perl']: 271 | splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( 272 | separators=[ 273 | "\n\n", 274 | "\n", 275 | "\r\n", 276 | ], 277 | chunk_size=max_tokens, 278 | chunk_overlap=chunk_overlap, 279 | tokenizer=self.llm_manager.tokenizer 280 | ) 281 | else: 282 | max_tokens = max_tokens * 5 283 | chunk_overlap = chunk_overlap * 5 284 | language_enum = Language[language.upper()] 285 | splitter = RecursiveCharacterTextSplitter.from_language( 286 | language=language_enum, 287 | chunk_size=max_tokens, 288 | chunk_overlap=chunk_overlap, 289 | ) 290 | 291 | chunks = splitter.split_text(content) 292 | return chunks 293 | 294 | def _arrange_issue_search_result(self, issue_search_result: dict) -> List[Dict[str, str]]: 295 | """ 296 | Arrange the result of Issue search with metadata. 297 | 298 | Args: 299 | _arrange_issue_search_result (dict): The result of Issue search. 300 | 301 | Returns: 302 | List[Dict[str, str]]: The arranged result of Issue search with metadata. 303 | """ 304 | arranged_results = [] 305 | 306 | for result in issue_search_result: 307 | content = result['issue_content'] 308 | url = result.get('url', '') 309 | 310 | # Split content into chunks 311 | chunks = self._split_content_into_chunks( 312 | content, max_tokens=config.get('issue_chunk_size')) 313 | 314 | for chunk in chunks: 315 | arranged_results.append({'context': chunk, 'url': url}) 316 | 317 | return arranged_results 318 | 319 | def _arrange_repo_search_result(self, repo_search_result: dict) -> List[Dict[str, str]]: 320 | """ 321 | Arrange the result of Repo search with metadata. 322 | 323 | Args: 324 | _arrange_repo_search_result (dict): The result of Repo search. 325 | 326 | Returns: 327 | List[Dict[str, str]]: The arranged result of Repo search with metadata. 328 | """ 329 | arranged_results = [] 330 | 331 | for result in repo_search_result: 332 | content = result['content'] 333 | url = result.get('url', '') 334 | 335 | # Split content into chunks 336 | chunks = self._split_content_into_chunks( 337 | content, max_tokens=config.get('repo_chunk_size')) 338 | 339 | for chunk in chunks: 340 | arranged_results.append({'context': chunk, 'url': url}) 341 | 342 | return arranged_results 343 | 344 | def _arrange_google_search_result(self, google_search_result: dict) -> List[Dict[str, str]]: 345 | """ 346 | Arrange the result of Google search with metadata. 347 | 348 | Args: 349 | google_search_result (dict): The result of Google search. 350 | 351 | Returns: 352 | List[Dict[str, str]]: The arranged result of Google search with metadata. 353 | """ 354 | arranged_results = [] 355 | 356 | for result in google_search_result: 357 | content = result['content'] 358 | url = result.get('url', '') # Extract the URL if available 359 | 360 | # Split content into chunks 361 | chunks = self._split_content_into_chunks(content, max_tokens=config.get('google_chunk_size')) 362 | 363 | for chunk in chunks: 364 | arranged_results.append({'context': chunk, 'url': url}) 365 | 366 | return arranged_results 367 | 368 | def arrange_context(self, code_search_result: Optional[dict] = None, issue_search_result: Optional[dict] = None, 369 | repo_search_result: Optional[dict] = None, google_search_result: Optional[dict] = None) -> List[ 370 | Dict[str, str]]: 371 | """ 372 | Arrange the context before RAG with metadata. 373 | 374 | Args: 375 | code_search_result (dict, optional): The result of code search. Defaults to None. 376 | issue_search_result (dict, optional): The result of issue search. Defaults to None. 377 | repo_search_result (dict, optional): The result of repo search. Defaults to None. 378 | google_search_result (dict, optional): The result of Google search. Defaults to None. 379 | 380 | Returns: 381 | List[Dict[str, str]]: The arranged context with metadata. 382 | """ 383 | context = [] 384 | if code_search_result: 385 | context.extend(self._arrange_code_search_result(code_search_result)) 386 | if issue_search_result: 387 | context.extend(self._arrange_issue_search_result(issue_search_result)) 388 | if repo_search_result: 389 | context.extend(self._arrange_repo_search_result(repo_search_result)) 390 | if google_search_result: 391 | context.extend(self._arrange_google_search_result(google_search_result)) 392 | return context 393 | 394 | async def retrieve_topn_contexts(self, context_list: List[Dict[str, str]], query: str, answer: Optional[str] = None, 395 | top_n: Optional[int] = 5) -> List[Dict[str, str]]: 396 | """ 397 | Retrieve top n context dictionaries from the context list. 398 | 399 | Args: 400 | context_list (List[Dict[str, str]]): List of context dictionaries to retrieve top n from. 401 | Each dictionary should have at least 'context' and 'url' keys. 402 | query (str): The query string. 403 | answer (Optional[str]): The answer string (optional). 404 | top_n (Optional[int]): Number of top context strings to retrieve (default: 5). 405 | 406 | Returns: 407 | List[Dict[str, str]]: A list of top n context dictionaries. 408 | """ 409 | top_contexts = [] 410 | try: 411 | reranker = self.llm_manager.get_rerank_model() 412 | 413 | # Extract contexts from the dictionaries 414 | contexts = [context_item['context'] for context_item in context_list] 415 | 416 | # Create sentence pairs for reranking 417 | sentence_pairs = [[query, doc] for doc in contexts] 418 | rerank_scores = reranker.compute_score(sentence_pairs) 419 | 420 | # Zip scores with context dictionaries 421 | scored_contexts = list(zip(rerank_scores, context_list)) 422 | sorted_scored_contexts = sorted( 423 | scored_contexts, key=lambda x: x[0], reverse=True) 424 | 425 | # Extract top 3*top_n context dictionaries after rerank 426 | selected_contexts = [context for score, context in 427 | sorted_scored_contexts[:min(top_n * 3, len(sorted_scored_contexts))]] 428 | 429 | # If there are too few contexts, skip embedding comparison step 430 | if len(selected_contexts) < top_n * 2: 431 | return selected_contexts[:min(top_n, len(selected_contexts))] 432 | 433 | # Calculate embeddings to select top 2*top_n 434 | logger.debug("Embedding start...") 435 | embedding_model = self.llm_manager.get_embedding_model() 436 | query_embedding = embedding_model.encode(query + "\n" + answer if answer is not None else "") 437 | context_embeddings = [embedding_model.encode(context_item['context']) for context_item in selected_contexts] 438 | 439 | # Calculate cosine similarities 440 | cos_similarities = [ 441 | (query_embedding @ context_embedding.T) / (norm(query_embedding) * norm(context_embedding)) 442 | for context_embedding in context_embeddings] 443 | 444 | # Get top indices based on cosine similarities 445 | top_indices = np.argsort(cos_similarities)[-(top_n * 2):][::-1] 446 | top_contexts = [selected_contexts[i] for i in top_indices] 447 | top_cos_similarities = [cos_similarities[i] for i in top_indices] 448 | top_rerank_scores = [rerank_scores[contexts.index(context_item['context'])] for context_item in 449 | top_contexts] 450 | 451 | # Use simple LLM to calculate context relevance scores 452 | llm_scores = await asyncio.gather( 453 | *[self.get_context_relevance_score(query, context_item['context']) for context_item in top_contexts]) 454 | logger.debug(f"Simple LLM scores: {llm_scores}") 455 | 456 | # Combine scores for final ranking 457 | combined_scores = [llm_score * cos_sim * rerank_score 458 | for llm_score, cos_sim, rerank_score in 459 | zip(llm_scores, top_cos_similarities, top_rerank_scores)] 460 | 461 | combined_context_scores = list(zip(top_contexts, combined_scores)) 462 | sorted_combined_context_scores = sorted( 463 | combined_context_scores, key=lambda x: x[1], reverse=True) 464 | logger.debug(f"Combined sorted context scores: {sorted_combined_context_scores}") 465 | 466 | # Extract top n context dictionaries 467 | top_contexts = [context for context, _ in sorted_combined_context_scores[:top_n]] 468 | logger.debug(f"Final top contexts: {top_contexts}") 469 | 470 | except Exception as e: 471 | logger.error(f"Error retrieving top n context: {e}") 472 | 473 | return top_contexts 474 | 475 | class _ContextRelevanceScore(BaseModel): 476 | score: int = Field( 477 | ..., 478 | description="This is a Context Relevance Score, ranging from 0 to 100, indicates how well a given coding-related context supports answering a specific question, with higher scores signifying greater relevance." 479 | ) 480 | 481 | async def get_context_relevance_score(self, query: str, context: str) -> int: 482 | """ 483 | generate context relevance score based on user's question and provided context 484 | 485 | Args: 486 | query (str): user's initial question. 487 | context (str): context fetched from Github 488 | 489 | Returns: 490 | int: context relevance score, from 0-100. 491 | """ 492 | try: 493 | prompt = config.get("scoring_context_prompt") 494 | response = await self.llm_handler.ainvoke(human_question=query, prompt=prompt, context=[context], output_structure=self._ContextRelevanceScore, simple_llm=True) 495 | logger.debug( 496 | f"For {context[:20]}, the context relevance score is: {response.score}") 497 | return response.score 498 | except Exception as e: 499 | logger.error(f"Error in get_context_relevance_score: {e}") 500 | return -1 501 | -------------------------------------------------------------------------------- /llama_github/github_rag.py: -------------------------------------------------------------------------------- 1 | from llama_github.logger import logger 2 | from llama_github.config.config import config 3 | from typing import List, Optional, Any, Dict 4 | from dataclasses import dataclass 5 | from pprint import pformat 6 | 7 | from llama_github.llm_integration.initial_load import LLMManager 8 | from llama_github.rag_processing.rag_processor import RAGProcessor 9 | 10 | from llama_github.github_integration.github_auth_manager import GitHubAuthManager 11 | from llama_github.data_retrieval.github_api import GitHubAPIHandler 12 | from llama_github.data_retrieval.github_entities import Repository, RepositoryPool 13 | from llama_github.utils import AsyncHTTPClient 14 | 15 | import asyncio 16 | from IPython import get_ipython 17 | from concurrent.futures import ThreadPoolExecutor, as_completed 18 | from urllib.parse import quote 19 | 20 | 21 | @dataclass 22 | class GitHubAppCredentials: 23 | app_id: int 24 | private_key: str 25 | installation_id: int 26 | 27 | 28 | class GithubRAG: 29 | rag_processor: RAGProcessor = None 30 | simple_mode: bool = False 31 | 32 | def __init__(self, 33 | github_access_token: Optional[str] = None, 34 | github_app_credentials: Optional[GitHubAppCredentials] = None, 35 | openai_api_key: Optional[str] = None, 36 | mistral_api_key: Optional[str] = None, 37 | huggingface_token: Optional[str] = None, 38 | jina_api_key: Optional[str] = None, 39 | open_source_models_hg_dir: Optional[str] = None, 40 | embedding_model: Optional[str] = config.get( 41 | "default_embedding"), 42 | rerank_model: Optional[str] = config.get("default_reranker"), 43 | llm: Any = None, 44 | simple_mode: bool = False, 45 | **kwargs) -> None: 46 | """ 47 | Initialize the GithubRAG with the provided credentials and configuration. 48 | 49 | This constructor sets up the necessary components for GitHub integration, 50 | RAG processing, and LLM capabilities. It handles authentication, initializes 51 | the repository pool, and sets up the LLM manager. 52 | 53 | Args: 54 | github_access_token (Optional[str]): GitHub access token for authentication. 55 | github_app_credentials (Optional[GitHubAppCredentials]): Credentials for GitHub App authentication. 56 | openai_api_key (Optional[str]): API key for OpenAI services (GPT-4-turbo will be used). 57 | mistral_api_key (Optional[str]): API key for Mistral AI services. 58 | huggingface_token (Optional[str]): Token for Hugging Face services (recommended). 59 | jina_api_key (Optional[str]): API key for Jina AI services (s.jina.ai API will be used). 60 | open_source_models_hg_dir (Optional[str]): Directory for open-source models from Hugging Face. 61 | embedding_model (Optional[str]): Name of the preferred embedding model from Hugging Face. 62 | rerank_model (Optional[str]): Name of the preferred rerank model from Hugging Face. 63 | llm (Any): Any LangChain LLM chat object to replace OpenAI or open-source models. 64 | simple_mode (bool): If True, skip embedding and rerank model initialization in LLMManager. 65 | **kwargs: Additional keyword arguments for repository pool configuration. 66 | 67 | Raises: 68 | Exception: If there's an error during initialization. 69 | """ 70 | try: 71 | logger.info("Initializing GithubRAG...") 72 | logger.debug("Initializing Github Instance...") 73 | 74 | self.simple_mode = simple_mode 75 | 76 | self.auth_manager = GitHubAuthManager() 77 | if github_access_token: 78 | self.github_instance = self.auth_manager.authenticate_with_token( 79 | github_access_token) 80 | elif github_app_credentials: 81 | self.github_instance = self.auth_manager.authenticate_with_app( 82 | github_app_credentials.app_id, github_app_credentials.private_key, github_app_credentials.installation_id) 83 | else: 84 | logger.error("GitHub credentials not provided.") 85 | logger.debug("Github Instance Initialized.") 86 | 87 | logger.debug("Initializing Repository Pool...") 88 | param_mapping = { 89 | "repo_cleanup_interval": "cleanup_interval", 90 | "repo_max_idle_time": "max_idle_time" 91 | } 92 | repo_pool_kwargs = { 93 | param_mapping[k]: v for k, v in kwargs.items() if k in param_mapping} 94 | self.RepositoryPool = RepositoryPool( 95 | self.github_instance, **repo_pool_kwargs) 96 | self.github_api_handler = GitHubAPIHandler(self.github_instance) 97 | logger.debug("Repository Pool Initialized.") 98 | 99 | self.jina_api_key = jina_api_key 100 | 101 | logger.debug( 102 | "Initializing llm manager, embedding model & reranker model...") 103 | self.llm_manager = LLMManager( 104 | openai_api_key, mistral_api_key, huggingface_token, open_source_models_hg_dir, embedding_model, rerank_model, llm, simple_mode=self.simple_mode) 105 | logger.debug( 106 | "LLM Manager, Embedding model & Reranker model Initialized.") 107 | 108 | self.rag_processor = RAGProcessor( 109 | self.github_api_handler, self.llm_manager) 110 | logger.info("GithubRAG initialization completed.") 111 | except Exception as e: 112 | logger.error(f"Error during GithubRAG initialization: {str(e)}") 113 | raise 114 | 115 | async def async_retrieve_context(self, query, simple_mode: Optional[bool] = None) -> List[str]: 116 | """ 117 | Asynchronously retrieve context based on the given query. 118 | 119 | This method orchestrates the context retrieval process, including Google search, 120 | code search, issue search, and repository search. It uses the RAG processor to 121 | analyze the query and retrieve the most relevant contexts. 122 | 123 | Args: 124 | query (str): The query to retrieve context for. 125 | simple_mode (Optional[bool]): If provided, overrides the instance's simple_mode setting. 126 | 127 | Returns: 128 | List[str]: A list of the most relevant context strings. 129 | 130 | Raises: 131 | Exception: If there's an error during context retrieval. 132 | """ 133 | 134 | if simple_mode is None: 135 | simple_mode = self.simple_mode 136 | 137 | topn_contexts = [] # This will be the list of context strings 138 | try: 139 | logger.info("Retrieving context...") 140 | if simple_mode: 141 | # In simple mode, only a Google search will be conducted based on the user's question. 142 | # This model is not suitable for long questions (e.g., questions with more than 20 words). 143 | task_google_search = asyncio.create_task( 144 | self.google_search_retrieval(query=query)) 145 | await asyncio.gather(task_google_search) 146 | logger.debug( 147 | f"Google search: {str(len(task_google_search.result()))}") 148 | context_list = self.rag_processor.arrange_context( 149 | google_search_result=task_google_search.result()) 150 | if len(context_list) > 0: 151 | topn_contexts = await self.rag_processor.retrieve_topn_contexts( 152 | context_list=context_list, query=query, top_n=config.get("top_n_contexts")) 153 | else: 154 | # Analyzing question and generating strategy 155 | analyze_strategy = asyncio.create_task( 156 | self.rag_processor.analyze_question(query)) 157 | # wait for generate analyze strategy 158 | await analyze_strategy 159 | analyzed_strategy = analyze_strategy.result() 160 | logger.debug(f"Analyze strategy: {analyzed_strategy}") 161 | 162 | # google search from GitHub 163 | tokens = self.llm_manager.get_tokenizer().encode(query) 164 | query_tokens = len(tokens) 165 | task_google_search = asyncio.create_task( 166 | self.google_search_retrieval(query=query if query_tokens < 20 else analyzed_strategy[0])) 167 | # code search from GitHub 168 | task_code_search = asyncio.create_task( 169 | self.code_search_retrieval(query=analyzed_strategy[0], draft_answer=analyzed_strategy[1]+"\n\n"+analyzed_strategy[2])) 170 | # issue search from GitHub 171 | task_issue_search = asyncio.create_task( 172 | self.issue_search_retrieval(query=analyzed_strategy[0], draft_answer=analyzed_strategy[1]+"\n\n"+analyzed_strategy[3])) 173 | # repo search from GitHub 174 | task_repo_search = asyncio.create_task( 175 | self.repo_search_retrieval(query=analyzed_strategy[0])) 176 | 177 | # wait for all tasks to complete 178 | await asyncio.gather(task_google_search, task_code_search, task_issue_search, task_repo_search) 179 | 180 | logger.debug( 181 | f"Google search: {str(len(task_google_search.result()))}") 182 | logger.debug( 183 | f"Code search: {str(len(task_code_search.result()))}") 184 | logger.debug( 185 | f"Issue search: {str(len(task_issue_search.result()))}") 186 | logger.debug( 187 | f"Repo search: {str(len(task_repo_search.result()))}") 188 | 189 | context_list = self.rag_processor.arrange_context( 190 | code_search_result=task_code_search.result(), 191 | issue_search_result=task_issue_search.result(), 192 | repo_search_result=task_repo_search.result(), 193 | google_search_result=task_google_search.result()) 194 | 195 | if len(context_list) > 0: 196 | topn_contexts = await self.rag_processor.retrieve_topn_contexts( 197 | context_list=context_list, query=query, answer=analyzed_strategy[1], top_n=config.get("top_n_contexts")) 198 | 199 | logger.info("Context retrieved successfully.") 200 | except Exception as e: 201 | logger.error(f"Error retrieving context: {e}") 202 | raise e 203 | return topn_contexts 204 | 205 | def retrieve_context(self, query, simple_mode: Optional[bool] = None) -> List[str]: 206 | """ 207 | Retrieve context from GitHub code, issue and repo search based on the input query. 208 | 209 | This method serves as a wrapper for the async_retrieve_context method, 210 | handling the asynchronous call in different runtime environments (e.g., Jupyter notebook, 211 | asyncio event loop). 212 | 213 | Args: 214 | query (str): The query or question to retrieve context for. 215 | simple_mode (Optional[bool]): If provided, overrides the instance's simple_mode setting. 216 | 217 | Returns: 218 | List[str]: A list of context strings retrieved from the specified GitHub repositories. 219 | """ 220 | effective_simple_mode = self.simple_mode if simple_mode is None else simple_mode 221 | 222 | self.loop = asyncio.get_event_loop() 223 | ipython = get_ipython() 224 | if ipython and ipython.has_trait('kernel'): 225 | logger.debug("Running in Jupyter notebook, nest_asyncio applied.") 226 | import nest_asyncio 227 | nest_asyncio.apply() 228 | return asyncio.run(self.async_retrieve_context(query, simple_mode=effective_simple_mode)) 229 | if self.loop.is_running(): 230 | return asyncio.ensure_future(self.async_retrieve_context(query, simple_mode=effective_simple_mode)) 231 | return self.loop.run_until_complete(self.async_retrieve_context(query, simple_mode=effective_simple_mode)) 232 | 233 | async def code_search_retrieval(self, query, draft_answer: Optional[str] = None): 234 | """ 235 | Perform a code search on GitHub based on the given query and draft answer. 236 | 237 | This method uses the RAG processor to generate search criteria, then performs 238 | a code search using the GitHub API. It filters and deduplicates the results 239 | based on star count and relevance. 240 | 241 | Args: 242 | query (str): The main query for the code search. 243 | draft_answer (Optional[str]): A draft answer to refine the search criteria. 244 | 245 | Returns: 246 | list: A list of unique, relevant code search results. 247 | 248 | Raises: 249 | Exception: If there's an error during the code search retrieval. 250 | """ 251 | 252 | result = [] 253 | try: 254 | logger.info("Retrieving code search...") 255 | search_criterias = await self.rag_processor.get_code_search_criteria(query, draft_answer) 256 | for search_criteria in search_criterias: 257 | single_search_result = self.github_api_handler.search_code( 258 | search_criteria.replace('"', '')) 259 | for d in single_search_result: 260 | result.append(d) 261 | # deduplicate results 262 | seen = set() 263 | unique_list = [] 264 | for d in result: 265 | value = d["url"] 266 | if value not in seen and d["stargazers_count"] + config.get("code_search_max_hits") - d["index"] >= config.get("min_stars_to_keep_result"): 267 | seen.add(value) 268 | unique_list.append(d) 269 | result = unique_list 270 | 271 | logger.info("Code search retrieved successfully.") 272 | except Exception as e: 273 | logger.error(f"Error retrieving code search: {e}") 274 | 275 | return result 276 | 277 | async def issue_search_retrieval(self, query, draft_answer: Optional[str] = None): 278 | """ 279 | Perform an issue search on GitHub based on the given query and draft answer. 280 | 281 | This method uses the RAG processor to generate search criteria, then performs 282 | an issue search using the GitHub API. It deduplicates the results and transforms 283 | the API URLs to official GitHub issue webpage URLs. 284 | 285 | Args: 286 | query (str): The main query for the issue search. 287 | draft_answer (Optional[str]): A draft answer to refine the search criteria. 288 | 289 | Returns: 290 | list: A list of unique, relevant issue search results with transformed URLs. 291 | 292 | Raises: 293 | Exception: If there's an error during the issue search retrieval. 294 | """ 295 | 296 | result = [] 297 | try: 298 | logger.info("Retrieving issue search...") 299 | search_criterias = await self.rag_processor.get_issue_search_criteria(query, draft_answer) 300 | for search_criteria in search_criterias: 301 | single_search_result = self.github_api_handler.search_issues( 302 | search_criteria.replace('"', '')) 303 | for d in single_search_result: 304 | result.append(d) 305 | # deduplicate results 306 | seen = set() 307 | unique_list = [] 308 | for d in result: 309 | api_url = d["url"] 310 | if api_url not in seen: 311 | seen.add(api_url) 312 | # Transform the API URL to the official GitHub issue webpage URL 313 | html_url = api_url.replace( 314 | 'api.github.com/repos', 'github.com').replace('issues/', 'issues/') 315 | d["url"] = html_url 316 | unique_list.append(d) 317 | result = unique_list 318 | 319 | logger.info("Issue search retrieved successfully.") 320 | except Exception as e: 321 | logger.error(f"Error retrieving issue search: {e}") 322 | 323 | return result 324 | 325 | async def google_search_retrieval(self, query): 326 | """ 327 | Perform a Google search for GitHub-related content based on the given query. 328 | 329 | This method uses the Jina AI search API to perform a Google search limited to 330 | GitHub.com. It then retrieves the content of the resulting GitHub URLs using 331 | the GitHub API. 332 | 333 | Args: 334 | query (str): The query to search for on Google. 335 | 336 | Returns: 337 | list: A list of dictionaries containing the GitHub URL and its content. 338 | 339 | Raises: 340 | Exception: If there's an error during the Google search retrieval. 341 | """ 342 | 343 | result = [] 344 | try: 345 | logger.info("Retrieving google search...") 346 | encoded_query = quote("site:github.com "+query) 347 | url = f"https://s.jina.ai/{encoded_query}" 348 | if self.jina_api_key is not None and self.jina_api_key != "": 349 | headers = { 350 | "Accept": "application/json", 351 | "Authorization": f"Bearer {self.jina_api_key}" 352 | } 353 | retry_delay = 1 354 | else: 355 | headers = { 356 | "Accept": "application/json" 357 | } 358 | retry_delay = 20 359 | 360 | response = await AsyncHTTPClient.request(url, headers=headers, retry_count=2, retry_delay=retry_delay) 361 | urls = [] 362 | urls = [item["url"] for item in response["data"] if "url" in item] 363 | 364 | for github_url in urls: 365 | content = self.github_api_handler.get_github_url_content( 366 | github_url) 367 | if content and content != "": 368 | result.append({ 369 | 'url': github_url, 370 | 'content': content 371 | }) 372 | logger.info(f"Google search retrieved successfully:{urls}") 373 | except Exception as e: 374 | logger.error(f"Error retrieving google search: {e}") 375 | return result 376 | 377 | def _get_repository_rag_info(self, repository: Repository): 378 | """ 379 | Retrieve RAG-related information for a given repository. 380 | 381 | This helper method fetches the README content and a simple structure 382 | of the repository using the Repository object. 383 | 384 | Args: 385 | repository (Repository): The Repository object to get information from. 386 | 387 | Returns: 388 | tuple: A tuple containing the repository's README content and simple structure. 389 | """ 390 | 391 | return repository.get_readme(), self.rag_processor.get_repo_simple_structure(repository) 392 | 393 | async def repo_search_retrieval(self, query, draft_answer: Optional[str] = None): 394 | """ 395 | Perform a repository search on GitHub based on the given query and draft answer. 396 | 397 | This method uses the RAG processor to generate search criteria, then performs 398 | a repository search using the GitHub API. It retrieves README content and 399 | simple structure for each relevant repository concurrently. 400 | 401 | Args: 402 | query (str): The main query for the repository search. 403 | draft_answer (Optional[str]): A draft answer to refine the search criteria. 404 | 405 | Returns: 406 | list: A list of dictionaries containing repository information and content. 407 | 408 | Raises: 409 | Exception: If there's an error during the repository search retrieval. 410 | """ 411 | 412 | result = [] 413 | results_with_index = [] 414 | try: 415 | logger.info("Retrieving repo search...") 416 | search_criterias = await self.rag_processor.get_repo_search_criteria(query, draft_answer) 417 | for search_criteria in search_criterias: 418 | single_search_result = self.github_api_handler.search_repositories( 419 | search_criteria.replace('"', '')) 420 | for d in single_search_result: 421 | result.append(d) 422 | # deduplicate results 423 | seen = set() 424 | unique_list = [] 425 | for d in result: 426 | value = d.full_name 427 | if value not in seen: 428 | seen.add(value) 429 | unique_list.append(d) 430 | repositories = unique_list 431 | 432 | with ThreadPoolExecutor(max_workers=config.get("max_workers")) as executor: 433 | # Concurrently fetch the file content for each code search result 434 | future_to_index = {executor.submit( 435 | self._get_repository_rag_info, repository): index for index, repository in enumerate(repositories)} 436 | for future in as_completed(future_to_index): 437 | index = future_to_index[future] 438 | repository = repositories[index] 439 | try: 440 | repo_readme, repo_simple_structure = future.result() 441 | if repo_readme is None or repository.description is None or repository.description == "" or repo_simple_structure == "{}": 442 | continue 443 | if repo_readme: 444 | results_with_index.append({ 445 | 'index': index, 446 | 'full_name': repository.full_name, 447 | 'url': repository.html_url, 448 | 'content': repo_readme, 449 | }) 450 | # if repo_simple_structure: 451 | # results_with_index.append({ 452 | # 'index': index, 453 | # 'full_name': repository.full_name, 454 | # 'content': "The repository "+repository.full_name+" with description:" + repository.description+" has below repo simple structure:\n"+repo_simple_structure, 455 | # }) 456 | except Exception as e: 457 | logger.error( 458 | f"Error getting repository info: {e}") 459 | 460 | logger.info("Repo search retrieved successfully.") 461 | except Exception as e: 462 | logger.error(f"Error retrieving repos search: {e}") 463 | return results_with_index 464 | 465 | def answer_with_context(self, query: str, contexts: Optional[List[Dict[str, Any]]] = None, simple_mode=False) -> str: 466 | """ 467 | Generate an answer based on the given query and optional contexts. 468 | 469 | This method serves as a wrapper for the async_answer_with_context method, 470 | handling the asynchronous call in different runtime environments (e.g., Jupyter notebook, 471 | asyncio event loop). 472 | 473 | Args: 474 | query (str): The user's query. 475 | contexts (Optional[List[Dict[str, Any]]]): Optional list of context dictionaries. 476 | Each dictionary should contain 'content' and 'url' keys. 477 | simple_mode (bool): Whether to use simple mode for context retrieval. 478 | 479 | Returns: 480 | str: The generated answer. 481 | """ 482 | 483 | self.loop = asyncio.get_event_loop() 484 | ipython = get_ipython() 485 | if ipython and ipython.has_trait('kernel'): 486 | logger.debug("Running in Jupyter notebook, nest_asyncio applied.") 487 | import nest_asyncio 488 | nest_asyncio.apply() 489 | return asyncio.run(self.async_answer_with_context(query, contexts, simple_mode)) 490 | if self.loop.is_running(): 491 | return asyncio.ensure_future(self.async_answer_with_context(query, contexts, simple_mode)) 492 | return self.loop.run_until_complete(self.async_answer_with_context(query, contexts, simple_mode)) 493 | 494 | async def async_answer_with_context(self, query: str, contexts: Optional[List[Dict[str, Any]]] = None, simple_mode=False) -> str: 495 | """ 496 | Asynchronously generate an answer based on the given query and optional contexts. 497 | 498 | This method retrieves contexts if not provided, extracts relevant information, 499 | and uses the RAG processor's LLM handler to generate an answer. 500 | 501 | Args: 502 | query (str): The user's query. 503 | contexts (Optional[List[Dict[str, Any]]]): Optional list of context dictionaries. 504 | Each dictionary should contain 'content' and 'url' keys. 505 | simple_mode (bool): Whether to use simple mode for context retrieval. 506 | 507 | Returns: 508 | str: The generated answer. 509 | """ 510 | 511 | if contexts is None: 512 | contexts = await self.async_retrieve_context(query, simple_mode) 513 | logger.debug(f"Retrieved contexts: {contexts}") 514 | context_contents = [context['context'] for context in contexts] 515 | context_urls = [context['url'] for context in contexts] 516 | 517 | answer = await self.rag_processor.llm_handler.ainvoke( 518 | human_question=query, 519 | context=context_contents, 520 | # context_urls=context_urls 521 | ) 522 | 523 | return answer 524 | --------------------------------------------------------------------------------