├── github
    ├── __init__.py
    ├── __pycache__
    │   ├── api.cpython-312.pyc
    │   └── __init__.cpython-312.pyc
    └── api.py
├── utils
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-312.pyc
    │   └── rate_limiter.cpython-312.pyc
    └── rate_limiter.py
├── analyzers
    ├── __init__.py
    ├── __pycache__
    │   ├── base.cpython-312.pyc
    │   ├── gemini.cpython-312.pyc
    │   └── __init__.cpython-312.pyc
    ├── base.py
    ├── claude.py
    └── gemini.py
├── visualization
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-312.pyc
    │   └── visualizer.cpython-312.pyc
    └── visualizer.py
├── requirements.txt
├── __init__.py
├── models.py
├── LICENSE
├── setup.py
├── README.md
├── prompts.py
├── notebooks
    ├── analyze_code_reviews.py
    └── analyze_code_reviews.ipynb
└── main.py


/github/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/analyzers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/github/__pycache__/api.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/github/__pycache__/api.cpython-312.pyc


--------------------------------------------------------------------------------
/analyzers/__pycache__/base.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/analyzers/__pycache__/base.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/utils/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/analyzers/__pycache__/gemini.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/analyzers/__pycache__/gemini.cpython-312.pyc


--------------------------------------------------------------------------------
/github/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/github/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/analyzers/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/analyzers/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/rate_limiter.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/utils/__pycache__/rate_limiter.cpython-312.pyc


--------------------------------------------------------------------------------
/visualization/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/visualization/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/visualization/__pycache__/visualizer.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Entelligence-AI/code_review_evals/HEAD/visualization/__pycache__/visualizer.cpython-312.pyc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp>=3.8.0
 2 | google-generativeai>=0.3.0
 3 | anthropic>=0.7.0
 4 | openai>=1.0.0
 5 | pandas>=2.0.0
 6 | matplotlib>=3.7.0
 7 | seaborn>=0.12.0
 8 | python-dotenv>=1.0.0
 9 | pydantic>=2.0.0
10 | typing-extensions>=4.5.0
11 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | """Code Review Evaluator - A tool for analyzing code review comments from different AI models"""
 2 | 
 3 | from .github.api import GitHubAPI
 4 | from .analyzers.gemini import GeminiAnalyzer
 5 | from .visualization.visualizer import ResultsVisualizer
 6 | from .models import ReviewComment, PRDiff, CommentAnalysis, ReviewCommentResponse
 7 | 
 8 | __version__ = "0.1.0"
 9 | __all__ = [
10 |     'GitHubAPI',
11 |     'GeminiAnalyzer',
12 |     'ResultsVisualizer',
13 |     'ReviewComment',
14 |     'PRDiff',
15 |     'CommentAnalysis',
16 |     'ReviewCommentResponse'
17 | ]
18 | 


--------------------------------------------------------------------------------
/analyzers/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Dict
 3 | from models import ReviewComment, PRDiff
 4 | 
 5 | class BaseAnalyzer(ABC):
 6 |     """Base class for all review analyzers"""
 7 |     
 8 |     @abstractmethod
 9 |     async def analyze_diff(self, diff: PRDiff) -> List[ReviewComment]:
10 |         """Analyze a PR diff to find potential issues"""
11 |         pass
12 | 
13 |     @abstractmethod
14 |     async def analyze_comment_quality(self, comments: List[ReviewComment]) -> Dict[str, Dict[str, float]]:
15 |         """Analyze the quality of bot comments"""
16 |         pass
17 | 
18 |     @abstractmethod
19 |     async def analyze_comment_quality_in_batch(self, comments: List[ReviewComment]) -> Dict[str, Dict]:
20 |         """Analyze comments in batches for more detailed analysis"""
21 |         pass
22 | 
23 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from enum import Enum
 3 | from typing import List, Optional
 4 | from pydantic import BaseModel
 5 | 
 6 | @dataclass
 7 | class ReviewComment:
 8 |     file_name: str
 9 |     chunk: str
10 |     comment: str
11 |     line_nums: str
12 |     bot_name: str
13 |     pr_number: int
14 |     category: Optional[str] = None
15 | 
16 | @dataclass
17 | class PRDiff:
18 |     pr_number: int
19 |     diff_content: str
20 |     files_changed: List[str]
21 | 
22 | class ImpactLevel(str, Enum):
23 |     HIGH = "High"
24 |     MEDIUM = "Medium"
25 |     LOW = "Low"
26 | 
27 | class ReviewCommentCategory(str, Enum):
28 |     CRITICAL_BUG = "Critical Bug"
29 |     NITPICK = "Nitpick"
30 |     OTHER = "Other"
31 | 
32 | class CommentAnalysis(BaseModel):
33 |     comment: str
34 |     category: ReviewCommentCategory
35 |     impact: ImpactLevel
36 |     reasoning: str
37 |     file_name: Optional[str] = None
38 |     line_numbers: Optional[str] = None
39 | 
40 | class ReviewCommentResponse(BaseModel):
41 |     bot: str
42 |     comments: List[CommentAnalysis]
43 | 
44 |     


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Entelligence.ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | with open("requirements.txt", "r", encoding="utf-8") as fh:
 7 |     requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
 8 | 
 9 | setup(
10 |     name="code_review_evals",
11 |     version="0.1.0",
12 |     author="Entelligence AI",
13 |     description="A tool to analyze and evaluate code review comments from different AI models",
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     url="https://github.com/Entelligence-AI/code_review_evals",
17 |     packages=find_packages(),
18 |     classifiers=[
19 |         "Development Status :: 3 - Alpha",
20 |         "Intended Audience :: Developers",
21 |         "License :: OSI Approved :: MIT License",
22 |         "Operating System :: OS Independent",
23 |         "Programming Language :: Python :: 3",
24 |         "Programming Language :: Python :: 3.8",
25 |         "Programming Language :: Python :: 3.9",
26 |         "Programming Language :: Python :: 3.10",
27 |         "Programming Language :: Python :: 3.11",
28 |     ],
29 |     python_requires=">=3.8",
30 |     install_requires=requirements,
31 | )
32 | 


--------------------------------------------------------------------------------
/utils/rate_limiter.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | import random
 4 | import logging
 5 | from typing import Callable, Any
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | class RateLimiter:
10 |     def __init__(self, requests_per_minute=60):
11 |         self.rate_limit = requests_per_minute
12 |         self.tokens = requests_per_minute
13 |         self.last_updated = time.time()
14 |         self.interval = 60.0 / requests_per_minute
15 |         self.lock = asyncio.Lock()
16 | 
17 |     async def acquire(self):
18 |         async with self.lock:
19 |             now = time.time()
20 |             time_passed = now - self.last_updated
21 |             self.tokens = min(
22 |                 self.rate_limit,
23 |                 self.tokens + time_passed * (self.rate_limit / 60.0)
24 |             )
25 |             self.last_updated = now
26 | 
27 |             if self.tokens < 1:
28 |                 sleep_time = (1 - self.tokens) * self.interval
29 |                 await asyncio.sleep(sleep_time)
30 |                 self.tokens = 0
31 |                 self.last_updated = time.time()
32 |             else:
33 |                 self.tokens -= 1
34 | 
35 | 
36 | async def make_api_call_with_backoff(func: Callable, *args, max_retries=5, initial_delay=1) -> Any:
37 |     """Make API call with exponential backoff retry logic"""
38 |     delay = initial_delay
39 |     last_exception = None
40 | 
41 |     for retry in range(max_retries):
42 |         try:
43 |             return await asyncio.to_thread(func, *args)
44 |         except Exception as e:
45 |             last_exception = e
46 | 
47 |             if '429' in str(e):
48 |                 sleep_time = delay * (2 ** retry) + random.uniform(0, 0.1)
49 |                 logger.warning(f"Rate limit hit, retrying in {sleep_time:.2f} seconds...")
50 |                 await asyncio.sleep(sleep_time)
51 |                 continue
52 |             else:
53 |                 raise
54 | 
55 |     logger.error(f"Failed after {max_retries} retries. Last error: {last_exception}")
56 |     raise last_exception
57 | 
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Code Review Evaluator
 2 | 
 3 | A tool to analyze and evaluate code review comments from different AI code review bots using LLMs.
 4 | 
 5 | ## Features
 6 | 
 7 | - Fetches and analyzes Pull Request data from GitHub repositories
 8 | - Evaluates code review comments using Google's Gemini model
 9 | - Categorizes comments into:
10 |   - Critical Bugs
11 |   - Nitpicks
12 |   - Other feedback
13 | - Generates visual analysis and detailed reports
14 | 
15 | ## Quick Start
16 | 
17 | 1. Clone the repository:
18 | ```bash
19 | git clone https://github.com/Entelligence-AI/code_review_evals.git
20 | cd code_review_evals
21 | ```
22 | 
23 | 2. Install dependencies:
24 | ```bash
25 | pip install -r requirements.txt
26 | ```
27 | 
28 | 3. Set up environment variables:
29 | ```bash
30 | cp .env.example .env
31 | # Edit .env and add your API keys
32 | ```
33 | 
34 | 4. Run the analysis:
35 | ```bash
36 | python main.py
37 | ```
38 | 
39 | ## Environment Setup
40 | 
41 | Required environment variables in your `.env` file:
42 | ```
43 | GITHUB_TOKEN=your_github_personal_access_token_here
44 | GOOGLE_API_KEY=your_gemini_api_key_here
45 | GITHUB_REPO=owner/repo  # default: microsoft/typescript
46 | NUM_PRS=5  # number of PRs to analyze
47 | ```
48 | 
49 | To get the required API keys:
50 | - GitHub Token: https://github.com/settings/tokens
51 |   - Needs `repo` scope access
52 | - Google API Key: https://makersuite.google.com/app/apikey
53 |   - Enable Gemini API access
54 | 
55 | ## Output
56 | 
57 | The tool generates several outputs in the `analysis_results` directory:
58 | 1. `comment_distribution.png` - Visual breakdown of comment categories
59 | 2. `bot_comparison.png` - Comparison of different bot performances
60 | 3. `analysis_report.txt` - Detailed metrics and analysis
61 | 
62 | ## Alternative Usage: Jupyter Notebook
63 | 
64 | For interactive analysis, you can use the provided notebook:
65 | ```bash
66 | jupyter notebook notebooks/code_review_analysis.ipynb
67 | ```
68 | 
69 | ## Development
70 | 
71 | Project structure:
72 | ```
73 | code_review_evals/
74 | ├── analyzers/        # Analysis modules for different LLMs
75 | ├── github/          # GitHub API interaction
76 | ├── utils/           # Utility functions
77 | ├── visualization/   # Visualization tools
78 | ├── models.py        # Data models
79 | ├── prompts.py       # LLM prompts
80 | ├── main.py         # Main execution script
81 | └── requirements.txt
82 | ```
83 | 
84 | ## Contributing
85 | 
86 | 1. Fork the repository
87 | 2. Create a feature branch
88 | 3. Submit a pull request
89 | 
90 | ## License
91 | 
92 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
93 | 


--------------------------------------------------------------------------------
/github/api.py:
--------------------------------------------------------------------------------
  1 | import aiohttp
  2 | import logging
  3 | from typing import List
  4 | from models import ReviewComment, PRDiff
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | class GitHubAPI:
  9 |     def __init__(self, token: str, repo: str):
 10 |         self.token = token
 11 |         self.repo = repo
 12 |         self.headers = {
 13 |             "Authorization": f"Bearer {token}",
 14 |             "Accept": "application/vnd.github.v3+json"
 15 |         }
 16 |         self.diff_headers = {
 17 |             "Authorization": f"Bearer {token}",
 18 |             "Accept": "application/vnd.github.v3.diff"
 19 |         }
 20 | 
 21 |     async def fetch_recent_prs(self, limit: int = 10) -> List[dict]:
 22 |         """Fetch recent PRs from the repository"""
 23 |         async with aiohttp.ClientSession(headers=self.headers) as session:
 24 |             url = f"https://api.github.com/repos/{self.repo}/pulls"
 25 |             prs = []
 26 |             page = 1
 27 | 
 28 |             while len(prs) < limit:
 29 |                 params = {
 30 |                     "state": "all",
 31 |                     "per_page": min(10, limit - len(prs)),
 32 |                     "page": page,
 33 |                     "sort": "created",
 34 |                     "direction": "desc"
 35 |                 }
 36 | 
 37 |                 async with session.get(url, params=params) as response:
 38 |                     if response.status != 200:
 39 |                         raise Exception(f"Failed to fetch PRs: {await response.text()}")
 40 | 
 41 |                     batch = await response.json()
 42 |                     if not batch:
 43 |                         break
 44 | 
 45 |                     prs.extend(batch)
 46 |                     page += 1
 47 | 
 48 |             return prs[:limit]
 49 | 
 50 | 
 51 |     async def fetch_pr_diff(self, pr_number: int) -> PRDiff:
 52 |         """Fetch the diff content for a PR"""
 53 |         logger.info(f"Fetching PR {pr_number}")
 54 |         async with aiohttp.ClientSession(headers=self.diff_headers) as session:
 55 |             url = f"https://api.github.com/repos/{self.repo}/pulls/{pr_number}"
 56 | 
 57 |             async with session.get(url) as response:
 58 |                 if response.status != 200:
 59 |                     raise Exception(f"Failed to fetch PR diff: {await response.text()}")
 60 | 
 61 |                 diff_content = await response.text()
 62 |                 files_changed = []
 63 |                 
 64 |                 # More robust file path extraction
 65 |                 for line in diff_content.split("\n"):
 66 |                     if line.startswith("+++ b/"):
 67 |                         try:
 68 |                             # Remove the "+++ b/" prefix to get the file path
 69 |                             file_path = line[6:]  # "+++ b/" is 6 characters
 70 |                             if file_path and file_path != '/dev/null':  # Skip deleted files
 71 |                                 files_changed.append(file_path)
 72 |                         except Exception as e:
 73 |                             logger.warning(f"Could not parse file path from line: {line}")
 74 |                             continue
 75 | 
 76 |                 logger.debug(f"Found {len(files_changed)} changed files in PR {pr_number}")
 77 |                 return PRDiff(
 78 |                     pr_number=pr_number,
 79 |                     diff_content=diff_content,
 80 |                     files_changed=files_changed
 81 |                 )
 82 |             
 83 | 
 84 |     async def fetch_pr_comments(self, pr_number: int) -> List[ReviewComment]:
 85 |         """Fetch review comments for a PR"""
 86 |         async with aiohttp.ClientSession(headers=self.headers) as session:
 87 |             url = f"https://api.github.com/repos/{self.repo}/pulls/{pr_number}/comments"
 88 | 
 89 |             async with session.get(url) as response:
 90 |                 if response.status != 200:
 91 |                     raise Exception(f"Failed to fetch PR comments: {await response.text()}")
 92 | 
 93 |                 comments = await response.json()
 94 |                 return [
 95 |                     ReviewComment(
 96 |                         file_name=comment['path'],
 97 |                         chunk=comment.get('diff_hunk', ''),
 98 |                         comment=comment['body'],
 99 |                         line_nums=f"{comment.get('line', '')}-{comment.get('original_line', '')}",
100 |                         bot_name=comment['user']['login'],
101 |                         pr_number=pr_number
102 |                     )
103 |                     for comment in comments
104 |                     if 'bot' in comment['user']['type'].lower()
105 |                 ]
106 |             
107 | 


--------------------------------------------------------------------------------
/prompts.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Centralized storage for all prompts used in code review analysis.
  3 | Each prompt is organized by analyzer type and purpose.
  4 | """
  5 | 
  6 | # Common templates that can be used across different analyzers
  7 | # Common templates that can be used across different analyzers
  8 | DIFF_ANALYSIS_TEMPLATE = """You are a senior staff principle engineer performing a security and functionality focused code review.
  9 | Go through this PR diff line by line, focusing ONLY on bugs that could cause:
 10 | 1. Runtime errors or crashes
 11 | 2. Race conditions 
 12 | 3. Memory leaks
 13 | 4. State management issues
 14 | 5. Security vulnerabilities
 15 | 6. Data loss or corruption
 16 | 7. Performance issues
 17 | 8. Resource leaks
 18 | 
 19 | Analyze this diff:
 20 | {diff}
 21 | 
 22 | Remember: Only report issues that could actually break functionality or corrupt data at runtime."""
 23 | 
 24 | COMMENT_CATEGORIES = """
 25 | 1. CRITICAL_BUG: Comments identifying serious issues that could cause crashes, data loss, security vulnerabilities, etc.
 26 | 2. NITPICK: Minor suggestions about style, formatting, variable names, or trivial changes that don't affect functionality
 27 | 3. OTHER: Everything else - general suggestions, questions, or feedback that don't fit the above"""
 28 | 
 29 | # Gemini-specific prompts
 30 | GEMINI_PROMPTS = {
 31 |     "diff_analysis": f"""{DIFF_ANALYSIS_TEMPLATE}
 32 | 
 33 | The output format should be the following JSON EXACTLY:
 34 | {{{{
 35 |     "issues": [
 36 |         {{{{
 37 |             "bug_description": "1-2 line description of how this bug impacts runtime behavior and how to fix it",
 38 |             "severity": "HIGH|MEDIUM|LOW based on potential user impact",
 39 |             "bug_type": "RACE_CONDITION|STATE_MANAGEMENT|MEMORY_LEAK|SECURITY|CRASH|CORRUPTION",
 40 |             "file_name": "Affected file",
 41 |             "line_numbers": "Relevant line numbers",
 42 |             "snippet": "Code showing the bug"
 43 |         }}}}
 44 |     ]
 45 | }}}}""",
 46 | 
 47 |     "comment_categorization": f"""As a senior engineer, analyze these code review comments and categorize each one into exactly ONE of:
 48 | {COMMENT_CATEGORIES}
 49 | 
 50 | PR #{{pr_number}} by {{bot_name}}:
 51 | {{comments}}
 52 | 
 53 | Respond with a JSON array where each object has:
 54 | {{{{
 55 |     "comment_index": "<index>",
 56 |     "category": "CRITICAL_BUG|NITPICK|OTHER",
 57 |     "reasoning": "Brief explanation of why this category was chosen"
 58 | }}}}
 59 | IMPORTANT: Each comment MUST be categorized. The category field MUST be exactly one of CRITICAL_BUG, NITPICK, or OTHER."""
 60 | }
 61 | 
 62 | # Claude-specific prompts
 63 | CLAUDE_PROMPTS = {
 64 |     "diff_analysis": f"""{DIFF_ANALYSIS_TEMPLATE}
 65 | 
 66 | Analyze this PR diff for potential bugs and issues:
 67 | 
 68 | {{diff}}
 69 | 
 70 | For each issue found, provide detailed analysis following this structure:
 71 | {{
 72 |     "description": "Clear explanation of the bug and its impact",
 73 |     "severity": "HIGH|MEDIUM|LOW",
 74 |     "category": "SECURITY|RACE_CONDITION|MEMORY_LEAK|PERFORMANCE|CRASH",
 75 |     "file": "affected_file.ext",
 76 |     "lines": "line numbers",
 77 |     "code": "relevant code snippet",
 78 |     "fix": "suggested fix approach"
 79 | }}""",
 80 | 
 81 |     "comment_categorization": f"""Analyze these code review comments and categorize each as either:
 82 | {COMMENT_CATEGORIES}
 83 | 
 84 | PR #{{pr_number}} comments from {{bot_name}}:
 85 | {{comments}}
 86 | 
 87 | Respond with a JSON array of objects:
 88 | [
 89 |     {{
 90 |         "comment_index": number,
 91 |         "category": "CRITICAL_BUG|NITPICK|OTHER",
 92 |         "reasoning": "Brief explanation"
 93 |     }}
 94 | ]"""
 95 | }
 96 | 
 97 | # GPT-4-specific prompts
 98 | GPT4_PROMPTS = {
 99 |     "diff_analysis": f"""{DIFF_ANALYSIS_TEMPLATE}
100 | 
101 | Analyze this PR diff for potential bugs:
102 | 
103 | {{diff}}
104 | 
105 | Focus on issues that could cause:
106 | 1. Runtime errors or crashes
107 | 2. Race conditions
108 | 3. Memory leaks
109 | 4. State management issues
110 | 5. Security vulnerabilities
111 | 6. Data loss or corruption
112 | 7. Performance issues
113 | 8. Resource leaks
114 | 
115 | Provide analysis in this JSON format:
116 | {{
117 |     "issues": [
118 |         {{
119 |             "description": "Clear explanation of the bug",
120 |             "severity": "HIGH|MEDIUM|LOW",
121 |             "category": "SECURITY|RACE_CONDITION|MEMORY_LEAK|PERFORMANCE|CRASH",
122 |             "file": "affected_file",
123 |             "lines": "line numbers",
124 |             "code": "relevant snippet",
125 |             "fix": "suggested fix"
126 |         }}
127 |     ]
128 | }}""",
129 | 
130 |     "comment_categorization": f"""Analyze code review comments and categorize each one.
131 | Categories:
132 | {COMMENT_CATEGORIES}
133 | 
134 | Analyze these code review comments from PR #{{pr_number}} by {{bot_name}}:
135 | 
136 | {{comments}}
137 | 
138 | Categorize each comment and explain your reasoning. Respond in JSON format:
139 | {{
140 |     "comments": [
141 |         {{
142 |             "index": number,
143 |             "category": "CRITICAL_BUG|NITPICK|OTHER",
144 |             "reasoning": "Brief explanation"
145 |         }}
146 |     ]
147 | }}"""
148 | }
149 | 
150 | 


--------------------------------------------------------------------------------
/notebooks/analyze_code_reviews.py:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Code Review Analysis with AI Models\n",
  8 |     "\n",
  9 |     "This notebook analyzes code reviews from any GitHub repository using AI models.\n",
 10 |     "\n",
 11 |     "## Setup\n",
 12 |     "First, let's install requirements and clone the repository:"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Clone the repository\n",
 21 |     "!git clone https://github.com/Entelligence-AI/code_review_evals.git\n",
 22 |     "%cd code_review_evals\n",
 23 |     "\n",
 24 |     "# Install requirements\n",
 25 |     "!pip install -r requirements.txt\n",
 26 |     "!pip install nest-asyncio  # Required for running async code in notebooks"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Configuration\n",
 34 |     "Enter your API keys and repository details below:"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {
 41 |     "cellView": "form"
 42 |    },
 43 |    "source": [
 44 |     "#@title API Keys and Repository Settings\n",
 45 |     "GITHUB_TOKEN = \"\" #@param {type:\"string\"}\n",
 46 |     "GOOGLE_API_KEY = \"\" #@param {type:\"string\"}\n",
 47 |     "GITHUB_REPO = \"microsoft/typescript\" #@param {type:\"string\"}\n",
 48 |     "NUM_PRS = 5 #@param {type:\"slider\", min:1, max:100, step:1}\n",
 49 |     "\n",
 50 |     "import os\n",
 51 |     "os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN\n",
 52 |     "os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY\n",
 53 |     "\n",
 54 |     "if not GITHUB_TOKEN or not GOOGLE_API_KEY:\n",
 55 |     "    raise ValueError(\"Please provide both GITHUB_TOKEN and GOOGLE_API_KEY\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Run Analysis\n",
 63 |     "Now let's analyze the repository using our code review evaluation tools:"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "import asyncio\n",
 72 |     "import nest_asyncio\n",
 73 |     "nest_asyncio.apply()\n",
 74 |     "\n",
 75 |     "from github.api import GitHubAPI\n",
 76 |     "from analyzers.gemini import GeminiAnalyzer\n",
 77 |     "from visualization.visualizer import ResultsVisualizer\n",
 78 |     "\n",
 79 |     "async def analyze_repository():\n",
 80 |     "    # Initialize components\n",
 81 |     "    github = GitHubAPI(GITHUB_TOKEN, GITHUB_REPO)\n",
 82 |     "    analyzer = GeminiAnalyzer(GOOGLE_API_KEY)\n",
 83 |     "    visualizer = ResultsVisualizer()\n",
 84 |     "    \n",
 85 |     "    print(f\"Analyzing {GITHUB_REPO}...\")\n",
 86 |     "    \n",
 87 |     "    # Fetch and analyze PRs\n",
 88 |     "    prs = await github.fetch_recent_prs(NUM_PRS)\n",
 89 |     "    print(f\"Fetched {len(prs)} PRs\")\n",
 90 |     "    \n",
 91 |     "    comments = []\n",
 92 |     "    for pr in prs:\n",
 93 |     "        pr_number = pr['number']\n",
 94 |     "        print(f\"Processing PR #{pr_number}...\")\n",
 95 |     "        \n",
 96 |     "        try:\n",
 97 |     "            # Fetch PR data\n",
 98 |     "            pr_comments = await github.fetch_pr_comments(pr_number)\n",
 99 |     "            diff = await github.fetch_pr_diff(pr_number)\n",
100 |     "            \n",
101 |     "            # Analyze with Gemini\n",
102 |     "            gemini_comments = await analyzer.analyze_diff(diff)\n",
103 |     "            \n",
104 |     "            comments.extend(pr_comments)\n",
105 |     "            comments.extend(gemini_comments)\n",
106 |     "            \n",
107 |     "        except Exception as e:\n",
108 |     "            print(f\"Error processing PR #{pr_number}: {str(e)}\")\n",
109 |     "            continue\n",
110 |     "    \n",
111 |     "    # Analyze quality and generate visualizations\n",
112 |     "    print(\"\\nAnalyzing comment quality...\")\n",
113 |     "    analysis_results = await analyzer.analyze_comment_quality_in_batch(comments)\n",
114 |     "    \n",
115 |     "    # Create visualizations\n",
116 |     "    print(\"\\nGenerating visualizations...\")\n",
117 |     "    visualizer.create_impact_distribution_chart(\n",
118 |     "        analysis_results['metrics'],\n",
119 |     "        'comment_distribution.png'\n",
120 |     "    )\n",
121 |     "    \n",
122 |     "    visualizer.create_bot_comparison_chart(\n",
123 |     "        analysis_results['metrics'],\n",
124 |     "        'bot_comparison.png'\n",
125 |     "    )\n",
126 |     "    \n",
127 |     "    visualizer.save_detailed_report(\n",
128 |     "        analysis_results,\n",
129 |     "        'analysis_report.txt'\n",
130 |     "    )\n",
131 |     "    \n",
132 |     "    return analysis_results\n",
133 |     "\n",
134 |     "# Run the analysis\n",
135 |     "results = await analyze_repository()\n",
136 |     "\n",
137 |     "# Display results\n",
138 |     "from IPython.display import Image, Markdown\n",
139 |     "display(Markdown(\"## Comment Distribution\"))\n",
140 |     "display(Image('comment_distribution.png'))\n",
141 |     "display(Markdown(\"## Bot Comparison\"))\n",
142 |     "display(Image('bot_comparison.png'))\n",
143 |     "\n",
144 |     "# Print summary metrics\n",
145 |     "print(\"\\nSummary Metrics:\")\n",
146 |     "for bot, metrics in results['metrics'].items():\n",
147 |     "    print(f\"\\nBot: {bot}\")\n",
148 |     "    print(f\"Total Comments: {metrics['total_comments']}\")\n",
149 |     "    print(f\"Critical Bug Ratio: {metrics['critical_bug_ratio']:.1%}\")\n",
150 |     "    print(f\"Nitpick Ratio: {metrics['nitpick_ratio']:.1%}\")\n",
151 |     "    print(f\"Other Ratio: {metrics['other_ratio']:.1%}\")"
152 |    ]
153 |   }
154 |  ],
155 |  "metadata": {
156 |   "colab": {
157 |    "name": "Code Review Analysis",
158 |    "provenance": []
159 |   },
160 |   "kernelspec": {
161 |    "display_name": "Python 3",
162 |    "language": "python",
163 |    "name": "python3"
164 |   }
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 4
168 | }
169 | 


--------------------------------------------------------------------------------
/analyzers/claude.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from collections import defaultdict
  4 | import anthropic
  5 | from typing import List, Dict
  6 | 
  7 | from .base import BaseReviewAnalyzer
  8 | from ..github.api import ReviewComment, PRDiff
  9 | from ..utils.rate_limiter import RateLimiter, make_api_call_with_backoff
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | class ClaudeReviewAnalyzer(BaseReviewAnalyzer):
 14 |     def __init__(self, api_key: str, requests_per_minute: int = 60):
 15 |         self.client = anthropic.Anthropic(api_key=api_key)
 16 |         self.rate_limiter = RateLimiter(requests_per_minute)
 17 | 
 18 |     async def analyze_diff(self, diff: PRDiff) -> List[ReviewComment]:
 19 |         """Analyze a PR diff using Claude"""
 20 |         prompt = """You are a senior staff principle engineer performing a security and functionality focused code review.
 21 |         Analyze this PR diff for potential bugs and issues:
 22 | 
 23 |         {diff}
 24 | 
 25 |         Focus on identifying serious issues that could cause:
 26 |         1. Runtime errors or crashes
 27 |         2. Race conditions
 28 |         3. Memory leaks
 29 |         4. State management issues
 30 |         5. Security vulnerabilities
 31 |         6. Data loss or corruption
 32 |         7. Performance issues
 33 |         8. Resource leaks
 34 | 
 35 |         For each issue found, provide detailed analysis following this structure:
 36 |         {
 37 |             "description": "Clear explanation of the bug and its impact",
 38 |             "severity": "HIGH|MEDIUM|LOW",
 39 |             "category": "SECURITY|RACE_CONDITION|MEMORY_LEAK|PERFORMANCE|CRASH",
 40 |             "file": "affected_file.ext",
 41 |             "lines": "line numbers",
 42 |             "code": "relevant code snippet",
 43 |             "fix": "suggested fix approach"
 44 |         }
 45 | 
 46 |         Only include serious technical issues that could affect runtime behavior or security."""
 47 | 
 48 |         try:
 49 |             def make_api_call():
 50 |                 return self.client.messages.create(
 51 |                     model="claude-3-opus-20240229",
 52 |                     max_tokens=4096,
 53 |                     messages=[{
 54 |                         "role": "user",
 55 |                         "content": prompt.format(diff=diff.diff_content)
 56 |                     }]
 57 |                 )
 58 | 
 59 |             response = await make_api_call_with_backoff(make_api_call)
 60 |             
 61 |             try:
 62 |                 results = json.loads(response.content[0].text)
 63 |                 if not isinstance(results, list):
 64 |                     results = [results]
 65 | 
 66 |                 return [
 67 |                     ReviewComment(
 68 |                         file_name=result['file'],
 69 |                         chunk=result['code'],
 70 |                         comment=f"{result['description']}\n\nSuggested fix: {result['fix']}",
 71 |                         line_nums=result['lines'],
 72 |                         bot_name='claude',
 73 |                         pr_number=diff.pr_number,
 74 |                         category=result['category']
 75 |                     )
 76 |                     for result in results
 77 |                 ]
 78 |             except json.JSONDecodeError as e:
 79 |                 logger.error(f"Error parsing Claude response: {str(e)}")
 80 |                 return []
 81 | 
 82 |         except Exception as e:
 83 |             logger.error(f"Error analyzing diff with Claude: {str(e)}")
 84 |             return []
 85 | 
 86 |     async def analyze_comment_quality(self, comments: List[ReviewComment]) -> Dict[str, Dict[str, float]]:
 87 |         """Analyze the quality of bot comments using Claude"""
 88 |         bot_metrics = defaultdict(lambda: {
 89 |             'critical_bug_ratio': 0.0,
 90 |             'nitpick_ratio': 0.0,
 91 |             'other_ratio': 0.0,
 92 |             'total_comments': 0
 93 |         })
 94 | 
 95 |         # Group comments by bot and PR
 96 |         bot_pr_comments = defaultdict(lambda: defaultdict(list))
 97 |         for comment in comments:
 98 |             bot_pr_comments[comment.bot_name][comment.pr_number].append(comment)
 99 | 
100 |         categorization_prompt = """Analyze these code review comments and categorize each as either:
101 |             1. CRITICAL_BUG: Comments identifying serious technical issues
102 |             2. NITPICK: Minor style/formatting suggestions
103 |             3. OTHER: General feedback or questions
104 | 
105 |             PR #{pr_number} comments from {bot_name}:
106 |             {comments}
107 | 
108 |             Respond with a JSON array of objects:
109 |             [
110 |                 {
111 |                     "comment_index": number,
112 |                     "category": "CRITICAL_BUG|NITPICK|OTHER",
113 |                     "reasoning": "Brief explanation"
114 |                 }
115 |             ]"""
116 | 
117 |         for bot_name, pr_comments in bot_pr_comments.items():
118 |             for pr_number, comment_list in pr_comments.items():
119 |                 try:
120 |                     formatted_comments = "\n\n".join([
121 |                         f"Comment {i}:\nFile: {c.file_name}\nLines: {c.line_nums}\nComment: {c.comment}\nCode:\n{c.chunk}"
122 |                         for i, c in enumerate(comment_list)
123 |                     ])
124 | 
125 |                     def make_api_call():
126 |                         return self.client.messages.create(
127 |                             model="claude-3-opus-20240229",
128 |                             max_tokens=4096,
129 |                             messages=[{
130 |                                 "role": "user",
131 |                                 "content": categorization_prompt.format(
132 |                                     pr_number=pr_number,
133 |                                     bot_name=bot_name,
134 |                                     comments=formatted_comments
135 |                                 )
136 |                             }]
137 |                         )
138 | 
139 |                     response = await make_api_call_with_backoff(make_api_call)
140 |                     
141 |                     try:
142 |                         results = json.loads(response.content[0].text)
143 |                         if not isinstance(results, list):
144 |                             results = [results]
145 | 
146 |                         bot_metrics[bot_name]['total_comments'] += len(comment_list)
147 |                         for result in results:
148 |                             category = result['category']
149 |                             if category == 'CRITICAL_BUG':
150 |                                 bot_metrics[bot_name]['critical_bug_ratio'] += 1
151 |                             elif category == 'NITPICK':
152 |                                 bot_metrics[bot_name]['nitpick_ratio'] += 1
153 |                             else:
154 |                                 bot_metrics[bot_name]['other_ratio'] += 1
155 | 
156 |                     except json.JSONDecodeError as e:
157 |                         logger.error(f"Error parsing Claude categorization response: {str(e)}")
158 |                         continue
159 | 
160 |                 except Exception as e:
161 |                     logger.error(f"Error processing comments with Claude for {bot_name} PR #{pr_number}: {str(e)}")
162 |                     continue
163 | 
164 |         # Convert counts to ratios
165 |         for bot in bot_metrics:
166 |             total = bot_metrics[bot]['total_comments']
167 |             if total > 0:
168 |                 for metric in ['critical_bug_ratio', 'nitpick_ratio', 'other_ratio']:
169 |                     bot_metrics[bot][metric] /= total
170 | 
171 |         return dict(bot_metrics)
172 |     
173 |     


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import os
  4 | from typing import List
  5 | import re
  6 | from datetime import datetime
  7 | from collections import defaultdict
  8 | 
  9 | from github.api import GitHubAPI
 10 | from analyzers.gemini import GeminiAnalyzer
 11 | from visualization.visualizer import ResultsVisualizer
 12 | from models import ReviewComment
 13 | import os
 14 | from dotenv import load_dotenv
 15 | 
 16 | # Load environment variables at the start of the script
 17 | load_dotenv()  
 18 | 
 19 | # Configure logging
 20 | logging.basicConfig(
 21 |     level=logging.INFO,
 22 |     format='%(asctime)s - %(levelname)s - %(message)s'
 23 | )
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def parse_comments_from_log(comments_log_path: str) -> List[ReviewComment]:
 28 |     """Parse review comments from log file"""
 29 |     comments = []
 30 |     current_comment = None
 31 |     current_pr = None
 32 |     current_section = None
 33 |     current_content = []
 34 |     
 35 |     def save_current_comment():
 36 |         nonlocal current_comment
 37 |         if current_comment:
 38 |             if current_comment.comment:
 39 |                 current_comment.comment = re.sub(r'<!--.*?-->', '', 
 40 |                     current_comment.comment, flags=re.DOTALL).strip()
 41 |             if current_comment.chunk:
 42 |                 current_comment.chunk = current_comment.chunk.strip()
 43 |             comments.append(current_comment)
 44 |             current_comment = None
 45 | 
 46 | 
 47 |     def process_section_content():
 48 |         nonlocal current_content, current_section, current_comment
 49 |         if not current_comment or not current_section:
 50 |             return
 51 |         
 52 |         content = '\n'.join(current_content).strip()
 53 |         if current_section == 'comment':
 54 |             current_comment.comment = content
 55 |         elif current_section == 'code':
 56 |             current_comment.chunk = content
 57 |         
 58 |         current_content = []
 59 |         current_section = None
 60 | 
 61 | 
 62 |     with open(comments_log_path, 'r') as log_file:
 63 |         for line in log_file:
 64 |             line = line.rstrip('\n')
 65 |             
 66 |             if line.startswith('=== PR'):
 67 |                 process_section_content()
 68 |                 save_current_comment()
 69 |                 try:
 70 |                     current_pr = int(re.search(r'PR #(\d+)', line).group(1))
 71 |                 except (AttributeError, ValueError):
 72 |                     logger.warning(f"Could not parse PR number from line: {line}")
 73 |                 continue
 74 | 
 75 |             if line.startswith('Bot:'):
 76 |                 process_section_content()
 77 |                 save_current_comment()
 78 |                 current_comment = ReviewComment(
 79 |                     file_name='',
 80 |                     chunk='',
 81 |                     comment='',
 82 |                     line_nums='',
 83 |                     bot_name=line.split('Bot: ')[1].strip(),
 84 |                     pr_number=current_pr or 0
 85 |                 )
 86 |                 continue
 87 | 
 88 |             if current_comment:
 89 |                 if line.startswith('File:'):
 90 |                     process_section_content()
 91 |                     current_comment.file_name = line.split('File: ')[1].strip()
 92 |                 elif line.startswith('Lines:'):
 93 |                     process_section_content()
 94 |                     current_comment.line_nums = line.split('Lines: ')[1].strip()
 95 |                 elif line.startswith('Comment:'):
 96 |                     process_section_content()
 97 |                     current_section = 'comment'
 98 |                     if len(line) > 8:
 99 |                         current_content.append(line.split('Comment: ')[1])
100 |                 elif line.startswith('Code Snippet:'):
101 |                     process_section_content()
102 |                     current_section = 'code'
103 |                 elif current_section:
104 |                     current_content.append(line)
105 | 
106 |         process_section_content()
107 |         save_current_comment()
108 | 
109 |     return [c for c in comments if c.comment.strip() and not re.match(r'^[:;][\w-]+[:;]$', c.comment.strip())]
110 | 
111 | 
112 | def write_comment_to_log(file_handle, comment: ReviewComment):
113 |     """Write a comment to the log file"""
114 |     file_handle.write(f"Bot: {comment.bot_name}\n")
115 |     if comment.file_name:
116 |         file_handle.write(f"File: {comment.file_name}\n")
117 |     if comment.line_nums:
118 |         file_handle.write(f"Lines: {comment.line_nums}\n")
119 |     file_handle.write(f"Comment: {comment.comment}\n")
120 |     if comment.chunk:
121 |         file_handle.write("Code Snippet:\n")
122 |         file_handle.write(f"{comment.chunk}\n")
123 |     file_handle.write("*" * 9 + "\n")
124 | 
125 | 
126 | async def main():
127 |     # Load configuration from environment
128 |     GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
129 |     REPO = os.getenv("GITHUB_REPO", "microsoft/typescript")
130 |     GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
131 |     
132 |     if not all([GITHUB_TOKEN, GOOGLE_API_KEY]):
133 |         raise ValueError("Missing required environment variables. Please set GITHUB_TOKEN and GOOGLE_API_KEY")
134 |     
135 |     # Initialize components
136 |     github = GitHubAPI(GITHUB_TOKEN, REPO)
137 |     analyzer = GeminiAnalyzer(GOOGLE_API_KEY)
138 |     visualizer = ResultsVisualizer()
139 |     
140 |     try:
141 |         output_dir = 'analysis_results'
142 |         os.makedirs(output_dir, exist_ok=True)
143 |         comments_log_path = os.path.join(output_dir, 'pr_comments.txt')
144 |         
145 |         # Either load existing comments or fetch new ones
146 |         if os.path.exists(comments_log_path) and os.path.getsize(comments_log_path) > 0:
147 |             logger.info("Reading comments from existing log file...")
148 |             comments = parse_comments_from_log(comments_log_path)
149 |             logger.info(f"Loaded {len(comments)} comments from log file")
150 |         else:
151 |             logger.info("Fetching new PR comments...")
152 |             prs = await github.fetch_recent_prs(limit=100)
153 |             comments = []
154 |             
155 |             with open(comments_log_path, 'w') as comments_log:
156 |                 for pr in prs:
157 |                     pr_number = pr['number']
158 |                     logger.info(f"Processing PR #{pr_number}...")
159 |                     
160 |                     comments_log.write(f"=== PR #{pr_number} Comments ===\n")
161 |                     comments_log.write(f"PR Title: {pr.get('title', 'No Title')}\n")
162 |                     comments_log.write(f"PR URL: {pr.get('html_url', 'No URL')}\n\n")
163 |                     
164 |                     try:
165 |                         logger.info(f"Fetching PR {pr_number}")
166 |                         diff = await github.fetch_pr_diff(pr_number)
167 |                         logger.info(f"Fetching comments for PR {pr_number}")
168 |                         bot_comments = await github.fetch_pr_comments(pr_number)
169 |                         logger.info(f"Analyzing PR for {pr_number}")
170 |                         gemini_comments = await analyzer.analyze_diff(diff)
171 |                         
172 |                         for comment in bot_comments + gemini_comments:
173 |                             write_comment_to_log(comments_log, comment)
174 |                             comments.append(comment)
175 |                             
176 |                     except Exception as e:
177 |                         logger.error(f"Error processing PR #{pr_number}: {str(e)}")
178 |                         comments_log.write(f"Error processing PR #{pr_number}: {str(e)}\n\n")
179 |                         continue
180 |         
181 |         # Analyze comments and generate reports
182 |         logger.info("Analyzing comment quality...")
183 |         analysis_results = await analyzer.analyze_comment_quality_in_batch(comments)
184 |         
185 |         logger.info("Generating visualizations and reports...")
186 |         
187 |         # Create visualizations
188 |         visualizer.create_impact_distribution_chart(
189 |             analysis_results['metrics'],
190 |             os.path.join(output_dir, 'comment_distribution.png')
191 |         )
192 |         
193 |         visualizer.create_bot_comparison_chart(
194 |             analysis_results['metrics'],
195 |             os.path.join(output_dir, 'bot_comparison.png')
196 |         )
197 |         
198 |         # Generate reports
199 |         visualizer.save_detailed_report(
200 |             analysis_results,
201 |             os.path.join(output_dir, 'analysis_report.txt')
202 |         )
203 |         
204 |         logger.info(f"""
205 |         Analysis complete! Results saved in {output_dir}:
206 |         1. comment_distribution.png - Visual breakdown of comment categories
207 |         2. bot_comparison.png - Radar chart comparing bot performance
208 |         3. analysis_report.txt - Detailed metrics and analysis
209 |         """)
210 |         
211 |     except Exception as e:
212 |         logger.error(f"Error in main execution: {str(e)}")
213 |         raise
214 | 
215 | if __name__ == "__main__":
216 |     asyncio.run(main())
217 | 
218 | 


--------------------------------------------------------------------------------
/visualization/visualizer.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | import numpy as np
  5 | from datetime import datetime
  6 | from typing import Dict, Any
  7 | from collections import defaultdict
  8 | 
  9 | class ResultsVisualizer:
 10 |     @staticmethod
 11 |     def create_impact_distribution_chart(metrics: Dict[str, Dict[str, float]], output_file: str):
 12 |         """Create a stacked bar chart showing comment category distribution by bot"""
 13 |         data = []
 14 |         for bot, scores in metrics.items():
 15 |             data.append({
 16 |                 'Bot': bot,
 17 |                 'Critical Bugs': scores['critical_bug_ratio'],
 18 |                 'Nitpicks': scores['nitpick_ratio'],
 19 |                 'Other': scores['other_ratio']
 20 |             })
 21 | 
 22 |         df = pd.DataFrame(data)
 23 | 
 24 |         plt.figure(figsize=(12, 6))
 25 |         ax = df.plot(
 26 |             x='Bot',
 27 |             y=['Critical Bugs', 'Nitpicks', 'Other'],
 28 |             kind='bar',
 29 |             stacked=True,
 30 |             color=['#ff6b6b', '#4ecdc4', '#45b7d1']
 31 |         )
 32 | 
 33 |         plt.title('Comment Category Distribution by Code Review Bot', pad=20, fontsize=14)
 34 |         plt.xlabel('Bot', fontsize=12)
 35 |         plt.ylabel('Ratio of Comments', fontsize=12)
 36 |         plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
 37 |         plt.grid(axis='y', linestyle='--', alpha=0.7)
 38 | 
 39 |         # Add percentage labels
 40 |         for c in ax.containers:
 41 |             ax.bar_label(c, fmt='%.1f%%', label_type='center')
 42 | 
 43 |         plt.tight_layout()
 44 |         plt.savefig(output_file, dpi=300, bbox_inches='tight')
 45 |         plt.close()
 46 | 
 47 |     @staticmethod
 48 |     def create_bot_comparison_chart(metrics: Dict[str, Dict[str, float]], output_file: str):
 49 |         """Create a radar chart comparing different aspects of bot performance"""
 50 |         bots = list(metrics.keys())
 51 |         metrics_list = ['critical_bug_ratio', 'nitpick_ratio', 'other_ratio']
 52 |         
 53 |         angles = np.linspace(0, 2*np.pi, len(metrics_list), endpoint=False)
 54 |         angles = np.concatenate((angles, [angles[0]]))
 55 |         
 56 |         fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
 57 |         
 58 |         for idx, bot in enumerate(bots):
 59 |             values = [metrics[bot][m] for m in metrics_list]
 60 |             values = np.concatenate((values, [values[0]]))
 61 |             
 62 |             ax.plot(angles, values, 'o-', linewidth=2, label=bot)
 63 |             ax.fill(angles, values, alpha=0.25)
 64 |         
 65 |         ax.set_xticks(angles[:-1])
 66 |         ax.set_xticklabels(['Critical Bugs', 'Nitpicks', 'Other'])
 67 |         
 68 |         plt.title('Bot Performance Comparison', pad=20, fontsize=14)
 69 |         plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
 70 |         
 71 |         plt.tight_layout()
 72 |         plt.savefig(output_file, dpi=300, bbox_inches='tight')
 73 |         plt.close()
 74 | 
 75 |     @staticmethod
 76 |     def save_metrics_report(metrics: Dict[str, Dict[str, float]], output_file: str):
 77 |         """Generate basic metrics report"""
 78 |         with open(output_file, 'w') as f:
 79 |             ResultsVisualizer._write_report_header(f)
 80 |             ResultsVisualizer._write_overall_stats(f, metrics)
 81 |             ResultsVisualizer._write_per_bot_analysis(f, metrics)
 82 |             ResultsVisualizer._write_summary_table(f, metrics)
 83 | 
 84 |     @staticmethod
 85 |     def save_detailed_report(analysis_results: Dict[str, Any], output_file: str):
 86 |         """Generate detailed report including per-comment analysis"""
 87 |         metrics = analysis_results['metrics']
 88 |         classifications = analysis_results['classifications']
 89 |         
 90 |         with open(output_file, 'w') as f:
 91 |             ResultsVisualizer._write_report_header(f)
 92 |             ResultsVisualizer._write_overall_stats(f, metrics)
 93 |             ResultsVisualizer._write_per_bot_analysis(f, metrics)
 94 |             ResultsVisualizer._write_detailed_classifications(f, classifications)
 95 |             ResultsVisualizer._write_summary_table(f, metrics)
 96 | 
 97 |     @staticmethod
 98 |     def _write_report_header(f):
 99 |         f.write("Code Review Bot Analysis Report\n")
100 |         f.write("=" * 80 + "\n")
101 |         f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
102 | 
103 |     @staticmethod
104 |     def _write_overall_stats(f, metrics):
105 |         total_comments = sum(m['total_comments'] for m in metrics.values())
106 |         total_critical = sum(m['critical_bug_ratio'] * m['total_comments'] for m in metrics.values())
107 |         total_nitpicks = sum(m['nitpick_ratio'] * m['total_comments'] for m in metrics.values())
108 |         total_other = sum(m['other_ratio'] * m['total_comments'] for m in metrics.values())
109 |         
110 |         f.write("Overall Statistics\n")
111 |         f.write("-" * 30 + "\n")
112 |         f.write(f"Total Comments Analyzed: {total_comments}\n")
113 |         f.write(f"Total Critical Bugs Found: {total_critical:.0f} ({total_critical/total_comments:.1%})\n")
114 |         f.write(f"Total Nitpicks Made: {total_nitpicks:.0f} ({total_nitpicks/total_comments:.1%})\n")
115 |         f.write(f"Total Other Comments: {total_other:.0f} ({total_other/total_comments:.1%})\n\n")
116 | 
117 |     @staticmethod
118 |     def _write_per_bot_analysis(f, metrics):
119 |         f.write("\nPer-Bot Analysis\n")
120 |         f.write("=" * 80 + "\n")
121 |         
122 |         for bot, scores in metrics.items():
123 |             f.write(f"\nBot: {bot}\n")
124 |             f.write(f"{'-' * (len(bot) + 5)}\n")
125 |             f.write(f"Total Comments: {scores['total_comments']}\n")
126 |             f.write(f"Critical Bug Ratio: {scores['critical_bug_ratio']:.1%}\n")
127 |             f.write(f"Nitpick Ratio: {scores['nitpick_ratio']:.1%}\n")
128 |             f.write(f"Other Feedback Ratio: {scores['other_ratio']:.1%}\n")
129 |             
130 |             # Calculate raw numbers
131 |             critical_count = int(scores['critical_bug_ratio'] * scores['total_comments'])
132 |             nitpick_count = int(scores['nitpick_ratio'] * scores['total_comments'])
133 |             other_count = int(scores['other_ratio'] * scores['total_comments'])
134 |             
135 |             f.write("\nRaw Numbers:\n")
136 |             f.write(f"- Critical Bugs: {critical_count}\n")
137 |             f.write(f"- Nitpicks: {nitpick_count}\n")
138 |             f.write(f"- Other Comments: {other_count}\n\n")
139 | 
140 |     @staticmethod
141 |     def _write_detailed_classifications(f, classifications):
142 |         f.write("\nDetailed Classifications\n")
143 |         f.write("=" * 80 + "\n")
144 |         
145 |         for bot_name, pr_data in classifications.items():
146 |             f.write(f"\nBot: {bot_name}\n")
147 |             f.write(f"{'-' * (len(bot_name) + 5)}\n")
148 |             
149 |             for pr_number, comments in pr_data.items():
150 |                 f.write(f"\nPR #{pr_number}\n")
151 |                 f.write("~" * 20 + "\n")
152 |                 
153 |                 # Group comments by category
154 |                 grouped_comments = defaultdict(list)
155 |                 for comment in comments:
156 |                     grouped_comments[comment['category']].append(comment)
157 |                 
158 |                 for category in ['CRITICAL_BUG', 'NITPICK', 'OTHER']:
159 |                     if category in grouped_comments:
160 |                         f.write(f"\n{category} Comments:\n")
161 |                         f.write("-" * 20 + "\n")
162 |                         
163 |                         for comment in grouped_comments[category]:
164 |                             f.write(f"\nComment {comment['comment_index']} ")
165 |                             f.write(f"(File: {comment['file_name']}, Lines: {comment['line_nums']})\n")
166 |                             f.write("Comment: " + comment['comment'].strip() + "\n")
167 |                             f.write("Code:\n" + comment['code_chunk'].strip() + "\n")
168 |                             f.write("Reasoning: " + comment['reasoning'].strip() + "\n")
169 |                             f.write("-" * 40 + "\n")
170 |                     
171 |                 f.write("\n")
172 | 
173 |     @staticmethod
174 |     def _write_summary_table(f, metrics):
175 |         f.write("\nSummary Table\n")
176 |         f.write("=" * 80 + "\n")
177 |         f.write(f"{'Bot Name':<20} {'Total':<10} {'Critical':<15} {'Nitpicks':<15} {'Other':<15}\n")
178 |         f.write("-" * 80 + "\n")
179 |         
180 |         for bot, scores in metrics.items():
181 |             total = scores['total_comments']
182 |             critical = scores['critical_bug_ratio'] * total
183 |             nitpicks = scores['nitpick_ratio'] * total
184 |             other = scores['other_ratio'] * total
185 |             
186 |             f.write(f"{bot:<20} {total:<10d} {critical:>6.0f} ({scores['critical_bug_ratio']:>3.0%}) "
187 |                 f"{nitpicks:>6.0f} ({scores['nitpick_ratio']:>3.0%}) "
188 |                 f"{other:>6.0f} ({scores['other_ratio']:>3.0%})\n")
189 | 
190 |         f.write("\nNote: Percentages may not sum to 100% due to rounding\n")
191 | 
192 | 


--------------------------------------------------------------------------------
/notebooks/analyze_code_reviews.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "13d7e989",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Code Review Analyzer Demo\n",
  9 |     "\n",
 10 |     "This notebook demonstrates how to use the Code Review Analyzer to analyze and evaluate code review comments from different AI code review bots.\n",
 11 |     "\n",
 12 |     "## Setup\n",
 13 |     "First, let's install the required packages and configure our environment."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "id": "72f70bfe",
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# Install required packages\n",
 24 |     "!pip install aiohttp google-generativeai anthropic openai pandas matplotlib seaborn python-dotenv pydantic typing-extensions\n",
 25 |     "\n",
 26 |     "# Clone the repository (if running in Colab)\n",
 27 |     "!git clone https://github.com/yourusername/CodeReviewAnalyzer.git\n",
 28 |     "!cd CodeReviewAnalyzer"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "48520a48",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Configuration\n",
 37 |     "Enter your API keys below. These will be stored only for this session."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "e69bfc8b",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "import os\n",
 48 |     "from dotenv import load_dotenv\n",
 49 |     "\n",
 50 |     "# Load from .env file if it exists\n",
 51 |     "load_dotenv()\n",
 52 |     "\n",
 53 |     "# Set your API keys here (or in .env file)\n",
 54 |     "os.environ['GITHUB_TOKEN'] = os.getenv('GITHUB_TOKEN', 'your_github_token')\n",
 55 |     "os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your_gemini_api_key')\n",
 56 |     "os.environ['ANTHROPIC_KEY'] = os.getenv('ANTHROPIC_KEY', 'your_claude_api_key') \n",
 57 |     "os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your_openai_api_key')"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "id": "01d39269",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Importing Required Modules"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "897ddbdb",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "import asyncio\n",
 76 |     "import logging\n",
 77 |     "from datetime import datetime\n",
 78 |     "\n",
 79 |     "from code_review_analyzer.github import GitHubAPI\n",
 80 |     "from code_review_analyzer.analyzers import GeminiReviewAnalyzer, ClaudeReviewAnalyzer, GPT4ReviewAnalyzer\n",
 81 |     "from code_review_analyzer.visualization import ResultsVisualizer\n",
 82 |     "\n",
 83 |     "# Configure logging\n",
 84 |     "logging.basicConfig(\n",
 85 |     "    level=logging.INFO,\n",
 86 |     "    format='%(asctime)s - %(levelname)s - %(message)s'\n",
 87 |     ")\n",
 88 |     "logger = logging.getLogger(__name__)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "id": "ec9b6db1",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Analyzing Code Reviews\n",
 97 |     "Now let's analyze some code reviews from a GitHub repository."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "id": "2ba0aa42",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "async def analyze_reviews(repo_name: str, num_prs: int = 5):\n",
108 |     "    \"\"\"\n",
109 |     "    Analyze code reviews from a GitHub repository\n",
110 |     "    \n",
111 |     "    Args:\n",
112 |     "        repo_name: GitHub repository in format 'owner/repo'\n",
113 |     "        num_prs: Number of most recent PRs to analyze\n",
114 |     "    \"\"\"\n",
115 |     "    # Initialize components\n",
116 |     "    github = GitHubAPI(os.environ['GITHUB_TOKEN'], repo_name)\n",
117 |     "    \n",
118 |     "    # Initialize analyzers\n",
119 |     "    analyzers = {\n",
120 |     "        'gemini': GeminiReviewAnalyzer(os.environ['GOOGLE_API_KEY']),\n",
121 |     "        'claude': ClaudeReviewAnalyzer(os.environ['ANTHROPIC_KEY']),\n",
122 |     "        'gpt4': GPT4ReviewAnalyzer(os.environ['OPENAI_API_KEY'])\n",
123 |     "    }\n",
124 |     "    \n",
125 |     "    visualizer = ResultsVisualizer()\n",
126 |     "    \n",
127 |     "    try:\n",
128 |     "        # Fetch recent PRs\n",
129 |     "        logger.info(f\"Fetching {num_prs} recent PRs from {repo_name}...\")\n",
130 |     "        prs = await github.fetch_recent_prs(limit=num_prs)\n",
131 |     "        \n",
132 |     "        # Collect and analyze comments\n",
133 |     "        all_comments = []\n",
134 |     "        for pr in prs:\n",
135 |     "            pr_number = pr['number']\n",
136 |     "            logger.info(f\"Processing PR #{pr_number}...\")\n",
137 |     "            \n",
138 |     "            # Fetch PR data\n",
139 |     "            comments = await github.fetch_pr_comments(pr_number)\n",
140 |     "            diff = await github.fetch_pr_diff(pr_number)\n",
141 |     "            \n",
142 |     "            # Analyze with each bot\n",
143 |     "            for name, analyzer in analyzers.items():\n",
144 |     "                bot_comments = await analyzer.analyze_diff(diff)\n",
145 |     "                all_comments.extend(bot_comments)\n",
146 |     "                \n",
147 |     "            # Add existing bot comments\n",
148 |     "            all_comments.extend(comments)\n",
149 |     "        \n",
150 |     "        # Analyze comment quality\n",
151 |     "        logger.info(\"Analyzing comment quality...\")\n",
152 |     "        metrics = {}\n",
153 |     "        for name, analyzer in analyzers.items():\n",
154 |     "            metrics[name] = await analyzer.analyze_comment_quality(all_comments)\n",
155 |     "        \n",
156 |     "        # Generate visualizations\n",
157 |     "        logger.info(\"Generating visualizations...\")\n",
158 |     "        output_dir = 'analysis_results'\n",
159 |     "        os.makedirs(output_dir, exist_ok=True)\n",
160 |     "        \n",
161 |     "        visualizer.create_impact_distribution_chart(\n",
162 |     "            metrics,\n",
163 |     "            os.path.join(output_dir, 'comment_distribution.png')\n",
164 |     "        )\n",
165 |     "        \n",
166 |     "        visualizer.create_bot_comparison_chart(\n",
167 |     "            metrics,\n",
168 |     "            os.path.join(output_dir, 'bot_comparison.png')\n",
169 |     "        )\n",
170 |     "        \n",
171 |     "        visualizer.save_metrics_report(\n",
172 |     "            metrics,\n",
173 |     "            os.path.join(output_dir, 'analysis_report.txt')\n",
174 |     "        )\n",
175 |     "        \n",
176 |     "        visualizer.export_to_excel(\n",
177 |     "            metrics,\n",
178 |     "            os.path.join(output_dir, 'metrics.xlsx')\n",
179 |     "        )\n",
180 |     "        \n",
181 |     "        logger.info(f\"\"\"\n",
182 |     "        Analysis complete! Results saved in {output_dir}:\n",
183 |     "        1. comment_distribution.png - Visual breakdown of comment categories\n",
184 |     "        2. bot_comparison.png - Radar chart comparing bot performance\n",
185 |     "        3. analysis_report.txt - Detailed metrics and statistics\n",
186 |     "        4. metrics.xlsx - Excel file with all metrics\n",
187 |     "        \"\"\")\n",
188 |     "        \n",
189 |     "        return metrics\n",
190 |     "        \n",
191 |     "    except Exception as e:\n",
192 |     "        logger.error(f\"Error during analysis: {str(e)}\")\n",
193 |     "        raise"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "id": "5bf7edc5",
199 |    "metadata": {},
200 |    "source": [
201 |     "## Running the Analysis\n",
202 |     "Let's analyze a repository. You can modify the repository name and number of PRs to analyze."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "id": "1609135f",
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "# Repository to analyze\n",
213 |     "REPO_NAME = \"microsoft/typescript\"  # Example repository\n",
214 |     "NUM_PRS = 5  # Number of PRs to analyze\n",
215 |     "\n",
216 |     "# Run the analysis\n",
217 |     "metrics = await analyze_reviews(REPO_NAME, NUM_PRS)\n",
218 |     "\n",
219 |     "# Display results\n",
220 |     "from IPython.display import Image\n",
221 |     "Image('analysis_results/comment_distribution.png')"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "id": "3d8095cb",
227 |    "metadata": {},
228 |    "source": [
229 |     "## Viewing the Results\n",
230 |     "The analysis results are saved in the `analysis_results` directory. You can also explore the metrics dictionary directly:"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "id": "3e5e7b97",
237 |    "metadata": {
238 |     "lines_to_next_cell": 3
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "# Print metrics summary\n",
243 |     "for bot, data in metrics.items():\n",
244 |     "    print(f\"\\nBot: {bot}\")\n",
245 |     "    print(\"-\" * 20)\n",
246 |     "    for metric, value in data.items():\n",
247 |     "        if metric == 'total_comments':\n",
248 |     "            print(f\"{metric}: {value}\")\n",
249 |     "        else:\n",
250 |     "            print(f\"{metric}: {value:.1%}\")"
251 |    ]
252 |   }
253 |  ],
254 |  "metadata": {
255 |   "jupytext": {
256 |    "cell_metadata_filter": "-all",
257 |    "main_language": "python",
258 |    "notebook_metadata_filter": "-all"
259 |   }
260 |  },
261 |  "nbformat": 4,
262 |  "nbformat_minor": 5
263 | }
264 | 


--------------------------------------------------------------------------------
/analyzers/gemini.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from collections import defaultdict
  4 | import google.generativeai as genai
  5 | from typing import List, Dict, Any
  6 | 
  7 | from .base import BaseAnalyzer
  8 | from models import ReviewComment, PRDiff
  9 | from utils.rate_limiter import RateLimiter, make_api_call_with_backoff
 10 | from prompts import GEMINI_PROMPTS
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class GeminiAnalyzer(BaseAnalyzer):
 15 |     def __init__(self, api_key: str, requests_per_minute: int = 60):
 16 |         genai.configure(api_key=api_key)
 17 |         self.model = genai.GenerativeModel("gemini-1.5-flash-002")
 18 |         self.rate_limiter = RateLimiter(requests_per_minute)
 19 | 
 20 | 
 21 |     async def analyze_diff(self, diff: PRDiff) -> List[ReviewComment]:
 22 |         """Analyze a PR diff using Gemini"""
 23 | 
 24 |         try:
 25 |             def make_api_call():
 26 |                 logger.info("Make API call")
 27 |                 generation_config = genai.GenerationConfig(
 28 |                     response_mime_type="application/json"
 29 |                 )
 30 | 
 31 |                 print(diff.diff_content)
 32 |                 prompt = GEMINI_PROMPTS["diff_analysis"].format(diff=diff.diff_content)
 33 |                 logger.info(f"Prompt: {prompt}")
 34 |                 return self.model.generate_content(
 35 |                     GEMINI_PROMPTS["diff_analysis"].format(diff=diff.diff_content),
 36 |                     generation_config=generation_config
 37 |                 )
 38 | 
 39 |             response = await make_api_call_with_backoff(make_api_call)
 40 |             response_text = response.text if hasattr(response, 'text') else response.parts[0].text
 41 |             
 42 |             # Log the raw response for debugging
 43 |             logger.debug(f"Raw Gemini response: {response_text}")
 44 |             
 45 |             try:
 46 |                 parsed_response = json.loads(response_text)
 47 |                 # Check if response has 'issues' key
 48 |                 if isinstance(parsed_response, dict) and 'issues' in parsed_response:
 49 |                     results = parsed_response['issues']
 50 |                 else:
 51 |                     # If no 'issues' key, treat the whole response as the results
 52 |                     results = [parsed_response] if isinstance(parsed_response, dict) else parsed_response
 53 | 
 54 |                 # Filter out any malformed results
 55 |                 valid_results = []
 56 |                 for result in results:
 57 |                     if all(key in result for key in ['file_name', 'snippet', 'bug_description', 'line_numbers']):
 58 |                         valid_results.append(result)
 59 |                     else:
 60 |                         logger.warning(f"Skipping malformed result: {result}")
 61 | 
 62 |                 return [
 63 |                     ReviewComment(
 64 |                         file_name=result['file_name'],
 65 |                         chunk=result['snippet'],
 66 |                         comment=result['bug_description'],
 67 |                         line_nums=result['line_numbers'],
 68 |                         bot_name='gemini',
 69 |                         pr_number=diff.pr_number,
 70 |                     )
 71 |                     for result in valid_results
 72 |                 ]
 73 | 
 74 |             except json.JSONDecodeError as e:
 75 |                 logger.error(f"Failed to parse Gemini response as JSON: {str(e)}")
 76 |                 logger.error(f"Response text: {response_text}")
 77 |                 return []
 78 | 
 79 |         except Exception as e:
 80 |             logger.error(f"Error analyzing diff: {str(e)}")
 81 |             logger.debug("Full error:", exc_info=True)
 82 |             return []
 83 | 
 84 | 
 85 |     async def analyze_comment_quality_in_batch(self, comments: List[ReviewComment]) -> Dict[str, Dict]:
 86 |         """Analyze comments in batches with detailed classification"""
 87 |         BATCH_SIZE = 25
 88 |         
 89 |         bot_metrics = defaultdict(lambda: {
 90 |             'critical_bug_ratio': 0.0,
 91 |             'nitpick_ratio': 0.0,
 92 |             'other_ratio': 0.0,
 93 |             'total_comments': 0
 94 |         })
 95 |         
 96 |         classifications = defaultdict(lambda: defaultdict(list))
 97 | 
 98 |         # Group comments by bot and PR
 99 |         bot_pr_comments = defaultdict(lambda: defaultdict(list))
100 |         for comment in comments:
101 |             bot_pr_comments[comment.bot_name][comment.pr_number].append(comment)
102 | 
103 |         for bot_name, pr_comments in bot_pr_comments.items():
104 |             for pr_number, comment_list in pr_comments.items():
105 |                 for i in range(0, len(comment_list), BATCH_SIZE):
106 |                     batch = comment_list[i:i + BATCH_SIZE]
107 |                     
108 |                     try:
109 |                         formatted_comments = self._format_comments_for_analysis(batch)
110 |                         analysis_results = await self._analyze_batch(
111 |                             bot_name, pr_number, formatted_comments
112 |                         )
113 |                         
114 |                         self._update_metrics_and_classifications(
115 |                             bot_metrics, classifications, bot_name, pr_number, 
116 |                             analysis_results, batch, i
117 |                         )
118 | 
119 |                     except Exception as e:
120 |                         logger.error(f"Error processing batch for {bot_name} PR #{pr_number}: {str(e)}")
121 |                         continue
122 | 
123 |         return {
124 |             'metrics': self._finalize_metrics(bot_metrics),
125 |             'classifications': dict(classifications)
126 |         }
127 | 
128 | 
129 |     async def analyze_comment_quality(self, comments: List[ReviewComment]) -> Dict[str, Dict[str, float]]:
130 |         """Simple comment quality analysis without detailed classifications"""
131 |         result = await self.analyze_comment_quality_in_batch(comments)
132 |         return result['metrics']
133 | 
134 | 
135 |     def _format_comments_for_analysis(self, comments: List[ReviewComment]) -> str:
136 |         return "\n\n".join([
137 |             f"Comment {i}:\nFile: {c.file_name}\nLines: {c.line_nums}\n"
138 |             f"Comment: {c.comment}\nCode:\n{c.chunk}"
139 |             for i, c in enumerate(comments)
140 |         ])
141 | 
142 | 
143 |     async def _analyze_batch(self, bot_name: str, pr_number: int, formatted_comments: str) -> List[Dict]:
144 |         def make_api_call():
145 |             return self.model.generate_content(
146 |                 GEMINI_PROMPTS["comment_categorization"].format(
147 |                     pr_number=pr_number,
148 |                     bot_name=bot_name,
149 |                     comments=formatted_comments
150 |                 ),
151 |                 generation_config=genai.GenerationConfig(
152 |                     response_mime_type="application/json"
153 |                 )
154 |             )
155 | 
156 |         response = await make_api_call_with_backoff(make_api_call)
157 |         response_text = response.text if hasattr(response, 'text') else response.parts[0].text
158 |         
159 |         try:
160 |             results = json.loads(response_text)
161 |             return results if isinstance(results, list) else [results]
162 |         except json.JSONDecodeError as e:
163 |             logger.error(f"Error parsing Gemini response: {str(e)}")
164 |             return []
165 | 
166 |     def _update_metrics_and_classifications(
167 |         self, 
168 |         bot_metrics: dict,
169 |         classifications: dict,
170 |         bot_name: str,
171 |         pr_number: int,
172 |         analysis_results: List[Dict],
173 |         batch: List[ReviewComment],
174 |         batch_start_idx: int
175 |     ):
176 |         bot_metrics[bot_name]['total_comments'] += len(batch)
177 |         
178 |         for idx, result in enumerate(analysis_results):
179 |             category = result['category']
180 |             comment = batch[idx]
181 |             
182 |             # Update metrics
183 |             if category == 'CRITICAL_BUG':
184 |                 bot_metrics[bot_name]['critical_bug_ratio'] += 1
185 |             elif category == 'NITPICK':
186 |                 bot_metrics[bot_name]['nitpick_ratio'] += 1
187 |             else:
188 |                 bot_metrics[bot_name]['other_ratio'] += 1
189 | 
190 |             # Store classification
191 |             classifications[bot_name][pr_number].append({
192 |                 'file_name': comment.file_name,
193 |                 'line_nums': comment.line_nums,
194 |                 'comment': comment.comment,
195 |                 'code_chunk': comment.chunk,
196 |                 'category': category,
197 |                 'reasoning': result.get('reasoning', 'No reasoning provided'),
198 |                 'comment_index': batch_start_idx + idx
199 |             })
200 | 
201 |     def _finalize_metrics(self, bot_metrics: dict) -> Dict[str, Dict[str, float]]:
202 |         final_metrics = {}
203 |         
204 |         for bot, metrics in bot_metrics.items():
205 |             total = metrics['total_comments']
206 |             if total > 0:
207 |                 final_metrics[bot] = {
208 |                     'critical_bug_ratio': metrics['critical_bug_ratio'] / total,
209 |                     'nitpick_ratio': metrics['nitpick_ratio'] / total,
210 |                     'other_ratio': metrics['other_ratio'] / total,
211 |                     'total_comments': total
212 |                 }
213 |             else:
214 |                 final_metrics[bot] = {
215 |                     'critical_bug_ratio': 0.0,
216 |                     'nitpick_ratio': 0.0,
217 |                     'other_ratio': 0.0,
218 |                     'total_comments': 0
219 |                 }
220 |                 
221 |         return final_metrics
222 |     
223 | 


--------------------------------------------------------------------------------