├── assets └── pipeline.png ├── data_synthesis ├── synthesized_data │ └── eval_50k.jsonl ├── config.example.json ├── __init__.py ├── config.py ├── validate_data.py ├── cli.py ├── inconsistency_rules.py ├── rule_applicability_checker.py ├── llm_interface.py ├── data_validator.py └── data_generator.py ├── LEGAL.md ├── README.md ├── evaluation ├── clone_repos.py ├── fewshot │ └── consistency_checker.py ├── pure_llm │ └── consistency_checker.py ├── CoT │ └── consistency_checker.py └── evaluate_main.py └── LICENSE /assets/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/CodeFuse-CommitEval/main/assets/pipeline.png -------------------------------------------------------------------------------- /data_synthesis/synthesized_data/eval_50k.jsonl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2232f1293dbbb794e5705a2af11813240d4e3ac0b7cdc1297221e33fdf112078 3 | size 280641409 4 | -------------------------------------------------------------------------------- /LEGAL.md: -------------------------------------------------------------------------------- 1 | 法律免责声明 2 | 关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。 3 | 4 | Legal Disclaimer 5 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail. 6 | -------------------------------------------------------------------------------- /data_synthesis/config.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "llm": { 3 | "provider": "openai", 4 | "api_key": "your_openai_api_key_here", 5 | "base_url": "", 6 | "model": "gpt-3.5-turbo", 7 | "max_retries": 3 8 | }, 9 | "generation": { 10 | "inconsistency_ratio": 1.0, 11 | "random_seed": 42, 12 | "batch_size": 10, 13 | "max_samples": null 14 | }, 15 | "logging": { 16 | "level": "INFO", 17 | "file": null 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /data_synthesis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data synthesis package - for generating test data with inconsistent commit messages and code diffs 3 | """ 4 | 5 | from .data_generator import InconsistentDataGenerator, setup_logging 6 | from .inconsistency_rules import InconsistencyRuleManager, InconsistencyType, InconsistencyRule 7 | from .llm_interface import LLMManager, LangChainOpenAIInterface, MockLLMInterface 8 | from .config import DataSynthesisConfig, default_config 9 | 10 | __version__ = "1.0.0" 11 | __author__ = "Research Team" 12 | 13 | __all__ = [ 14 | 'InconsistentDataGenerator', 15 | 'InconsistencyRuleManager', 16 | 'InconsistencyType', 17 | 'InconsistencyRule', 18 | 'LLMManager', 19 | 'LangChainOpenAIInterface', 20 | 'MockLLMInterface', 21 | 'DataSynthesisConfig', 22 | 'default_config', 23 | 'setup_logging' 24 | ] 25 | -------------------------------------------------------------------------------- /data_synthesis/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data synthesis configuration 3 | """ 4 | 5 | import os 6 | from typing import Dict, Any 7 | 8 | 9 | class DataSynthesisConfig: 10 | """Data synthesis configuration class""" 11 | 12 | def __init__(self): 13 | # LLM configuration 14 | self.llm_config = { 15 | 'provider': os.getenv('LLM_PROVIDER', 'openai'), # openai 16 | 'api_key': os.getenv('LLM_API_KEY', 'empty'), 17 | 'base_url': os.getenv('LLM_BASE_URL', ''), 18 | 'model': os.getenv('LLM_MODEL', 'gpt-3.5-turbo'), 19 | 'max_retries': int(os.getenv('LLM_MAX_RETRIES', '3')) 20 | } 21 | 22 | # Data generation configuration 23 | self.generation_config = { 24 | 'inconsistency_ratio': float(os.getenv('INCONSISTENCY_RATIO', '1.0')), 25 | 'random_seed': int(os.getenv('RANDOM_SEED', '777')), 26 | 'batch_size': int(os.getenv('BATCH_SIZE', '10')), 27 | 'max_samples': int(os.getenv('MAX_SAMPLES', '0')) or None 28 | } 29 | 30 | # Logging configuration 31 | self.logging_config = { 32 | 'level': os.getenv('LOG_LEVEL', 'INFO'), 33 | 'file': os.getenv('LOG_FILE', None) 34 | } 35 | 36 | def get_llm_config(self) -> Dict[str, Any]: 37 | """Get LLM configuration""" 38 | return self.llm_config.copy() 39 | 40 | def get_generation_config(self) -> Dict[str, Any]: 41 | """Get data generation configuration""" 42 | return self.generation_config.copy() 43 | 44 | def get_logging_config(self) -> Dict[str, Any]: 45 | """Get logging configuration""" 46 | return self.logging_config.copy() 47 | 48 | def update_config(self, config_dict: Dict[str, Any]) -> None: 49 | """Update configuration""" 50 | if 'llm' in config_dict: 51 | self.llm_config.update(config_dict['llm']) 52 | if 'generation' in config_dict: 53 | self.generation_config.update(config_dict['generation']) 54 | if 'logging' in config_dict: 55 | self.logging_config.update(config_dict['logging']) 56 | 57 | def to_dict(self) -> Dict[str, Any]: 58 | """Convert to dictionary format""" 59 | return { 60 | 'llm': self.llm_config, 61 | 'generation': self.generation_config, 62 | 'logging': self.logging_config 63 | } 64 | 65 | 66 | # Default configuration instance 67 | default_config = DataSynthesisConfig() 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg) 2 | ![Python](https://img.shields.io/badge/python-3.9%2B-blue) 3 | ![arXiv](https://img.shields.io/badge/arXiv-2511.19875-b31b1b.svg) 4 | 5 | 6 | # CodeFuse-CommitEval 7 | 8 | CodeFuse-CommitEval is the first benchmark tailored to commit Message-Code Inconsistency (MCI) detection with large language models (LLMs). Building on the ApacheCM dataset for diversity and quality, we synthesize seven types of inconsistent messages via rule-guided mutations of originally consistent commits and apply two-fold validation to verify both positive (inconsistent) and negative (consistent) samples. Using this rich and labeled dataset of message–diff pairs, we then evaluate six state-of-the-art open-source LLMs under a vanilla setting and with three augmentation strategies: few-shot prompting, chain-of-thought (CoT), and extended context. 9 | 10 | ![](assets/pipeline.png) 11 | 12 | ## Features 13 | 14 | - Multilingual & large-scale dataset 15 | - Even distruction of samples 16 | - Rich inconsistent commit types 17 | - Modular commit mutation rules 18 | - Effective verification for synthesized samples 19 | 20 | ## Related Project 21 | 22 | - [ApacheCM Dataset](https://arxiv.org/html/2507.17690v1) - Contextual Code Retrieval for Commit Message Generation: A Preliminary Study 23 | 24 | ## Documentations 25 | ### Environment Setup 26 | 27 | Execute under Python 3.9.6 28 | 29 | ```shell 30 | python3 -m pip install langchain langchain_openai langchain_community 31 | ``` 32 | 33 | ### Benchmarking 34 | First you need to download all the repositories for contextual code retrieval: 35 | 36 | ```shell 37 | python3 evaluation/clone_repos.py 38 | ``` 39 | 40 | Then, you need to deploy the targeted models by yourself, or use public apis. In our paper, we evaluated the following models: 41 | 42 | - DeepSeek-V3.1 (Remote API) 43 | - gpt-oss-20b (Local deployment) 44 | - Qwen3-30B-A3B (Local deployment) 45 | - Llama-3.1-8B (Local deployment) 46 | - Mistral-Small-3.2-24B (Local deployment) 47 | - Kimi-K2-Instruct (Remote API) 48 | 49 | Run benchmarking: 50 | ```shell 51 | python3 evaluation/evaluate_main.py \ 52 | -s {pure_llm,fewshot_llm,cot_llm} \ 53 | --ctx \ 54 | -d \ 55 | -r \ 56 | --api_key \ 57 | --api-base \ 58 | --model \ 59 | -o \ 60 | --worker 61 | ``` 62 | 63 | ## Contribution 64 | 65 | We welcome and encourage contributions from the community! If you're interested in contributing to this project, please follow these guidelines: 66 | 67 | 1. **Identify a Need**: Before submitting a pull request (PR), ensure that your contribution addresses a real need or improvement for the project. 68 | 69 | 2. **Submit a PR**: Create a pull request with a clear description of: 70 | - The problem or feature request you're addressing 71 | - How your changes solve the problem or implement the feature 72 | - Any relevant test cases or documentation updates 73 | 74 | 3. **Review Process**: Our team will review your PR based on: 75 | - Whether the contribution addresses a genuine need for the project 76 | - The quality and correctness of the implementation 77 | - Adherence to the project's coding standards and architecture 78 | 79 | We appreciate your interest in making CodeFuse-CommitEval better. 80 | 81 | ## Citation 82 | ``` 83 | @misc{zhang2025codefusecommitevalbenchmarkingllmspower, 84 | title={CodeFuse-CommitEval: Towards Benchmarking LLM's Power on Commit Message and Code Change Inconsistency Detection}, 85 | author={Qingyu Zhang and Puzhuo Liu and Peng Di and Chenxiong Qian}, 86 | year={2025}, 87 | eprint={2511.19875}, 88 | archivePrefix={arXiv}, 89 | primaryClass={cs.SE}, 90 | url={https://arxiv.org/abs/2511.19875}, 91 | } 92 | ``` 93 | 94 | ## License 95 | 96 | CodeFuse-CommitEval is licensed under the [Apache License 2.0](./LICENSE). 97 | 98 | -------------------------------------------------------------------------------- /data_synthesis/validate_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | CLI tool for validating generated inconsistent data 4 | """ 5 | 6 | import argparse 7 | import json 8 | import sys 9 | from pathlib import Path 10 | 11 | from data_validator import DataValidator 12 | from llm_interface import LLMManager 13 | from config import DataSynthesisConfig 14 | 15 | 16 | def main(): 17 | """Main function for data validation CLI""" 18 | 19 | parser = argparse.ArgumentParser( 20 | description="Validate quality of generated inconsistent commit message data", 21 | formatter_class=argparse.RawDescriptionHelpFormatter, 22 | epilog=""" 23 | Examples: 24 | # Basic validation 25 | python validate_data.py input.jsonl output.jsonl 26 | 27 | # Validate with custom LLM settings 28 | python validate_data.py input.jsonl output.jsonl --api-key your_key 29 | 30 | # Process only first 50 entries 31 | python validate_data.py input.jsonl output.jsonl --max-entries 50 32 | 33 | # Save detailed validation report 34 | python validate_data.py input.jsonl output.jsonl --report validation_report.json 35 | """ 36 | ) 37 | 38 | # Required arguments 39 | parser.add_argument('input_file', help='Input JSONL file with generated data') 40 | parser.add_argument('output_file', help='Output JSONL file for valid entries') 41 | 42 | # LLM configuration 43 | llm_group = parser.add_argument_group('LLM Configuration') 44 | llm_group.add_argument('--provider', choices=['openai'], 45 | default='openai', help='LLM provider (default: openai)') 46 | llm_group.add_argument('--api-key', default='empty', 47 | help='API key (default: empty, will use mock response)') 48 | llm_group.add_argument('--base-url', help='API base URL') 49 | llm_group.add_argument('--model', help='Model name') 50 | llm_group.add_argument('--max-retries', type=int, default=3, 51 | help='Maximum retry attempts (default: 3)') 52 | 53 | # Validation configuration 54 | val_group = parser.add_argument_group('Validation Configuration') 55 | val_group.add_argument('--max-entries', type=int, 56 | help='Maximum number of entries to process') 57 | val_group.add_argument('--report', 58 | help='Save detailed validation report to JSON file') 59 | 60 | # Other options 61 | parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], 62 | default='INFO', help='Log level (default: INFO)') 63 | parser.add_argument('--config', help='Load configuration from JSON file') 64 | 65 | args = parser.parse_args() 66 | 67 | # Setup logging 68 | import logging 69 | logging.basicConfig( 70 | level=getattr(logging, args.log_level), 71 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 72 | ) 73 | 74 | # Create configuration 75 | config = DataSynthesisConfig() 76 | 77 | # Override with command line arguments if provided 78 | llm_config_dict = config.llm_config 79 | if args.api_key != 'empty': 80 | llm_config_dict['api_key'] = args.api_key 81 | if args.base_url: 82 | llm_config_dict['base_url'] = args.base_url 83 | if args.model: 84 | llm_config_dict['model'] = args.model 85 | if args.max_retries: 86 | llm_config_dict['max_retries'] = args.max_retries 87 | llm_config_dict['provider'] = args.provider 88 | 89 | # Load from config file if provided 90 | if args.config: 91 | import json 92 | with open(args.config, 'r') as f: 93 | config_data = json.load(f) 94 | config.update_config(config_data) 95 | 96 | # Validate files 97 | input_path = Path(args.input_file) 98 | if not input_path.exists(): 99 | print(f"❌ Input file not found: {args.input_file}", file=sys.stderr) 100 | sys.exit(1) 101 | 102 | # Create validator and run validation 103 | try: 104 | print(f"🔍 Starting validation of {args.input_file}") 105 | print(f"📝 Valid entries will be saved to {args.output_file}") 106 | if args.max_entries: 107 | print(f"📊 Processing maximum {args.max_entries} entries") 108 | print() 109 | 110 | llm_manager = LLMManager(config.get_llm_config()) 111 | validator = DataValidator(llm_manager) 112 | 113 | stats = validator.validate_file( 114 | input_file=args.input_file, 115 | output_file=args.output_file, 116 | max_entries=args.max_entries 117 | ) 118 | 119 | # Print statistics 120 | print() 121 | validator.print_validation_statistics(stats) 122 | 123 | # Save detailed report if requested 124 | if args.report: 125 | with open(args.report, 'w', encoding='utf-8') as f: 126 | json.dump(stats, f, indent=2, ensure_ascii=False) 127 | print(f"\n📄 Detailed validation report saved to {args.report}") 128 | 129 | print(f"\n✅ Validation complete! Valid data saved to {args.output_file}") 130 | 131 | except Exception as e: 132 | print(f"❌ Error during validation: {e}", file=sys.stderr) 133 | sys.exit(1) 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /evaluation/clone_repos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Git Repository Cloner for Consistency Dataset 4 | 5 | This script parses the filtered_inconsistency.jsonl file, extracts unique git URLs, 6 | and clones them to the specified repository collection directory. 7 | """ 8 | 9 | import json 10 | import os 11 | import subprocess 12 | import sys 13 | import threading 14 | from pathlib import Path 15 | from typing import Set, List 16 | from urllib.parse import urlparse 17 | from concurrent.futures import ThreadPoolExecutor, as_completed 18 | 19 | def parse_jsonl_file(file_path: str) -> List[dict]: 20 | """ 21 | Parse JSONL file and return list of records. 22 | 23 | Args: 24 | file_path (str): Path to the JSONL file 25 | 26 | Returns: 27 | List[dict]: List of parsed JSON records 28 | """ 29 | records = [] 30 | try: 31 | with open(file_path, 'r', encoding='utf-8') as f: 32 | for line_num, line in enumerate(f, 1): 33 | line = line.strip() 34 | if not line: 35 | continue 36 | try: 37 | record = json.loads(line) 38 | records.append(record) 39 | except json.JSONDecodeError as e: 40 | print(f"Warning: Failed to parse JSON on line {line_num}: {e}") 41 | continue 42 | except FileNotFoundError: 43 | print(f"Error: File not found: {file_path}") 44 | sys.exit(1) 45 | except Exception as e: 46 | print(f"Error reading file {file_path}: {e}") 47 | sys.exit(1) 48 | 49 | return records 50 | 51 | def extract_git_urls(records: List[dict]) -> Set[str]: 52 | """ 53 | Extract unique git URLs from records. 54 | 55 | Args: 56 | records (List[dict]): List of parsed records 57 | 58 | Returns: 59 | Set[str]: Set of unique git URLs 60 | """ 61 | git_urls = set() 62 | 63 | for record in records: 64 | git_url = record.get('git_url') 65 | if git_url and isinstance(git_url, str): 66 | # Clean up the URL (remove .git suffix if present for consistency) 67 | if git_url.endswith('.git'): 68 | git_url = git_url[:-4] 69 | git_urls.add(git_url + '.git') # Add .git back for cloning 70 | 71 | return git_urls 72 | 73 | def get_repo_name_from_url(git_url: str) -> str: 74 | """ 75 | Extract repository name from git URL. 76 | 77 | Args: 78 | git_url (str): Git repository URL 79 | 80 | Returns: 81 | str: Repository name 82 | """ 83 | parsed = urlparse(git_url) 84 | path = parsed.path.strip('/') 85 | 86 | if path.endswith('.git'): 87 | path = path[:-4] 88 | 89 | # Handle GitHub URLs 90 | parts = path.split('/') 91 | if len(parts) >= 2: 92 | return f"{parts[-2]}_{parts[-1]}" 93 | else: 94 | return parts[-1] if parts else "unknown_repo" 95 | 96 | def clone_repository(git_url: str, target_dir: str) -> tuple[bool, str]: 97 | """ 98 | Clone a git repository to the target directory. 99 | 100 | Args: 101 | git_url (str): Git repository URL to clone 102 | target_dir (str): Target directory for cloning 103 | 104 | Returns: 105 | tuple[bool, str]: (success, repo_name) 106 | """ 107 | repo_name = get_repo_name_from_url(git_url) 108 | repo_path = os.path.join(target_dir, repo_name) 109 | 110 | # Thread-safe printing 111 | thread_id = threading.current_thread().name 112 | 113 | # Check if repository already exists 114 | if os.path.exists(repo_path): 115 | print(f"[{thread_id}] Repository already exists: {repo_name}") 116 | return True, repo_name 117 | 118 | try: 119 | print(f"[{thread_id}] Cloning {git_url} to {repo_name}...") 120 | 121 | # Use git clone with shallow clone for faster download 122 | cmd = [ 123 | 'git', 'clone', 124 | git_url, 125 | repo_path 126 | ] 127 | 128 | result = subprocess.run( 129 | cmd, 130 | capture_output=True, 131 | text=True, 132 | timeout=3000 # 5 minute timeout 133 | ) 134 | 135 | if result.returncode == 0: 136 | print(f"[{thread_id}] ✅ Successfully cloned: {repo_name}") 137 | return True, repo_name 138 | else: 139 | print(f"[{thread_id}] ❌ Failed to clone {repo_name}") 140 | print(f"[{thread_id}] Error: {result.stderr}") 141 | return False, repo_name 142 | 143 | except subprocess.TimeoutExpired: 144 | print(f"[{thread_id}] ❌ Timeout while cloning {repo_name}") 145 | return False, repo_name 146 | except Exception as e: 147 | print(f"[{thread_id}] ❌ Error cloning {repo_name}: {e}") 148 | return False, repo_name 149 | 150 | def main(): 151 | """Main function to orchestrate the cloning process.""" 152 | 153 | # File paths 154 | jsonl_file = sys.argv[1] 155 | target_dir = sys.argv[2] 156 | 157 | # Configuration 158 | max_workers = 4 # Number of concurrent cloning processes 159 | 160 | print("Git Repository Cloner for Consistency Dataset") 161 | print("=" * 50) 162 | 163 | # Create target directory if it doesn't exist 164 | os.makedirs(target_dir, exist_ok=True) 165 | print(f"Target directory: {target_dir}") 166 | print(f"Max concurrent workers: {max_workers}") 167 | 168 | # Parse JSONL file 169 | print(f"Parsing JSONL file: {jsonl_file}") 170 | records = parse_jsonl_file(jsonl_file) 171 | print(f"Loaded {len(records)} records") 172 | 173 | # Extract unique git URLs 174 | print("Extracting unique git URLs...") 175 | git_urls = extract_git_urls(records) 176 | print(f"Found {len(git_urls)} unique git URLs") 177 | 178 | # Display URLs to be cloned 179 | print("\nURLs to be cloned:") 180 | for i, url in enumerate(sorted(git_urls), 1): 181 | print(f" {i:3d}. {url}") 182 | 183 | # Ask for confirmation 184 | print(f"\nThis will clone {len(git_urls)} repositories to {target_dir}") 185 | print(f"Using {max_workers} concurrent workers") 186 | response = input("Do you want to continue? (y/N): ").strip().lower() 187 | 188 | if response not in ['y', 'yes']: 189 | print("Operation cancelled.") 190 | return 191 | 192 | # Clone repositories concurrently 193 | print("\nStarting concurrent repository cloning...") 194 | print("=" * 50) 195 | 196 | successful_clones = 0 197 | failed_clones = 0 198 | completed_count = 0 199 | 200 | # Use ThreadPoolExecutor for concurrent cloning 201 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 202 | # Submit all clone tasks 203 | future_to_url = { 204 | executor.submit(clone_repository, url, target_dir): url 205 | for url in sorted(git_urls) 206 | } 207 | 208 | # Process completed tasks as they finish 209 | for future in as_completed(future_to_url): 210 | git_url = future_to_url[future] 211 | completed_count += 1 212 | 213 | try: 214 | success, repo_name = future.result() 215 | if success: 216 | successful_clones += 1 217 | print(f"[MAIN] Progress: {completed_count}/{len(git_urls)} - ✅ {repo_name}") 218 | else: 219 | failed_clones += 1 220 | print(f"[MAIN] Progress: {completed_count}/{len(git_urls)} - ❌ {repo_name}") 221 | 222 | except Exception as e: 223 | failed_clones += 1 224 | repo_name = get_repo_name_from_url(git_url) 225 | print(f"[MAIN] Progress: {completed_count}/{len(git_urls)} - ❌ {repo_name} (Exception: {e})") 226 | 227 | # Summary 228 | print("\n" + "=" * 50) 229 | print("CLONING SUMMARY") 230 | print("=" * 50) 231 | print(f"Total repositories: {len(git_urls)}") 232 | print(f"Successfully cloned: {successful_clones}") 233 | print(f"Failed to clone: {failed_clones}") 234 | print(f"Concurrent workers used: {max_workers}") 235 | 236 | if failed_clones > 0: 237 | print(f"\n⚠️ {failed_clones} repositories failed to clone.") 238 | print("You may want to retry these manually or check your network connection.") 239 | else: 240 | print("\n🎉 All repositories cloned successfully!") 241 | 242 | if __name__ == "__main__": 243 | main() 244 | -------------------------------------------------------------------------------- /evaluation/fewshot/consistency_checker.py: -------------------------------------------------------------------------------- 1 | from langchain_community.chat_models import ChatOpenAI 2 | from langchain_core.prompts import ChatPromptTemplate 3 | from typing import Dict, Any, Optional 4 | import logging 5 | import json 6 | import re 7 | 8 | import time 9 | import os 10 | 11 | 12 | class ConsistencyChecker: 13 | """ 14 | A consistency checker that uses few-shot learning with examples. 15 | """ 16 | 17 | def __init__(self, openai_api_key: str, openai_api_base: str, model: str = "gpt-3.5-turbo"): 18 | """ 19 | Initialize the few-shot consistency checker with LLM configuration. 20 | 21 | Args: 22 | openai_api_key (str): OpenAI API key 23 | openai_api_base (str): OpenAI API base URL 24 | model (str): Model name to use 25 | """ 26 | from langchain_community.chat_models import ChatOpenAI 27 | from langchain_core.prompts import ChatPromptTemplate 28 | 29 | # Validate input parameters 30 | if not openai_api_key or not openai_api_key.strip(): 31 | raise ValueError("OpenAI API key cannot be empty") 32 | if not openai_api_base or not openai_api_base.strip(): 33 | raise ValueError("OpenAI API base URL cannot be empty") 34 | if not model or not model.strip(): 35 | raise ValueError("Model name cannot be empty") 36 | 37 | self.chat_model = ChatOpenAI( 38 | openai_api_key=openai_api_key, 39 | openai_api_base=openai_api_base, 40 | model=model, 41 | temperature=0 42 | ) 43 | 44 | # Define the system prompt with few-shot examples 45 | self.system_prompt = """You are an expert code reviewer tasked with evaluating the consistency between commit messages and their corresponding code changes. 46 | 47 | Your job is to analyze whether a commit message accurately describes the actual code changes made in the diff. 48 | 49 | Consider the following aspects: 50 | 1. Does the commit message describe the actual changes made? 51 | 2. Are the mentioned components/files/functions actually modified? 52 | 3. Is the scope of changes (major/minor) consistent with the message? 53 | 4. Are any important changes missing from the commit message? 54 | 5. Does the commit message contain any false or misleading information? 55 | 56 | Here are two examples to guide your analysis: 57 | 58 | **Example 1 (Consistent):** 59 | Commit Message: "Fix memory leak in buffer allocation" 60 | Code Diff: 61 | ``` 62 | void allocate_buffer() {{ 63 | - char* buf = malloc(1024); 64 | + char* buf = malloc(1024); 65 | + if (!buf) return; 66 | // process buffer 67 | + free(buf); 68 | }} 69 | ``` 70 | Analysis: {{ 71 | "consistent": true, 72 | "confidence": 0.95, 73 | "reasoning": "The commit message accurately describes the change. A memory leak was indeed fixed by adding proper error checking and freeing the allocated buffer.", 74 | "issues": [] 75 | }} 76 | 77 | **Example 2 (Inconsistent):** 78 | Commit Message: "Add new sorting algorithm implementation" 79 | Code Diff: 80 | ``` 81 | int compare_strings(const char* a, const char* b) {{ 82 | - return strcmp(a, b); 83 | + return strcasecmp(a, b); 84 | }} 85 | ``` 86 | Analysis: {{ 87 | "consistent": false, 88 | "confidence": 0.9, 89 | "reasoning": "The commit message claims to add a new sorting algorithm, but the actual change only modifies a string comparison function to be case-insensitive. No sorting algorithm was added.", 90 | "issues": ["Commit message mentions adding sorting algorithm but only string comparison was changed", "Scope mismatch: minor change vs claimed major addition"] 91 | }} 92 | 93 | Respond with a JSON object containing: 94 | - "consistent": true/false, 95 | - "confidence": 0.0-1.0, 96 | - "reasoning": "detailed explanation of your analysis", 97 | - "issues": ["list of specific inconsistencies found, if any"] 98 | 99 | Be thorough in your analysis and provide clear reasoning for your decision.""" 100 | 101 | self.user_prompt = """Please analyze the consistency between this commit message and the corresponding code diff: 102 | 103 | **Commit Message:** 104 | {commit_message} 105 | 106 | **Code Diff:** 107 | {code_diff} 108 | 109 | Evaluate whether the commit message accurately describes the code changes and respond with the requested JSON format.""" 110 | 111 | def check_consistency(self, commit_message: str, code_diff: str) -> Dict[str, Any]: 112 | """ 113 | Check consistency between a commit message and code diff using few-shot learning. 114 | 115 | Args: 116 | commit_message (str): The commit message to analyze 117 | code_diff (str): The code diff to analyze 118 | 119 | Returns: 120 | Dict[str, Any]: Analysis result containing consistency, confidence, reasoning, and issues 121 | """ 122 | try: 123 | from langchain_core.prompts import ChatPromptTemplate 124 | 125 | prompt = ChatPromptTemplate.from_messages([ 126 | ("system", self.system_prompt), 127 | ("user", self.user_prompt) 128 | ]) 129 | 130 | chain = prompt | self.chat_model 131 | 132 | response = chain.invoke({ 133 | "commit_message": commit_message, 134 | "code_diff": code_diff 135 | }) 136 | 137 | 138 | # Extract token usage from response 139 | prompt_tokens = 0 140 | completion_tokens = 0 141 | total_tokens = 0 142 | 143 | try: 144 | if hasattr(response, 'response_metadata') and response.response_metadata: 145 | token_usage = response.response_metadata.get('token_usage', {}) 146 | prompt_tokens = token_usage.get('prompt_tokens', 0) 147 | completion_tokens = token_usage.get('completion_tokens', 0) 148 | total_tokens = token_usage.get('total_tokens', 0) 149 | elif hasattr(response, 'usage_metadata'): 150 | prompt_tokens = getattr(response.usage_metadata, 'input_tokens', 0) 151 | completion_tokens = getattr(response.usage_metadata, 'output_tokens', 0) 152 | total_tokens = getattr(response.usage_metadata, 'total_tokens', 0) 153 | except Exception as e: 154 | pass 155 | 156 | # Parse the JSON response 157 | result = self._parse_response(response.content) 158 | 159 | # Add token usage to result 160 | result['prompt_tokens'] = prompt_tokens 161 | result['completion_tokens'] = completion_tokens 162 | result['total_tokens'] = total_tokens 163 | 164 | return result 165 | 166 | except Exception as e: 167 | # logger.error(f"Error in few-shot consistency check: {e}") 168 | import traceback 169 | traceback.print_exc() 170 | return { 171 | "consistent": False, 172 | "confidence": 0.0, 173 | "reasoning": f"Error occurred during analysis: {str(e)}", 174 | "issues": ["Analysis failed due to technical error"], 175 | "prompt_tokens": 0, 176 | "completion_tokens": 0, 177 | "total_tokens": 0 178 | } 179 | 180 | def _parse_response(self, content: str) -> Dict[str, Any]: 181 | """ 182 | Parse the LLM response to extract the JSON result. 183 | 184 | Args: 185 | content (str): Raw LLM response content 186 | 187 | Returns: 188 | Dict[str, Any]: Parsed analysis result 189 | """ 190 | try: 191 | import json 192 | import re 193 | 194 | content = content.strip() 195 | 196 | # Try to find JSON object in the response 197 | json_match = re.search(r'\{.*\}', content, re.DOTALL) 198 | if json_match: 199 | json_str = json_match.group() 200 | result = json.loads(json_str) 201 | 202 | # Validate required fields 203 | required_fields = ['consistent', 'confidence', 'reasoning', 'issues'] 204 | for field in required_fields: 205 | if field not in result: 206 | raise ValueError(f"Missing required field: {field}") 207 | 208 | # Ensure types are correct 209 | result['consistent'] = bool(result['consistent']) 210 | result['confidence'] = float(result['confidence']) 211 | result['reasoning'] = str(result['reasoning']) 212 | result['issues'] = list(result['issues']) if isinstance(result['issues'], list) else [] 213 | 214 | return result 215 | else: 216 | raise ValueError("No JSON object found in response") 217 | 218 | except Exception as e: 219 | # logger.error(f"Error parsing LLM response: {e}") 220 | # logger.error(f"Raw response: {content}") 221 | 222 | # Return a default response for parsing errors 223 | return { 224 | "consistent": False, 225 | "confidence": 0.0, 226 | "reasoning": f"Failed to parse LLM response: {str(e)}", 227 | "issues": ["Response parsing error"] 228 | } 229 | 230 | 231 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2025 CodeFuse-CommitEval Contributors 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /data_synthesis/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Command line tool for inconsistent data generation 4 | """ 5 | 6 | import argparse 7 | import json 8 | import sys 9 | import multiprocessing 10 | from pathlib import Path 11 | from typing import Optional 12 | 13 | from config import DataSynthesisConfig 14 | from data_generator import InconsistentDataGenerator, setup_logging 15 | 16 | 17 | def main(): 18 | """Main function""" 19 | parser = argparse.ArgumentParser( 20 | description="Generate test data with inconsistent commit messages and code diffs", 21 | formatter_class=argparse.RawDescriptionHelpFormatter, 22 | epilog=""" 23 | Examples: 24 | # Basic usage 25 | python -m data_synthesis.cli input.jsonl output.jsonl 26 | 27 | # Specify LLM configuration 28 | python -m data_synthesis.cli input.jsonl output.jsonl --provider openai --api-key your_key 29 | 30 | # Process first 100 samples with 50% inconsistency ratio 31 | python -m data_synthesis.cli input.jsonl output.jsonl --max-samples 100 --ratio 0.5 32 | 33 | # Use parallel processing with 4 workers 34 | python -m data_synthesis.cli input.jsonl output.jsonl --workers 4 35 | 36 | # List all available rules 37 | python -m data_synthesis.cli --list-rules 38 | """ 39 | ) 40 | 41 | # Input/output parameters 42 | parser.add_argument('input_file', nargs='?', help='Input JSONL file path') 43 | parser.add_argument('output_file', nargs='?', help='Output JSONL file path') 44 | 45 | # LLM configuration 46 | llm_group = parser.add_argument_group('LLM Configuration') 47 | llm_group.add_argument('--provider', choices=['openai'], 48 | default='openai', help='LLM provider (default: openai)') 49 | llm_group.add_argument('--api-key', default='empty', 50 | help='API key (default: empty, will use mock response)') 51 | llm_group.add_argument('--base-url', help='API base URL') 52 | llm_group.add_argument('--model', help='Model name') 53 | llm_group.add_argument('--max-retries', type=int, default=3, 54 | help='Maximum retry attempts (default: 3)') 55 | 56 | # Generation configuration 57 | gen_group = parser.add_argument_group('Generation Configuration') 58 | gen_group.add_argument('--ratio', type=float, default=1.0, 59 | help='Inconsistent data ratio 0.0-1.0 (default: 1.0)') 60 | gen_group.add_argument('--max-samples', type=int, 61 | help='Maximum number of samples to process') 62 | gen_group.add_argument('--seed', type=int, default=42, 63 | help='Random seed (default: 42)') 64 | gen_group.add_argument('--batch-size', type=int, default=10, 65 | help='Batch size (default: 10)') 66 | gen_group.add_argument('--no-applicability-check', action='store_true', 67 | help='Disable intelligent rule applicability checking (use random selection)') 68 | gen_group.add_argument('--test-applicability', 69 | help='Test rule applicability for a specific commit (provide JSON file)') 70 | gen_group.add_argument('--save-analysis', 71 | help='Save detailed applicability analysis to file') 72 | gen_group.add_argument('--workers', '-w', type=int, default=1, 73 | help='Number of worker processes for parallel processing (default: 1)') 74 | 75 | # Other options 76 | parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], 77 | default='INFO', help='Log level (default: INFO)') 78 | parser.add_argument('--log-file', help='Log file path') 79 | parser.add_argument('--list-rules', action='store_true', 80 | help='List all available inconsistency rules') 81 | parser.add_argument('--analyze', help='Analyze rule distribution in output file') 82 | parser.add_argument('--config', help='Load configuration from JSON file') 83 | 84 | args = parser.parse_args() 85 | 86 | # Setup logging 87 | setup_logging(args.log_level, args.log_file) 88 | 89 | # Handle test applicability request 90 | if args.test_applicability: 91 | test_applicability(args.test_applicability, create_config(args)) 92 | return 93 | 94 | # Handle list rules request 95 | if args.list_rules: 96 | list_rules() 97 | return 98 | 99 | # Handle analyze request 100 | if args.analyze: 101 | analyze_output(args.analyze) 102 | return 103 | 104 | # Check required parameters 105 | if not args.input_file or not args.output_file: 106 | parser.error("Input and output file paths are required, or use --list-rules or --analyze options") 107 | 108 | # Validate workers parameter 109 | max_cpu_count = multiprocessing.cpu_count() 110 | if args.workers < 1: 111 | parser.error("Number of workers must be at least 1") 112 | elif args.workers > max_cpu_count: 113 | print(f"⚠️ Number of workers ({args.workers}) exceeds CPU count ({max_cpu_count}), using {max_cpu_count}") 114 | args.workers = max_cpu_count 115 | 116 | # Create configuration 117 | config = create_config(args) 118 | 119 | # Create generator and process file 120 | try: 121 | # print("==============") 122 | # print(config.get_llm_config()) 123 | generator = InconsistentDataGenerator( 124 | llm_config=config.get_llm_config(), 125 | seed=args.seed, 126 | use_applicability_check=not args.no_applicability_check 127 | ) 128 | 129 | print(f"🚀 Starting data generation with {args.workers} worker(s)...") 130 | if args.workers > 1: 131 | print(f"📊 Using multiprocessing for parallel data generation") 132 | 133 | # Use enhanced processing if analysis is requested 134 | if args.save_analysis: 135 | generator.process_jsonl_file_with_analysis( 136 | input_file=args.input_file, 137 | output_file=args.output_file, 138 | analysis_file=args.save_analysis, 139 | num_samples=args.max_samples, 140 | inconsistency_ratio=args.ratio, 141 | max_workers=args.workers 142 | ) 143 | else: 144 | generator.process_jsonl_file( 145 | input_file=args.input_file, 146 | output_file=args.output_file, 147 | num_samples=args.max_samples, 148 | inconsistency_ratio=args.ratio, 149 | max_workers=args.workers 150 | ) 151 | 152 | print(f"✅ Processing complete! Output file: {args.output_file}") 153 | 154 | # Show rule distribution statistics 155 | rule_distribution = generator.analyze_rules_distribution(args.output_file) 156 | if rule_distribution: 157 | print("\n📊 Rule Distribution Statistics:") 158 | for rule_type, count in sorted(rule_distribution.items()): 159 | print(f" {rule_type}: {count}") 160 | 161 | except Exception as e: 162 | print(f"❌ Processing failed: {e}", file=sys.stderr) 163 | sys.exit(1) 164 | 165 | 166 | def create_config(args) -> DataSynthesisConfig: 167 | """Create configuration based on command line arguments""" 168 | config = DataSynthesisConfig() 169 | 170 | # Load configuration from file 171 | if args.config: 172 | try: 173 | with open(args.config, 'r', encoding='utf-8') as f: 174 | config_dict = json.load(f) 175 | config.update_config(config_dict) 176 | except Exception as e: 177 | print(f"⚠️ Failed to load configuration file: {e}", file=sys.stderr) 178 | 179 | # Update LLM configuration 180 | llm_updates = {} 181 | if args.provider: 182 | llm_updates['provider'] = args.provider 183 | if args.api_key: 184 | llm_updates['api_key'] = args.api_key 185 | if args.base_url: 186 | llm_updates['base_url'] = args.base_url 187 | if args.model: 188 | llm_updates['model'] = args.model 189 | if args.max_retries: 190 | llm_updates['max_retries'] = args.max_retries 191 | 192 | if llm_updates: 193 | config.update_config({'llm': llm_updates}) 194 | 195 | return config 196 | 197 | 198 | def list_rules(): 199 | """List all available inconsistency rules""" 200 | config = DataSynthesisConfig() 201 | generator = InconsistentDataGenerator(config.get_llm_config()) 202 | 203 | rules = generator.get_available_rules() 204 | 205 | print("📋 Available Inconsistency Rules:") 206 | print("=" * 60) 207 | 208 | for i, rule in enumerate(rules, 1): 209 | print(f"\n{i}. {rule['name']} ({rule['type']})") 210 | print(f" Weight: {rule['weight']}") 211 | print(f" Description: {rule['description']}") 212 | 213 | print(f"\nTotal: {len(rules)} rules") 214 | 215 | 216 | def test_applicability(json_file: str, config: DataSynthesisConfig): 217 | """Test rule applicability for a specific commit""" 218 | import json 219 | from rule_applicability_checker import EnhancedInconsistencyRuleManager 220 | from llm_interface import LLMManager 221 | 222 | if not Path(json_file).exists(): 223 | print(f"❌ File does not exist: {json_file}", file=sys.stderr) 224 | sys.exit(1) 225 | 226 | # Load commit data 227 | try: 228 | with open(json_file, 'r', encoding='utf-8') as f: 229 | commit_data = json.load(f) 230 | except (json.JSONDecodeError, IOError) as e: 231 | print(f"❌ Error reading JSON file: {e}", file=sys.stderr) 232 | sys.exit(1) 233 | 234 | # Test applicability 235 | llm_manager = LLMManager(config.get_llm_config()) 236 | enhanced_manager = EnhancedInconsistencyRuleManager(llm_manager) 237 | 238 | print(f"🔍 Testing Rule Applicability for Commit:") 239 | print(f"Message: {commit_data.get('message', 'N/A')}") 240 | print(f"Files: {commit_data.get('files', [])}") 241 | print("=" * 60) 242 | 243 | try: 244 | # Get applicability analysis 245 | applicability = enhanced_manager.analyze_commit_applicability(commit_data) 246 | 247 | print("📊 Applicability Analysis:") 248 | for rule_key, rule_info in applicability.items(): 249 | status = "✅ Applicable" if rule_info["applicable"] else "❌ Not Applicable" 250 | print(f" {rule_key}: {status}") 251 | print(f" Reasoning: {rule_info['reasoning']}") 252 | 253 | # Get best rule 254 | best_rule, selection_info = enhanced_manager.get_best_rule_for_commit(commit_data) 255 | print(f"\n🏆 Best Rule Selected: {best_rule.name} (weight: {best_rule.weight})") 256 | print(f"📝 Selection Reasoning: {selection_info.get('reasoning', 'N/A')}") 257 | 258 | except Exception as e: 259 | print(f"❌ Error during applicability testing: {e}") 260 | sys.exit(1) 261 | 262 | 263 | def analyze_output(output_file: str): 264 | """Analyze rule distribution in output file""" 265 | if not Path(output_file).exists(): 266 | print(f"❌ File does not exist: {output_file}", file=sys.stderr) 267 | sys.exit(1) 268 | 269 | config = DataSynthesisConfig() 270 | generator = InconsistentDataGenerator(config.get_llm_config()) 271 | 272 | rule_distribution = generator.analyze_rules_distribution(output_file) 273 | 274 | if not rule_distribution: 275 | print("📊 No inconsistent data found") 276 | return 277 | 278 | print(f"📊 Rule Distribution Statistics for {output_file}:") 279 | print("=" * 60) 280 | 281 | total_inconsistent = sum(rule_distribution.values()) 282 | 283 | for rule_type, count in sorted(rule_distribution.items(), key=lambda x: x[1], reverse=True): 284 | percentage = (count / total_inconsistent) * 100 285 | print(f"{rule_type:30} {count:6d} ({percentage:5.1f}%)") 286 | 287 | print("-" * 60) 288 | print(f"{'Total':30} {total_inconsistent:6d} (100.0%)") 289 | 290 | 291 | if __name__ == '__main__': 292 | main() 293 | -------------------------------------------------------------------------------- /data_synthesis/inconsistency_rules.py: -------------------------------------------------------------------------------- 1 | """ 2 | Define rules for generating inconsistent commit messages 3 | """ 4 | 5 | from enum import Enum 6 | from dataclasses import dataclass 7 | from typing import List, Dict, Any 8 | import random 9 | 10 | 11 | class InconsistencyType(Enum): 12 | """Inconsistency type enumeration""" 13 | FUNCTION_NAME_MISMATCH = "function_name_mismatch" 14 | FILE_PATH_MISMATCH = "file_path_mismatch" 15 | OPERATION_TYPE_MISMATCH = "operation_type_mismatch" 16 | PURPOSE_MISMATCH = "purpose_mismatch" 17 | COMPONENT_MISMATCH = "component_mismatch" 18 | FEATURE_MISSING = "feature_missing" 19 | EXTRA_FEATURE = "extra_feature" 20 | 21 | 22 | @dataclass 23 | class InconsistencyRule: 24 | """Inconsistency rule definition""" 25 | rule_type: InconsistencyType 26 | name: str 27 | description: str 28 | prompt_template: str 29 | weight: float = 1.0 # Weight for rule selection 30 | 31 | 32 | class InconsistencyRuleManager: 33 | """Inconsistency rule manager""" 34 | 35 | def __init__(self): 36 | self.rules = self._initialize_rules() 37 | 38 | def _initialize_rules(self) -> List[InconsistencyRule]: 39 | """Initialize all inconsistency rules""" 40 | return [ 41 | InconsistencyRule( 42 | rule_type=InconsistencyType.FUNCTION_NAME_MISMATCH, 43 | name="Function Name Mismatch", 44 | description="Modify function names mentioned in commit message to make them inconsistent with actual code diff", 45 | prompt_template=""" 46 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the function names mentioned to different but plausible function names that don't actually appear in the diff. 47 | 48 | Original commit message: {message} 49 | Code diff: {diff} 50 | 51 | Requirements: 52 | 1. Keep the overall structure and tone of the original message 53 | 2. Replace function names with different but realistic function names 54 | 3. Ensure the new function names are NOT present in the actual diff 55 | 4. Output a COMPLETE, standalone commit message (not just the changed parts) 56 | 5. The resulting message should seem plausible but be factually incorrect about function names 57 | 58 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message. 59 | """, 60 | weight=1.5 61 | ), 62 | 63 | InconsistencyRule( 64 | rule_type=InconsistencyType.FILE_PATH_MISMATCH, 65 | name="File Path Mismatch", 66 | description="Modify file paths mentioned in commit message to make them inconsistent with actual modified files", 67 | prompt_template=""" 68 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the file paths or module names mentioned to different but plausible paths that don't match the actual files changed. 69 | 70 | Original commit message: {message} 71 | Code diff: {diff} 72 | Actual files changed: {files} 73 | 74 | Requirements: 75 | 1. Keep the overall structure and purpose of the original message 76 | 2. Replace file paths or module names with different but realistic alternatives 77 | 3. Ensure the new paths are NOT in the actual files changed list 78 | 4. Output a COMPLETE, standalone commit message (not just the changed parts) 79 | 5. The resulting message should seem plausible but be factually incorrect about which files were modified 80 | 81 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message. 82 | """, 83 | weight=1.3 84 | ), 85 | 86 | InconsistencyRule( 87 | rule_type=InconsistencyType.OPERATION_TYPE_MISMATCH, 88 | name="Operation Type Mismatch", 89 | description="Modify operation type described in commit message (add/remove/fix/refactor) to make it inconsistent with actual code changes", 90 | prompt_template=""" 91 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the operation type (add/remove/fix/refactor/update/etc.) to a different operation that doesn't match what was actually done in the code. 92 | 93 | Original commit message: {message} 94 | Code diff: {diff} 95 | 96 | Requirements: 97 | 1. Identify the actual operation performed in the diff (adding, removing, fixing, refactoring, etc.) 98 | 2. Change the commit message to describe a different type of operation 99 | 3. Keep other details relatively consistent 100 | 4. Output a COMPLETE, standalone commit message (not just the changed operation) 101 | 5. The new operation should be plausible but factually incorrect 102 | 103 | Common operation changes: 104 | - "fix" → "add" or "remove" 105 | - "add" → "fix" or "refactor" 106 | - "remove" → "update" or "add" 107 | - "refactor" → "fix" or "optimize" 108 | 109 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message. 110 | """, 111 | weight=1.4 112 | ), 113 | 114 | InconsistencyRule( 115 | rule_type=InconsistencyType.PURPOSE_MISMATCH, 116 | name="Purpose Mismatch", 117 | description="Modify the purpose described in commit message to make it inconsistent with actual code change purpose", 118 | prompt_template=""" 119 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the stated purpose or goal of the changes while keeping the technical action similar. 120 | 121 | Original commit message: {message} 122 | Code diff: {diff} 123 | 124 | Requirements: 125 | 1. Keep the technical action (what was changed) relatively accurate 126 | 2. Change the reason/purpose/goal for making the change 127 | 3. Output a COMPLETE, standalone commit message (not just the changed purpose) 128 | 4. The new purpose should be plausible but different from what the code actually achieves 129 | 5. Maintain professional commit message style 130 | 131 | Examples of purpose changes: 132 | - "Fix memory leak in parser" → "Improve performance in parser" 133 | - "Add logging for debugging" → "Add logging for compliance" 134 | - "Refactor for better readability" → "Refactor for performance optimization" 135 | 136 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message. 137 | """, 138 | weight=1.1 139 | ), 140 | 141 | InconsistencyRule( 142 | rule_type=InconsistencyType.COMPONENT_MISMATCH, 143 | name="Component Mismatch", 144 | description="Modify component or module names mentioned in commit message to make them inconsistent with actually modified components", 145 | prompt_template=""" 146 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the component, module, or system names mentioned to different but plausible components that aren't actually modified. 147 | 148 | Original commit message: {message} 149 | Code diff: {diff} 150 | 151 | Requirements: 152 | 1. Identify components/modules mentioned in the original message 153 | 2. Replace them with different but realistic component names 154 | 3. Keep the action and technical details consistent 155 | 4. Output a COMPLETE, standalone commit message (not just the changed components) 156 | 5. The new components should be plausible for the codebase but factually incorrect 157 | 158 | Examples of component changes: 159 | - "Update database connection pool" → "Update cache connection pool" 160 | - "Fix bug in user service" → "Fix bug in notification service" 161 | - "Optimize search algorithm" → "Optimize sorting algorithm" 162 | 163 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message. 164 | """, 165 | weight=1.2 166 | ), 167 | 168 | InconsistencyRule( 169 | rule_type=InconsistencyType.FEATURE_MISSING, 170 | name="Missing Feature Description", 171 | description="Generate commit message that only describes partial actual changes, omitting important code modifications", 172 | prompt_template=""" 173 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by describing only a subset of the actual changes made, omitting significant modifications. 174 | 175 | Original commit message: {message} 176 | Code diff: {diff} 177 | 178 | Requirements: 179 | 1. Identify multiple distinct changes in the diff 180 | 2. Create a COMPLETE, standalone commit message that only mentions some of the changes 181 | 3. Omit important or significant modifications from the original message 182 | 4. The described changes should be accurate but incomplete compared to what was actually done 183 | 5. Make it seem like a legitimate but incomplete commit message 184 | 6. Do NOT just output the missing parts - output a FULL commit message that describes less than what was actually changed 185 | 186 | Example: If original message is "Fix login bug and add user validation" and both changes exist in diff, 187 | output a complete message like "Fix login bug" (omitting the validation part). 188 | 189 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message. 190 | """, 191 | weight=1.0 192 | ), 193 | 194 | InconsistencyRule( 195 | rule_type=InconsistencyType.EXTRA_FEATURE, 196 | name="Extra Feature Description", 197 | description="Generate commit message that describes additional features not present in actual code changes", 198 | prompt_template=""" 199 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by describing the actual changes PLUS a additional plausible but non-existent change. 200 | 201 | Original commit message: {message} 202 | Code diff: {diff} 203 | 204 | Requirements: 205 | 1. Start with accurate descriptions of the actual changes from the original message 206 | 2. Add a description of additional plausible change that don't exist in the diff 207 | 3. The extra change should be realistic and related to the actual changes 208 | 4. Create a COMPLETE, standalone commit message that combines both real and fictional changes 209 | 5. Make it seem like a legitimate commit that describes more than what was actually done 210 | 6. Do NOT just output the extra parts - output a FULL commit message that includes both real and fictional changes 211 | 212 | Examples of how to add extra features: 213 | - Original: "Fix login bug" → Full message: "Fix login bug and add comprehensive test coverage" 214 | - Original: "Add user registration" → Full message: "Add user registration with email validation and update API documentation" 215 | - Original: "Refactor database connection" → Full message: "Refactor database connection and optimize query performance" 216 | 217 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message. 218 | """, 219 | weight=1.0 220 | ) 221 | ] 222 | 223 | def get_random_rule(self) -> InconsistencyRule: 224 | """Select a rule randomly based on weights""" 225 | weights = [rule.weight for rule in self.rules] 226 | return random.choices(self.rules, weights=weights, k=1)[0] 227 | 228 | def get_rule_by_type(self, rule_type: InconsistencyType) -> InconsistencyRule: 229 | """Get specific rule by type""" 230 | for rule in self.rules: 231 | if rule.rule_type == rule_type: 232 | return rule 233 | raise ValueError(f"Rule type {rule_type} not found") 234 | 235 | def get_all_rules(self) -> List[InconsistencyRule]: 236 | """Get all rules""" 237 | return self.rules.copy() 238 | 239 | def format_prompt(self, rule: InconsistencyRule, commit_data: Dict[str, Any]) -> str: 240 | """Format rule prompt template""" 241 | return rule.prompt_template.format( 242 | message=commit_data.get('message', ''), 243 | diff=commit_data.get('diff', ''), 244 | files=commit_data.get('files', []) 245 | ) 246 | -------------------------------------------------------------------------------- /data_synthesis/rule_applicability_checker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Rule applicability checker using LLM to determine which inconsistency rules are suitable for given commit data 3 | """ 4 | 5 | import json 6 | import logging 7 | from typing import Dict, Any, Optional, List, Tuple 8 | from inconsistency_rules import InconsistencyRuleManager, InconsistencyRule, InconsistencyType 9 | 10 | 11 | class RuleApplicabilityChecker: 12 | """Check which inconsistency rules are applicable to given commit message and diff""" 13 | 14 | def __init__(self, llm_manager): 15 | self.llm_manager = llm_manager 16 | self.rule_manager = InconsistencyRuleManager() 17 | self.logger = logging.getLogger(__name__) 18 | self.used_rule_count = {rule_type.value: 0 for rule_type in InconsistencyType} 19 | 20 | def _create_applicability_prompt(self, commit_data: Dict[str, Any]) -> str: 21 | """Create prompt for LLM to judge rule applicability based only on commit message""" 22 | 23 | message = commit_data.get('message', '') 24 | 25 | prompt = f""" 26 | You are an expert analyzing commit messages. Your task is to determine which inconsistency rules are applicable based on the commit message content. 27 | 28 | Given commit message: {message} 29 | 30 | **IMPORTANT**: For rule 3 below, IGNORE any content within square brackets []. Content in brackets will be ignored in subsequent processing, so mutation of such content would have no effect. 31 | 32 | Please analyze whether each of the following 7 inconsistency rules would be appropriate to apply to this commit message: 33 | 34 | 1. **FUNCTION_NAME_MISMATCH**: Modify function names mentioned in commit message 35 | - Requires: Commit message explicitly mentions specific function names (e.g., "fix bug in authenticate_user()", "update process_data function") 36 | 37 | 2. **FILE_PATH_MISMATCH**: Modify file paths mentioned in commit message 38 | - Requires: Commit message explicitly mentions file paths, module names, or specific files (e.g., "update config/database.py", "fix bug in user_service.py") 39 | 40 | 3. **OPERATION_TYPE_MISMATCH**: Modify operation type (add/remove/fix/refactor/update) mentioned in message 41 | - Requires: Commit message contains clear operation verbs OUTSIDE of brackets (e.g., "add feature", "fix bug", "remove deprecated", "refactor code") 42 | - Ignore: Operation verbs within brackets like "[add] feature" or "[fix] bug" 43 | 44 | 4. **PURPOSE_MISMATCH**: Modify the stated purpose/goal mentioned in message 45 | - Requires: Commit message states a clear purpose/reason (e.g., "for performance", "to fix memory leak", "for security", "to improve readability") 46 | 47 | 5. **COMPONENT_MISMATCH**: Modify component/module/system names mentioned in message 48 | - Requires: Commit message mentions specific components, modules, systems, or architectural elements (e.g., "auth service", "database layer", "user management") 49 | 50 | 6. **FEATURE_MISSING**: Generate message describing only partial changes 51 | - Requires: Commit message describes multiple distinct changes that could be described separately (e.g., "fix bug and add validation", "update API and add tests") 52 | 53 | 7. **EXTRA_FEATURE**: Add descriptions of additional changes not mentioned in original message 54 | - Requires: Commit message describes actual changes that could realistically be accompanied by related changes (e.g., a bug fix could also mention adding tests) 55 | 56 | For each rule, determine based ONLY on the commit message content: 57 | - Whether it's applicable (true/false) 58 | - Reasoning for your decision 59 | 60 | Return ONLY a JSON object in this exact format: 61 | {{ 62 | "function_name_mismatch": {{ 63 | "applicable": true/false, 64 | "reasoning": "explanation based on commit message content" 65 | }}, 66 | "file_path_mismatch": {{ 67 | "applicable": true/false, 68 | "reasoning": "explanation based on commit message content" 69 | }}, 70 | "operation_type_mismatch": {{ 71 | "applicable": true/false, 72 | "reasoning": "explanation based on commit message content" 73 | }}, 74 | "purpose_mismatch": {{ 75 | "applicable": true/false, 76 | "reasoning": "explanation based on commit message content" 77 | }}, 78 | "component_mismatch": {{ 79 | "applicable": true/false, 80 | "reasoning": "explanation based on commit message content" 81 | }}, 82 | "feature_missing": {{ 83 | "applicable": true/false, 84 | "reasoning": "explanation based on commit message content" 85 | }}, 86 | "extra_feature": {{ 87 | "applicable": true/false, 88 | "reasoning": "explanation based on commit message content" 89 | }} 90 | }} 91 | """ 92 | return prompt 93 | 94 | def check_rule_applicability(self, commit_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: 95 | """Check which rules are applicable to the given commit data""" 96 | 97 | try: 98 | prompt = self._create_applicability_prompt(commit_data) 99 | response = self.llm_manager.query(prompt) 100 | 101 | if not response: 102 | self.logger.warning("No response from LLM for applicability check") 103 | return self._get_fallback_applicability() 104 | 105 | # Parse the JSON response 106 | try: 107 | applicability_result = json.loads(response) 108 | 109 | # Validate the response structure 110 | expected_keys = { 111 | "function_name_mismatch", "file_path_mismatch", "operation_type_mismatch", 112 | "purpose_mismatch", "component_mismatch", "feature_missing", "extra_feature" 113 | } 114 | 115 | if not isinstance(applicability_result, dict): 116 | raise ValueError("Response is not a dictionary") 117 | 118 | if not expected_keys.issubset(applicability_result.keys()): 119 | missing_keys = expected_keys - applicability_result.keys() 120 | raise ValueError(f"Missing keys in response: {missing_keys}") 121 | 122 | # Validate each rule entry 123 | for rule_key in expected_keys: 124 | rule_data = applicability_result[rule_key] 125 | if not isinstance(rule_data, dict): 126 | raise ValueError(f"Rule {rule_key} data is not a dictionary") 127 | if "applicable" not in rule_data or "reasoning" not in rule_data: 128 | raise ValueError(f"Rule {rule_key} missing required fields") 129 | if not isinstance(rule_data["applicable"], bool): 130 | raise ValueError(f"Rule {rule_key} 'applicable' field is not boolean") 131 | 132 | self.logger.info("Successfully parsed rule applicability response") 133 | return applicability_result 134 | 135 | except (json.JSONDecodeError, ValueError) as e: 136 | self.logger.error(f"Error parsing applicability response: {e}") 137 | self.logger.debug(f"Raw response: {response}") 138 | return self._get_fallback_applicability() 139 | 140 | except Exception as e: 141 | self.logger.error(f"Error in rule applicability check: {e}") 142 | return self._get_fallback_applicability() 143 | 144 | def _get_fallback_applicability(self) -> Dict[str, Dict[str, Any]]: 145 | """Return fallback applicability when LLM check fails""" 146 | 147 | fallback = {} 148 | for rule_type in InconsistencyType: 149 | fallback[rule_type.value] = { 150 | "applicable": True, # Default to applicable 151 | "reasoning": "LLM applicability check failed, defaulting to applicable" 152 | } 153 | 154 | return fallback 155 | 156 | def select_best_rule(self, commit_data: Dict[str, Any]) -> Tuple[InconsistencyRule, Dict[str, Any]]: 157 | """Select the best applicable rule based on applicability and weight""" 158 | 159 | # Get applicability results 160 | applicability = self.check_rule_applicability(commit_data) 161 | 162 | # Filter applicable rules and calculate weighted scores 163 | applicable_rules = [] 164 | 165 | for rule in self.rule_manager.get_all_rules(): 166 | rule_key = rule.rule_type.value 167 | 168 | if rule_key in applicability and applicability[rule_key]["applicable"]: 169 | # Calculate score based on weight (could be enhanced with other factors) 170 | score = rule.weight 171 | applicable_rules.append((rule, score, applicability[rule_key])) 172 | 173 | if not applicable_rules: 174 | # If no rules are applicable, fall back to the highest weighted rule 175 | self.logger.warning("No applicable rules found, falling back to highest weighted rule") 176 | best_rule = max(self.rule_manager.get_all_rules(), key=lambda r: r.weight) 177 | fallback_info = { 178 | "applicable": False, 179 | "reasoning": "No rules were deemed applicable, using fallback" 180 | } 181 | return best_rule, fallback_info 182 | 183 | # Select rule with highest score 184 | best_rule, best_score, best_applicability = max(applicable_rules, key=lambda x: x[1]) 185 | 186 | # Balance rule usage by selecting from least used applicable rules 187 | applicable_rule_types = [rule[0].rule_type.value for rule in applicable_rules] 188 | min_usage_count = min(self.used_rule_count[rule_type] for rule_type in applicable_rule_types) 189 | least_used_applicable_rules = [rule for rule in applicable_rules 190 | if self.used_rule_count[rule[0].rule_type.value] == min_usage_count] 191 | 192 | if least_used_applicable_rules: 193 | # Select from least used rules with highest score 194 | best_rule, best_score, best_applicability = max(least_used_applicable_rules, key=lambda x: x[1]) 195 | self.used_rule_count[best_rule.rule_type.value] += 1 196 | self.logger.info(f"Selected least used rule: {best_rule.name} (usage count: {self.used_rule_count[best_rule.rule_type.value]})") 197 | 198 | self.logger.info(f"Selected rule: {best_rule.name} (score: {best_score})") 199 | self.logger.info(f"Reasoning: {best_applicability['reasoning']}") 200 | 201 | return best_rule, best_applicability 202 | 203 | def get_applicability_summary(self, commit_data: Dict[str, Any]) -> str: 204 | """Get a human-readable summary of rule applicability""" 205 | 206 | applicability = self.check_rule_applicability(commit_data) 207 | 208 | summary_lines = ["Rule Applicability Analysis:"] 209 | 210 | for rule in self.rule_manager.get_all_rules(): 211 | rule_key = rule.rule_type.value 212 | if rule_key in applicability: 213 | rule_data = applicability[rule_key] 214 | status = "✅ Applicable" if rule_data["applicable"] else "❌ Not Applicable" 215 | summary_lines.append(f" {rule.name}: {status}") 216 | summary_lines.append(f" Reasoning: {rule_data['reasoning']}") 217 | 218 | return "\n".join(summary_lines) 219 | 220 | 221 | class EnhancedInconsistencyRuleManager(InconsistencyRuleManager): 222 | """Enhanced rule manager with applicability checking""" 223 | 224 | def __init__(self, llm_manager): 225 | super().__init__() 226 | self.applicability_checker = RuleApplicabilityChecker(llm_manager) 227 | 228 | def get_best_rule_for_commit(self, commit_data: Dict[str, Any]) -> Tuple[InconsistencyRule, Dict[str, Any]]: 229 | """Get the best rule for given commit data based on applicability and weight""" 230 | return self.applicability_checker.select_best_rule(commit_data) 231 | 232 | def analyze_commit_applicability(self, commit_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: 233 | """Analyze which rules are applicable to the commit""" 234 | return self.applicability_checker.check_rule_applicability(commit_data) 235 | 236 | def get_applicability_summary(self, commit_data: Dict[str, Any]) -> str: 237 | """Get human-readable summary of rule applicability""" 238 | return self.applicability_checker.get_applicability_summary(commit_data) 239 | -------------------------------------------------------------------------------- /evaluation/pure_llm/consistency_checker.py: -------------------------------------------------------------------------------- 1 | from langchain_community.chat_models import ChatOpenAI 2 | from langchain_core.prompts import ChatPromptTemplate 3 | from typing import Dict, Any, Optional 4 | import logging 5 | import json 6 | import re 7 | 8 | class ConsistencyChecker: 9 | """ 10 | A class to check consistency between commit messages and code changes using LLM. 11 | """ 12 | 13 | def __init__(self, openai_api_key: str, openai_api_base: str, model: str = "gpt-3.5-turbo"): 14 | """ 15 | Initialize the consistency checker with LLM configuration. 16 | 17 | Args: 18 | openai_api_key (str): OpenAI API key 19 | openai_api_base (str): OpenAI API base URL 20 | model (str): Model name to use 21 | """ 22 | # Validate input parameters 23 | if not openai_api_key or not openai_api_key.strip(): 24 | raise ValueError("OpenAI API key cannot be empty") 25 | if not openai_api_base or not openai_api_base.strip(): 26 | raise ValueError("OpenAI API base URL cannot be empty") 27 | if not model or not model.strip(): 28 | raise ValueError("Model name cannot be empty") 29 | 30 | self.chat_model = ChatOpenAI( 31 | openai_api_key=openai_api_key, 32 | openai_api_base=openai_api_base, 33 | model=model, 34 | temperature=0 35 | ) 36 | 37 | # Define the system prompt for consistency checking 38 | self.system_prompt = """You are an expert code reviewer tasked with evaluating the consistency between commit messages and their corresponding code changes. 39 | 40 | Your job is to analyze whether a commit message accurately describes the actual code changes made in the diff. 41 | 42 | Consider the following aspects: 43 | 1. Does the commit message describe the actual changes made? 44 | 2. Are the mentioned components/files/functions actually modified? 45 | 3. Is the scope of changes (major/minor) consistent with the message? 46 | 4. Are any important changes missing from the commit message? 47 | 5. Does the commit message contain any false or misleading information? 48 | 49 | Respond with a JSON object containing: 50 | - "consistent": true/false, 51 | - "confidence": 0.0-1.0, 52 | - "reasoning": "detailed explanation of your analysis", 53 | - "issues": ["list of specific inconsistencies found, if any"] 54 | 55 | Be thorough in your analysis and provide clear reasoning for your decision.""" 56 | 57 | self.user_prompt = """Please analyze the consistency between this commit message and the corresponding code diff: 58 | 59 | **Commit Message:** 60 | {commit_message} 61 | 62 | **Code Diff:** 63 | {code_diff} 64 | 65 | Evaluate whether the commit message accurately describes the code changes and respond with the requested JSON format.""" 66 | 67 | def check_consistency(self, commit_message: str, code_diff: str) -> Dict[str, Any]: 68 | """ 69 | Check consistency between a commit message and code diff. 70 | 71 | Args: 72 | commit_message (str): The commit message to analyze 73 | code_diff (str): The code diff to analyze 74 | 75 | Returns: 76 | Dict[str, Any]: Analysis result containing consistency, confidence, reasoning, and issues 77 | """ 78 | try: 79 | prompt = ChatPromptTemplate.from_messages([ 80 | ("system", self.system_prompt), 81 | ("user", self.user_prompt) 82 | ]) 83 | 84 | # print(prompt) 85 | 86 | chain = prompt | self.chat_model 87 | 88 | response = chain.invoke({ 89 | "commit_message": commit_message, 90 | "code_diff": code_diff 91 | }) 92 | 93 | logging.info(f"LLM Consistency Check Response: {response.content}") 94 | 95 | # Extract token usage from response 96 | prompt_tokens = 0 97 | completion_tokens = 0 98 | total_tokens = 0 99 | 100 | try: 101 | if hasattr(response, 'response_metadata') and response.response_metadata: 102 | token_usage = response.response_metadata.get('token_usage', {}) 103 | prompt_tokens = token_usage.get('prompt_tokens', 0) 104 | completion_tokens = token_usage.get('completion_tokens', 0) 105 | total_tokens = token_usage.get('total_tokens', 0) 106 | elif hasattr(response, 'usage_metadata'): 107 | prompt_tokens = getattr(response.usage_metadata, 'input_tokens', 0) 108 | completion_tokens = getattr(response.usage_metadata, 'output_tokens', 0) 109 | total_tokens = getattr(response.usage_metadata, 'total_tokens', 0) 110 | except Exception as e: 111 | logging.warning(f"Could not extract token usage: {e}") 112 | 113 | # Parse the JSON response 114 | result = self._parse_response(response.content) 115 | 116 | # Add token usage to result 117 | result['prompt_tokens'] = prompt_tokens 118 | result['completion_tokens'] = completion_tokens 119 | result['total_tokens'] = total_tokens 120 | 121 | return result 122 | 123 | except Exception as e: 124 | logging.error(f"Error in consistency check: {e}") 125 | import traceback 126 | traceback.print_exc() 127 | return { 128 | "consistent": False, 129 | "confidence": 0.0, 130 | "reasoning": f"Error occurred during analysis: {str(e)}", 131 | "issues": ["Analysis failed due to technical error"], 132 | "prompt_tokens": 0, 133 | "completion_tokens": 0, 134 | "total_tokens": 0 135 | } 136 | 137 | def _parse_response(self, content: str) -> Dict[str, Any]: 138 | """ 139 | Parse the LLM response to extract the JSON result. 140 | 141 | Args: 142 | content (str): Raw LLM response content 143 | 144 | Returns: 145 | Dict[str, Any]: Parsed analysis result 146 | """ 147 | try: 148 | content = content.strip() 149 | 150 | # Try to find JSON object in the response 151 | # Look for patterns like: ```json {...} ``` or just {...} 152 | json_pattern = r'```json\s*(\{.*?\})\s*```' 153 | match = re.search(json_pattern, content, re.DOTALL) 154 | 155 | if match: 156 | json_str = match.group(1) 157 | else: 158 | # Try to find JSON object without code blocks 159 | obj_pattern = r'(\{.*?\})' 160 | match = re.search(obj_pattern, content, re.DOTALL) 161 | if match: 162 | json_str = match.group(1) 163 | else: 164 | # Try to extract from the raw content 165 | json_str = content 166 | 167 | # Parse the JSON 168 | result = json.loads(json_str) 169 | 170 | # Validate required fields 171 | required_fields = ["consistent", "confidence", "reasoning"] 172 | for field in required_fields: 173 | if field not in result: 174 | raise ValueError(f"Missing required field: {field}") 175 | 176 | # Ensure issues field exists 177 | if "issues" not in result: 178 | result["issues"] = [] 179 | 180 | # Validate data types 181 | if not isinstance(result["consistent"], bool): 182 | result["consistent"] = str(result["consistent"]).lower() in ["true", "yes", "1"] 183 | 184 | if not isinstance(result["confidence"], (int, float)): 185 | try: 186 | result["confidence"] = float(result["confidence"]) 187 | except: 188 | result["confidence"] = 0.5 189 | 190 | # Clamp confidence to [0, 1] range 191 | result["confidence"] = max(0.0, min(1.0, float(result["confidence"]))) 192 | 193 | if not isinstance(result["reasoning"], str): 194 | result["reasoning"] = str(result["reasoning"]) 195 | 196 | if not isinstance(result["issues"], list): 197 | result["issues"] = [str(result["issues"])] if result["issues"] else [] 198 | 199 | logging.info(f"Parsed consistency result: consistent={result['consistent']}, confidence={result['confidence']}") 200 | return result 201 | 202 | except json.JSONDecodeError as e: 203 | logging.error(f"JSON parsing error: {e}") 204 | logging.error(f"Response content: {content}") 205 | 206 | # Fallback: try to extract information manually 207 | return self._fallback_parse(content) 208 | 209 | except Exception as e: 210 | logging.error(f"Error parsing response: {e}") 211 | return { 212 | "consistent": False, 213 | "confidence": 0.0, 214 | "reasoning": f"Failed to parse LLM response: {str(e)}", 215 | "issues": ["Response parsing failed"], 216 | "prompt_tokens": 0, 217 | "completion_tokens": 0, 218 | "total_tokens": 0 219 | } 220 | 221 | def _fallback_parse(self, content: str) -> Dict[str, Any]: 222 | """ 223 | Fallback method to extract information when JSON parsing fails. 224 | 225 | Args: 226 | content (str): Raw LLM response content 227 | 228 | Returns: 229 | Dict[str, Any]: Best-effort parsed result 230 | """ 231 | try: 232 | content_lower = content.lower() 233 | 234 | # Try to determine consistency 235 | consistent = False 236 | if any(phrase in content_lower for phrase in ["consistent", "matches", "accurate", "correct"]): 237 | consistent = True 238 | elif any(phrase in content_lower for phrase in ["inconsistent", "mismatch", "inaccurate", "incorrect", "false"]): 239 | consistent = False 240 | 241 | # Try to extract confidence 242 | confidence = 0.5 243 | confidence_patterns = [ 244 | r'confidence[:\s]+([0-9]*\.?[0-9]+)', 245 | r'([0-9]*\.?[0-9]+)\s*confidence', 246 | r'confidence.*?([0-9]*\.?[0-9]+)' 247 | ] 248 | 249 | for pattern in confidence_patterns: 250 | match = re.search(pattern, content_lower) 251 | if match: 252 | try: 253 | conf_val = float(match.group(1)) 254 | if conf_val <= 1.0: 255 | confidence = conf_val 256 | elif conf_val <= 100: 257 | confidence = conf_val / 100 258 | break 259 | except: 260 | continue 261 | 262 | return { 263 | "consistent": consistent, 264 | "confidence": confidence, 265 | "reasoning": content.strip(), 266 | "issues": ["Response format was not JSON, used fallback parsing"], 267 | "prompt_tokens": 0, 268 | "completion_tokens": 0, 269 | "total_tokens": 0 270 | } 271 | 272 | except Exception as e: 273 | logging.error(f"Fallback parsing failed: {e}") 274 | return { 275 | "consistent": False, 276 | "confidence": 0.0, 277 | "reasoning": content.strip() if content else "No response content", 278 | "issues": ["Both JSON and fallback parsing failed"], 279 | "prompt_tokens": 0, 280 | "completion_tokens": 0, 281 | "total_tokens": 0 282 | } 283 | 284 | def batch_check(self, data: list) -> list: 285 | """ 286 | Check consistency for multiple commit-diff pairs. 287 | 288 | Args: 289 | data (list): List of dictionaries with 'commit_message' and 'code_diff' keys 290 | 291 | Returns: 292 | list: List of analysis results 293 | """ 294 | results = [] 295 | 296 | for i, item in enumerate(data): 297 | try: 298 | commit_message = item.get('commit_message', '') 299 | code_diff = item.get('code_diff', '') 300 | 301 | if not commit_message or not code_diff: 302 | logging.warning(f"Skipping item {i}: missing commit_message or code_diff") 303 | results.append({ 304 | "consistent": False, 305 | "confidence": 0.0, 306 | "reasoning": "Missing commit message or code diff", 307 | "issues": ["Incomplete input data"], 308 | "prompt_tokens": 0, 309 | "completion_tokens": 0, 310 | "total_tokens": 0 311 | }) 312 | continue 313 | 314 | result = self.check_consistency(commit_message, code_diff) 315 | results.append(result) 316 | 317 | logging.info(f"Processed item {i+1}/{len(data)}") 318 | 319 | except Exception as e: 320 | logging.error(f"Error processing item {i}: {e}") 321 | results.append({ 322 | "consistent": False, 323 | "confidence": 0.0, 324 | "reasoning": f"Error processing item: {str(e)}", 325 | "issues": ["Processing error"], 326 | "prompt_tokens": 0, 327 | "completion_tokens": 0, 328 | "total_tokens": 0 329 | }) 330 | 331 | return results 332 | -------------------------------------------------------------------------------- /evaluation/CoT/consistency_checker.py: -------------------------------------------------------------------------------- 1 | from langchain_community.chat_models import ChatOpenAI 2 | from langchain_core.prompts import ChatPromptTemplate 3 | from typing import Dict, Any, Optional 4 | import logging 5 | import json 6 | import re 7 | 8 | class ConsistencyChecker: 9 | """ 10 | A class to check consistency between commit messages and code changes using LLM with Chain of Thought reasoning. 11 | """ 12 | 13 | def __init__(self, openai_api_key: str, openai_api_base: str, model: str = "gpt-3.5-turbo"): 14 | """ 15 | Initialize the consistency checker with LLM configuration. 16 | 17 | Args: 18 | openai_api_key (str): OpenAI API key 19 | openai_api_base (str): OpenAI API base URL 20 | model (str): Model name to use 21 | """ 22 | # Validate input parameters 23 | if not openai_api_key or not openai_api_key.strip(): 24 | raise ValueError("OpenAI API key cannot be empty") 25 | if not openai_api_base or not openai_api_base.strip(): 26 | raise ValueError("OpenAI API base URL cannot be empty") 27 | if not model or not model.strip(): 28 | raise ValueError("Model name cannot be empty") 29 | 30 | self.chat_model = ChatOpenAI( 31 | openai_api_key=openai_api_key, 32 | openai_api_base=openai_api_base, 33 | model=model, 34 | temperature=0 35 | ) 36 | 37 | # Define the system prompt with Chain of Thought reasoning for consistency checking 38 | self.system_prompt = """You are an expert code reviewer tasked with evaluating the consistency between commit messages and their corresponding code changes. 39 | 40 | Your job is to analyze whether a commit message accurately describes the actual code changes made in the diff using a step-by-step Chain of Thought approach. 41 | 42 | Please follow this structured reasoning process: 43 | 44 | **Step 1: Understand the Commit Message** 45 | - What is the main purpose/goal described in the commit message? 46 | - What specific changes or fixes does it claim to make? 47 | - What components, files, or functionality does it mention? 48 | 49 | **Step 2: Analyze the Code Diff** 50 | - What files are actually modified? 51 | - What specific code changes are made (additions, deletions, modifications)? 52 | - What functionality is actually being changed or implemented? 53 | 54 | **Step 3: Compare Message vs Reality** 55 | - Do the claimed changes in the message match the actual code changes? 56 | - Are the mentioned components/files actually modified? 57 | - Is the scope of changes (major/minor) consistent with the message? 58 | - Are there any important changes in the code that are not mentioned in the message? 59 | - Are there any claims in the message that are not supported by the code changes? 60 | 61 | **Step 4: Identify Inconsistencies (if any)** 62 | - List specific discrepancies between the message and code 63 | - Note any misleading or false information in the commit message 64 | - Identify missing information that should have been mentioned 65 | 66 | **Step 5: Make Final Decision** 67 | - Based on the analysis above, determine if the message is consistent with the code changes 68 | - Assign a confidence level (0.0-1.0) based on the clarity and strength of the evidence 69 | - Provide a clear reasoning for the decision 70 | 71 | Consider the following aspects in your analysis: 72 | 1. Does the commit message describe the actual changes made? 73 | 2. Are the mentioned components/files/functions actually modified? 74 | 3. Is the scope of changes (major/minor) consistent with the message? 75 | 4. Are any important changes missing from the commit message? 76 | 5. Does the commit message contain any false or misleading information? 77 | 78 | Respond with a JSON object containing: 79 | - "consistent": true/false, 80 | - "confidence": 0.0-1.0, 81 | - "reasoning": "detailed step-by-step explanation following the Chain of Thought process above", 82 | - "issues": ["list of specific inconsistencies found, if any"] 83 | 84 | Be thorough in your step-by-step analysis and provide clear reasoning for your decision.""" 85 | 86 | self.user_prompt = """Please analyze the consistency between this commit message and the corresponding code diff using the Chain of Thought approach: 87 | 88 | **Commit Message:** 89 | {commit_message} 90 | 91 | **Code Diff:** 92 | {code_diff} 93 | 94 | Follow the 5-step Chain of Thought process outlined in the system prompt to evaluate whether the commit message accurately describes the code changes, and respond with the requested JSON format.""" 95 | 96 | def check_consistency(self, commit_message: str, code_diff: str) -> Dict[str, Any]: 97 | """ 98 | Check consistency between a commit message and code diff using Chain of Thought reasoning. 99 | 100 | Args: 101 | commit_message (str): The commit message to analyze 102 | code_diff (str): The code diff to analyze 103 | 104 | Returns: 105 | Dict[str, Any]: Analysis result containing consistency, confidence, reasoning, and issues 106 | """ 107 | try: 108 | prompt = ChatPromptTemplate.from_messages([ 109 | ("system", self.system_prompt), 110 | ("user", self.user_prompt) 111 | ]) 112 | 113 | # print(prompt) 114 | 115 | chain = prompt | self.chat_model 116 | 117 | response = chain.invoke({ 118 | "commit_message": commit_message, 119 | "code_diff": code_diff 120 | }) 121 | 122 | logging.info(f"LLM CoT Consistency Check Response: {response.content}") 123 | 124 | # Extract token usage from response 125 | prompt_tokens = 0 126 | completion_tokens = 0 127 | total_tokens = 0 128 | 129 | try: 130 | if hasattr(response, 'response_metadata') and response.response_metadata: 131 | token_usage = response.response_metadata.get('token_usage', {}) 132 | prompt_tokens = token_usage.get('prompt_tokens', 0) 133 | completion_tokens = token_usage.get('completion_tokens', 0) 134 | total_tokens = token_usage.get('total_tokens', 0) 135 | elif hasattr(response, 'usage_metadata'): 136 | prompt_tokens = getattr(response.usage_metadata, 'input_tokens', 0) 137 | completion_tokens = getattr(response.usage_metadata, 'output_tokens', 0) 138 | total_tokens = getattr(response.usage_metadata, 'total_tokens', 0) 139 | except Exception as e: 140 | logging.warning(f"Could not extract token usage: {e}") 141 | 142 | # Parse the JSON response 143 | result = self._parse_response(response.content) 144 | 145 | # Add token usage to result 146 | result['prompt_tokens'] = prompt_tokens 147 | result['completion_tokens'] = completion_tokens 148 | result['total_tokens'] = total_tokens 149 | 150 | return result 151 | 152 | except Exception as e: 153 | logging.error(f"Error in CoT consistency check: {e}") 154 | import traceback 155 | traceback.print_exc() 156 | return { 157 | "consistent": False, 158 | "confidence": 0.0, 159 | "reasoning": f"Error occurred during CoT analysis: {str(e)}", 160 | "issues": ["Analysis failed due to technical error"], 161 | "prompt_tokens": 0, 162 | "completion_tokens": 0, 163 | "total_tokens": 0 164 | } 165 | 166 | def _parse_response(self, content: str) -> Dict[str, Any]: 167 | """ 168 | Parse the LLM response to extract the JSON result. 169 | 170 | Args: 171 | content (str): Raw LLM response content 172 | 173 | Returns: 174 | Dict[str, Any]: Parsed analysis result 175 | """ 176 | try: 177 | content = content.strip() 178 | 179 | # Try to find JSON object in the response 180 | # Look for patterns like: ```json {...} ``` or just {...} 181 | json_pattern = r'```json\s*(\{.*?\})\s*```' 182 | match = re.search(json_pattern, content, re.DOTALL) 183 | 184 | if match: 185 | json_str = match.group(1) 186 | else: 187 | # Try to find JSON object without code blocks 188 | obj_pattern = r'(\{.*?\})' 189 | match = re.search(obj_pattern, content, re.DOTALL) 190 | if match: 191 | json_str = match.group(1) 192 | else: 193 | # Try to extract from the raw content 194 | json_str = content 195 | 196 | # Parse the JSON 197 | result = json.loads(json_str) 198 | 199 | # Validate required fields 200 | required_fields = ["consistent", "confidence", "reasoning"] 201 | for field in required_fields: 202 | if field not in result: 203 | raise ValueError(f"Missing required field: {field}") 204 | 205 | # Ensure issues field exists 206 | if "issues" not in result: 207 | result["issues"] = [] 208 | 209 | # Validate data types 210 | if not isinstance(result["consistent"], bool): 211 | result["consistent"] = str(result["consistent"]).lower() in ["true", "yes", "1"] 212 | 213 | if not isinstance(result["confidence"], (int, float)): 214 | try: 215 | result["confidence"] = float(result["confidence"]) 216 | except: 217 | result["confidence"] = 0.5 218 | 219 | # Clamp confidence to [0, 1] range 220 | result["confidence"] = max(0.0, min(1.0, float(result["confidence"]))) 221 | 222 | if not isinstance(result["reasoning"], str): 223 | result["reasoning"] = str(result["reasoning"]) 224 | 225 | if not isinstance(result["issues"], list): 226 | result["issues"] = [str(result["issues"])] if result["issues"] else [] 227 | 228 | logging.info(f"Parsed CoT consistency result: consistent={result['consistent']}, confidence={result['confidence']}") 229 | return result 230 | 231 | except json.JSONDecodeError as e: 232 | logging.error(f"JSON parsing error: {e}") 233 | logging.error(f"Response content: {content}") 234 | 235 | # Fallback: try to extract information manually 236 | return self._fallback_parse(content) 237 | 238 | except Exception as e: 239 | logging.error(f"Error parsing response: {e}") 240 | return { 241 | "consistent": False, 242 | "confidence": 0.0, 243 | "reasoning": f"Failed to parse LLM response: {str(e)}", 244 | "issues": ["Response parsing failed"], 245 | "prompt_tokens": 0, 246 | "completion_tokens": 0, 247 | "total_tokens": 0 248 | } 249 | 250 | def _fallback_parse(self, content: str) -> Dict[str, Any]: 251 | """ 252 | Fallback method to extract information when JSON parsing fails. 253 | 254 | Args: 255 | content (str): Raw LLM response content 256 | 257 | Returns: 258 | Dict[str, Any]: Best-effort parsed result 259 | """ 260 | try: 261 | content_lower = content.lower() 262 | 263 | # Try to determine consistency 264 | consistent = False 265 | if any(phrase in content_lower for phrase in ["consistent", "matches", "accurate", "correct"]): 266 | consistent = True 267 | elif any(phrase in content_lower for phrase in ["inconsistent", "mismatch", "inaccurate", "incorrect", "false"]): 268 | consistent = False 269 | 270 | # Try to extract confidence 271 | confidence = 0.5 272 | confidence_patterns = [ 273 | r'confidence[:\s]+([0-9]*\.?[0-9]+)', 274 | r'([0-9]*\.?[0-9]+)\s*confidence', 275 | r'confidence.*?([0-9]*\.?[0-9]+)' 276 | ] 277 | 278 | for pattern in confidence_patterns: 279 | match = re.search(pattern, content_lower) 280 | if match: 281 | try: 282 | conf_val = float(match.group(1)) 283 | if conf_val <= 1.0: 284 | confidence = conf_val 285 | elif conf_val <= 100: 286 | confidence = conf_val / 100 287 | break 288 | except: 289 | continue 290 | 291 | return { 292 | "consistent": consistent, 293 | "confidence": confidence, 294 | "reasoning": content.strip(), 295 | "issues": ["Response format was not JSON, used fallback parsing"], 296 | "prompt_tokens": 0, 297 | "completion_tokens": 0, 298 | "total_tokens": 0 299 | } 300 | 301 | except Exception as e: 302 | logging.error(f"Fallback parsing failed: {e}") 303 | return { 304 | "consistent": False, 305 | "confidence": 0.0, 306 | "reasoning": content.strip() if content else "No response content", 307 | "issues": ["Both JSON and fallback parsing failed"], 308 | "prompt_tokens": 0, 309 | "completion_tokens": 0, 310 | "total_tokens": 0 311 | } 312 | 313 | def batch_check(self, data: list) -> list: 314 | """ 315 | Check consistency for multiple commit-diff pairs using Chain of Thought reasoning. 316 | 317 | Args: 318 | data (list): List of dictionaries with 'commit_message' and 'code_diff' keys 319 | 320 | Returns: 321 | list: List of analysis results 322 | """ 323 | results = [] 324 | 325 | for i, item in enumerate(data): 326 | try: 327 | commit_message = item.get('commit_message', '') 328 | code_diff = item.get('code_diff', '') 329 | 330 | if not commit_message or not code_diff: 331 | logging.warning(f"Skipping item {i}: missing commit_message or code_diff") 332 | results.append({ 333 | "consistent": False, 334 | "confidence": 0.0, 335 | "reasoning": "Missing commit message or code diff", 336 | "issues": ["Incomplete input data"], 337 | "prompt_tokens": 0, 338 | "completion_tokens": 0, 339 | "total_tokens": 0 340 | }) 341 | continue 342 | 343 | result = self.check_consistency(commit_message, code_diff) 344 | results.append(result) 345 | 346 | logging.info(f"Processed item {i+1}/{len(data)} with CoT reasoning") 347 | 348 | except Exception as e: 349 | logging.error(f"Error processing item {i}: {e}") 350 | results.append({ 351 | "consistent": False, 352 | "confidence": 0.0, 353 | "reasoning": f"Error processing item: {str(e)}", 354 | "issues": ["Processing error"], 355 | "prompt_tokens": 0, 356 | "completion_tokens": 0, 357 | "total_tokens": 0 358 | }) 359 | 360 | return results 361 | -------------------------------------------------------------------------------- /data_synthesis/llm_interface.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLM interface module for calling large language models to generate inconsistent commit messages 3 | """ 4 | 5 | import json 6 | import logging 7 | import re 8 | from typing import Dict, Any, Optional 9 | from abc import ABC, abstractmethod 10 | 11 | try: 12 | from langchain_community.chat_models import ChatOpenAI 13 | from langchain_core.prompts import ChatPromptTemplate 14 | from langchain_core.messages import HumanMessage, SystemMessage 15 | LANGCHAIN_AVAILABLE = True 16 | except ImportError: 17 | LANGCHAIN_AVAILABLE = False 18 | logging.warning("Langchain not available, using mock responses") 19 | 20 | 21 | class LLMInterface(ABC): 22 | """LLM interface abstract base class""" 23 | 24 | @abstractmethod 25 | def generate_inconsistent_message(self, prompt: str) -> Optional[str]: 26 | """Generate inconsistent commit message""" 27 | pass 28 | 29 | @abstractmethod 30 | def query(self, prompt: str) -> Optional[str]: 31 | """General LLM query method for any type of analysis""" 32 | pass 33 | 34 | 35 | class LangChainOpenAIInterface(LLMInterface): 36 | """LangChain OpenAI interface matching model_manager.py style""" 37 | 38 | def __init__(self, api_key: str, base_url: str = "", 39 | model: str = "gpt-3.5-turbo", max_retries: int = 3): 40 | # Validate input parameters 41 | if not api_key or not api_key.strip(): 42 | raise ValueError("OpenAI API key cannot be empty") 43 | if not base_url or not base_url.strip(): 44 | raise ValueError("OpenAI API base URL cannot be empty") 45 | if not model or not model.strip(): 46 | raise ValueError("Model name cannot be empty") 47 | 48 | self.api_key = api_key 49 | self.base_url = base_url 50 | self.model = model 51 | self.max_retries = max_retries 52 | self.logger = logging.getLogger(__name__) 53 | 54 | if not LANGCHAIN_AVAILABLE: 55 | self.logger.warning("Langchain not available, will use mock responses") 56 | self.chat_model = None 57 | elif api_key.lower() == "empty": 58 | self.logger.warning("API key is empty, will use mock responses") 59 | self.chat_model = None 60 | else: 61 | try: 62 | self.chat_model = ChatOpenAI( 63 | openai_api_key=api_key, 64 | openai_api_base=base_url, 65 | model=model, 66 | temperature=0 67 | ) 68 | except Exception as e: 69 | self.logger.error(f"Failed to initialize ChatOpenAI: {e}") 70 | self.chat_model = None 71 | 72 | def generate_inconsistent_message(self, prompt: str) -> Optional[str]: 73 | """Call LangChain OpenAI to generate inconsistent commit message""" 74 | if not self.chat_model: 75 | self.logger.warning("Chat model not available, returning mock response") 76 | return self._get_mock_response() 77 | 78 | try: 79 | # Use direct message construction to avoid template parsing issues with special characters 80 | # print(f"🔍 Generating inconsistent message with prompt: {prompt}") 81 | 82 | # Create messages directly without template parsing to handle special characters like {} 83 | system_message = SystemMessage(content="You are a helpful assistant that generates inconsistent commit messages for testing purposes. Always return valid JSON with a single 'message' field.") 84 | human_message = HumanMessage(content=prompt) 85 | 86 | # Call the model directly with messages 87 | print("send message...") 88 | response = self.chat_model.invoke([system_message, human_message]) 89 | print("finish send message") 90 | 91 | self.logger.info(f"LLM Response: {response.content}") 92 | 93 | # Parse the response content to extract message 94 | try: 95 | content = response.content.strip() 96 | 97 | # Try to find JSON in the response (similar to model_manager.py parsing logic) 98 | # Look for patterns like: ```json {...} ``` or just {...} 99 | json_pattern = r'```json\s*(\{.*?\})\s*```' 100 | match = re.search(json_pattern, content, re.DOTALL) 101 | 102 | if match: 103 | json_str = match.group(1) 104 | else: 105 | # Try to find object without code blocks 106 | object_pattern = r'(\{.*?\})' 107 | match = re.search(object_pattern, content, re.DOTALL) 108 | if match: 109 | json_str = match.group(1) 110 | else: 111 | # If no JSON found, try to extract from quotes 112 | quote_pattern = r'"message"\s*:\s*"([^"]+)"' 113 | match = re.search(quote_pattern, content) 114 | if match: 115 | return match.group(1) 116 | else: 117 | self.logger.warning("Could not find JSON or message in response") 118 | return self._extract_message_from_text(content) 119 | 120 | # Parse the JSON 121 | parsed_response = json.loads(json_str) 122 | 123 | # Extract message field 124 | if isinstance(parsed_response, dict) and 'message' in parsed_response: 125 | message = parsed_response['message'] 126 | if isinstance(message, str) and message.strip(): 127 | return message.strip() 128 | else: 129 | self.logger.warning("Message field is empty or not a string") 130 | return self._extract_message_from_text(content) 131 | else: 132 | self.logger.warning("Parsed JSON does not contain 'message' field") 133 | return self._extract_message_from_text(content) 134 | 135 | except json.JSONDecodeError as e: 136 | self.logger.error(f"Error parsing JSON from LLM response: {e}") 137 | self.logger.debug(f"Response content: {content}") 138 | 139 | # Fallback: try to extract meaningful text 140 | return self._extract_message_from_text(content) 141 | 142 | except Exception as e: 143 | self.logger.error(f"Error in LLM request: {e}") 144 | import traceback 145 | traceback.print_exc() 146 | return self._get_mock_response() 147 | 148 | def _extract_message_from_text(self, text: str) -> str: 149 | """Extract possible commit message from text (fallback method)""" 150 | try: 151 | # Clean up the text 152 | text = text.strip() 153 | 154 | # Look for quoted strings that could be commit messages 155 | quote_patterns = [ 156 | r'"([^"]+)"', 157 | r"'([^']+)'", 158 | r'`([^`]+)`' 159 | ] 160 | 161 | for pattern in quote_patterns: 162 | matches = re.findall(pattern, text) 163 | for match in matches: 164 | if len(match.strip()) > 10: # Reasonable commit message length 165 | return match.strip() 166 | 167 | # If no quoted strings, look for lines that could be commit messages 168 | lines = text.split('\n') 169 | for line in lines: 170 | line = line.strip() 171 | if (line and 172 | not line.startswith('{') and 173 | not line.startswith('[') and 174 | not line.startswith('```') and 175 | len(line) > 10 and 176 | len(line) < 200): # Reasonable commit message bounds 177 | return line 178 | 179 | # Last resort: return the first reasonable line 180 | if text and len(text) > 10: 181 | return text.split('\n')[0][:100] # Truncate if too long 182 | 183 | return "Generated inconsistent commit message" 184 | 185 | except Exception as e: 186 | self.logger.error(f"Error extracting message from text: {e}") 187 | return "Generated inconsistent commit message" 188 | 189 | def _get_mock_response(self) -> str: 190 | """Return mock response for testing""" 191 | return "Mock inconsistent commit message for testing" 192 | 193 | def query(self, prompt: str) -> Optional[str]: 194 | """General LLM query method for any type of analysis""" 195 | if not self.chat_model: 196 | self.logger.warning("Chat model not available, returning mock response") 197 | return self._get_mock_query_response() 198 | 199 | try: 200 | # Use direct message construction to avoid template parsing issues with special characters 201 | self.logger.debug(f"Sending general query to LLM") 202 | 203 | # Create messages directly without template parsing 204 | system_message = SystemMessage(content="You are a helpful assistant that provides detailed analysis and responds in the requested format.") 205 | human_message = HumanMessage(content=prompt) 206 | 207 | # Call the model directly with messages 208 | response = self.chat_model.invoke([system_message, human_message]) 209 | 210 | self.logger.info(f"LLM Query Response received") 211 | 212 | # Process the response to extract JSON 213 | if response.content: 214 | raw_content = response.content.strip() 215 | return self._extract_json_from_response(raw_content) 216 | else: 217 | return None 218 | 219 | except Exception as e: 220 | self.logger.error(f"Error in LLM query: {e}") 221 | import traceback 222 | traceback.print_exc() 223 | return self._get_mock_query_response() 224 | 225 | def _extract_json_from_response(self, raw_response: str) -> Optional[str]: 226 | """Extract JSON string from LLM response""" 227 | import re 228 | 229 | try: 230 | # First, try to parse the response directly as JSON 231 | json.loads(raw_response) 232 | return raw_response 233 | except json.JSONDecodeError: 234 | pass 235 | 236 | # Try to find JSON within the response 237 | # Look for content between { and } 238 | json_match = re.search(r'\{.*\}', raw_response, re.DOTALL) 239 | if json_match: 240 | json_str = json_match.group(0) 241 | try: 242 | # Validate that it's proper JSON 243 | json.loads(json_str) 244 | return json_str 245 | except json.JSONDecodeError: 246 | pass 247 | 248 | # Try to find content between ```json and ``` or ``` 249 | code_block_match = re.search(r'```(?:json)?\s*(.*?)\s*```', raw_response, re.DOTALL) 250 | if code_block_match: 251 | json_str = code_block_match.group(1).strip() 252 | try: 253 | json.loads(json_str) 254 | return json_str 255 | except json.JSONDecodeError: 256 | pass 257 | 258 | # If no valid JSON found, log warning and return mock response 259 | self.logger.warning(f"Could not extract valid JSON from response: {raw_response[:200]}...") 260 | return self._get_mock_query_response() 261 | 262 | def _get_mock_query_response(self) -> str: 263 | """Return mock response for general queries""" 264 | return '{"function_name_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "file_path_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "operation_type_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "purpose_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "component_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "feature_missing": {"applicable": false, "reasoning": "Mock reasoning"}, "extra_feature": {"applicable": true, "reasoning": "Mock reasoning"}}' 265 | 266 | 267 | class MockLLMInterface(LLMInterface): 268 | """Mock LLM interface for testing when LangChain is not available""" 269 | 270 | def __init__(self, **kwargs): 271 | self.logger = logging.getLogger(__name__) 272 | 273 | def generate_inconsistent_message(self, prompt: str) -> Optional[str]: 274 | """Generate a mock inconsistent commit message""" 275 | self.logger.info("Using mock LLM interface") 276 | 277 | # Extract some context from prompt to make mock more realistic 278 | if "function" in prompt.lower(): 279 | return "Fix bug in login_handler() method" 280 | elif "file" in prompt.lower(): 281 | return "Update database_config.py settings" 282 | elif "operation" in prompt.lower(): 283 | return "Add new user registration feature" 284 | else: 285 | return "Mock inconsistent commit message for testing" 286 | 287 | def query(self, message: str) -> Optional[str]: 288 | """Return mock response for general queries""" 289 | self.logger.info("Using mock LLM interface for query") 290 | return '{"function_name_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "file_path_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "operation_type_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "purpose_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "component_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "feature_missing": {"applicable": false, "reasoning": "Mock reasoning"}, "extra_feature": {"applicable": true, "reasoning": "Mock reasoning"}}' 291 | 292 | 293 | class LLMManager: 294 | """LLM manager supporting multiple LLM interfaces""" 295 | 296 | def __init__(self, llm_config: Dict[str, Any]): 297 | self.logger = logging.getLogger(__name__) 298 | self.llm_interface = self._create_llm_interface(llm_config) 299 | self.llm_config_test = llm_config 300 | 301 | def _create_llm_interface(self, config: Dict[str, Any]) -> LLMInterface: 302 | """Create LLM interface based on configuration""" 303 | provider = config.get('provider', 'openai').lower() 304 | 305 | if provider == 'openai': 306 | if not LANGCHAIN_AVAILABLE: 307 | self.logger.warning("LangChain not available, using mock interface") 308 | return MockLLMInterface(**config) 309 | 310 | return LangChainOpenAIInterface( 311 | api_key=config.get('api_key', 'empty'), 312 | base_url=config.get('base_url', ''), 313 | model=config.get('model', 'gpt-3.5-turbo'), 314 | max_retries=config.get('max_retries', 3) 315 | ) 316 | else: 317 | raise ValueError(f"Unsupported LLM provider: {provider}") 318 | 319 | def generate_inconsistent_message(self, prompt: str) -> Optional[str]: 320 | """Generate inconsistent commit message""" 321 | try: 322 | return self.llm_interface.generate_inconsistent_message(prompt) 323 | except Exception as e: 324 | self.logger.error(f"Failed to generate inconsistent message: {str(e)}") 325 | return "Error generating inconsistent commit message" 326 | 327 | def query(self, message: str) -> Optional[str]: 328 | """Perform general query to LLM""" 329 | try: 330 | return self.llm_interface.query(message) 331 | except Exception as e: 332 | self.logger.error(f"Failed to query LLM: {str(e)}") 333 | return "Error querying LLM" 334 | -------------------------------------------------------------------------------- /data_synthesis/data_validator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data validation module for verifying the quality of generated inconsistent commit messages 3 | """ 4 | 5 | import json 6 | import logging 7 | from pathlib import Path 8 | from typing import Dict, Any, Tuple, Optional 9 | from dataclasses import dataclass 10 | 11 | from llm_interface import LLMManager 12 | 13 | 14 | @dataclass 15 | class ValidationResult: 16 | """Validation result for a single data entry""" 17 | line_number: int 18 | original_entry: Dict[str, Any] 19 | groundtruth_message: str 20 | consistency_with_new_message: bool 21 | consistency_with_original_message: bool 22 | new_message_reasoning: str 23 | original_message_reasoning: str 24 | is_valid: bool # True if inconsistent with new but consistent with original 25 | 26 | 27 | class DataValidator: 28 | """Validate generated inconsistent data quality""" 29 | 30 | def __init__(self, llm_manager: LLMManager, log_file: str = "valid.log"): 31 | self.llm_manager = llm_manager 32 | self.logger = logging.getLogger(__name__) 33 | self.log_file = log_file 34 | 35 | # Set up validation log file 36 | self.validation_logger = logging.getLogger("validation") 37 | self.validation_logger.setLevel(logging.INFO) 38 | 39 | # Create file handler for validation results 40 | file_handler = logging.FileHandler(self.log_file, mode='w', encoding='utf-8') 41 | file_handler.setLevel(logging.INFO) 42 | 43 | # Create formatter 44 | formatter = logging.Formatter('%(asctime)s - %(message)s') 45 | file_handler.setFormatter(formatter) 46 | 47 | # Add handler to validation logger 48 | self.validation_logger.addHandler(file_handler) 49 | 50 | # Log header 51 | self.validation_logger.info("=" * 80) 52 | self.validation_logger.info("Data Validation Results Log") 53 | self.validation_logger.info("=" * 80) 54 | 55 | def _generate_groundtruth_message(self, diff: str) -> Optional[str]: 56 | """Generate groundtruth commit message from diff""" 57 | 58 | prompt = f""" 59 | You are an expert software developer. Given the following code diff, generate a clear and accurate commit message that precisely describes the changes made. 60 | 61 | The commit message should: 62 | 1. Be concise but descriptive 63 | 2. Use conventional commit format if applicable 64 | 3. Accurately reflect what was actually changed in the code 65 | 4. Focus on the main purpose/intent of the changes 66 | 67 | Code diff: 68 | {diff} 69 | 70 | Please respond with a JSON object in this exact format: 71 | {{ 72 | "message": "your generated commit message here" 73 | }} 74 | """ 75 | 76 | try: 77 | response = self.llm_manager.query(prompt) 78 | if response: 79 | # Parse JSON response 80 | result = json.loads(response) 81 | return result.get("message", "").strip() 82 | return None 83 | except Exception as e: 84 | self.logger.error(f"Error generating groundtruth message: {e}") 85 | return None 86 | 87 | def _check_message_consistency(self, groundtruth: str, test_message: str, message_type: str) -> Tuple[bool, str]: 88 | """Check if two messages are consistent""" 89 | 90 | prompt = f""" 91 | You are an expert at analyzing commit messages. Compare the following two commit messages and determine if they describe the same changes with the same intent. 92 | 93 | Groundtruth message (generated from actual code diff): {groundtruth} 94 | {message_type} message: {test_message} 95 | 96 | Please analyze: 97 | 1. Do both messages describe the same type of operation (e.g., fix, add, update, refactor)? 98 | 2. Do both messages refer to the same components/files/functions? 99 | 3. Do both messages convey the same purpose/intent? 100 | 4. Are the key details consistent between them? 101 | 102 | Consider the messages CONSISTENT if they describe the same changes with the same intent, even if wording differs slightly. 103 | Consider them INCONSISTENT if they describe different changes, different purposes, or contradictory information. 104 | 105 | Respond with a JSON object in this exact format: 106 | {{ 107 | "consistent": true/false, 108 | "reasoning": "detailed explanation of your analysis and decision" 109 | }} 110 | """ 111 | 112 | try: 113 | response = self.llm_manager.query(prompt) 114 | if response: 115 | result = json.loads(response) 116 | return result.get("consistent", False), result.get("reasoning", "No reasoning provided") 117 | return False, "Failed to get LLM response" 118 | except Exception as e: 119 | self.logger.error(f"Error checking message consistency: {e}") 120 | return False, f"Error in consistency check: {str(e)}" 121 | 122 | def validate_entry(self, entry: Dict[str, Any], line_number: int) -> ValidationResult: 123 | """Validate a single data entry""" 124 | 125 | # Extract required fields 126 | new_message = entry.get('message', '') 127 | original_message = entry.get('original_message', '') 128 | diff = entry.get('diff', '') 129 | 130 | self.logger.info(f"Validating entry {line_number}") 131 | 132 | # Generate groundtruth message from diff 133 | groundtruth = self._generate_groundtruth_message(diff) 134 | if not groundtruth: 135 | self.logger.warning(f"Failed to generate groundtruth for entry {line_number}") 136 | return ValidationResult( 137 | line_number=line_number, 138 | original_entry=entry, 139 | groundtruth_message="", 140 | consistency_with_new_message=False, 141 | consistency_with_original_message=False, 142 | new_message_reasoning="Failed to generate groundtruth", 143 | original_message_reasoning="Failed to generate groundtruth", 144 | is_valid=False 145 | ) 146 | 147 | # Check consistency with new message 148 | consistent_with_new, new_reasoning = self._check_message_consistency( 149 | groundtruth, new_message, "New" 150 | ) 151 | 152 | # Check consistency with original message 153 | consistent_with_original, original_reasoning = self._check_message_consistency( 154 | groundtruth, original_message, "Original" 155 | ) 156 | 157 | # Determine if entry is valid 158 | # Valid if: inconsistent with new message AND consistent with original message 159 | is_valid = (not consistent_with_new) and consistent_with_original 160 | 161 | # Create validation result 162 | result = ValidationResult( 163 | line_number=line_number, 164 | original_entry=entry, 165 | groundtruth_message=groundtruth, 166 | consistency_with_new_message=consistent_with_new, 167 | consistency_with_original_message=consistent_with_original, 168 | new_message_reasoning=new_reasoning, 169 | original_message_reasoning=original_reasoning, 170 | is_valid=is_valid 171 | ) 172 | 173 | # Log validation result to file 174 | self._log_validation_result(result) 175 | 176 | return result 177 | 178 | def _log_validation_result(self, result: ValidationResult) -> None: 179 | """Log validation result to file""" 180 | 181 | # Format validation result 182 | status = "✅ VALID" if result.is_valid else "❌ INVALID" 183 | 184 | self.validation_logger.info(f"\nEntry {result.line_number}: {status}") 185 | self.validation_logger.info("-" * 60) 186 | 187 | # Log messages 188 | self.validation_logger.info(f"Original Message: {result.original_entry.get('original_message', 'N/A')}") 189 | self.validation_logger.info(f"Generated Message: {result.original_entry.get('message', 'N/A')}") 190 | self.validation_logger.info(f"Groundtruth Message: {result.groundtruth_message}") 191 | 192 | # Log consistency analysis 193 | new_status = "✅ CONSISTENT" if result.consistency_with_new_message else "❌ INCONSISTENT" 194 | original_status = "✅ CONSISTENT" if result.consistency_with_original_message else "❌ INCONSISTENT" 195 | 196 | self.validation_logger.info(f"\nConsistency Analysis:") 197 | self.validation_logger.info(f" Generated vs Groundtruth: {new_status}") 198 | self.validation_logger.info(f" Reasoning: {result.new_message_reasoning}") 199 | self.validation_logger.info(f" Original vs Groundtruth: {original_status}") 200 | self.validation_logger.info(f" Reasoning: {result.original_message_reasoning}") 201 | 202 | # Log final decision 203 | self.validation_logger.info(f"\nFinal Decision: {status}") 204 | if result.is_valid: 205 | self.validation_logger.info(" ✓ Generated message is inconsistent with groundtruth (as expected)") 206 | self.validation_logger.info(" ✓ Original message is consistent with groundtruth (as expected)") 207 | else: 208 | if result.consistency_with_new_message: 209 | self.validation_logger.info(" ✗ Generated message is consistent with groundtruth (should be inconsistent)") 210 | if not result.consistency_with_original_message: 211 | self.validation_logger.info(" ✗ Original message is inconsistent with groundtruth (should be consistent)") 212 | 213 | self.validation_logger.info("=" * 80) 214 | 215 | # Format validation result 216 | status = "✅ VALID" if result.is_valid else "❌ INVALID" 217 | 218 | self.validation_logger.info(f"\nEntry {result.line_number}: {status}") 219 | self.validation_logger.info("-" * 60) 220 | 221 | # Log messages 222 | self.validation_logger.info(f"Original Message: {result.original_entry.get('original_message', 'N/A')}") 223 | self.validation_logger.info(f"Generated Message: {result.original_entry.get('message', 'N/A')}") 224 | self.validation_logger.info(f"Groundtruth Message: {result.groundtruth_message}") 225 | 226 | # Log consistency analysis 227 | new_status = "✅ CONSISTENT" if result.consistency_with_new_message else "❌ INCONSISTENT" 228 | original_status = "✅ CONSISTENT" if result.consistency_with_original_message else "❌ INCONSISTENT" 229 | 230 | self.validation_logger.info(f"\nConsistency Analysis:") 231 | self.validation_logger.info(f" Generated vs Groundtruth: {new_status}") 232 | self.validation_logger.info(f" Reasoning: {result.new_message_reasoning}") 233 | self.validation_logger.info(f" Original vs Groundtruth: {original_status}") 234 | self.validation_logger.info(f" Reasoning: {result.original_message_reasoning}") 235 | 236 | # Log final decision 237 | self.validation_logger.info(f"\nFinal Decision: {status}") 238 | if result.is_valid: 239 | self.validation_logger.info(" ✓ Generated message is inconsistent with groundtruth (as expected)") 240 | self.validation_logger.info(" ✓ Original message is consistent with groundtruth (as expected)") 241 | else: 242 | if result.consistency_with_new_message: 243 | self.validation_logger.info(" ✗ Generated message is consistent with groundtruth (should be inconsistent)") 244 | if not result.consistency_with_original_message: 245 | self.validation_logger.info(" ✗ Original message is inconsistent with groundtruth (should be consistent)") 246 | 247 | self.validation_logger.info("=" * 80) 248 | 249 | def validate_file(self, input_file: str, output_file: str, max_entries: Optional[int] = None) -> Dict[str, Any]: 250 | """ 251 | Validate entire file and create filtered output 252 | 253 | Args: 254 | input_file: Input JSONL file with generated data 255 | output_file: Output file for valid entries 256 | max_entries: Maximum number of entries to process 257 | 258 | Returns: 259 | Statistics dictionary 260 | """ 261 | 262 | input_path = Path(input_file) 263 | output_path = Path(output_file) 264 | 265 | if not input_path.exists(): 266 | raise FileNotFoundError(f"Input file not found: {input_file}") 267 | 268 | # Ensure output directory exists 269 | output_path.parent.mkdir(parents=True, exist_ok=True) 270 | 271 | # Statistics tracking 272 | stats = { 273 | 'total_processed': 0, 274 | 'valid_entries': 0, 275 | 'invalid_entries': 0, 276 | 'groundtruth_generation_failures': 0, 277 | 'consistent_with_new': 0, 278 | 'inconsistent_with_new': 0, 279 | 'consistent_with_original': 0, 280 | 'inconsistent_with_original': 0, 281 | 'validation_details': [] 282 | } 283 | 284 | valid_entries = [] 285 | 286 | with open(input_path, 'r', encoding='utf-8') as infile: 287 | for line_num, line in enumerate(infile, 1): 288 | if max_entries and stats['total_processed'] >= max_entries: 289 | break 290 | 291 | try: 292 | entry = json.loads(line.strip()) 293 | 294 | # Skip entries that don't have required fields 295 | if not all(key in entry for key in ['message', 'original_message', 'diff']): 296 | self.logger.warning(f"Skipping entry {line_num}: missing required fields") 297 | continue 298 | 299 | # Validate entry 300 | result = self.validate_entry(entry, line_num) 301 | 302 | # Update statistics 303 | stats['total_processed'] += 1 304 | 305 | if not result.groundtruth_message: 306 | stats['groundtruth_generation_failures'] += 1 307 | continue 308 | 309 | if result.consistency_with_new_message: 310 | stats['consistent_with_new'] += 1 311 | else: 312 | stats['inconsistent_with_new'] += 1 313 | 314 | if result.consistency_with_original_message: 315 | stats['consistent_with_original'] += 1 316 | else: 317 | stats['inconsistent_with_original'] += 1 318 | 319 | if result.is_valid: 320 | stats['valid_entries'] += 1 321 | valid_entries.append(entry) 322 | self.logger.info(f"Entry {line_num} is valid") 323 | else: 324 | stats['invalid_entries'] += 1 325 | self.logger.info(f"Entry {line_num} is invalid") 326 | 327 | # Store detailed validation info 328 | stats['validation_details'].append({ 329 | 'line_number': line_num, 330 | 'groundtruth': result.groundtruth_message, 331 | 'consistent_with_new': result.consistency_with_new_message, 332 | 'consistent_with_original': result.consistency_with_original_message, 333 | 'new_reasoning': result.new_message_reasoning, 334 | 'original_reasoning': result.original_message_reasoning, 335 | 'is_valid': result.is_valid 336 | }) 337 | 338 | # Log progress 339 | if stats['total_processed'] % 10 == 0: 340 | self.logger.info(f"Processed {stats['total_processed']} entries...") 341 | 342 | except json.JSONDecodeError as e: 343 | self.logger.error(f"Failed to parse JSON on line {line_num}: {e}") 344 | except Exception as e: 345 | self.logger.error(f"Error processing line {line_num}: {e}") 346 | 347 | # Write valid entries to output file 348 | with open(output_path, 'w', encoding='utf-8') as outfile: 349 | for entry in valid_entries: 350 | outfile.write(json.dumps(entry, ensure_ascii=False) + '\n') 351 | 352 | self.logger.info(f"Validation complete. {stats['valid_entries']} valid entries written to {output_file}") 353 | 354 | return stats 355 | 356 | def print_validation_statistics(self, stats: Dict[str, Any]) -> None: 357 | """Print detailed validation statistics""" 358 | 359 | total = stats['total_processed'] 360 | if total == 0: 361 | print("📊 No entries processed") 362 | return 363 | 364 | print("📊 Data Validation Statistics") 365 | print("=" * 60) 366 | print(f"Total entries processed: {total}") 367 | print(f"Valid entries (inconsistent with new, consistent with original): {stats['valid_entries']} ({stats['valid_entries']/total*100:.1f}%)") 368 | print(f"Invalid entries: {stats['invalid_entries']} ({stats['invalid_entries']/total*100:.1f}%)") 369 | 370 | if stats['groundtruth_generation_failures'] > 0: 371 | print(f"Groundtruth generation failures: {stats['groundtruth_generation_failures']}") 372 | 373 | print("\n🔍 Consistency Analysis:") 374 | print(f"Consistent with new message: {stats['consistent_with_new']} ({stats['consistent_with_new']/total*100:.1f}%)") 375 | print(f"Inconsistent with new message: {stats['inconsistent_with_new']} ({stats['inconsistent_with_new']/total*100:.1f}%)") 376 | print(f"Consistent with original message: {stats['consistent_with_original']} ({stats['consistent_with_original']/total*100:.1f}%)") 377 | print(f"Inconsistent with original message: {stats['inconsistent_with_original']} ({stats['inconsistent_with_original']/total*100:.1f}%)") 378 | 379 | print("\n📋 Validation Criteria:") 380 | print("✅ Valid: Inconsistent with new message AND consistent with original message") 381 | print("❌ Invalid: Either consistent with new message OR inconsistent with original message") 382 | 383 | # Calculate quality metrics 384 | if stats['inconsistent_with_new'] > 0: 385 | inconsistency_rate = stats['inconsistent_with_new'] / total * 100 386 | print(f"\n🎯 Data Quality Metrics:") 387 | print(f"Inconsistency generation rate: {inconsistency_rate:.1f}%") 388 | 389 | if stats['valid_entries'] > 0: 390 | validity_rate = stats['valid_entries'] / total * 100 391 | print(f"Overall data validity rate: {validity_rate:.1f}%") 392 | -------------------------------------------------------------------------------- /evaluation/evaluate_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Evaluation Framework for Commit Message Consistency Systems 4 | 5 | This script provides a unified evaluation framework for testing different 6 | commit message consistency checking systems. 7 | 8 | Supported systems: 9 | - commaster: System 1 for commit message consistency checking 10 | - pure_llm: System 2 using pure LLM-based approach 11 | """ 12 | 13 | import argparse 14 | import json 15 | import os 16 | import subprocess 17 | import sys 18 | import tempfile 19 | import time 20 | from pathlib import Path 21 | from typing import Dict, Any, List, Optional 22 | import logging 23 | from concurrent.futures import ProcessPoolExecutor, as_completed 24 | from functools import partial 25 | import multiprocessing 26 | 27 | # import resolvers 28 | import pure_llm.consistency_checker as pure_llm_checker 29 | import fewshot.consistency_checker as fewshot_checker 30 | import CoT.consistency_checker as cot_checker 31 | 32 | # Import commaster system for direct function calls 33 | import sys 34 | import os 35 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 36 | from commit_analyzer import evaluation as commaster_evaluation 37 | 38 | # entry of commaster system (keeping for reference) 39 | COMMASTER_MAIN = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "commit_analyzer.py") 40 | 41 | # Configure logging 42 | logging.basicConfig( 43 | level=logging.INFO, 44 | format='%(asctime)s - %(levelname)s - %(message)s' 45 | ) 46 | logger = logging.getLogger(__name__) 47 | 48 | def parse_jsonl_file(file_path: str) -> List[Dict[str, Any]]: 49 | """ 50 | Parse JSONL file and return list of records. 51 | 52 | Args: 53 | file_path (str): Path to the JSONL file 54 | 55 | Returns: 56 | List[Dict[str, Any]]: List of parsed JSON records 57 | """ 58 | records = [] 59 | try: 60 | with open(file_path, 'r', encoding='utf-8') as f: 61 | for line_num, line in enumerate(f, 1): 62 | line = line.strip() 63 | if not line: 64 | continue 65 | try: 66 | record = json.loads(line) 67 | records.append(record) 68 | except json.JSONDecodeError as e: 69 | logger.warning(f"Failed to parse JSON on line {line_num}: {e}") 70 | continue 71 | except FileNotFoundError: 72 | logger.error(f"File not found: {file_path}") 73 | sys.exit(1) 74 | except Exception as e: 75 | logger.error(f"Error reading file {file_path}: {e}") 76 | sys.exit(1) 77 | 78 | logger.info(f"Successfully loaded {len(records)} records from {file_path}") 79 | return records 80 | 81 | def extract_data_fields(record: Dict[str, Any], repo_collect_dir: Optional[str] = None) -> Dict[str, Any]: 82 | """ 83 | Extract required fields from a data record. 84 | 85 | Args: 86 | record (Dict[str, Any]): Single data record from JSONL file 87 | 88 | Returns: 89 | Dict[str, Any]: Extracted fields (repo, commit_sha, message, files, diff, gt_consistent, loc) 90 | """ 91 | 92 | extracted = { 93 | 'repo': os.path.join(repo_collect_dir, record.get('repo', '')), 94 | 'commit_sha': record.get('commit_sha', ''), 95 | 'message': record.get('message', ''), 96 | 'files': record.get('files', []), 97 | 'diff': record.get('diff', ''), 98 | 'gt_consistent': 'original_message' not in record, 99 | 'loc': record.get('loc', 0) 100 | } 101 | 102 | return extracted 103 | 104 | def test_commaster_system(data: Dict[str, Any], api_key: str, api_base: str, model: str) -> Dict[str, Any]: 105 | """ 106 | Test the commaster system (System 1) by directly calling the evaluation function. 107 | 108 | Args: 109 | data (Dict[str, Any]): Extracted data fields 110 | api_key (str): OpenAI API key 111 | api_base (str): OpenAI API base URL 112 | model (str): Model name to use 113 | 114 | Returns: 115 | Dict[str, Any]: Test results 116 | """ 117 | repo_path = os.path.join(os.path.dirname(data['repo']), "apache_" + os.path.basename(data['repo'])) 118 | logger.info(f"Testing commaster system with repo: {repo_path}, commit_sha: {data['commit_sha']}") 119 | 120 | try: 121 | # Create a temporary work directory for this evaluation 122 | import tempfile 123 | with tempfile.TemporaryDirectory() as work_dir: 124 | 125 | logger.info(f"Running commaster evaluation function with repo: {repo_path}, commit: {data['commit_sha']}") 126 | 127 | # Run the evaluation function and capture output 128 | start_time = time.time() 129 | try: 130 | exit_code, result = commaster_evaluation( 131 | message=data['message'], 132 | repo_path=repo_path, 133 | work_dir=work_dir, 134 | commit=data['commit_sha'], 135 | openai_api_key=api_key, 136 | openai_api_base=api_base, 137 | model=model 138 | ) 139 | end_time = time.time() 140 | 141 | except Exception as e: 142 | end_time = time.time() 143 | logger.error(f"Error during commaster evaluation: {e}") 144 | exit_code = 1 145 | json_output = json.dumps({ 146 | 'consistent': False, 147 | 'prompt_tokens': 0, 148 | 'completion_tokens': 0, 149 | 'total_tokens': 0 150 | }) 151 | 152 | try: 153 | import glob 154 | import shutil 155 | commaster_result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "commaster_result") 156 | os.makedirs(commaster_result_dir, exist_ok=True) 157 | for f in glob.glob(os.path.join(work_dir, "*.analysis")) + glob.glob(os.path.join(work_dir, "*.log")): 158 | shutil.move(f, commaster_result_dir) 159 | except Exception as e: 160 | logger.warning(f"Failed to move .analysis/.log files: {e}") 161 | 162 | if exit_code == 0: 163 | try: 164 | # Parse JSON output 165 | commaster_result = result 166 | 167 | # Format result to match expected output 168 | formatted_result = { 169 | 'system': 'commaster', 170 | 'repo': data['repo'], 171 | 'commit_sha': data['commit_sha'], 172 | 'consistent': commaster_result["consistency_analysis"]["consistent"], 173 | 'confidence': 1.0, 174 | 'reasoning': 'IGNORE', 175 | 'gt_consistent': data.get('gt_consistent', True), 176 | 'loc': data.get('loc', 0), 177 | 'processing_time_ms': round((end_time - start_time) * 1000, 2), 178 | 'error': None, 179 | "prompt_tokens": commaster_result["token_usage"]["grand_total"]["prompt_tokens"], 180 | "completion_tokens": commaster_result["token_usage"]["grand_total"].get("completion_tokens", 0), 181 | "total_tokens": commaster_result["token_usage"]["grand_total"].get("total_tokens", 0) 182 | } 183 | 184 | # Add any additional fields from commaster result 185 | for key, value in commaster_result.items(): 186 | if key not in formatted_result: 187 | formatted_result[key] = value 188 | 189 | logger.debug(f"Commaster result: consistent={formatted_result['consistent']}, confidence={formatted_result['confidence']}") 190 | return formatted_result 191 | 192 | except json.JSONDecodeError as e: 193 | logger.error(f"Failed to parse commaster JSON output: {e}") 194 | logger.error(f"Raw output: {json_output}") 195 | return { 196 | 'system': 'commaster', 197 | 'repo': data['repo'], 198 | 'commit_sha': data['commit_sha'], 199 | 'consistent': False, 200 | 'confidence': 0.0, 201 | 'reasoning': f"Failed to parse JSON output: {str(e)}", 202 | 'gt_consistent': data.get('gt_consistent', True), 203 | 'loc': data.get('loc', 0), 204 | 'processing_time_ms': round((end_time - start_time) * 1000, 2), 205 | 'error': f"JSON parse error: {str(e)}" 206 | } 207 | else: 208 | logger.error(f"Commaster evaluation failed with exit code {exit_code}") 209 | return { 210 | 'system': 'commaster', 211 | 'repo': data['repo'], 212 | 'commit_sha': data['commit_sha'], 213 | 'consistent': False, 214 | 'confidence': 0.0, 215 | 'reasoning': f"Evaluation failed with exit code {exit_code}", 216 | 'gt_consistent': data.get('gt_consistent', True), 217 | 'loc': data.get('loc', 0), 218 | 'processing_time_ms': round((end_time - start_time) * 1000, 2), 219 | 'error': f"Function error: exit code {exit_code}" 220 | } 221 | 222 | 223 | 224 | except Exception as e: 225 | logger.error(f"Error running commaster system: {e}") 226 | return { 227 | 'system': 'commaster', 228 | 'repo': data['repo'], 229 | 'commit_sha': data['commit_sha'], 230 | 'consistent': False, 231 | 'confidence': 0.0, 232 | 'reasoning': f"System error: {str(e)}", 233 | 'gt_consistent': data.get('gt_consistent', True), 234 | 'loc': data.get('loc', 0), 235 | 'processing_time_ms': 0, 236 | 'error': str(e) 237 | } 238 | 239 | def get_git_diff(repo_path: str, commit_sha: str, ctx = 0) -> str: 240 | """ 241 | Get git diff for a specific commit using git command. 242 | 243 | Args: 244 | repo_path (str): Path to the git repository 245 | commit_sha (str): Commit hash to get diff for 246 | ctx (int): Number of context lines around changes (0 or 20) 247 | 248 | Returns: 249 | str: Git diff content 250 | """ 251 | try: 252 | # Define file extensions to include 253 | # file_extensions = [ 254 | # "*.c", "*.h", "*.cpp", "*.hpp", "*.cxx", "*.hxx", "*.cc", "*.hh", 255 | # "*.c++", "*.h++", "*.cxx++", "*.hxx++", "*.C", "*.java", "*.py", "*.pyx", "*.pyi" 256 | # ] 257 | 258 | # Build git diff command 259 | cmd = [ 260 | 'git', '-C', repo_path, 'diff', f'--unified={ctx}', 261 | f'{commit_sha}^', commit_sha] 262 | 263 | # Execute git command 264 | result = subprocess.run( 265 | cmd, 266 | capture_output=True, 267 | text=True, 268 | check=True 269 | ) 270 | 271 | return result.stdout 272 | 273 | except subprocess.CalledProcessError as e: 274 | logger.error(f"Git diff command failed for repo {repo_path}, commit {commit_sha}: {e}") 275 | if e.stderr: 276 | logger.error(f"Git error output: {e.stderr}") 277 | return "" 278 | except Exception as e: 279 | logger.error(f"Error getting git diff: {e}") 280 | return "" 281 | 282 | def test_pure_llm_system(data: Dict[str, Any], api_key: str, api_base: str, model: str, repo_collect_dir: str, ctx: int = 0) -> Dict[str, Any]: 283 | """ 284 | Test the pure_llm system (System 2). 285 | · 286 | Args: 287 | data (Dict[str, Any]): Extracted data fields 288 | api_key (str): OpenAI API key 289 | api_base (str): OpenAI API base URL 290 | model (str): Model name to use 291 | repo_collect_dir (str): Directory containing cloned repositories 292 | ctx (int): Number of context lines for git diff (0 or 20) 293 | 294 | Returns: 295 | Dict[str, Any]: Test results 296 | """ 297 | logger.debug(f"Testing pure_llm system with repo: {data['repo']}, hash: {data['commit_sha']}") 298 | 299 | try: 300 | # Initialize the consistency checker with provided API configuration 301 | checker = pure_llm_checker.ConsistencyChecker( 302 | openai_api_key=api_key, 303 | openai_api_base=api_base, 304 | model=model 305 | ) 306 | 307 | # Get git diff dynamically using git command 308 | # Build repo path based on repo_collections structure 309 | repo_base_name = os.path.basename(data['repo']) 310 | repo_path = os.path.join(repo_collect_dir, "apache_" + repo_base_name) 311 | 312 | logger.debug(f"Getting git diff for repo: {repo_path}, commit: {data['commit_sha']}") 313 | git_diff = get_git_diff(repo_path, data['commit_sha'], ctx) 314 | 315 | if not git_diff: 316 | logger.warning(f"No git diff found for repo {repo_path}, commit {data['commit_sha']}") 317 | logger.warning(f"Falling back to original diff data") 318 | # Fall back to original diff if git command fails 319 | git_diff = data.get('diff', '') 320 | if not git_diff: 321 | logger.error(f"No diff data available for commit {data['commit_sha']}") 322 | else: 323 | logger.debug(f"Successfully obtained git diff ({len(git_diff)} characters)") 324 | 325 | # Run consistency check with dynamically obtained diff 326 | start_time = time.time() 327 | llm_result = checker.check_consistency(data['message'], git_diff) 328 | end_time = time.time() 329 | 330 | # Format result to match expected output 331 | result = { 332 | 'system': 'pure_llm', 333 | 'repo': data['repo'], 334 | 'commit_sha': data['commit_sha'], 335 | 'consistent': llm_result.get('consistent', False), 336 | 'confidence': llm_result.get('confidence', 0.0), 337 | 'reasoning': llm_result.get('reasoning', ''), 338 | 'issues': llm_result.get('issues', []), 339 | 'gt_consistent': data.get('gt_consistent', True), 340 | 'loc': data.get('loc', 0), 341 | 'processing_time_ms': round((end_time - start_time) * 1000, 2), 342 | 'error': None, 343 | "prompt_tokens": llm_result.get("prompt_tokens", 0), 344 | "completion_tokens": llm_result.get("completion_tokens", 0), 345 | "total_tokens": llm_result.get("total_tokens", 0) 346 | } 347 | 348 | logger.debug(f"Pure LLM result: consistent={result['consistent']}, confidence={result['confidence']}") 349 | return result 350 | 351 | except Exception as e: 352 | logger.error(f"Error in pure_llm system: {e}") 353 | return { 354 | 'system': 'pure_llm', 355 | 'repo': data['repo'], 356 | 'commit_sha': data['commit_sha'], 357 | 'consistent': False, 358 | 'confidence': 0.0, 359 | 'reasoning': f"Error occurred: {str(e)}", 360 | 'issues': ["System error"], 361 | 'gt_consistent': data.get('gt_consistent', True), 362 | 'loc': data.get('loc', 0), 363 | 'processing_time_ms': 0, 364 | 'error': str(e), 365 | "prompt_tokens": 0, 366 | "completion_tokens": 0, 367 | "total_tokens": 0 368 | } 369 | 370 | def test_fewshot_llm_system(data: Dict[str, Any], api_key: str, api_base: str, model: str, repo_collect_dir: str) -> Dict[str, Any]: 371 | """ 372 | Test the fewshot_llm system (System 3) using few-shot learning with examples. 373 | · 374 | Args: 375 | data (Dict[str, Any]): Extracted data fields 376 | api_key (str): OpenAI API key 377 | api_base (str): OpenAI API base URL 378 | model (str): Model name to use 379 | repo_collect_dir (str): Directory containing cloned repositories 380 | 381 | Returns: 382 | Dict[str, Any]: Test results 383 | """ 384 | logger.debug(f"Testing fewshot_llm system with repo: {data['repo']}, hash: {data['commit_sha']}") 385 | 386 | try: 387 | # Initialize the consistency checker with provided API configuration 388 | checker = fewshot_checker.ConsistencyChecker( 389 | openai_api_key=api_key, 390 | openai_api_base=api_base, 391 | model=model 392 | ) 393 | 394 | # Get git diff dynamically using git command with ctx=0 (no context) 395 | # Build repo path based on repo_collections structure 396 | repo_base_name = os.path.basename(data['repo']) 397 | repo_path = os.path.join(repo_collect_dir, "apache_" + repo_base_name) 398 | 399 | logger.debug(f"Getting git diff for repo: {repo_path}, commit: {data['commit_sha']}") 400 | git_diff = get_git_diff(repo_path, data['commit_sha'], ctx=0) # Always use ctx=0 for fewshot 401 | 402 | if not git_diff: 403 | logger.warning(f"No git diff found for repo {repo_path}, commit {data['commit_sha']}") 404 | logger.warning(f"Falling back to original diff data") 405 | # Fall back to original diff if git command fails 406 | git_diff = data.get('diff', '') 407 | if not git_diff: 408 | logger.error(f"No diff data available for commit {data['commit_sha']}") 409 | else: 410 | logger.debug(f"Successfully obtained git diff ({len(git_diff)} characters)") 411 | 412 | # Run consistency check with dynamically obtained diff 413 | start_time = time.time() 414 | llm_result = checker.check_consistency(data['message'], git_diff) 415 | end_time = time.time() 416 | 417 | # Format result to match expected output 418 | result = { 419 | 'system': 'fewshot_llm', 420 | 'repo': data['repo'], 421 | 'commit_sha': data['commit_sha'], 422 | 'consistent': llm_result.get('consistent', False), 423 | 'confidence': llm_result.get('confidence', 0.0), 424 | 'reasoning': llm_result.get('reasoning', ''), 425 | 'issues': llm_result.get('issues', []), 426 | 'gt_consistent': data.get('gt_consistent', True), 427 | 'loc': data.get('loc', 0), 428 | 'processing_time_ms': round((end_time - start_time) * 1000, 2), 429 | 'error': None, 430 | "prompt_tokens": llm_result.get("prompt_tokens", 0), 431 | "completion_tokens": llm_result.get("completion_tokens", 0), 432 | "total_tokens": llm_result.get("total_tokens", 0) 433 | } 434 | 435 | logger.debug(f"Fewshot LLM result: consistent={result['consistent']}, confidence={result['confidence']}") 436 | return result 437 | 438 | except Exception as e: 439 | logger.error(f"Error in fewshot_llm system: {e}") 440 | return { 441 | 'system': 'fewshot_llm', 442 | 'repo': data['repo'], 443 | 'commit_sha': data['commit_sha'], 444 | 'consistent': False, 445 | 'confidence': 0.0, 446 | 'reasoning': f"Error occurred: {str(e)}", 447 | 'issues': ["System error"], 448 | 'gt_consistent': data.get('gt_consistent', True), 449 | 'loc': data.get('loc', 0), 450 | 'processing_time_ms': 0, 451 | 'error': str(e), 452 | "prompt_tokens": 0, 453 | "completion_tokens": 0, 454 | "total_tokens": 0 455 | } 456 | 457 | 458 | def test_cot_llm_system(data: Dict[str, Any], api_key: str, api_base: str, model: str, repo_collect_dir: str) -> Dict[str, Any]: 459 | """ 460 | Test the cot_llm system (System 4) using Chain of Thought reasoning. 461 | · 462 | Args: 463 | data (Dict[str, Any]): Extracted data fields 464 | api_key (str): OpenAI API key 465 | api_base (str): OpenAI API base URL 466 | model (str): Model name to use 467 | repo_collect_dir (str): Directory containing cloned repositories 468 | 469 | Returns: 470 | Dict[str, Any]: Test results 471 | """ 472 | logger.debug(f"Testing cot_llm system with repo: {data['repo']}, hash: {data['commit_sha']}") 473 | 474 | try: 475 | # Initialize the consistency checker with provided API configuration 476 | checker = cot_checker.ConsistencyChecker( 477 | openai_api_key=api_key, 478 | openai_api_base=api_base, 479 | model=model 480 | ) 481 | 482 | # Get git diff dynamically using git command with ctx=0 (no context) 483 | # Build repo path based on repo_collections structure 484 | repo_base_name = os.path.basename(data['repo']) 485 | repo_path = os.path.join(repo_collect_dir, "apache_" + repo_base_name) 486 | 487 | logger.debug(f"Getting git diff for repo: {repo_path}, commit: {data['commit_sha']}") 488 | git_diff = get_git_diff(repo_path, data['commit_sha'], ctx=0) # Always use ctx=0 for CoT 489 | 490 | if not git_diff: 491 | logger.warning(f"No git diff found for repo {repo_path}, commit {data['commit_sha']}") 492 | logger.warning(f"Falling back to original diff data") 493 | # Fall back to original diff if git command fails 494 | git_diff = data.get('diff', '') 495 | if not git_diff: 496 | logger.error(f"No diff data available for commit {data['commit_sha']}") 497 | else: 498 | logger.debug(f"Successfully obtained git diff ({len(git_diff)} characters)") 499 | 500 | # Run consistency check with Chain of Thought reasoning 501 | start_time = time.time() 502 | llm_result = checker.check_consistency(data['message'], git_diff) 503 | end_time = time.time() 504 | 505 | # Format result to match expected output 506 | result = { 507 | 'system': 'cot_llm', 508 | 'repo': data['repo'], 509 | 'commit_sha': data['commit_sha'], 510 | 'consistent': llm_result.get('consistent', False), 511 | 'confidence': llm_result.get('confidence', 0.0), 512 | 'reasoning': llm_result.get('reasoning', ''), 513 | 'issues': llm_result.get('issues', []), 514 | 'gt_consistent': data.get('gt_consistent', True), 515 | 'loc': data.get('loc', 0), 516 | 'processing_time_ms': round((end_time - start_time) * 1000, 2), 517 | 'error': None, 518 | "prompt_tokens": llm_result.get("prompt_tokens", 0), 519 | "completion_tokens": llm_result.get("completion_tokens", 0), 520 | "total_tokens": llm_result.get("total_tokens", 0) 521 | } 522 | 523 | logger.debug(f"CoT LLM result: consistent={result['consistent']}, confidence={result['confidence']}") 524 | return result 525 | 526 | except Exception as e: 527 | logger.error(f"Error in cot_llm system: {e}") 528 | return { 529 | 'system': 'cot_llm', 530 | 'repo': data['repo'], 531 | 'commit_sha': data['commit_sha'], 532 | 'consistent': False, 533 | 'confidence': 0.0, 534 | 'reasoning': f"Error occurred: {str(e)}", 535 | 'issues': ["System error"], 536 | 'gt_consistent': data.get('gt_consistent', True), 537 | 'loc': data.get('loc', 0), 538 | 'processing_time_ms': 0, 539 | 'error': str(e), 540 | "prompt_tokens": 0, 541 | "completion_tokens": 0, 542 | "total_tokens": 0 543 | } 544 | 545 | 546 | def process_single_record(record_data: tuple, system_name: str, api_key: str, api_base: str, model: str, repo_collect_dir: Optional[str] = None, ctx: int = 0) -> Dict[str, Any]: 547 | """ 548 | Process a single record for evaluation. This function is designed to be used with multiprocessing. 549 | 550 | Args: 551 | record_data (tuple): Tuple containing (index, record) 552 | system_name (str): Name of the system to test ('commaster' or 'pure_llm') 553 | api_key (str): OpenAI API key 554 | api_base (str): OpenAI API base URL 555 | model (str): Model name to use 556 | repo_collect_dir (str, optional): Directory containing cloned repositories 557 | ctx (int): Number of context lines for git diff (0 or 20) 558 | 559 | Returns: 560 | Dict[str, Any]: Evaluation result for this record 561 | """ 562 | i, record = record_data 563 | 564 | try: 565 | # Extract required fields 566 | data = extract_data_fields(record, repo_collect_dir) 567 | 568 | # Validate extracted data 569 | if not data['repo'] or not data['commit_sha']: 570 | return { 571 | 'system': system_name, 572 | 'record_index': i, 573 | 'repo': data.get('repo', 'unknown'), 574 | 'commit_sha': data.get('commit_sha', 'unknown'), 575 | 'consistent': None, 576 | 'confidence': None, 577 | 'gt_consistent': data.get('gt_consistent', True), 578 | 'loc': data.get('loc', 0), 579 | 'error': 'Missing repo or commit_sha', 580 | 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') 581 | } 582 | 583 | # Select the appropriate test function 584 | if system_name == 'commaster': 585 | test_function = lambda data: test_commaster_system(data, api_key, api_base, model) 586 | elif system_name == 'pure_llm': 587 | test_function = lambda data: test_pure_llm_system(data, api_key, api_base, model, repo_collect_dir, ctx) 588 | elif system_name == 'fewshot_llm': 589 | test_function = lambda data: test_fewshot_llm_system(data, api_key, api_base, model, repo_collect_dir) 590 | else: # cot_llm 591 | test_function = lambda data: test_cot_llm_system(data, api_key, api_base, model, repo_collect_dir) 592 | 593 | # Run test on the selected system 594 | start_time = time.time() 595 | result = test_function(data) 596 | end_time = time.time() 597 | 598 | # Add metadata to result 599 | result.update({ 600 | 'record_index': i, 601 | 'actual_processing_time_ms': round((end_time - start_time) * 1000, 2), 602 | 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') 603 | }) 604 | 605 | return result 606 | 607 | except Exception as e: 608 | return { 609 | 'system': system_name, 610 | 'record_index': i, 611 | 'repo': data.get('repo', 'unknown') if 'data' in locals() else 'unknown', 612 | 'commit_sha': data.get('commit_sha', 'unknown') if 'data' in locals() else 'unknown', 613 | 'consistent': None, 614 | 'confidence': None, 615 | 'gt_consistent': data.get('gt_consistent', True) if 'data' in locals() else True, 616 | 'loc': data.get('loc', 0) if 'data' in locals() else 0, 617 | 'error': str(e), 618 | 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') 619 | } 620 | 621 | def run_evaluation(system_name: str, data_file: str, api_key: str, api_base: str, model: str, 622 | output_file: Optional[str] = None, repo_collect_dir: Optional[str] = None, max_workers: int = 1, ctx: int = 0) -> List[Dict[str, Any]]: 623 | """ 624 | Run evaluation on the specified system with optional parallel processing. 625 | 626 | Args: 627 | system_name (str): Name of the system to test ('commaster' or 'pure_llm') 628 | data_file (str): Path to the JSONL data file 629 | api_key (str): OpenAI API key 630 | api_base (str): OpenAI API base URL 631 | model (str): Model name to use 632 | output_file (str, optional): Path to save results 633 | repo_collect_dir (str, optional): Directory containing cloned repositories 634 | max_workers (int): Maximum number of parallel workers (default: 1 for sequential processing) 635 | ctx (int): Number of context lines for git diff (0 or 20) 636 | 637 | Returns: 638 | List[Dict[str, Any]]: List of evaluation results 639 | """ 640 | # Validate system name 641 | if system_name not in ['commaster', 'pure_llm', 'fewshot_llm', 'cot_llm']: 642 | logger.error(f"Unsupported system: {system_name}. Supported systems: commaster, pure_llm, fewshot_llm, cot_llm") 643 | sys.exit(1) 644 | 645 | logger.info(f"Starting evaluation for system: {system_name}") 646 | logger.info(f"Data file: {data_file}") 647 | logger.info(f"Max workers: {max_workers}") 648 | 649 | # Load data 650 | records = parse_jsonl_file(data_file) 651 | # records = records[2000:3500] 652 | records = records[::-1] # For testing, limit to last 100 records 653 | total_records = len(records) 654 | 655 | completed_indices = set() 656 | if output_file and os.path.exists(output_file): 657 | logger.info(f"Resume mode: loading completed records from {output_file}") 658 | with open(output_file, 'r', encoding='utf-8') as f: 659 | for line in f: 660 | try: 661 | rec = json.loads(line) 662 | if 'record_index' in rec: 663 | completed_indices.add(rec['record_index']) 664 | except Exception: 665 | continue 666 | logger.info(f"Resume mode: {len(completed_indices)} records already completed.") 667 | 668 | record_data = [(i+1, record) for i, record in enumerate(records) 669 | if (i+1) not in completed_indices] 670 | if len(record_data) < total_records: 671 | logger.info(f"Resume mode: {total_records-len(record_data)} records will be skipped (already done). {len(record_data)} to process.") 672 | else: 673 | logger.info(f"No resume: all {total_records} records will be processed.") 674 | 675 | results = [] 676 | output_fh = None 677 | if output_file: 678 | output_fh = open(output_file, 'w', encoding='utf-8') 679 | 680 | def write_result_line(res): 681 | if output_fh: 682 | output_fh.write(json.dumps(res, ensure_ascii=False) + '\n') 683 | output_fh.flush() 684 | 685 | if max_workers == 1: 686 | # Sequential processing (original behavior) 687 | logger.info("Running in sequential mode") 688 | for i, (index, record) in enumerate(record_data): 689 | logger.info(f"Processing record {index}/{total_records}") 690 | result = process_single_record( 691 | (index, record), system_name, api_key, api_base, model, repo_collect_dir, ctx 692 | ) 693 | results.append(result) 694 | write_result_line(result) 695 | else: 696 | # Parallel processing 697 | logger.info(f"Running in parallel mode with {max_workers} workers") 698 | process_func = partial( 699 | process_single_record, 700 | system_name=system_name, 701 | api_key=api_key, 702 | api_base=api_base, 703 | model=model, 704 | repo_collect_dir=repo_collect_dir, 705 | ctx=ctx 706 | ) 707 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 708 | future_to_index = { 709 | executor.submit(process_func, record_item): record_item[0] 710 | for record_item in record_data 711 | } 712 | completed_count = 0 713 | for future in as_completed(future_to_index): 714 | try: 715 | result = future.result() 716 | results.append(result) 717 | write_result_line(result) 718 | completed_count += 1 719 | if completed_count % 10 == 0 or completed_count == total_records: 720 | logger.info(f"Completed {completed_count}/{total_records} records") 721 | except Exception as e: 722 | index = future_to_index[future] 723 | logger.error(f"Error processing record {index}: {e}") 724 | error_result = { 725 | 'system': system_name, 726 | 'record_index': index, 727 | 'repo': 'unknown', 728 | 'commit_sha': 'unknown', 729 | 'consistent': None, 730 | 'confidence': None, 731 | 'gt_consistent': True, 732 | 'loc': 0, 733 | 'error': str(e), 734 | 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S') 735 | } 736 | results.append(error_result) 737 | write_result_line(error_result) 738 | if output_fh: 739 | output_fh.close() 740 | # Sort results by record index to maintain order 741 | results.sort(key=lambda x: x.get('record_index', 0)) 742 | logger.info(f"Evaluation completed. Processed {len(results)} records.") 743 | return results 744 | 745 | def save_results(results: List[Dict[str, Any]], output_file: str) -> None: 746 | """ 747 | Save evaluation results to a JSON file. 748 | 749 | Args: 750 | results (List[Dict[str, Any]]): Evaluation results 751 | output_file (str): Output file path 752 | """ 753 | try: 754 | with open(output_file, 'w', encoding='utf-8') as f: 755 | json.dump(results, f, indent=2, ensure_ascii=False) 756 | logger.info(f"Results saved to: {output_file}") 757 | except Exception as e: 758 | logger.error(f"Failed to save results to {output_file}: {e}") 759 | 760 | def print_summary(results: List[Dict[str, Any]]) -> None: 761 | """ 762 | Print a summary of evaluation results. 763 | 764 | Args: 765 | results (List[Dict[str, Any]]): Evaluation results 766 | """ 767 | if not results: 768 | logger.warning("No results to summarize.") 769 | return 770 | 771 | system_name = results[0].get('system', 'unknown') 772 | total = len(results) 773 | 774 | # Count results 775 | consistent_count = sum(1 for r in results if r.get('consistent') is True) 776 | inconsistent_count = sum(1 for r in results if r.get('consistent') is False) 777 | error_count = sum(1 for r in results if r.get('error') is not None) 778 | 779 | # Calculate average confidence 780 | confidences = [r.get('confidence', 0) for r in results if r.get('confidence') is not None] 781 | avg_confidence = sum(confidences) / len(confidences) if confidences else 0 782 | 783 | # Calculate average processing time 784 | times = [r.get('actual_processing_time_ms', 0) for r in results if r.get('actual_processing_time_ms') is not None] 785 | avg_time = sum(times) / len(times) if times else 0 786 | 787 | print("\n" + "=" * 60) 788 | print(f"EVALUATION SUMMARY - {system_name.upper()}") 789 | print("=" * 60) 790 | print(f"Total records processed: {total}") 791 | print(f"Consistent results: {consistent_count} ({consistent_count/total*100:.1f}%)") 792 | print(f"Inconsistent results: {inconsistent_count} ({inconsistent_count/total*100:.1f}%)") 793 | print(f"Errors: {error_count} ({error_count/total*100:.1f}%)") 794 | print(f"Average confidence: {avg_confidence:.3f}") 795 | print(f"Average processing time: {avg_time:.2f} ms") 796 | print("=" * 60) 797 | 798 | def main(): 799 | """Main function to handle command line arguments and orchestrate evaluation.""" 800 | parser = argparse.ArgumentParser( 801 | description="Evaluation framework for commit message consistency systems", 802 | formatter_class=argparse.RawDescriptionHelpFormatter, 803 | epilog=""" 804 | Examples: 805 | # Test commaster system 806 | python evaluate_main.py --system commaster --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY 807 | 808 | # Test pure_llm system with custom API base and model 809 | python evaluate_main.py --system pure_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --api-base xxx_url --model gpt-4 810 | 811 | # Test pure_llm system with extended context (20 lines) for git diff 812 | python evaluate_main.py --system pure_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --ctx 20 813 | 814 | # Test fewshot_llm system (uses few-shot examples in prompt, always ctx=0) 815 | python evaluate_main.py --system fewshot_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY 816 | 817 | # Test cot_llm system (uses Chain of Thought reasoning, always ctx=0) 818 | python evaluate_main.py --system cot_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY 819 | 820 | # With output file, verbose logging, and parallel processing 821 | python evaluate_main.py --system pure_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --output results.json --verbose --workers 4 --ctx 0 822 | 823 | # Use all available CPU cores with no context lines 824 | python evaluate_main.py --system commaster --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --workers 8 825 | """ 826 | ) 827 | 828 | parser.add_argument( 829 | '--system', '-s', 830 | required=True, 831 | choices=['commaster', 'pure_llm', 'fewshot_llm', 'cot_llm'], 832 | help='System to evaluate (commaster, pure_llm, fewshot_llm, or cot_llm)' 833 | ) 834 | 835 | parser.add_argument( 836 | '--data', '-d', 837 | required=True, 838 | help='Path to the JSONL data file' 839 | ) 840 | 841 | parser.add_argument( 842 | '--repocollect', '-r', 843 | required=True, 844 | help='Path to the repository collection directory' 845 | ) 846 | 847 | parser.add_argument( 848 | '--api-key', 849 | required=True, 850 | help='OpenAI API key' 851 | ) 852 | 853 | parser.add_argument( 854 | '--api-base', 855 | default='EMPTY', 856 | help='OpenAI API base URL (default: EMPTY)' 857 | ) 858 | 859 | parser.add_argument( 860 | '--model', 861 | default='gpt-3.5-turbo', 862 | help='Model name to use (default: gpt-3.5-turbo)' 863 | ) 864 | 865 | parser.add_argument( 866 | '--output', '-o', 867 | help='Output file path for results (JSON format)' 868 | ) 869 | 870 | parser.add_argument( 871 | '--verbose', '-v', 872 | action='store_true', 873 | help='Enable verbose logging' 874 | ) 875 | 876 | parser.add_argument( 877 | '--workers', '-w', 878 | type=int, 879 | default=1, 880 | help=f'Number of parallel workers for processing (default: 1, max: {multiprocessing.cpu_count()})' 881 | ) 882 | 883 | parser.add_argument( 884 | '--ctx', 885 | type=int, 886 | choices=[0, 20], 887 | default=0, 888 | help='Number of context lines for git diff (0: no context, 20: extended context). Only affects pure_llm system. fewshot_llm and cot_llm always use ctx=0. (default: 0)' 889 | ) 890 | 891 | args = parser.parse_args() 892 | 893 | # Set logging level based on verbose flag 894 | if args.verbose: 895 | logging.getLogger().setLevel(logging.DEBUG) 896 | 897 | # Validate workers parameter 898 | max_cpu_count = multiprocessing.cpu_count() 899 | if args.workers < 1: 900 | logger.error("Number of workers must be at least 1") 901 | sys.exit(1) 902 | elif args.workers > max_cpu_count: 903 | logger.warning(f"Number of workers ({args.workers}) exceeds CPU count ({max_cpu_count}), using {max_cpu_count}") 904 | args.workers = max_cpu_count 905 | 906 | # Validate data file exists 907 | if not Path(args.data).exists(): 908 | logger.error(f"Data file not found: {args.data}") 909 | sys.exit(1) 910 | 911 | # Run evaluation 912 | try: 913 | results = run_evaluation( 914 | system_name=args.system, 915 | data_file=args.data, 916 | api_key=args.api_key, 917 | api_base=args.api_base, 918 | model=args.model, 919 | output_file=args.output, 920 | repo_collect_dir=args.repocollect, 921 | max_workers=args.workers, 922 | ctx=args.ctx, 923 | ) 924 | print_summary(results) 925 | 926 | except KeyboardInterrupt: 927 | logger.info("Evaluation interrupted by user.") 928 | sys.exit(1) 929 | except Exception as e: 930 | logger.error(f"Evaluation failed: {e}") 931 | sys.exit(1) 932 | 933 | if __name__ == "__main__": 934 | main() 935 | -------------------------------------------------------------------------------- /data_synthesis/data_generator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inconsistent data generator 3 | """ 4 | 5 | import json 6 | import logging 7 | import random 8 | import sys 9 | import multiprocessing 10 | import queue 11 | import threading 12 | from pathlib import Path 13 | from typing import Dict, Any, List, Optional 14 | from dataclasses import asdict 15 | from concurrent.futures import ProcessPoolExecutor, as_completed 16 | from functools import partial 17 | 18 | from inconsistency_rules import InconsistencyRuleManager, InconsistencyRule 19 | from rule_applicability_checker import EnhancedInconsistencyRuleManager 20 | from llm_interface import LLMManager 21 | 22 | 23 | def _worker_process_entry_simple(entry_data: tuple, llm_config: Dict[str, Any], 24 | inconsistency_ratio: float, use_applicability_check: bool, 25 | seed_offset: int) -> Dict[str, Any]: 26 | """ 27 | Simplified worker function for multiprocessing.Pool 28 | 29 | This is a standalone function that avoids complex serialization issues. 30 | 31 | Args: 32 | entry_data: Tuple of (line_number, original_entry) 33 | llm_config: Simplified LLM configuration dictionary 34 | inconsistency_ratio: Ratio for inconsistency generation 35 | use_applicability_check: Whether to use applicability checking 36 | seed_offset: Offset for random seed 37 | 38 | Returns: 39 | Processed entry with metadata 40 | """ 41 | line_num, original_entry = entry_data 42 | 43 | # Set random seed for this process (deterministic but different per entry) 44 | random.seed(hash(str(original_entry)) + seed_offset + line_num) 45 | 46 | try: 47 | # Create minimal components for this worker 48 | llm_manager = LLMManager(llm_config) 49 | 50 | if use_applicability_check: 51 | rule_manager = EnhancedInconsistencyRuleManager(llm_manager) 52 | else: 53 | rule_manager = InconsistencyRuleManager() 54 | 55 | # Decide whether to generate inconsistent data 56 | is_inconsistent = random.random() < inconsistency_ratio 57 | 58 | result = { 59 | 'line_number': line_num, 60 | 'is_inconsistent': is_inconsistent, 61 | 'error': None 62 | } 63 | 64 | if is_inconsistent: 65 | # Generate inconsistent data using simplified logic 66 | if use_applicability_check and hasattr(rule_manager, 'get_best_rule_for_commit'): 67 | try: 68 | rule, applicability_info = rule_manager.get_best_rule_for_commit(original_entry) 69 | rule_selection_info = { 70 | 'selection_method': rule.rule_type.value, 71 | 'applicability_check': True, 72 | 'reasoning': applicability_info.get('reasoning', 'N/A'), 73 | 'was_applicable': applicability_info.get('applicable', True) 74 | } 75 | except Exception: 76 | rule = rule_manager.get_random_rule() 77 | rule_selection_info = { 78 | 'selection_method': 'fallback_random', 79 | 'applicability_check': False 80 | } 81 | else: 82 | rule = rule_manager.get_random_rule() 83 | rule_selection_info = { 84 | 'selection_method': 'random', 85 | 'applicability_check': False 86 | } 87 | 88 | # Format prompt and generate inconsistent message 89 | prompt = rule_manager.format_prompt(rule, original_entry) 90 | inconsistent_message = llm_manager.generate_inconsistent_message(prompt) 91 | 92 | if inconsistent_message is None: 93 | inconsistent_message = f"[Generated by {rule.rule_type.value}] {original_entry.get('message', '')}" 94 | 95 | # Create inconsistent entry 96 | inconsistent_entry = original_entry.copy() 97 | inconsistent_entry['message'] = inconsistent_message 98 | inconsistent_entry['consistency'] = False 99 | inconsistent_entry['inconsistency_rule'] = rule.rule_type.value 100 | inconsistent_entry['rule_weight'] = rule.weight 101 | inconsistent_entry['original_message'] = original_entry.get('message', '') 102 | inconsistent_entry['rule_selection_info'] = rule_selection_info 103 | 104 | result['entry'] = inconsistent_entry 105 | result['type'] = 'inconsistent' 106 | else: 107 | # Keep original data (add consistency field) 108 | consistent_entry = original_entry.copy() 109 | consistent_entry['consistency'] = True 110 | result['entry'] = consistent_entry 111 | result['type'] = 'consistent' 112 | 113 | return result 114 | 115 | except Exception as e: 116 | return { 117 | 'line_number': line_num, 118 | 'is_inconsistent': False, 119 | 'error': str(e), 120 | 'entry': original_entry, 121 | 'type': 'error' 122 | } 123 | 124 | 125 | def _worker_process_entry(entry_data: tuple, llm_config: Dict[str, Any], 126 | inconsistency_ratio: float, use_applicability_check: bool, 127 | seed_offset: int) -> Dict[str, Any]: 128 | """ 129 | Worker function to process a single entry in multiprocessing. 130 | 131 | This function is defined at module level to ensure proper serialization. 132 | 133 | Args: 134 | entry_data: Tuple of (line_number, original_entry) 135 | llm_config: Simplified LLM configuration dictionary 136 | inconsistency_ratio: Ratio for inconsistency generation 137 | use_applicability_check: Whether to use applicability checking 138 | seed_offset: Offset for random seed 139 | 140 | Returns: 141 | Processed entry with metadata 142 | """ 143 | line_num, original_entry = entry_data 144 | 145 | # Set random seed for this process (deterministic but different per entry) 146 | random.seed(hash(str(original_entry)) + seed_offset + line_num) 147 | 148 | try: 149 | # Create minimal components for this worker 150 | # print(llm_config) 151 | # exit(0) 152 | llm_manager = LLMManager(llm_config) 153 | 154 | if use_applicability_check: 155 | rule_manager = EnhancedInconsistencyRuleManager(llm_manager) 156 | else: 157 | rule_manager = InconsistencyRuleManager() 158 | 159 | # Decide whether to generate inconsistent data 160 | is_inconsistent = random.random() < inconsistency_ratio 161 | 162 | result = { 163 | 'line_number': line_num, 164 | 'is_inconsistent': is_inconsistent, 165 | 'error': None 166 | } 167 | 168 | if is_inconsistent: 169 | # Generate inconsistent data using simplified logic 170 | if use_applicability_check and hasattr(rule_manager, 'get_best_rule_for_commit'): 171 | try: 172 | rule, applicability_info = rule_manager.get_best_rule_for_commit(original_entry) 173 | rule_selection_info = { 174 | 'selection_method': rule.rule_type.value, 175 | 'applicability_check': True, 176 | 'reasoning': applicability_info.get('reasoning', 'N/A'), 177 | 'was_applicable': applicability_info.get('applicable', True) 178 | } 179 | except Exception: 180 | rule = rule_manager.get_random_rule() 181 | rule_selection_info = { 182 | 'selection_method': 'fallback_random', 183 | 'applicability_check': False 184 | } 185 | else: 186 | rule = rule_manager.get_random_rule() 187 | rule_selection_info = { 188 | 'selection_method': 'random', 189 | 'applicability_check': False 190 | } 191 | 192 | # Format prompt and generate inconsistent message 193 | prompt = rule_manager.format_prompt(rule, original_entry) 194 | inconsistent_message = llm_manager.generate_inconsistent_message(prompt) 195 | 196 | if inconsistent_message is None: 197 | inconsistent_message = f"[Generated by {rule.rule_type.value}] {original_entry.get('message', '')}" 198 | 199 | # Create inconsistent entry 200 | inconsistent_entry = original_entry.copy() 201 | inconsistent_entry['message'] = inconsistent_message 202 | inconsistent_entry['consistency'] = False 203 | inconsistent_entry['inconsistency_rule'] = rule.rule_type.value 204 | inconsistent_entry['rule_weight'] = rule.weight 205 | inconsistent_entry['original_message'] = original_entry.get('message', '') 206 | inconsistent_entry['rule_selection_info'] = rule_selection_info 207 | 208 | result['entry'] = inconsistent_entry 209 | result['type'] = 'inconsistent' 210 | else: 211 | # Keep original data (add consistency field) 212 | consistent_entry = original_entry.copy() 213 | consistent_entry['consistency'] = True 214 | result['entry'] = consistent_entry 215 | result['type'] = 'consistent' 216 | 217 | return result 218 | 219 | except Exception as e: 220 | return { 221 | 'line_number': line_num, 222 | 'is_inconsistent': False, 223 | 'error': str(e), 224 | 'entry': original_entry, 225 | 'type': 'error' 226 | } 227 | 228 | 229 | class InconsistentDataGenerator: 230 | """Inconsistent data generator with intelligent rule selection""" 231 | 232 | def __init__(self, llm_config: Dict[str, Any], seed: Optional[int] = None, 233 | use_applicability_check: bool = True): 234 | """ 235 | Initialize generator 236 | 237 | Args: 238 | llm_config: LLM configuration dictionary 239 | seed: Random seed for reproducible results 240 | use_applicability_check: Whether to use LLM-based applicability checking 241 | """ 242 | self.logger = logging.getLogger(__name__) 243 | self.llm_manager = LLMManager(llm_config) 244 | self.use_applicability_check = use_applicability_check 245 | 246 | if use_applicability_check: 247 | self.rule_manager = EnhancedInconsistencyRuleManager(self.llm_manager) 248 | self.logger.info("Using enhanced rule manager with applicability checking") 249 | else: 250 | self.rule_manager = InconsistencyRuleManager() 251 | self.logger.info("Using basic rule manager without applicability checking") 252 | 253 | if seed is not None: 254 | random.seed(seed) 255 | self.logger.info(f"Set random seed to {seed}") 256 | 257 | def generate_inconsistent_entry(self, original_entry: Dict[str, Any], 258 | rule: Optional[InconsistencyRule] = None) -> Dict[str, Any]: 259 | """ 260 | Generate inconsistent data entry based on original entry with intelligent rule selection 261 | 262 | Args: 263 | original_entry: Original data entry 264 | rule: Specified inconsistency rule, if None then intelligently select 265 | 266 | Returns: 267 | Inconsistent data entry 268 | """ 269 | rule_selection_info = {} 270 | 271 | # Use intelligent rule selection if applicability checking is enabled 272 | if rule is None and self.use_applicability_check and hasattr(self.rule_manager, 'get_best_rule_for_commit'): 273 | try: 274 | self.logger.info("Performing intelligent rule selection...") 275 | rule, applicability_info = self.rule_manager.get_best_rule_for_commit(original_entry) 276 | 277 | rule_selection_info = { 278 | 'selection_method': rule.rule_type.value, 279 | 'applicability_check': True, 280 | 'reasoning': applicability_info.get('reasoning', 'N/A'), 281 | 'was_applicable': applicability_info.get('applicable', True) 282 | } 283 | 284 | self.logger.info(f"Intelligently selected rule: {rule.name} (weight: {rule.weight})") 285 | self.logger.debug(f"Selection reasoning: {applicability_info.get('reasoning', 'N/A')}") 286 | 287 | except Exception as e: 288 | self.logger.warning(f"Intelligent rule selection failed: {e}, falling back to random selection") 289 | rule = self.rule_manager.get_random_rule() 290 | rule_selection_info = { 291 | 'selection_method': 'fallback_random', 292 | 'applicability_check': False, 293 | 'error': str(e) 294 | } 295 | 296 | elif rule is None: 297 | # Use random selection 298 | rule = self.rule_manager.get_random_rule() 299 | rule_selection_info = { 300 | 'selection_method': 'random', 301 | 'applicability_check': False 302 | } 303 | else: 304 | # Use provided rule 305 | rule_selection_info = { 306 | 'selection_method': 'specified', 307 | 'applicability_check': False 308 | } 309 | 310 | self.logger.info(f"Applying rule: {rule.name} ({rule.rule_type.value})") 311 | 312 | # Format prompt 313 | prompt = self.rule_manager.format_prompt(rule, original_entry) 314 | 315 | # Call LLM to generate inconsistent commit message 316 | inconsistent_message = self.llm_manager.generate_inconsistent_message(prompt) 317 | 318 | if inconsistent_message is None: 319 | self.logger.error("Failed to generate inconsistent message") 320 | inconsistent_message = f"[Generated by {rule.rule_type.value}] {original_entry.get('message', '')}" 321 | 322 | # Create new inconsistent entry 323 | inconsistent_entry = original_entry.copy() 324 | inconsistent_entry['message'] = inconsistent_message 325 | inconsistent_entry['consistency'] = False 326 | inconsistent_entry['inconsistency_rule'] = rule.rule_type.value 327 | inconsistent_entry['rule_weight'] = rule.weight 328 | inconsistent_entry['original_message'] = original_entry.get('message', '') 329 | inconsistent_entry['rule_selection_info'] = rule_selection_info 330 | 331 | return inconsistent_entry 332 | 333 | def process_jsonl_file(self, input_file: str, output_file: str, 334 | num_samples: Optional[int] = None, 335 | inconsistency_ratio: float = 1.0, 336 | max_workers: int = 1) -> None: 337 | """ 338 | Process JSONL file to generate inconsistent data 339 | 340 | Args: 341 | input_file: Input JSONL file path 342 | output_file: Output JSONL file path 343 | num_samples: Number of samples to process, None means process all 344 | inconsistency_ratio: Ratio of inconsistent data (0.0-1.0) 345 | max_workers: Number of worker processes for parallel processing 346 | """ 347 | if max_workers == 1: 348 | # Use sequential processing (original behavior) 349 | self._process_jsonl_file_sequential(input_file, output_file, num_samples, inconsistency_ratio) 350 | else: 351 | # Use multiprocessing 352 | self._process_jsonl_file_parallel(input_file, output_file, num_samples, inconsistency_ratio, max_workers) 353 | 354 | def _process_jsonl_file_sequential(self, input_file: str, output_file: str, 355 | num_samples: Optional[int] = None, 356 | inconsistency_ratio: float = 1.0) -> None: 357 | """ 358 | Original sequential processing method 359 | 360 | Args: 361 | input_file: Input JSONL file path 362 | output_file: Output JSONL file path 363 | num_samples: Number of samples to process, None means process all 364 | inconsistency_ratio: Ratio of inconsistent data (0.0-1.0) 365 | """ 366 | input_path = Path(input_file) 367 | output_path = Path(output_file) 368 | 369 | if not input_path.exists(): 370 | raise FileNotFoundError(f"Input file not found: {input_file}") 371 | 372 | # Ensure output directory exists 373 | output_path.parent.mkdir(parents=True, exist_ok=True) 374 | 375 | processed_count = 0 376 | inconsistent_count = 0 377 | 378 | with open(input_path, 'r', encoding='utf-8') as infile, \ 379 | open(output_path, 'w', encoding='utf-8') as outfile: 380 | 381 | for line_num, line in enumerate(infile, 1): 382 | if num_samples is not None and processed_count >= num_samples: 383 | break 384 | 385 | try: 386 | original_entry = json.loads(line.strip()) 387 | 388 | # Decide whether to generate inconsistent data 389 | if random.random() < inconsistency_ratio: 390 | # Generate inconsistent data 391 | inconsistent_entry = self.generate_inconsistent_entry(original_entry) 392 | outfile.write(json.dumps(inconsistent_entry, ensure_ascii=False) + '\n') 393 | inconsistent_count += 1 394 | self.logger.info(f"Generated inconsistent entry {inconsistent_count} from line {line_num}") 395 | else: 396 | # Keep original data (add consistency field) 397 | consistent_entry = original_entry.copy() 398 | consistent_entry['consistency'] = True 399 | outfile.write(json.dumps(consistent_entry, ensure_ascii=False) + '\n') 400 | self.logger.info(f"Kept consistent entry from line {line_num}") 401 | 402 | processed_count += 1 403 | 404 | if processed_count % 10 == 0: 405 | self.logger.info(f"Processed {processed_count} entries...") 406 | 407 | except json.JSONDecodeError as e: 408 | self.logger.error(f"Failed to parse JSON on line {line_num}: {e}") 409 | except Exception as e: 410 | self.logger.error(f"Error processing line {line_num}: {e}") 411 | 412 | self.logger.info(f"Processing complete. Total: {processed_count}, Inconsistent: {inconsistent_count}") 413 | 414 | def _process_jsonl_file_parallel(self, input_file: str, output_file: str, 415 | num_samples: Optional[int] = None, 416 | inconsistency_ratio: float = 1.0, 417 | max_workers: int = 4) -> None: 418 | """ 419 | Parallel processing method using multiprocessing 420 | 421 | Args: 422 | input_file: Input JSONL file path 423 | output_file: Output JSONL file path 424 | num_samples: Number of samples to process, None means process all 425 | inconsistency_ratio: Ratio of inconsistent data (0.0-1.0) 426 | max_workers: Number of worker processes 427 | """ 428 | input_path = Path(input_file) 429 | output_path = Path(output_file) 430 | 431 | if not input_path.exists(): 432 | raise FileNotFoundError(f"Input file not found: {input_file}") 433 | 434 | # Ensure output directory exists 435 | output_path.parent.mkdir(parents=True, exist_ok=True) 436 | 437 | # Load all entries to process 438 | entries_to_process = [] 439 | with open(input_path, 'r', encoding='utf-8') as infile: 440 | for line_num, line in enumerate(infile, 1): 441 | if num_samples is not None and line_num > num_samples: 442 | break 443 | try: 444 | original_entry = json.loads(line.strip()) 445 | entries_to_process.append((line_num, original_entry)) 446 | except json.JSONDecodeError as e: 447 | self.logger.error(f"Failed to parse JSON on line {line_num}: {e}") 448 | 449 | total_entries = len(entries_to_process) 450 | self.logger.info(f"🚀 Starting parallel processing:") 451 | self.logger.info(f" 📊 Total entries: {total_entries}") 452 | self.logger.info(f" 👥 Workers: {max_workers}") 453 | self.logger.info(f" 📈 Inconsistency ratio: {inconsistency_ratio}") 454 | self.logger.info(f" 🎯 Expected inconsistent entries: {int(total_entries * inconsistency_ratio)}") 455 | self.logger.info(f" 🔧 Applicability check: {'enabled' if self.use_applicability_check else 'disabled'}") 456 | 457 | # Get simplified LLM config for worker processes (ensure serializability) 458 | try: 459 | if hasattr(self.llm_manager, 'llm_config_test'): 460 | llm_config = dict(self.llm_manager.llm_config_test) # Ensure it's a dict, not custom object 461 | else: 462 | # Fallback: construct config from basic attributes 463 | llm_config = { 464 | 'provider': getattr(self.llm_manager, 'provider', 'openai'), 465 | 'api_key': getattr(self.llm_manager, 'api_key', 'empty'), 466 | 'base_url': getattr(self.llm_manager, 'base_url', None), 467 | 'model': getattr(self.llm_manager, 'model', 'gpt-3.5-turbo'), 468 | 'max_retries': getattr(self.llm_manager, 'max_retries', 3) 469 | } 470 | # Ensure all values are serializable 471 | serializable_config = {} 472 | for key, value in llm_config.items(): 473 | if isinstance(value, (str, int, float, bool, type(None))): 474 | serializable_config[key] = value 475 | else: 476 | serializable_config[key] = str(value) 477 | 478 | llm_config = serializable_config 479 | 480 | 481 | except Exception as e: 482 | self.logger.warning(f"Could not extract LLM config: {e}, using minimal defaults") 483 | llm_config = { 484 | 'provider': 'openai', 485 | 'api_key': 'empty', 486 | 'base_url': None, 487 | 'model': 'gpt-3.5-turbo', 488 | 'max_retries': 3 489 | } 490 | 491 | 492 | # Prepare worker arguments as simple tuples to avoid serialization issues 493 | worker_args_list = [] 494 | for entry_data in entries_to_process: 495 | worker_args = ( 496 | entry_data, # (line_num, original_entry) 497 | llm_config, 498 | inconsistency_ratio, 499 | self.use_applicability_check, 500 | hash(input_file) # seed_offset 501 | ) 502 | worker_args_list.append(worker_args) 503 | 504 | processed_count = 0 505 | inconsistent_count = 0 506 | error_count = 0 507 | 508 | # Process entries in parallel using simple map approach with progress monitoring 509 | try: 510 | import time 511 | start_time = time.time() 512 | self.logger.info(f"⏱️ Starting multiprocessing with {max_workers} workers at {time.strftime('%H:%M:%S')}...") 513 | 514 | # Use multiprocessing approach with progress monitoring and real-time file writing 515 | with multiprocessing.Pool(processes=max_workers) as pool: 516 | # Submit all tasks asynchronously for better progress tracking 517 | async_results = [] 518 | for worker_args in worker_args_list: 519 | async_result = pool.apply_async(_worker_process_entry_simple, worker_args) 520 | async_results.append(async_result) 521 | 522 | self.logger.info(f"📤 Submitted {len(async_results)} tasks, monitoring progress...") 523 | 524 | # Monitor progress and collect results, write to file immediately 525 | completed_count = 0 526 | last_progress_time = start_time 527 | 528 | # Open output file for writing results as they complete 529 | with open(output_path, 'w', encoding='utf-8') as outfile: 530 | for i, async_result in enumerate(async_results): 531 | try: 532 | # Get result (this will block until the specific task completes) 533 | result = async_result.get() 534 | completed_count += 1 535 | processed_count += 1 536 | 537 | # Update counters 538 | if result['type'] == 'inconsistent': 539 | inconsistent_count += 1 540 | elif result['type'] == 'error': 541 | error_count += 1 542 | self.logger.error(f"Error processing line {result['line_number']}: {result['error']}") 543 | 544 | # Write result to file immediately 545 | outfile.write(json.dumps(result['entry'], ensure_ascii=False) + '\n') 546 | outfile.flush() # Ensure data is written to disk 547 | 548 | # Progress reporting - show progress with timing info 549 | current_time = time.time() 550 | if completed_count % 1 == 0 or completed_count == total_entries: 551 | progress_percentage = (completed_count / total_entries) * 100 552 | elapsed_time = current_time - start_time 553 | rate = completed_count / elapsed_time if elapsed_time > 0 else 0 554 | eta_seconds = (total_entries - completed_count) / rate if rate > 0 else 0 555 | eta_str = f"{int(eta_seconds//60)}m{int(eta_seconds%60)}s" if eta_seconds < 3600 else f"{int(eta_seconds//3600)}h{int((eta_seconds%3600)//60)}m" 556 | 557 | self.logger.info(f"🔄 Progress: {completed_count}/{total_entries} ({progress_percentage:.1f}%) | " 558 | f"⚡ {rate:.1f}/s | ETA: {eta_str} | " 559 | f"✅ Inconsistent: {inconsistent_count} | ❌ Errors: {error_count}") 560 | 561 | except Exception as e: 562 | error_count += 1 563 | self.logger.error(f"Failed to get result for task {i}: {e}") 564 | # Create error entry for failed task and write it immediately 565 | if i < len(worker_args_list): 566 | entry_data = worker_args_list[i][0] 567 | line_num, original_entry = entry_data 568 | error_result = { 569 | 'line_number': line_num, 570 | 'is_inconsistent': False, 571 | 'error': str(e), 572 | 'entry': original_entry, 573 | 'type': 'error' 574 | } 575 | outfile.write(json.dumps(error_result['entry'], ensure_ascii=False) + '\n') 576 | outfile.flush() 577 | 578 | except Exception as e: 579 | self.logger.error(f"Multiprocessing failed: {e}") 580 | raise 581 | 582 | self.logger.info(f"✅ Parallel processing complete. Total: {processed_count}, " 583 | f"Inconsistent: {inconsistent_count}, Errors: {error_count}") 584 | self.logger.info(f"📄 Output written to: {output_path}") 585 | 586 | def analyze_commit_applicability(self, commit_entry: Dict[str, Any]) -> Dict[str, Any]: 587 | """ 588 | Analyze rule applicability for a specific commit entry 589 | 590 | Args: 591 | commit_entry: Commit data entry 592 | 593 | Returns: 594 | Detailed applicability analysis 595 | """ 596 | if not self.use_applicability_check or not hasattr(self.rule_manager, 'analyze_commit_applicability'): 597 | return { 598 | 'error': 'Applicability checking not available', 599 | 'use_applicability_check': self.use_applicability_check 600 | } 601 | 602 | try: 603 | # Get detailed applicability analysis 604 | applicability = self.rule_manager.analyze_commit_applicability(commit_entry) 605 | 606 | # Get best rule selection 607 | best_rule, selection_info = self.rule_manager.get_best_rule_for_commit(commit_entry) 608 | 609 | # Count applicable rules 610 | applicable_count = sum(1 for rule_info in applicability.values() if rule_info.get('applicable', False)) 611 | 612 | analysis_result = { 613 | 'commit_message': commit_entry.get('message', ''), 614 | 'files_changed': commit_entry.get('files', []), 615 | 'total_rules': len(applicability), 616 | 'applicable_rules_count': applicable_count, 617 | 'applicability_details': applicability, 618 | 'selected_rule': { 619 | 'name': best_rule.name, 620 | 'type': best_rule.rule_type.value, 621 | 'weight': best_rule.weight, 622 | 'reasoning': selection_info.get('reasoning', 'N/A') 623 | }, 624 | 'summary': self.rule_manager.get_applicability_summary(commit_entry) 625 | } 626 | 627 | return analysis_result 628 | 629 | except Exception as e: 630 | self.logger.error(f"Error in applicability analysis: {e}") 631 | return { 632 | 'error': f'Applicability analysis failed: {str(e)}', 633 | 'commit_message': commit_entry.get('message', ''), 634 | 'files_changed': commit_entry.get('files', []) 635 | } 636 | 637 | def process_jsonl_file_with_analysis(self, input_file: str, output_file: str, 638 | analysis_file: Optional[str] = None, 639 | num_samples: Optional[int] = None, 640 | inconsistency_ratio: float = 1.0, 641 | max_workers: int = 1) -> None: 642 | """ 643 | Process JSONL file with detailed rule applicability analysis 644 | 645 | Args: 646 | input_file: Input JSONL file path 647 | output_file: Output JSONL file path 648 | analysis_file: Optional file to save applicability analysis 649 | num_samples: Number of samples to process 650 | inconsistency_ratio: Ratio of inconsistent data (0.0-1.0) 651 | max_workers: Number of worker processes for parallel processing 652 | """ 653 | if max_workers == 1: 654 | # Use sequential processing (original behavior) 655 | self._process_jsonl_file_with_analysis_sequential( 656 | input_file, output_file, analysis_file, num_samples, inconsistency_ratio 657 | ) 658 | else: 659 | # Note: Analysis with multiprocessing is complex due to shared state 660 | # For now, fall back to sequential processing with a warning 661 | self.logger.warning("Analysis mode with multiprocessing not fully supported, using sequential processing") 662 | self._process_jsonl_file_with_analysis_sequential( 663 | input_file, output_file, analysis_file, num_samples, inconsistency_ratio 664 | ) 665 | 666 | def _process_jsonl_file_with_analysis_sequential(self, input_file: str, output_file: str, 667 | analysis_file: Optional[str] = None, 668 | num_samples: Optional[int] = None, 669 | inconsistency_ratio: float = 1.0) -> None: 670 | """Original sequential analysis processing method""" 671 | input_path = Path(input_file) 672 | output_path = Path(output_file) 673 | 674 | if not input_path.exists(): 675 | raise FileNotFoundError(f"Input file not found: {input_file}") 676 | 677 | # Ensure output directory exists 678 | output_path.parent.mkdir(parents=True, exist_ok=True) 679 | 680 | processed_count = 0 681 | inconsistent_count = 0 682 | analysis_data = [] 683 | 684 | with open(input_path, 'r', encoding='utf-8') as infile, \ 685 | open(output_path, 'w', encoding='utf-8') as outfile: 686 | 687 | for line_num, line in enumerate(infile, 1): 688 | if num_samples is not None and processed_count >= num_samples: 689 | break 690 | 691 | try: 692 | original_entry = json.loads(line.strip()) 693 | 694 | # Decide whether to generate inconsistent data 695 | if random.random() < inconsistency_ratio: 696 | # Analyze applicability if enabled 697 | if self.use_applicability_check and analysis_file: 698 | analysis = self.analyze_commit_applicability(original_entry) 699 | analysis['line_number'] = line_num 700 | analysis['processed_as'] = 'inconsistent' 701 | analysis_data.append(analysis) 702 | 703 | # Generate inconsistent data 704 | inconsistent_entry = self.generate_inconsistent_entry(original_entry) 705 | outfile.write(json.dumps(inconsistent_entry, ensure_ascii=False) + '\n') 706 | inconsistent_count += 1 707 | self.logger.info(f"Generated inconsistent entry {inconsistent_count} from line {line_num}") 708 | else: 709 | # Keep original data (add consistency field) 710 | consistent_entry = original_entry.copy() 711 | consistent_entry['consistency'] = True 712 | outfile.write(json.dumps(consistent_entry, ensure_ascii=False) + '\n') 713 | self.logger.info(f"Kept consistent entry from line {line_num}") 714 | 715 | if analysis_file: 716 | analysis_data.append({ 717 | 'line_number': line_num, 718 | 'processed_as': 'consistent', 719 | 'commit_message': original_entry.get('message', ''), 720 | 'files_changed': original_entry.get('files', []) 721 | }) 722 | 723 | processed_count += 1 724 | 725 | if processed_count % 10 == 0: 726 | self.logger.info(f"Processed {processed_count} entries...") 727 | 728 | except json.JSONDecodeError as e: 729 | self.logger.error(f"Failed to parse JSON on line {line_num}: {e}") 730 | except Exception as e: 731 | self.logger.error(f"Error processing line {line_num}: {e}") 732 | 733 | # Save analysis data if requested 734 | if analysis_file and analysis_data: 735 | analysis_path = Path(analysis_file) 736 | analysis_path.parent.mkdir(parents=True, exist_ok=True) 737 | 738 | with open(analysis_path, 'w', encoding='utf-8') as f: 739 | json.dump(analysis_data, f, ensure_ascii=False, indent=2) 740 | 741 | self.logger.info(f"Saved applicability analysis to {analysis_file}") 742 | 743 | self.logger.info(f"Processing complete. Total: {processed_count}, Inconsistent: {inconsistent_count}") 744 | 745 | def generate_batch_samples(self, original_entries: List[Dict[str, Any]], 746 | batch_size: int = 10) -> List[Dict[str, Any]]: 747 | """ 748 | Generate inconsistent samples in batches 749 | 750 | Args: 751 | original_entries: List of original data entries 752 | batch_size: Batch processing size 753 | 754 | Returns: 755 | List of inconsistent data entries 756 | """ 757 | inconsistent_entries = [] 758 | 759 | for i in range(0, len(original_entries), batch_size): 760 | batch = original_entries[i:i + batch_size] 761 | self.logger.info(f"Processing batch {i // batch_size + 1}") 762 | 763 | for entry in batch: 764 | try: 765 | inconsistent_entry = self.generate_inconsistent_entry(entry) 766 | inconsistent_entries.append(inconsistent_entry) 767 | except Exception as e: 768 | self.logger.error(f"Failed to process entry: {e}") 769 | 770 | return inconsistent_entries 771 | 772 | def analyze_rules_distribution(self, output_file: str) -> Dict[str, int]: 773 | """ 774 | Analyze distribution of rules in generated data 775 | 776 | Args: 777 | output_file: Output file path 778 | 779 | Returns: 780 | Rule distribution statistics 781 | """ 782 | rule_counts = {} 783 | 784 | try: 785 | with open(output_file, 'r', encoding='utf-8') as f: 786 | for line in f: 787 | try: 788 | entry = json.loads(line.strip()) 789 | if entry.get('consistency') == False: 790 | rule_type = entry.get('inconsistency_rule', 'unknown') 791 | rule_counts[rule_type] = rule_counts.get(rule_type, 0) + 1 792 | except json.JSONDecodeError: 793 | continue 794 | except FileNotFoundError: 795 | self.logger.error(f"Output file not found: {output_file}") 796 | 797 | return rule_counts 798 | 799 | def get_available_rules(self) -> List[Dict[str, Any]]: 800 | """Get information about all available inconsistency rules""" 801 | rules_info = [] 802 | for rule in self.rule_manager.get_all_rules(): 803 | rule_info = { 804 | 'type': rule.rule_type.value, 805 | 'name': rule.name, 806 | 'description': rule.description, 807 | 'weight': rule.weight 808 | } 809 | rules_info.append(rule_info) 810 | return rules_info 811 | 812 | 813 | def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None): 814 | """Setup logging configuration""" 815 | log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 816 | 817 | handlers = [logging.StreamHandler(sys.stdout)] 818 | if log_file: 819 | handlers.append(logging.FileHandler(log_file)) 820 | 821 | logging.basicConfig( 822 | level=getattr(logging, log_level.upper()), 823 | format=log_format, 824 | handlers=handlers 825 | ) 826 | 827 | 828 | class ThreadSafeFileWriter: 829 | """Thread-safe file writer to avoid write conflicts""" 830 | 831 | def __init__(self, file_path: str): 832 | self.file_path = file_path 833 | self.lock = threading.Lock() 834 | self.file_handle = None 835 | 836 | def __enter__(self): 837 | self.file_handle = open(self.file_path, 'w', encoding='utf-8') 838 | return self 839 | 840 | def __exit__(self, exc_type, exc_val, exc_tb): 841 | if self.file_handle: 842 | self.file_handle.close() 843 | 844 | def write_entry(self, entry: Dict[str, Any]): 845 | """Write an entry to file in a thread-safe manner""" 846 | with self.lock: 847 | self.file_handle.write(json.dumps(entry, ensure_ascii=False) + '\n') 848 | self.file_handle.flush() 849 | --------------------------------------------------------------------------------