├── assets
    └── pipeline.png
├── data_synthesis
    ├── synthesized_data
    │   └── eval_50k.jsonl
    ├── config.example.json
    ├── __init__.py
    ├── config.py
    ├── validate_data.py
    ├── cli.py
    ├── inconsistency_rules.py
    ├── rule_applicability_checker.py
    ├── llm_interface.py
    ├── data_validator.py
    └── data_generator.py
├── LEGAL.md
├── README.md
├── evaluation
    ├── clone_repos.py
    ├── fewshot
    │   └── consistency_checker.py
    ├── pure_llm
    │   └── consistency_checker.py
    ├── CoT
    │   └── consistency_checker.py
    └── evaluate_main.py
└── LICENSE


/assets/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/CodeFuse-CommitEval/main/assets/pipeline.png


--------------------------------------------------------------------------------
/data_synthesis/synthesized_data/eval_50k.jsonl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2232f1293dbbb794e5705a2af11813240d4e3ac0b7cdc1297221e33fdf112078
3 | size 280641409
4 | 


--------------------------------------------------------------------------------
/LEGAL.md:
--------------------------------------------------------------------------------
1 | 法律免责声明
2 | 关于代码注释部分，中文注释为官方版本，其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致，当中文注释与其它语言注释存在不一致时，请以中文注释为准。
3 | 
4 | Legal Disclaimer
5 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
6 | 


--------------------------------------------------------------------------------
/data_synthesis/config.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "llm": {
 3 |     "provider": "openai",
 4 |     "api_key": "your_openai_api_key_here",
 5 |     "base_url": "",
 6 |     "model": "gpt-3.5-turbo",
 7 |     "max_retries": 3
 8 |   },
 9 |   "generation": {
10 |     "inconsistency_ratio": 1.0,
11 |     "random_seed": 42,
12 |     "batch_size": 10,
13 |     "max_samples": null
14 |   },
15 |   "logging": {
16 |     "level": "INFO",
17 |     "file": null
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/data_synthesis/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data synthesis package - for generating test data with inconsistent commit messages and code diffs
 3 | """
 4 | 
 5 | from .data_generator import InconsistentDataGenerator, setup_logging
 6 | from .inconsistency_rules import InconsistencyRuleManager, InconsistencyType, InconsistencyRule
 7 | from .llm_interface import LLMManager, LangChainOpenAIInterface, MockLLMInterface
 8 | from .config import DataSynthesisConfig, default_config
 9 | 
10 | __version__ = "1.0.0"
11 | __author__ = "Research Team"
12 | 
13 | __all__ = [
14 |     'InconsistentDataGenerator',
15 |     'InconsistencyRuleManager',
16 |     'InconsistencyType',
17 |     'InconsistencyRule',
18 |     'LLMManager',
19 |     'LangChainOpenAIInterface',
20 |     'MockLLMInterface',
21 |     'DataSynthesisConfig',
22 |     'default_config',
23 |     'setup_logging'
24 | ]
25 | 


--------------------------------------------------------------------------------
/data_synthesis/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Data synthesis configuration
 3 | """
 4 | 
 5 | import os
 6 | from typing import Dict, Any
 7 | 
 8 | 
 9 | class DataSynthesisConfig:
10 |     """Data synthesis configuration class"""
11 |     
12 |     def __init__(self):
13 |         # LLM configuration
14 |         self.llm_config = {
15 |             'provider': os.getenv('LLM_PROVIDER', 'openai'),  # openai
16 |             'api_key': os.getenv('LLM_API_KEY', 'empty'),
17 |             'base_url': os.getenv('LLM_BASE_URL', ''),
18 |             'model': os.getenv('LLM_MODEL', 'gpt-3.5-turbo'),
19 |             'max_retries': int(os.getenv('LLM_MAX_RETRIES', '3'))
20 |         }
21 |         
22 |         # Data generation configuration
23 |         self.generation_config = {
24 |             'inconsistency_ratio': float(os.getenv('INCONSISTENCY_RATIO', '1.0')),
25 |             'random_seed': int(os.getenv('RANDOM_SEED', '777')),
26 |             'batch_size': int(os.getenv('BATCH_SIZE', '10')),
27 |             'max_samples': int(os.getenv('MAX_SAMPLES', '0')) or None
28 |         }
29 |         
30 |         # Logging configuration
31 |         self.logging_config = {
32 |             'level': os.getenv('LOG_LEVEL', 'INFO'),
33 |             'file': os.getenv('LOG_FILE', None)
34 |         }
35 |     
36 |     def get_llm_config(self) -> Dict[str, Any]:
37 |         """Get LLM configuration"""
38 |         return self.llm_config.copy()
39 |     
40 |     def get_generation_config(self) -> Dict[str, Any]:
41 |         """Get data generation configuration"""
42 |         return self.generation_config.copy()
43 |     
44 |     def get_logging_config(self) -> Dict[str, Any]:
45 |         """Get logging configuration"""
46 |         return self.logging_config.copy()
47 |     
48 |     def update_config(self, config_dict: Dict[str, Any]) -> None:
49 |         """Update configuration"""
50 |         if 'llm' in config_dict:
51 |             self.llm_config.update(config_dict['llm'])
52 |         if 'generation' in config_dict:
53 |             self.generation_config.update(config_dict['generation'])
54 |         if 'logging' in config_dict:
55 |             self.logging_config.update(config_dict['logging'])
56 |     
57 |     def to_dict(self) -> Dict[str, Any]:
58 |         """Convert to dictionary format"""
59 |         return {
60 |             'llm': self.llm_config,
61 |             'generation': self.generation_config,
62 |             'logging': self.logging_config
63 |         }
64 | 
65 | 
66 | # Default configuration instance
67 | default_config = DataSynthesisConfig()
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)
 2 | ![Python](https://img.shields.io/badge/python-3.9%2B-blue)
 3 | ![arXiv](https://img.shields.io/badge/arXiv-2511.19875-b31b1b.svg)
 4 | 
 5 | 
 6 | # CodeFuse-CommitEval
 7 | 
 8 | CodeFuse-CommitEval is the first benchmark tailored to commit Message-Code Inconsistency (MCI) detection with large language models (LLMs). Building on the ApacheCM dataset for diversity and quality, we synthesize seven types of inconsistent messages via rule-guided mutations of originally consistent commits and apply two-fold validation to verify both positive (inconsistent) and negative (consistent) samples. Using this rich and labeled dataset of message–diff pairs, we then evaluate six state-of-the-art open-source LLMs under a vanilla setting and with three augmentation strategies: few-shot prompting, chain-of-thought (CoT), and extended context.
 9 | 
10 | ![](assets/pipeline.png)
11 | 
12 | ## Features
13 | 
14 | - Multilingual & large-scale dataset
15 | - Even distruction of samples
16 | - Rich inconsistent commit types
17 | - Modular commit mutation rules
18 | - Effective verification for synthesized samples
19 | 
20 | ## Related Project
21 | 
22 | - [ApacheCM Dataset](https://arxiv.org/html/2507.17690v1) - Contextual Code Retrieval for Commit Message Generation: A Preliminary Study
23 | 
24 | ## Documentations
25 | ### Environment Setup
26 | 
27 | Execute under Python 3.9.6
28 | 
29 | ```shell
30 | python3 -m pip install langchain langchain_openai langchain_community
31 | ```
32 | 
33 | ### Benchmarking
34 | First you need to download all the repositories for contextual code retrieval:
35 | 
36 | ```shell
37 | python3 evaluation/clone_repos.py <dataset_json_path> <repo_collection_path>
38 | ```
39 | 
40 | Then, you need to deploy the targeted models by yourself, or use public apis. In our paper, we evaluated the following models:
41 | 
42 | - DeepSeek-V3.1 (Remote API)
43 | - gpt-oss-20b (Local deployment)
44 | - Qwen3-30B-A3B (Local deployment)
45 | - Llama-3.1-8B (Local deployment)
46 | - Mistral-Small-3.2-24B (Local deployment)
47 | - Kimi-K2-Instruct (Remote API)
48 | 
49 | Run benchmarking:
50 | ```shell
51 | python3 evaluation/evaluate_main.py \
52 |     -s {pure_llm,fewshot_llm,cot_llm} \
53 |     --ctx <context_code_lines> \
54 |     -d <dataset_json_path> \
55 |     -r <repo_collection_path> \
56 |     --api_key <api_key> \
57 |     --api-base <base_url> \
58 |     --model <model_name> \
59 |     -o <output_json_path> \
60 |     --worker <concurrent_worker_num>
61 | ```
62 | 
63 | ## Contribution
64 | 
65 | We welcome and encourage contributions from the community! If you're interested in contributing to this project, please follow these guidelines:
66 | 
67 | 1. **Identify a Need**: Before submitting a pull request (PR), ensure that your contribution addresses a real need or improvement for the project.
68 | 
69 | 2. **Submit a PR**: Create a pull request with a clear description of:
70 |    - The problem or feature request you're addressing
71 |    - How your changes solve the problem or implement the feature
72 |    - Any relevant test cases or documentation updates
73 | 
74 | 3. **Review Process**: Our team will review your PR based on:
75 |    - Whether the contribution addresses a genuine need for the project
76 |    - The quality and correctness of the implementation
77 |    - Adherence to the project's coding standards and architecture
78 | 
79 | We appreciate your interest in making CodeFuse-CommitEval better.
80 | 
81 | ## Citation
82 | ```
83 | @misc{zhang2025codefusecommitevalbenchmarkingllmspower,
84 |       title={CodeFuse-CommitEval: Towards Benchmarking LLM's Power on Commit Message and Code Change Inconsistency Detection}, 
85 |       author={Qingyu Zhang and Puzhuo Liu and Peng Di and Chenxiong Qian},
86 |       year={2025},
87 |       eprint={2511.19875},
88 |       archivePrefix={arXiv},
89 |       primaryClass={cs.SE},
90 |       url={https://arxiv.org/abs/2511.19875}, 
91 | }
92 | ```
93 | 
94 | ## License
95 | 
96 | CodeFuse-CommitEval is licensed under the [Apache License 2.0](./LICENSE).
97 | 
98 | 


--------------------------------------------------------------------------------
/data_synthesis/validate_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | CLI tool for validating generated inconsistent data
  4 | """
  5 | 
  6 | import argparse
  7 | import json
  8 | import sys
  9 | from pathlib import Path
 10 | 
 11 | from data_validator import DataValidator
 12 | from llm_interface import LLMManager
 13 | from config import DataSynthesisConfig
 14 | 
 15 | 
 16 | def main():
 17 |     """Main function for data validation CLI"""
 18 |     
 19 |     parser = argparse.ArgumentParser(
 20 |         description="Validate quality of generated inconsistent commit message data",
 21 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 22 |         epilog="""
 23 | Examples:
 24 |   # Basic validation
 25 |   python validate_data.py input.jsonl output.jsonl
 26 |   
 27 |   # Validate with custom LLM settings
 28 |   python validate_data.py input.jsonl output.jsonl --api-key your_key
 29 |   
 30 |   # Process only first 50 entries
 31 |   python validate_data.py input.jsonl output.jsonl --max-entries 50
 32 |   
 33 |   # Save detailed validation report
 34 |   python validate_data.py input.jsonl output.jsonl --report validation_report.json
 35 |         """
 36 |     )
 37 |     
 38 |     # Required arguments
 39 |     parser.add_argument('input_file', help='Input JSONL file with generated data')
 40 |     parser.add_argument('output_file', help='Output JSONL file for valid entries')
 41 |     
 42 |     # LLM configuration
 43 |     llm_group = parser.add_argument_group('LLM Configuration')
 44 |     llm_group.add_argument('--provider', choices=['openai'], 
 45 |                           default='openai', help='LLM provider (default: openai)')
 46 |     llm_group.add_argument('--api-key', default='empty', 
 47 |                           help='API key (default: empty, will use mock response)')
 48 |     llm_group.add_argument('--base-url', help='API base URL')
 49 |     llm_group.add_argument('--model', help='Model name')
 50 |     llm_group.add_argument('--max-retries', type=int, default=3, 
 51 |                           help='Maximum retry attempts (default: 3)')
 52 |     
 53 |     # Validation configuration
 54 |     val_group = parser.add_argument_group('Validation Configuration')
 55 |     val_group.add_argument('--max-entries', type=int, 
 56 |                           help='Maximum number of entries to process')
 57 |     val_group.add_argument('--report', 
 58 |                           help='Save detailed validation report to JSON file')
 59 |     
 60 |     # Other options
 61 |     parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], 
 62 |                        default='INFO', help='Log level (default: INFO)')
 63 |     parser.add_argument('--config', help='Load configuration from JSON file')
 64 |     
 65 |     args = parser.parse_args()
 66 |     
 67 |     # Setup logging
 68 |     import logging
 69 |     logging.basicConfig(
 70 |         level=getattr(logging, args.log_level),
 71 |         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 72 |     )
 73 |     
 74 |     # Create configuration
 75 |     config = DataSynthesisConfig()
 76 |     
 77 |     # Override with command line arguments if provided
 78 |     llm_config_dict = config.llm_config
 79 |     if args.api_key != 'empty':
 80 |         llm_config_dict['api_key'] = args.api_key
 81 |     if args.base_url:
 82 |         llm_config_dict['base_url'] = args.base_url
 83 |     if args.model:
 84 |         llm_config_dict['model'] = args.model
 85 |     if args.max_retries:
 86 |         llm_config_dict['max_retries'] = args.max_retries
 87 |     llm_config_dict['provider'] = args.provider
 88 |     
 89 |     # Load from config file if provided
 90 |     if args.config:
 91 |         import json
 92 |         with open(args.config, 'r') as f:
 93 |             config_data = json.load(f)
 94 |             config.update_config(config_data)
 95 |     
 96 |     # Validate files
 97 |     input_path = Path(args.input_file)
 98 |     if not input_path.exists():
 99 |         print(f"❌ Input file not found: {args.input_file}", file=sys.stderr)
100 |         sys.exit(1)
101 |     
102 |     # Create validator and run validation
103 |     try:
104 |         print(f"🔍 Starting validation of {args.input_file}")
105 |         print(f"📝 Valid entries will be saved to {args.output_file}")
106 |         if args.max_entries:
107 |             print(f"📊 Processing maximum {args.max_entries} entries")
108 |         print()
109 |         
110 |         llm_manager = LLMManager(config.get_llm_config())
111 |         validator = DataValidator(llm_manager)
112 |         
113 |         stats = validator.validate_file(
114 |             input_file=args.input_file,
115 |             output_file=args.output_file,
116 |             max_entries=args.max_entries
117 |         )
118 |         
119 |         # Print statistics
120 |         print()
121 |         validator.print_validation_statistics(stats)
122 |         
123 |         # Save detailed report if requested
124 |         if args.report:
125 |             with open(args.report, 'w', encoding='utf-8') as f:
126 |                 json.dump(stats, f, indent=2, ensure_ascii=False)
127 |             print(f"\n📄 Detailed validation report saved to {args.report}")
128 |         
129 |         print(f"\n✅ Validation complete! Valid data saved to {args.output_file}")
130 |         
131 |     except Exception as e:
132 |         print(f"❌ Error during validation: {e}", file=sys.stderr)
133 |         sys.exit(1)
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     main()
138 | 


--------------------------------------------------------------------------------
/evaluation/clone_repos.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Git Repository Cloner for Consistency Dataset
  4 | 
  5 | This script parses the filtered_inconsistency.jsonl file, extracts unique git URLs,
  6 | and clones them to the specified repository collection directory.
  7 | """
  8 | 
  9 | import json
 10 | import os
 11 | import subprocess
 12 | import sys
 13 | import threading
 14 | from pathlib import Path
 15 | from typing import Set, List
 16 | from urllib.parse import urlparse
 17 | from concurrent.futures import ThreadPoolExecutor, as_completed
 18 | 
 19 | def parse_jsonl_file(file_path: str) -> List[dict]:
 20 |     """
 21 |     Parse JSONL file and return list of records.
 22 |     
 23 |     Args:
 24 |         file_path (str): Path to the JSONL file
 25 |         
 26 |     Returns:
 27 |         List[dict]: List of parsed JSON records
 28 |     """
 29 |     records = []
 30 |     try:
 31 |         with open(file_path, 'r', encoding='utf-8') as f:
 32 |             for line_num, line in enumerate(f, 1):
 33 |                 line = line.strip()
 34 |                 if not line:
 35 |                     continue
 36 |                 try:
 37 |                     record = json.loads(line)
 38 |                     records.append(record)
 39 |                 except json.JSONDecodeError as e:
 40 |                     print(f"Warning: Failed to parse JSON on line {line_num}: {e}")
 41 |                     continue
 42 |     except FileNotFoundError:
 43 |         print(f"Error: File not found: {file_path}")
 44 |         sys.exit(1)
 45 |     except Exception as e:
 46 |         print(f"Error reading file {file_path}: {e}")
 47 |         sys.exit(1)
 48 |     
 49 |     return records
 50 | 
 51 | def extract_git_urls(records: List[dict]) -> Set[str]:
 52 |     """
 53 |     Extract unique git URLs from records.
 54 |     
 55 |     Args:
 56 |         records (List[dict]): List of parsed records
 57 |         
 58 |     Returns:
 59 |         Set[str]: Set of unique git URLs
 60 |     """
 61 |     git_urls = set()
 62 |     
 63 |     for record in records:
 64 |         git_url = record.get('git_url')
 65 |         if git_url and isinstance(git_url, str):
 66 |             # Clean up the URL (remove .git suffix if present for consistency)
 67 |             if git_url.endswith('.git'):
 68 |                 git_url = git_url[:-4]
 69 |             git_urls.add(git_url + '.git')  # Add .git back for cloning
 70 |     
 71 |     return git_urls
 72 | 
 73 | def get_repo_name_from_url(git_url: str) -> str:
 74 |     """
 75 |     Extract repository name from git URL.
 76 |     
 77 |     Args:
 78 |         git_url (str): Git repository URL
 79 |         
 80 |     Returns:
 81 |         str: Repository name
 82 |     """
 83 |     parsed = urlparse(git_url)
 84 |     path = parsed.path.strip('/')
 85 |     
 86 |     if path.endswith('.git'):
 87 |         path = path[:-4]
 88 |     
 89 |     # Handle GitHub URLs
 90 |     parts = path.split('/')
 91 |     if len(parts) >= 2:
 92 |         return f"{parts[-2]}_{parts[-1]}"
 93 |     else:
 94 |         return parts[-1] if parts else "unknown_repo"
 95 | 
 96 | def clone_repository(git_url: str, target_dir: str) -> tuple[bool, str]:
 97 |     """
 98 |     Clone a git repository to the target directory.
 99 |     
100 |     Args:
101 |         git_url (str): Git repository URL to clone
102 |         target_dir (str): Target directory for cloning
103 |         
104 |     Returns:
105 |         tuple[bool, str]: (success, repo_name)
106 |     """
107 |     repo_name = get_repo_name_from_url(git_url)
108 |     repo_path = os.path.join(target_dir, repo_name)
109 |     
110 |     # Thread-safe printing
111 |     thread_id = threading.current_thread().name
112 |     
113 |     # Check if repository already exists
114 |     if os.path.exists(repo_path):
115 |         print(f"[{thread_id}] Repository already exists: {repo_name}")
116 |         return True, repo_name
117 |     
118 |     try:
119 |         print(f"[{thread_id}] Cloning {git_url} to {repo_name}...")
120 |         
121 |         # Use git clone with shallow clone for faster download
122 |         cmd = [
123 |             'git', 'clone', 
124 |             git_url, 
125 |             repo_path
126 |         ]
127 |         
128 |         result = subprocess.run(
129 |             cmd, 
130 |             capture_output=True, 
131 |             text=True, 
132 |             timeout=3000  # 5 minute timeout
133 |         )
134 |         
135 |         if result.returncode == 0:
136 |             print(f"[{thread_id}] ✅ Successfully cloned: {repo_name}")
137 |             return True, repo_name
138 |         else:
139 |             print(f"[{thread_id}] ❌ Failed to clone {repo_name}")
140 |             print(f"[{thread_id}] Error: {result.stderr}")
141 |             return False, repo_name
142 |             
143 |     except subprocess.TimeoutExpired:
144 |         print(f"[{thread_id}] ❌ Timeout while cloning {repo_name}")
145 |         return False, repo_name
146 |     except Exception as e:
147 |         print(f"[{thread_id}] ❌ Error cloning {repo_name}: {e}")
148 |         return False, repo_name
149 | 
150 | def main():
151 |     """Main function to orchestrate the cloning process."""
152 |     
153 |     # File paths
154 |     jsonl_file = sys.argv[1]
155 |     target_dir = sys.argv[2]
156 |     
157 |     # Configuration
158 |     max_workers = 4  # Number of concurrent cloning processes
159 |     
160 |     print("Git Repository Cloner for Consistency Dataset")
161 |     print("=" * 50)
162 |     
163 |     # Create target directory if it doesn't exist
164 |     os.makedirs(target_dir, exist_ok=True)
165 |     print(f"Target directory: {target_dir}")
166 |     print(f"Max concurrent workers: {max_workers}")
167 |     
168 |     # Parse JSONL file
169 |     print(f"Parsing JSONL file: {jsonl_file}")
170 |     records = parse_jsonl_file(jsonl_file)
171 |     print(f"Loaded {len(records)} records")
172 |     
173 |     # Extract unique git URLs
174 |     print("Extracting unique git URLs...")
175 |     git_urls = extract_git_urls(records)
176 |     print(f"Found {len(git_urls)} unique git URLs")
177 |     
178 |     # Display URLs to be cloned
179 |     print("\nURLs to be cloned:")
180 |     for i, url in enumerate(sorted(git_urls), 1):
181 |         print(f"  {i:3d}. {url}")
182 |     
183 |     # Ask for confirmation
184 |     print(f"\nThis will clone {len(git_urls)} repositories to {target_dir}")
185 |     print(f"Using {max_workers} concurrent workers")
186 |     response = input("Do you want to continue? (y/N): ").strip().lower()
187 |     
188 |     if response not in ['y', 'yes']:
189 |         print("Operation cancelled.")
190 |         return
191 |     
192 |     # Clone repositories concurrently
193 |     print("\nStarting concurrent repository cloning...")
194 |     print("=" * 50)
195 |     
196 |     successful_clones = 0
197 |     failed_clones = 0
198 |     completed_count = 0
199 |     
200 |     # Use ThreadPoolExecutor for concurrent cloning
201 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
202 |         # Submit all clone tasks
203 |         future_to_url = {
204 |             executor.submit(clone_repository, url, target_dir): url 
205 |             for url in sorted(git_urls)
206 |         }
207 |         
208 |         # Process completed tasks as they finish
209 |         for future in as_completed(future_to_url):
210 |             git_url = future_to_url[future]
211 |             completed_count += 1
212 |             
213 |             try:
214 |                 success, repo_name = future.result()
215 |                 if success:
216 |                     successful_clones += 1
217 |                     print(f"[MAIN] Progress: {completed_count}/{len(git_urls)} - ✅ {repo_name}")
218 |                 else:
219 |                     failed_clones += 1
220 |                     print(f"[MAIN] Progress: {completed_count}/{len(git_urls)} - ❌ {repo_name}")
221 |                     
222 |             except Exception as e:
223 |                 failed_clones += 1
224 |                 repo_name = get_repo_name_from_url(git_url)
225 |                 print(f"[MAIN] Progress: {completed_count}/{len(git_urls)} - ❌ {repo_name} (Exception: {e})")
226 |     
227 |     # Summary
228 |     print("\n" + "=" * 50)
229 |     print("CLONING SUMMARY")
230 |     print("=" * 50)
231 |     print(f"Total repositories: {len(git_urls)}")
232 |     print(f"Successfully cloned: {successful_clones}")
233 |     print(f"Failed to clone: {failed_clones}")
234 |     print(f"Concurrent workers used: {max_workers}")
235 |     
236 |     if failed_clones > 0:
237 |         print(f"\n⚠️  {failed_clones} repositories failed to clone.")
238 |         print("You may want to retry these manually or check your network connection.")
239 |     else:
240 |         print("\n🎉 All repositories cloned successfully!")
241 | 
242 | if __name__ == "__main__":
243 |     main()
244 | 


--------------------------------------------------------------------------------
/evaluation/fewshot/consistency_checker.py:
--------------------------------------------------------------------------------
  1 | from langchain_community.chat_models import ChatOpenAI
  2 | from langchain_core.prompts import ChatPromptTemplate
  3 | from typing import Dict, Any, Optional
  4 | import logging
  5 | import json
  6 | import re
  7 | 
  8 | import time
  9 | import os
 10 | 
 11 | 
 12 | class ConsistencyChecker:
 13 |     """
 14 |     A consistency checker that uses few-shot learning with examples.
 15 |     """
 16 |     
 17 |     def __init__(self, openai_api_key: str, openai_api_base: str, model: str = "gpt-3.5-turbo"):
 18 |         """
 19 |         Initialize the few-shot consistency checker with LLM configuration.
 20 |         
 21 |         Args:
 22 |             openai_api_key (str): OpenAI API key
 23 |             openai_api_base (str): OpenAI API base URL
 24 |             model (str): Model name to use
 25 |         """
 26 |         from langchain_community.chat_models import ChatOpenAI
 27 |         from langchain_core.prompts import ChatPromptTemplate
 28 |         
 29 |         # Validate input parameters
 30 |         if not openai_api_key or not openai_api_key.strip():
 31 |             raise ValueError("OpenAI API key cannot be empty")
 32 |         if not openai_api_base or not openai_api_base.strip():
 33 |             raise ValueError("OpenAI API base URL cannot be empty")
 34 |         if not model or not model.strip():
 35 |             raise ValueError("Model name cannot be empty")
 36 |             
 37 |         self.chat_model = ChatOpenAI(
 38 |             openai_api_key=openai_api_key,
 39 |             openai_api_base=openai_api_base,
 40 |             model=model,
 41 |             temperature=0
 42 |         )
 43 |         
 44 |         # Define the system prompt with few-shot examples
 45 |         self.system_prompt = """You are an expert code reviewer tasked with evaluating the consistency between commit messages and their corresponding code changes.
 46 | 
 47 | Your job is to analyze whether a commit message accurately describes the actual code changes made in the diff.
 48 | 
 49 | Consider the following aspects:
 50 | 1. Does the commit message describe the actual changes made?
 51 | 2. Are the mentioned components/files/functions actually modified?
 52 | 3. Is the scope of changes (major/minor) consistent with the message?
 53 | 4. Are any important changes missing from the commit message?
 54 | 5. Does the commit message contain any false or misleading information?
 55 | 
 56 | Here are two examples to guide your analysis:
 57 | 
 58 | **Example 1 (Consistent):**
 59 | Commit Message: "Fix memory leak in buffer allocation"
 60 | Code Diff:
 61 | ```
 62 |  void allocate_buffer() {{
 63 | -    char* buf = malloc(1024);
 64 | +    char* buf = malloc(1024);
 65 | +    if (!buf) return;
 66 |      // process buffer
 67 | +    free(buf);
 68 |  }}
 69 | ```
 70 | Analysis: {{
 71 |   "consistent": true,
 72 |   "confidence": 0.95,
 73 |   "reasoning": "The commit message accurately describes the change. A memory leak was indeed fixed by adding proper error checking and freeing the allocated buffer.",
 74 |   "issues": []
 75 | }}
 76 | 
 77 | **Example 2 (Inconsistent):**
 78 | Commit Message: "Add new sorting algorithm implementation"
 79 | Code Diff:
 80 | ```
 81 |  int compare_strings(const char* a, const char* b) {{
 82 | -    return strcmp(a, b);
 83 | +    return strcasecmp(a, b);
 84 |  }}
 85 | ```
 86 | Analysis: {{
 87 |   "consistent": false,
 88 |   "confidence": 0.9,
 89 |   "reasoning": "The commit message claims to add a new sorting algorithm, but the actual change only modifies a string comparison function to be case-insensitive. No sorting algorithm was added.",
 90 |   "issues": ["Commit message mentions adding sorting algorithm but only string comparison was changed", "Scope mismatch: minor change vs claimed major addition"]
 91 | }}
 92 | 
 93 | Respond with a JSON object containing:
 94 | - "consistent": true/false,
 95 | - "confidence": 0.0-1.0,
 96 | - "reasoning": "detailed explanation of your analysis",
 97 | - "issues": ["list of specific inconsistencies found, if any"]
 98 | 
 99 | Be thorough in your analysis and provide clear reasoning for your decision."""
100 | 
101 |         self.user_prompt = """Please analyze the consistency between this commit message and the corresponding code diff:
102 | 
103 | **Commit Message:**
104 | {commit_message}
105 | 
106 | **Code Diff:**
107 | {code_diff}
108 | 
109 | Evaluate whether the commit message accurately describes the code changes and respond with the requested JSON format."""
110 | 
111 |     def check_consistency(self, commit_message: str, code_diff: str) -> Dict[str, Any]:
112 |         """
113 |         Check consistency between a commit message and code diff using few-shot learning.
114 |         
115 |         Args:
116 |             commit_message (str): The commit message to analyze
117 |             code_diff (str): The code diff to analyze
118 |             
119 |         Returns:
120 |             Dict[str, Any]: Analysis result containing consistency, confidence, reasoning, and issues
121 |         """
122 |         try:
123 |             from langchain_core.prompts import ChatPromptTemplate
124 |             
125 |             prompt = ChatPromptTemplate.from_messages([
126 |                 ("system", self.system_prompt),
127 |                 ("user", self.user_prompt)
128 |             ])
129 |             
130 |             chain = prompt | self.chat_model
131 |             
132 |             response = chain.invoke({
133 |                 "commit_message": commit_message,
134 |                 "code_diff": code_diff
135 |             })
136 |             
137 |             
138 |             # Extract token usage from response
139 |             prompt_tokens = 0
140 |             completion_tokens = 0
141 |             total_tokens = 0
142 |             
143 |             try:
144 |                 if hasattr(response, 'response_metadata') and response.response_metadata:
145 |                     token_usage = response.response_metadata.get('token_usage', {})
146 |                     prompt_tokens = token_usage.get('prompt_tokens', 0)
147 |                     completion_tokens = token_usage.get('completion_tokens', 0)
148 |                     total_tokens = token_usage.get('total_tokens', 0)
149 |                 elif hasattr(response, 'usage_metadata'):
150 |                     prompt_tokens = getattr(response.usage_metadata, 'input_tokens', 0)
151 |                     completion_tokens = getattr(response.usage_metadata, 'output_tokens', 0)
152 |                     total_tokens = getattr(response.usage_metadata, 'total_tokens', 0)
153 |             except Exception as e:
154 |                 pass
155 |             
156 |             # Parse the JSON response
157 |             result = self._parse_response(response.content)
158 |             
159 |             # Add token usage to result
160 |             result['prompt_tokens'] = prompt_tokens
161 |             result['completion_tokens'] = completion_tokens
162 |             result['total_tokens'] = total_tokens
163 |             
164 |             return result
165 |             
166 |         except Exception as e:
167 |             # logger.error(f"Error in few-shot consistency check: {e}")
168 |             import traceback
169 |             traceback.print_exc()
170 |             return {
171 |                 "consistent": False,
172 |                 "confidence": 0.0,
173 |                 "reasoning": f"Error occurred during analysis: {str(e)}",
174 |                 "issues": ["Analysis failed due to technical error"],
175 |                 "prompt_tokens": 0,
176 |                 "completion_tokens": 0,
177 |                 "total_tokens": 0
178 |             }
179 |     
180 |     def _parse_response(self, content: str) -> Dict[str, Any]:
181 |         """
182 |         Parse the LLM response to extract the JSON result.
183 |         
184 |         Args:
185 |             content (str): Raw LLM response content
186 |             
187 |         Returns:
188 |             Dict[str, Any]: Parsed analysis result
189 |         """
190 |         try:
191 |             import json
192 |             import re
193 |             
194 |             content = content.strip()
195 |             
196 |             # Try to find JSON object in the response
197 |             json_match = re.search(r'\{.*\}', content, re.DOTALL)
198 |             if json_match:
199 |                 json_str = json_match.group()
200 |                 result = json.loads(json_str)
201 |                 
202 |                 # Validate required fields
203 |                 required_fields = ['consistent', 'confidence', 'reasoning', 'issues']
204 |                 for field in required_fields:
205 |                     if field not in result:
206 |                         raise ValueError(f"Missing required field: {field}")
207 |                 
208 |                 # Ensure types are correct
209 |                 result['consistent'] = bool(result['consistent'])
210 |                 result['confidence'] = float(result['confidence'])
211 |                 result['reasoning'] = str(result['reasoning'])
212 |                 result['issues'] = list(result['issues']) if isinstance(result['issues'], list) else []
213 |                 
214 |                 return result
215 |             else:
216 |                 raise ValueError("No JSON object found in response")
217 |                 
218 |         except Exception as e:
219 |             # logger.error(f"Error parsing LLM response: {e}")
220 |             # logger.error(f"Raw response: {content}")
221 |             
222 |             # Return a default response for parsing errors
223 |             return {
224 |                 "consistent": False,
225 |                 "confidence": 0.0,
226 |                 "reasoning": f"Failed to parse LLM response: {str(e)}",
227 |                 "issues": ["Response parsing error"]
228 |             }
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2025 CodeFuse-CommitEval Contributors
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/data_synthesis/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Command line tool for inconsistent data generation
  4 | """
  5 | 
  6 | import argparse
  7 | import json
  8 | import sys
  9 | import multiprocessing
 10 | from pathlib import Path
 11 | from typing import Optional
 12 | 
 13 | from config import DataSynthesisConfig
 14 | from data_generator import InconsistentDataGenerator, setup_logging
 15 | 
 16 | 
 17 | def main():
 18 |     """Main function"""
 19 |     parser = argparse.ArgumentParser(
 20 |         description="Generate test data with inconsistent commit messages and code diffs",
 21 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 22 |         epilog="""
 23 | Examples:
 24 |   # Basic usage
 25 |   python -m data_synthesis.cli input.jsonl output.jsonl
 26 |   
 27 |   # Specify LLM configuration
 28 |   python -m data_synthesis.cli input.jsonl output.jsonl --provider openai --api-key your_key
 29 |   
 30 |   # Process first 100 samples with 50% inconsistency ratio
 31 |   python -m data_synthesis.cli input.jsonl output.jsonl --max-samples 100 --ratio 0.5
 32 |   
 33 |   # Use parallel processing with 4 workers
 34 |   python -m data_synthesis.cli input.jsonl output.jsonl --workers 4
 35 |   
 36 |   # List all available rules
 37 |   python -m data_synthesis.cli --list-rules
 38 |         """
 39 |     )
 40 |     
 41 |     # Input/output parameters
 42 |     parser.add_argument('input_file', nargs='?', help='Input JSONL file path')
 43 |     parser.add_argument('output_file', nargs='?', help='Output JSONL file path')
 44 |     
 45 |     # LLM configuration
 46 |     llm_group = parser.add_argument_group('LLM Configuration')
 47 |     llm_group.add_argument('--provider', choices=['openai'], 
 48 |                           default='openai', help='LLM provider (default: openai)')
 49 |     llm_group.add_argument('--api-key', default='empty', 
 50 |                           help='API key (default: empty, will use mock response)')
 51 |     llm_group.add_argument('--base-url', help='API base URL')
 52 |     llm_group.add_argument('--model', help='Model name')
 53 |     llm_group.add_argument('--max-retries', type=int, default=3, 
 54 |                           help='Maximum retry attempts (default: 3)')
 55 |     
 56 |     # Generation configuration
 57 |     gen_group = parser.add_argument_group('Generation Configuration')
 58 |     gen_group.add_argument('--ratio', type=float, default=1.0, 
 59 |                           help='Inconsistent data ratio 0.0-1.0 (default: 1.0)')
 60 |     gen_group.add_argument('--max-samples', type=int, 
 61 |                           help='Maximum number of samples to process')
 62 |     gen_group.add_argument('--seed', type=int, default=42, 
 63 |                           help='Random seed (default: 42)')
 64 |     gen_group.add_argument('--batch-size', type=int, default=10,
 65 |                           help='Batch size (default: 10)')
 66 |     gen_group.add_argument('--no-applicability-check', action='store_true',
 67 |                           help='Disable intelligent rule applicability checking (use random selection)')
 68 |     gen_group.add_argument('--test-applicability', 
 69 |                           help='Test rule applicability for a specific commit (provide JSON file)')
 70 |     gen_group.add_argument('--save-analysis', 
 71 |                           help='Save detailed applicability analysis to file')
 72 |     gen_group.add_argument('--workers', '-w', type=int, default=1,
 73 |                           help='Number of worker processes for parallel processing (default: 1)')
 74 |     
 75 |     # Other options
 76 |     parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], 
 77 |                        default='INFO', help='Log level (default: INFO)')
 78 |     parser.add_argument('--log-file', help='Log file path')
 79 |     parser.add_argument('--list-rules', action='store_true', 
 80 |                        help='List all available inconsistency rules')
 81 |     parser.add_argument('--analyze', help='Analyze rule distribution in output file')
 82 |     parser.add_argument('--config', help='Load configuration from JSON file')
 83 |     
 84 |     args = parser.parse_args()
 85 |     
 86 |     # Setup logging
 87 |     setup_logging(args.log_level, args.log_file)
 88 |     
 89 |     # Handle test applicability request
 90 |     if args.test_applicability:
 91 |         test_applicability(args.test_applicability, create_config(args))
 92 |         return
 93 |     
 94 |     # Handle list rules request
 95 |     if args.list_rules:
 96 |         list_rules()
 97 |         return
 98 |     
 99 |     # Handle analyze request
100 |     if args.analyze:
101 |         analyze_output(args.analyze)
102 |         return
103 |     
104 |     # Check required parameters
105 |     if not args.input_file or not args.output_file:
106 |         parser.error("Input and output file paths are required, or use --list-rules or --analyze options")
107 |     
108 |     # Validate workers parameter
109 |     max_cpu_count = multiprocessing.cpu_count()
110 |     if args.workers < 1:
111 |         parser.error("Number of workers must be at least 1")
112 |     elif args.workers > max_cpu_count:
113 |         print(f"⚠️  Number of workers ({args.workers}) exceeds CPU count ({max_cpu_count}), using {max_cpu_count}")
114 |         args.workers = max_cpu_count
115 |     
116 |     # Create configuration
117 |     config = create_config(args)
118 |     
119 |     # Create generator and process file
120 |     try:
121 |         # print("==============")
122 |         # print(config.get_llm_config())
123 |         generator = InconsistentDataGenerator(
124 |             llm_config=config.get_llm_config(),
125 |             seed=args.seed,
126 |             use_applicability_check=not args.no_applicability_check
127 |         )
128 |         
129 |         print(f"🚀 Starting data generation with {args.workers} worker(s)...")
130 |         if args.workers > 1:
131 |             print(f"📊 Using multiprocessing for parallel data generation")
132 |         
133 |         # Use enhanced processing if analysis is requested
134 |         if args.save_analysis:
135 |             generator.process_jsonl_file_with_analysis(
136 |                 input_file=args.input_file,
137 |                 output_file=args.output_file,
138 |                 analysis_file=args.save_analysis,
139 |                 num_samples=args.max_samples,
140 |                 inconsistency_ratio=args.ratio,
141 |                 max_workers=args.workers
142 |             )
143 |         else:
144 |             generator.process_jsonl_file(
145 |                 input_file=args.input_file,
146 |                 output_file=args.output_file,
147 |                 num_samples=args.max_samples,
148 |                 inconsistency_ratio=args.ratio,
149 |                 max_workers=args.workers
150 |             )
151 |         
152 |         print(f"✅ Processing complete! Output file: {args.output_file}")
153 |         
154 |         # Show rule distribution statistics
155 |         rule_distribution = generator.analyze_rules_distribution(args.output_file)
156 |         if rule_distribution:
157 |             print("\n📊 Rule Distribution Statistics:")
158 |             for rule_type, count in sorted(rule_distribution.items()):
159 |                 print(f"  {rule_type}: {count}")
160 |         
161 |     except Exception as e:
162 |         print(f"❌ Processing failed: {e}", file=sys.stderr)
163 |         sys.exit(1)
164 | 
165 | 
166 | def create_config(args) -> DataSynthesisConfig:
167 |     """Create configuration based on command line arguments"""
168 |     config = DataSynthesisConfig()
169 |     
170 |     # Load configuration from file
171 |     if args.config:
172 |         try:
173 |             with open(args.config, 'r', encoding='utf-8') as f:
174 |                 config_dict = json.load(f)
175 |                 config.update_config(config_dict)
176 |         except Exception as e:
177 |             print(f"⚠️  Failed to load configuration file: {e}", file=sys.stderr)
178 |     
179 |     # Update LLM configuration
180 |     llm_updates = {}
181 |     if args.provider:
182 |         llm_updates['provider'] = args.provider
183 |     if args.api_key:
184 |         llm_updates['api_key'] = args.api_key
185 |     if args.base_url:
186 |         llm_updates['base_url'] = args.base_url
187 |     if args.model:
188 |         llm_updates['model'] = args.model
189 |     if args.max_retries:
190 |         llm_updates['max_retries'] = args.max_retries
191 |     
192 |     if llm_updates:
193 |         config.update_config({'llm': llm_updates})
194 |     
195 |     return config
196 | 
197 | 
198 | def list_rules():
199 |     """List all available inconsistency rules"""
200 |     config = DataSynthesisConfig()
201 |     generator = InconsistentDataGenerator(config.get_llm_config())
202 |     
203 |     rules = generator.get_available_rules()
204 |     
205 |     print("📋 Available Inconsistency Rules:")
206 |     print("=" * 60)
207 |     
208 |     for i, rule in enumerate(rules, 1):
209 |         print(f"\n{i}. {rule['name']} ({rule['type']})")
210 |         print(f"   Weight: {rule['weight']}")
211 |         print(f"   Description: {rule['description']}")
212 |     
213 |     print(f"\nTotal: {len(rules)} rules")
214 | 
215 | 
216 | def test_applicability(json_file: str, config: DataSynthesisConfig):
217 |     """Test rule applicability for a specific commit"""
218 |     import json
219 |     from rule_applicability_checker import EnhancedInconsistencyRuleManager
220 |     from llm_interface import LLMManager
221 |     
222 |     if not Path(json_file).exists():
223 |         print(f"❌ File does not exist: {json_file}", file=sys.stderr)
224 |         sys.exit(1)
225 |     
226 |     # Load commit data
227 |     try:
228 |         with open(json_file, 'r', encoding='utf-8') as f:
229 |             commit_data = json.load(f)
230 |     except (json.JSONDecodeError, IOError) as e:
231 |         print(f"❌ Error reading JSON file: {e}", file=sys.stderr)
232 |         sys.exit(1)
233 |     
234 |     # Test applicability
235 |     llm_manager = LLMManager(config.get_llm_config())
236 |     enhanced_manager = EnhancedInconsistencyRuleManager(llm_manager)
237 |     
238 |     print(f"🔍 Testing Rule Applicability for Commit:")
239 |     print(f"Message: {commit_data.get('message', 'N/A')}")
240 |     print(f"Files: {commit_data.get('files', [])}")
241 |     print("=" * 60)
242 |     
243 |     try:
244 |         # Get applicability analysis
245 |         applicability = enhanced_manager.analyze_commit_applicability(commit_data)
246 |         
247 |         print("📊 Applicability Analysis:")
248 |         for rule_key, rule_info in applicability.items():
249 |             status = "✅ Applicable" if rule_info["applicable"] else "❌ Not Applicable"
250 |             print(f"  {rule_key}: {status}")
251 |             print(f"    Reasoning: {rule_info['reasoning']}")
252 |         
253 |         # Get best rule
254 |         best_rule, selection_info = enhanced_manager.get_best_rule_for_commit(commit_data)
255 |         print(f"\n🏆 Best Rule Selected: {best_rule.name} (weight: {best_rule.weight})")
256 |         print(f"📝 Selection Reasoning: {selection_info.get('reasoning', 'N/A')}")
257 |         
258 |     except Exception as e:
259 |         print(f"❌ Error during applicability testing: {e}")
260 |         sys.exit(1)
261 | 
262 | 
263 | def analyze_output(output_file: str):
264 |     """Analyze rule distribution in output file"""
265 |     if not Path(output_file).exists():
266 |         print(f"❌ File does not exist: {output_file}", file=sys.stderr)
267 |         sys.exit(1)
268 |     
269 |     config = DataSynthesisConfig()
270 |     generator = InconsistentDataGenerator(config.get_llm_config())
271 |     
272 |     rule_distribution = generator.analyze_rules_distribution(output_file)
273 |     
274 |     if not rule_distribution:
275 |         print("📊 No inconsistent data found")
276 |         return
277 |     
278 |     print(f"📊 Rule Distribution Statistics for {output_file}:")
279 |     print("=" * 60)
280 |     
281 |     total_inconsistent = sum(rule_distribution.values())
282 |     
283 |     for rule_type, count in sorted(rule_distribution.items(), key=lambda x: x[1], reverse=True):
284 |         percentage = (count / total_inconsistent) * 100
285 |         print(f"{rule_type:30} {count:6d} ({percentage:5.1f}%)")
286 |     
287 |     print("-" * 60)
288 |     print(f"{'Total':30} {total_inconsistent:6d} (100.0%)")
289 | 
290 | 
291 | if __name__ == '__main__':
292 |     main()
293 | 


--------------------------------------------------------------------------------
/data_synthesis/inconsistency_rules.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Define rules for generating inconsistent commit messages
  3 | """
  4 | 
  5 | from enum import Enum
  6 | from dataclasses import dataclass
  7 | from typing import List, Dict, Any
  8 | import random
  9 | 
 10 | 
 11 | class InconsistencyType(Enum):
 12 |     """Inconsistency type enumeration"""
 13 |     FUNCTION_NAME_MISMATCH = "function_name_mismatch"
 14 |     FILE_PATH_MISMATCH = "file_path_mismatch"
 15 |     OPERATION_TYPE_MISMATCH = "operation_type_mismatch"
 16 |     PURPOSE_MISMATCH = "purpose_mismatch"
 17 |     COMPONENT_MISMATCH = "component_mismatch"
 18 |     FEATURE_MISSING = "feature_missing"
 19 |     EXTRA_FEATURE = "extra_feature"
 20 | 
 21 | 
 22 | @dataclass
 23 | class InconsistencyRule:
 24 |     """Inconsistency rule definition"""
 25 |     rule_type: InconsistencyType
 26 |     name: str
 27 |     description: str
 28 |     prompt_template: str
 29 |     weight: float = 1.0  # Weight for rule selection
 30 | 
 31 | 
 32 | class InconsistencyRuleManager:
 33 |     """Inconsistency rule manager"""
 34 |     
 35 |     def __init__(self):
 36 |         self.rules = self._initialize_rules()
 37 |     
 38 |     def _initialize_rules(self) -> List[InconsistencyRule]:
 39 |         """Initialize all inconsistency rules"""
 40 |         return [
 41 |             InconsistencyRule(
 42 |                 rule_type=InconsistencyType.FUNCTION_NAME_MISMATCH,
 43 |                 name="Function Name Mismatch",
 44 |                 description="Modify function names mentioned in commit message to make them inconsistent with actual code diff",
 45 |                 prompt_template="""
 46 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the function names mentioned to different but plausible function names that don't actually appear in the diff.
 47 | 
 48 | Original commit message: {message}
 49 | Code diff: {diff}
 50 | 
 51 | Requirements:
 52 | 1. Keep the overall structure and tone of the original message
 53 | 2. Replace function names with different but realistic function names
 54 | 3. Ensure the new function names are NOT present in the actual diff
 55 | 4. Output a COMPLETE, standalone commit message (not just the changed parts)
 56 | 5. The resulting message should seem plausible but be factually incorrect about function names
 57 | 
 58 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message.
 59 | """,
 60 |                 weight=1.5
 61 |             ),
 62 |             
 63 |             InconsistencyRule(
 64 |                 rule_type=InconsistencyType.FILE_PATH_MISMATCH,
 65 |                 name="File Path Mismatch",
 66 |                 description="Modify file paths mentioned in commit message to make them inconsistent with actual modified files",
 67 |                 prompt_template="""
 68 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the file paths or module names mentioned to different but plausible paths that don't match the actual files changed.
 69 | 
 70 | Original commit message: {message}
 71 | Code diff: {diff}
 72 | Actual files changed: {files}
 73 | 
 74 | Requirements:
 75 | 1. Keep the overall structure and purpose of the original message
 76 | 2. Replace file paths or module names with different but realistic alternatives
 77 | 3. Ensure the new paths are NOT in the actual files changed list
 78 | 4. Output a COMPLETE, standalone commit message (not just the changed parts)
 79 | 5. The resulting message should seem plausible but be factually incorrect about which files were modified
 80 | 
 81 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message.
 82 | """,
 83 |                 weight=1.3
 84 |             ),
 85 |             
 86 |             InconsistencyRule(
 87 |                 rule_type=InconsistencyType.OPERATION_TYPE_MISMATCH,
 88 |                 name="Operation Type Mismatch",
 89 |                 description="Modify operation type described in commit message (add/remove/fix/refactor) to make it inconsistent with actual code changes",
 90 |                 prompt_template="""
 91 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the operation type (add/remove/fix/refactor/update/etc.) to a different operation that doesn't match what was actually done in the code.
 92 | 
 93 | Original commit message: {message}
 94 | Code diff: {diff}
 95 | 
 96 | Requirements:
 97 | 1. Identify the actual operation performed in the diff (adding, removing, fixing, refactoring, etc.)
 98 | 2. Change the commit message to describe a different type of operation
 99 | 3. Keep other details relatively consistent
100 | 4. Output a COMPLETE, standalone commit message (not just the changed operation)
101 | 5. The new operation should be plausible but factually incorrect
102 | 
103 | Common operation changes:
104 | - "fix" → "add" or "remove"
105 | - "add" → "fix" or "refactor"
106 | - "remove" → "update" or "add"
107 | - "refactor" → "fix" or "optimize"
108 | 
109 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message.
110 | """,
111 |                 weight=1.4
112 |             ),
113 |             
114 |             InconsistencyRule(
115 |                 rule_type=InconsistencyType.PURPOSE_MISMATCH,
116 |                 name="Purpose Mismatch",
117 |                 description="Modify the purpose described in commit message to make it inconsistent with actual code change purpose",
118 |                 prompt_template="""
119 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the stated purpose or goal of the changes while keeping the technical action similar.
120 | 
121 | Original commit message: {message}
122 | Code diff: {diff}
123 | 
124 | Requirements:
125 | 1. Keep the technical action (what was changed) relatively accurate
126 | 2. Change the reason/purpose/goal for making the change
127 | 3. Output a COMPLETE, standalone commit message (not just the changed purpose)
128 | 4. The new purpose should be plausible but different from what the code actually achieves
129 | 5. Maintain professional commit message style
130 | 
131 | Examples of purpose changes:
132 | - "Fix memory leak in parser" → "Improve performance in parser"
133 | - "Add logging for debugging" → "Add logging for compliance"
134 | - "Refactor for better readability" → "Refactor for performance optimization"
135 | 
136 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message.
137 | """,
138 |                 weight=1.1
139 |             ),
140 |             
141 |             InconsistencyRule(
142 |                 rule_type=InconsistencyType.COMPONENT_MISMATCH,
143 |                 name="Component Mismatch",
144 |                 description="Modify component or module names mentioned in commit message to make them inconsistent with actually modified components",
145 |                 prompt_template="""
146 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by changing the component, module, or system names mentioned to different but plausible components that aren't actually modified.
147 | 
148 | Original commit message: {message}
149 | Code diff: {diff}
150 | 
151 | Requirements:
152 | 1. Identify components/modules mentioned in the original message
153 | 2. Replace them with different but realistic component names
154 | 3. Keep the action and technical details consistent
155 | 4. Output a COMPLETE, standalone commit message (not just the changed components)
156 | 5. The new components should be plausible for the codebase but factually incorrect
157 | 
158 | Examples of component changes:
159 | - "Update database connection pool" → "Update cache connection pool"
160 | - "Fix bug in user service" → "Fix bug in notification service"
161 | - "Optimize search algorithm" → "Optimize sorting algorithm"
162 | 
163 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message.
164 | """,
165 |                 weight=1.2
166 |             ),
167 |             
168 |             InconsistencyRule(
169 |                 rule_type=InconsistencyType.FEATURE_MISSING,
170 |                 name="Missing Feature Description",
171 |                 description="Generate commit message that only describes partial actual changes, omitting important code modifications",
172 |                 prompt_template="""
173 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by describing only a subset of the actual changes made, omitting significant modifications.
174 | 
175 | Original commit message: {message}
176 | Code diff: {diff}
177 | 
178 | Requirements:
179 | 1. Identify multiple distinct changes in the diff
180 | 2. Create a COMPLETE, standalone commit message that only mentions some of the changes
181 | 3. Omit important or significant modifications from the original message
182 | 4. The described changes should be accurate but incomplete compared to what was actually done
183 | 5. Make it seem like a legitimate but incomplete commit message
184 | 6. Do NOT just output the missing parts - output a FULL commit message that describes less than what was actually changed
185 | 
186 | Example: If original message is "Fix login bug and add user validation" and both changes exist in diff, 
187 | output a complete message like "Fix login bug" (omitting the validation part).
188 | 
189 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message.
190 | """,
191 |                 weight=1.0
192 |             ),
193 |             
194 |             InconsistencyRule(
195 |                 rule_type=InconsistencyType.EXTRA_FEATURE,
196 |                 name="Extra Feature Description",
197 |                 description="Generate commit message that describes additional features not present in actual code changes",
198 |                 prompt_template="""
199 | Given the following commit message and code diff, generate a COMPLETE inconsistent commit message by describing the actual changes PLUS a additional plausible but non-existent change.
200 | 
201 | Original commit message: {message}
202 | Code diff: {diff}
203 | 
204 | Requirements:
205 | 1. Start with accurate descriptions of the actual changes from the original message
206 | 2. Add a description of additional plausible change that don't exist in the diff
207 | 3. The extra change should be realistic and related to the actual changes
208 | 4. Create a COMPLETE, standalone commit message that combines both real and fictional changes
209 | 5. Make it seem like a legitimate commit that describes more than what was actually done
210 | 6. Do NOT just output the extra parts - output a FULL commit message that includes both real and fictional changes
211 | 
212 | Examples of how to add extra features:
213 | - Original: "Fix login bug" → Full message: "Fix login bug and add comprehensive test coverage"
214 | - Original: "Add user registration" → Full message: "Add user registration with email validation and update API documentation"
215 | - Original: "Refactor database connection" → Full message: "Refactor database connection and optimize query performance"
216 | 
217 | Return only a JSON object with a single "message" field containing the COMPLETE inconsistent commit message.
218 | """,
219 |                 weight=1.0
220 |             )
221 |         ]
222 |     
223 |     def get_random_rule(self) -> InconsistencyRule:
224 |         """Select a rule randomly based on weights"""
225 |         weights = [rule.weight for rule in self.rules]
226 |         return random.choices(self.rules, weights=weights, k=1)[0]
227 |     
228 |     def get_rule_by_type(self, rule_type: InconsistencyType) -> InconsistencyRule:
229 |         """Get specific rule by type"""
230 |         for rule in self.rules:
231 |             if rule.rule_type == rule_type:
232 |                 return rule
233 |         raise ValueError(f"Rule type {rule_type} not found")
234 |     
235 |     def get_all_rules(self) -> List[InconsistencyRule]:
236 |         """Get all rules"""
237 |         return self.rules.copy()
238 |     
239 |     def format_prompt(self, rule: InconsistencyRule, commit_data: Dict[str, Any]) -> str:
240 |         """Format rule prompt template"""
241 |         return rule.prompt_template.format(
242 |             message=commit_data.get('message', ''),
243 |             diff=commit_data.get('diff', ''),
244 |             files=commit_data.get('files', [])
245 |         )
246 | 


--------------------------------------------------------------------------------
/data_synthesis/rule_applicability_checker.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Rule applicability checker using LLM to determine which inconsistency rules are suitable for given commit data
  3 | """
  4 | 
  5 | import json
  6 | import logging
  7 | from typing import Dict, Any, Optional, List, Tuple
  8 | from inconsistency_rules import InconsistencyRuleManager, InconsistencyRule, InconsistencyType
  9 | 
 10 | 
 11 | class RuleApplicabilityChecker:
 12 |     """Check which inconsistency rules are applicable to given commit message and diff"""
 13 |     
 14 |     def __init__(self, llm_manager):
 15 |         self.llm_manager = llm_manager
 16 |         self.rule_manager = InconsistencyRuleManager()
 17 |         self.logger = logging.getLogger(__name__)
 18 |         self.used_rule_count = {rule_type.value: 0 for rule_type in InconsistencyType}
 19 |     
 20 |     def _create_applicability_prompt(self, commit_data: Dict[str, Any]) -> str:
 21 |         """Create prompt for LLM to judge rule applicability based only on commit message"""
 22 |         
 23 |         message = commit_data.get('message', '')
 24 |         
 25 |         prompt = f"""
 26 | You are an expert analyzing commit messages. Your task is to determine which inconsistency rules are applicable based on the commit message content.
 27 | 
 28 | Given commit message: {message}
 29 | 
 30 | **IMPORTANT**: For rule 3 below, IGNORE any content within square brackets []. Content in brackets will be ignored in subsequent processing, so mutation of such content would have no effect.
 31 | 
 32 | Please analyze whether each of the following 7 inconsistency rules would be appropriate to apply to this commit message:
 33 | 
 34 | 1. **FUNCTION_NAME_MISMATCH**: Modify function names mentioned in commit message
 35 |    - Requires: Commit message explicitly mentions specific function names (e.g., "fix bug in authenticate_user()", "update process_data function")
 36 | 
 37 | 2. **FILE_PATH_MISMATCH**: Modify file paths mentioned in commit message
 38 |    - Requires: Commit message explicitly mentions file paths, module names, or specific files (e.g., "update config/database.py", "fix bug in user_service.py")
 39 | 
 40 | 3. **OPERATION_TYPE_MISMATCH**: Modify operation type (add/remove/fix/refactor/update) mentioned in message
 41 |    - Requires: Commit message contains clear operation verbs OUTSIDE of brackets (e.g., "add feature", "fix bug", "remove deprecated", "refactor code")
 42 |    - Ignore: Operation verbs within brackets like "[add] feature" or "[fix] bug"
 43 | 
 44 | 4. **PURPOSE_MISMATCH**: Modify the stated purpose/goal mentioned in message
 45 |    - Requires: Commit message states a clear purpose/reason (e.g., "for performance", "to fix memory leak", "for security", "to improve readability")
 46 | 
 47 | 5. **COMPONENT_MISMATCH**: Modify component/module/system names mentioned in message
 48 |    - Requires: Commit message mentions specific components, modules, systems, or architectural elements (e.g., "auth service", "database layer", "user management")
 49 | 
 50 | 6. **FEATURE_MISSING**: Generate message describing only partial changes
 51 |    - Requires: Commit message describes multiple distinct changes that could be described separately (e.g., "fix bug and add validation", "update API and add tests")
 52 | 
 53 | 7. **EXTRA_FEATURE**: Add descriptions of additional changes not mentioned in original message
 54 |    - Requires: Commit message describes actual changes that could realistically be accompanied by related changes (e.g., a bug fix could also mention adding tests)
 55 | 
 56 | For each rule, determine based ONLY on the commit message content:
 57 | - Whether it's applicable (true/false)
 58 | - Reasoning for your decision
 59 | 
 60 | Return ONLY a JSON object in this exact format:
 61 | {{
 62 |     "function_name_mismatch": {{
 63 |         "applicable": true/false,
 64 |         "reasoning": "explanation based on commit message content"
 65 |     }},
 66 |     "file_path_mismatch": {{
 67 |         "applicable": true/false,
 68 |         "reasoning": "explanation based on commit message content"
 69 |     }},
 70 |     "operation_type_mismatch": {{
 71 |         "applicable": true/false,
 72 |         "reasoning": "explanation based on commit message content"
 73 |     }},
 74 |     "purpose_mismatch": {{
 75 |         "applicable": true/false,
 76 |         "reasoning": "explanation based on commit message content"
 77 |     }},
 78 |     "component_mismatch": {{
 79 |         "applicable": true/false,
 80 |         "reasoning": "explanation based on commit message content"
 81 |     }},
 82 |     "feature_missing": {{
 83 |         "applicable": true/false,
 84 |         "reasoning": "explanation based on commit message content"
 85 |     }},
 86 |     "extra_feature": {{
 87 |         "applicable": true/false,
 88 |         "reasoning": "explanation based on commit message content"
 89 |     }}
 90 | }}
 91 | """
 92 |         return prompt
 93 |     
 94 |     def check_rule_applicability(self, commit_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
 95 |         """Check which rules are applicable to the given commit data"""
 96 |         
 97 |         try:
 98 |             prompt = self._create_applicability_prompt(commit_data)
 99 |             response = self.llm_manager.query(prompt)
100 |             
101 |             if not response:
102 |                 self.logger.warning("No response from LLM for applicability check")
103 |                 return self._get_fallback_applicability()
104 |             
105 |             # Parse the JSON response
106 |             try:
107 |                 applicability_result = json.loads(response)
108 |                 
109 |                 # Validate the response structure
110 |                 expected_keys = {
111 |                     "function_name_mismatch", "file_path_mismatch", "operation_type_mismatch",
112 |                     "purpose_mismatch", "component_mismatch", "feature_missing", "extra_feature"
113 |                 }
114 |                 
115 |                 if not isinstance(applicability_result, dict):
116 |                     raise ValueError("Response is not a dictionary")
117 |                 
118 |                 if not expected_keys.issubset(applicability_result.keys()):
119 |                     missing_keys = expected_keys - applicability_result.keys()
120 |                     raise ValueError(f"Missing keys in response: {missing_keys}")
121 |                 
122 |                 # Validate each rule entry
123 |                 for rule_key in expected_keys:
124 |                     rule_data = applicability_result[rule_key]
125 |                     if not isinstance(rule_data, dict):
126 |                         raise ValueError(f"Rule {rule_key} data is not a dictionary")
127 |                     if "applicable" not in rule_data or "reasoning" not in rule_data:
128 |                         raise ValueError(f"Rule {rule_key} missing required fields")
129 |                     if not isinstance(rule_data["applicable"], bool):
130 |                         raise ValueError(f"Rule {rule_key} 'applicable' field is not boolean")
131 |                 
132 |                 self.logger.info("Successfully parsed rule applicability response")
133 |                 return applicability_result
134 |                 
135 |             except (json.JSONDecodeError, ValueError) as e:
136 |                 self.logger.error(f"Error parsing applicability response: {e}")
137 |                 self.logger.debug(f"Raw response: {response}")
138 |                 return self._get_fallback_applicability()
139 |                 
140 |         except Exception as e:
141 |             self.logger.error(f"Error in rule applicability check: {e}")
142 |             return self._get_fallback_applicability()
143 |     
144 |     def _get_fallback_applicability(self) -> Dict[str, Dict[str, Any]]:
145 |         """Return fallback applicability when LLM check fails"""
146 |         
147 |         fallback = {}
148 |         for rule_type in InconsistencyType:
149 |             fallback[rule_type.value] = {
150 |                 "applicable": True,  # Default to applicable
151 |                 "reasoning": "LLM applicability check failed, defaulting to applicable"
152 |             }
153 |         
154 |         return fallback
155 |     
156 |     def select_best_rule(self, commit_data: Dict[str, Any]) -> Tuple[InconsistencyRule, Dict[str, Any]]:
157 |         """Select the best applicable rule based on applicability and weight"""
158 |         
159 |         # Get applicability results
160 |         applicability = self.check_rule_applicability(commit_data)
161 |         
162 |         # Filter applicable rules and calculate weighted scores
163 |         applicable_rules = []
164 |         
165 |         for rule in self.rule_manager.get_all_rules():
166 |             rule_key = rule.rule_type.value
167 |             
168 |             if rule_key in applicability and applicability[rule_key]["applicable"]:
169 |                 # Calculate score based on weight (could be enhanced with other factors)
170 |                 score = rule.weight
171 |                 applicable_rules.append((rule, score, applicability[rule_key]))
172 |         
173 |         if not applicable_rules:
174 |             # If no rules are applicable, fall back to the highest weighted rule
175 |             self.logger.warning("No applicable rules found, falling back to highest weighted rule")
176 |             best_rule = max(self.rule_manager.get_all_rules(), key=lambda r: r.weight)
177 |             fallback_info = {
178 |                 "applicable": False,
179 |                 "reasoning": "No rules were deemed applicable, using fallback"
180 |             }
181 |             return best_rule, fallback_info
182 |         
183 |         # Select rule with highest score
184 |         best_rule, best_score, best_applicability = max(applicable_rules, key=lambda x: x[1])
185 |         
186 |         # Balance rule usage by selecting from least used applicable rules
187 |         applicable_rule_types = [rule[0].rule_type.value for rule in applicable_rules]
188 |         min_usage_count = min(self.used_rule_count[rule_type] for rule_type in applicable_rule_types)
189 |         least_used_applicable_rules = [rule for rule in applicable_rules 
190 |                                      if self.used_rule_count[rule[0].rule_type.value] == min_usage_count]
191 |         
192 |         if least_used_applicable_rules:
193 |             # Select from least used rules with highest score
194 |             best_rule, best_score, best_applicability = max(least_used_applicable_rules, key=lambda x: x[1])
195 |             self.used_rule_count[best_rule.rule_type.value] += 1
196 |             self.logger.info(f"Selected least used rule: {best_rule.name} (usage count: {self.used_rule_count[best_rule.rule_type.value]})")
197 |         
198 |         self.logger.info(f"Selected rule: {best_rule.name} (score: {best_score})")
199 |         self.logger.info(f"Reasoning: {best_applicability['reasoning']}")
200 |         
201 |         return best_rule, best_applicability
202 |     
203 |     def get_applicability_summary(self, commit_data: Dict[str, Any]) -> str:
204 |         """Get a human-readable summary of rule applicability"""
205 |         
206 |         applicability = self.check_rule_applicability(commit_data)
207 |         
208 |         summary_lines = ["Rule Applicability Analysis:"]
209 |         
210 |         for rule in self.rule_manager.get_all_rules():
211 |             rule_key = rule.rule_type.value
212 |             if rule_key in applicability:
213 |                 rule_data = applicability[rule_key]
214 |                 status = "✅ Applicable" if rule_data["applicable"] else "❌ Not Applicable"
215 |                 summary_lines.append(f"  {rule.name}: {status}")
216 |                 summary_lines.append(f"    Reasoning: {rule_data['reasoning']}")
217 |         
218 |         return "\n".join(summary_lines)
219 | 
220 | 
221 | class EnhancedInconsistencyRuleManager(InconsistencyRuleManager):
222 |     """Enhanced rule manager with applicability checking"""
223 |     
224 |     def __init__(self, llm_manager):
225 |         super().__init__()
226 |         self.applicability_checker = RuleApplicabilityChecker(llm_manager)
227 |     
228 |     def get_best_rule_for_commit(self, commit_data: Dict[str, Any]) -> Tuple[InconsistencyRule, Dict[str, Any]]:
229 |         """Get the best rule for given commit data based on applicability and weight"""
230 |         return self.applicability_checker.select_best_rule(commit_data)
231 |     
232 |     def analyze_commit_applicability(self, commit_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
233 |         """Analyze which rules are applicable to the commit"""
234 |         return self.applicability_checker.check_rule_applicability(commit_data)
235 |     
236 |     def get_applicability_summary(self, commit_data: Dict[str, Any]) -> str:
237 |         """Get human-readable summary of rule applicability"""
238 |         return self.applicability_checker.get_applicability_summary(commit_data)
239 | 


--------------------------------------------------------------------------------
/evaluation/pure_llm/consistency_checker.py:
--------------------------------------------------------------------------------
  1 | from langchain_community.chat_models import ChatOpenAI
  2 | from langchain_core.prompts import ChatPromptTemplate
  3 | from typing import Dict, Any, Optional
  4 | import logging
  5 | import json
  6 | import re
  7 | 
  8 | class ConsistencyChecker:
  9 |     """
 10 |     A class to check consistency between commit messages and code changes using LLM.
 11 |     """
 12 |     
 13 |     def __init__(self, openai_api_key: str, openai_api_base: str, model: str = "gpt-3.5-turbo"):
 14 |         """
 15 |         Initialize the consistency checker with LLM configuration.
 16 |         
 17 |         Args:
 18 |             openai_api_key (str): OpenAI API key
 19 |             openai_api_base (str): OpenAI API base URL
 20 |             model (str): Model name to use
 21 |         """
 22 |         # Validate input parameters
 23 |         if not openai_api_key or not openai_api_key.strip():
 24 |             raise ValueError("OpenAI API key cannot be empty")
 25 |         if not openai_api_base or not openai_api_base.strip():
 26 |             raise ValueError("OpenAI API base URL cannot be empty")
 27 |         if not model or not model.strip():
 28 |             raise ValueError("Model name cannot be empty")
 29 |             
 30 |         self.chat_model = ChatOpenAI(
 31 |             openai_api_key=openai_api_key,
 32 |             openai_api_base=openai_api_base,
 33 |             model=model,
 34 |             temperature=0
 35 |         )
 36 |         
 37 |         # Define the system prompt for consistency checking
 38 |         self.system_prompt = """You are an expert code reviewer tasked with evaluating the consistency between commit messages and their corresponding code changes.
 39 | 
 40 | Your job is to analyze whether a commit message accurately describes the actual code changes made in the diff.
 41 | 
 42 | Consider the following aspects:
 43 | 1. Does the commit message describe the actual changes made?
 44 | 2. Are the mentioned components/files/functions actually modified?
 45 | 3. Is the scope of changes (major/minor) consistent with the message?
 46 | 4. Are any important changes missing from the commit message?
 47 | 5. Does the commit message contain any false or misleading information?
 48 | 
 49 | Respond with a JSON object containing:
 50 | - "consistent": true/false,
 51 | - "confidence": 0.0-1.0,
 52 | - "reasoning": "detailed explanation of your analysis",
 53 | - "issues": ["list of specific inconsistencies found, if any"]
 54 | 
 55 | Be thorough in your analysis and provide clear reasoning for your decision."""
 56 | 
 57 |         self.user_prompt = """Please analyze the consistency between this commit message and the corresponding code diff:
 58 | 
 59 | **Commit Message:**
 60 | {commit_message}
 61 | 
 62 | **Code Diff:**
 63 | {code_diff}
 64 | 
 65 | Evaluate whether the commit message accurately describes the code changes and respond with the requested JSON format."""
 66 | 
 67 |     def check_consistency(self, commit_message: str, code_diff: str) -> Dict[str, Any]:
 68 |         """
 69 |         Check consistency between a commit message and code diff.
 70 |         
 71 |         Args:
 72 |             commit_message (str): The commit message to analyze
 73 |             code_diff (str): The code diff to analyze
 74 |             
 75 |         Returns:
 76 |             Dict[str, Any]: Analysis result containing consistency, confidence, reasoning, and issues
 77 |         """
 78 |         try:
 79 |             prompt = ChatPromptTemplate.from_messages([
 80 |                 ("system", self.system_prompt),
 81 |                 ("user", self.user_prompt)
 82 |             ])
 83 | 
 84 |             # print(prompt)
 85 |             
 86 |             chain = prompt | self.chat_model
 87 |             
 88 |             response = chain.invoke({
 89 |                 "commit_message": commit_message,
 90 |                 "code_diff": code_diff
 91 |             })
 92 |             
 93 |             logging.info(f"LLM Consistency Check Response: {response.content}")
 94 |             
 95 |             # Extract token usage from response
 96 |             prompt_tokens = 0
 97 |             completion_tokens = 0
 98 |             total_tokens = 0
 99 |             
100 |             try:
101 |                 if hasattr(response, 'response_metadata') and response.response_metadata:
102 |                     token_usage = response.response_metadata.get('token_usage', {})
103 |                     prompt_tokens = token_usage.get('prompt_tokens', 0)
104 |                     completion_tokens = token_usage.get('completion_tokens', 0)
105 |                     total_tokens = token_usage.get('total_tokens', 0)
106 |                 elif hasattr(response, 'usage_metadata'):
107 |                     prompt_tokens = getattr(response.usage_metadata, 'input_tokens', 0)
108 |                     completion_tokens = getattr(response.usage_metadata, 'output_tokens', 0)
109 |                     total_tokens = getattr(response.usage_metadata, 'total_tokens', 0)
110 |             except Exception as e:
111 |                 logging.warning(f"Could not extract token usage: {e}")
112 |             
113 |             # Parse the JSON response
114 |             result = self._parse_response(response.content)
115 |             
116 |             # Add token usage to result
117 |             result['prompt_tokens'] = prompt_tokens
118 |             result['completion_tokens'] = completion_tokens
119 |             result['total_tokens'] = total_tokens
120 |             
121 |             return result
122 |             
123 |         except Exception as e:
124 |             logging.error(f"Error in consistency check: {e}")
125 |             import traceback
126 |             traceback.print_exc()
127 |             return {
128 |                 "consistent": False,
129 |                 "confidence": 0.0,
130 |                 "reasoning": f"Error occurred during analysis: {str(e)}",
131 |                 "issues": ["Analysis failed due to technical error"],
132 |                 "prompt_tokens": 0,
133 |                 "completion_tokens": 0,
134 |                 "total_tokens": 0
135 |             }
136 |     
137 |     def _parse_response(self, content: str) -> Dict[str, Any]:
138 |         """
139 |         Parse the LLM response to extract the JSON result.
140 |         
141 |         Args:
142 |             content (str): Raw LLM response content
143 |             
144 |         Returns:
145 |             Dict[str, Any]: Parsed analysis result
146 |         """
147 |         try:
148 |             content = content.strip()
149 |             
150 |             # Try to find JSON object in the response
151 |             # Look for patterns like: ```json {...} ``` or just {...}
152 |             json_pattern = r'```json\s*(\{.*?\})\s*```'
153 |             match = re.search(json_pattern, content, re.DOTALL)
154 |             
155 |             if match:
156 |                 json_str = match.group(1)
157 |             else:
158 |                 # Try to find JSON object without code blocks
159 |                 obj_pattern = r'(\{.*?\})'
160 |                 match = re.search(obj_pattern, content, re.DOTALL)
161 |                 if match:
162 |                     json_str = match.group(1)
163 |                 else:
164 |                     # Try to extract from the raw content
165 |                     json_str = content
166 |             
167 |             # Parse the JSON
168 |             result = json.loads(json_str)
169 |             
170 |             # Validate required fields
171 |             required_fields = ["consistent", "confidence", "reasoning"]
172 |             for field in required_fields:
173 |                 if field not in result:
174 |                     raise ValueError(f"Missing required field: {field}")
175 |             
176 |             # Ensure issues field exists
177 |             if "issues" not in result:
178 |                 result["issues"] = []
179 |             
180 |             # Validate data types
181 |             if not isinstance(result["consistent"], bool):
182 |                 result["consistent"] = str(result["consistent"]).lower() in ["true", "yes", "1"]
183 |             
184 |             if not isinstance(result["confidence"], (int, float)):
185 |                 try:
186 |                     result["confidence"] = float(result["confidence"])
187 |                 except:
188 |                     result["confidence"] = 0.5
189 |             
190 |             # Clamp confidence to [0, 1] range
191 |             result["confidence"] = max(0.0, min(1.0, float(result["confidence"])))
192 |             
193 |             if not isinstance(result["reasoning"], str):
194 |                 result["reasoning"] = str(result["reasoning"])
195 |             
196 |             if not isinstance(result["issues"], list):
197 |                 result["issues"] = [str(result["issues"])] if result["issues"] else []
198 |             
199 |             logging.info(f"Parsed consistency result: consistent={result['consistent']}, confidence={result['confidence']}")
200 |             return result
201 |             
202 |         except json.JSONDecodeError as e:
203 |             logging.error(f"JSON parsing error: {e}")
204 |             logging.error(f"Response content: {content}")
205 |             
206 |             # Fallback: try to extract information manually
207 |             return self._fallback_parse(content)
208 |             
209 |         except Exception as e:
210 |             logging.error(f"Error parsing response: {e}")
211 |             return {
212 |                 "consistent": False,
213 |                 "confidence": 0.0,
214 |                 "reasoning": f"Failed to parse LLM response: {str(e)}",
215 |                 "issues": ["Response parsing failed"],
216 |                 "prompt_tokens": 0,
217 |                 "completion_tokens": 0,
218 |                 "total_tokens": 0
219 |             }
220 |     
221 |     def _fallback_parse(self, content: str) -> Dict[str, Any]:
222 |         """
223 |         Fallback method to extract information when JSON parsing fails.
224 |         
225 |         Args:
226 |             content (str): Raw LLM response content
227 |             
228 |         Returns:
229 |             Dict[str, Any]: Best-effort parsed result
230 |         """
231 |         try:
232 |             content_lower = content.lower()
233 |             
234 |             # Try to determine consistency
235 |             consistent = False
236 |             if any(phrase in content_lower for phrase in ["consistent", "matches", "accurate", "correct"]):
237 |                 consistent = True
238 |             elif any(phrase in content_lower for phrase in ["inconsistent", "mismatch", "inaccurate", "incorrect", "false"]):
239 |                 consistent = False
240 |             
241 |             # Try to extract confidence
242 |             confidence = 0.5
243 |             confidence_patterns = [
244 |                 r'confidence[:\s]+([0-9]*\.?[0-9]+)',
245 |                 r'([0-9]*\.?[0-9]+)\s*confidence',
246 |                 r'confidence.*?([0-9]*\.?[0-9]+)'
247 |             ]
248 |             
249 |             for pattern in confidence_patterns:
250 |                 match = re.search(pattern, content_lower)
251 |                 if match:
252 |                     try:
253 |                         conf_val = float(match.group(1))
254 |                         if conf_val <= 1.0:
255 |                             confidence = conf_val
256 |                         elif conf_val <= 100:
257 |                             confidence = conf_val / 100
258 |                         break
259 |                     except:
260 |                         continue
261 |             
262 |             return {
263 |                 "consistent": consistent,
264 |                 "confidence": confidence,
265 |                 "reasoning": content.strip(),
266 |                 "issues": ["Response format was not JSON, used fallback parsing"],
267 |                 "prompt_tokens": 0,
268 |                 "completion_tokens": 0,
269 |                 "total_tokens": 0
270 |             }
271 |             
272 |         except Exception as e:
273 |             logging.error(f"Fallback parsing failed: {e}")
274 |             return {
275 |                 "consistent": False,
276 |                 "confidence": 0.0,
277 |                 "reasoning": content.strip() if content else "No response content",
278 |                 "issues": ["Both JSON and fallback parsing failed"],
279 |                 "prompt_tokens": 0,
280 |                 "completion_tokens": 0,
281 |                 "total_tokens": 0
282 |             }
283 | 
284 |     def batch_check(self, data: list) -> list:
285 |         """
286 |         Check consistency for multiple commit-diff pairs.
287 |         
288 |         Args:
289 |             data (list): List of dictionaries with 'commit_message' and 'code_diff' keys
290 |             
291 |         Returns:
292 |             list: List of analysis results
293 |         """
294 |         results = []
295 |         
296 |         for i, item in enumerate(data):
297 |             try:
298 |                 commit_message = item.get('commit_message', '')
299 |                 code_diff = item.get('code_diff', '')
300 |                 
301 |                 if not commit_message or not code_diff:
302 |                     logging.warning(f"Skipping item {i}: missing commit_message or code_diff")
303 |                     results.append({
304 |                         "consistent": False,
305 |                         "confidence": 0.0,
306 |                         "reasoning": "Missing commit message or code diff",
307 |                         "issues": ["Incomplete input data"],
308 |                         "prompt_tokens": 0,
309 |                         "completion_tokens": 0,
310 |                         "total_tokens": 0
311 |                     })
312 |                     continue
313 |                 
314 |                 result = self.check_consistency(commit_message, code_diff)
315 |                 results.append(result)
316 |                 
317 |                 logging.info(f"Processed item {i+1}/{len(data)}")
318 |                 
319 |             except Exception as e:
320 |                 logging.error(f"Error processing item {i}: {e}")
321 |                 results.append({
322 |                     "consistent": False,
323 |                     "confidence": 0.0,
324 |                     "reasoning": f"Error processing item: {str(e)}",
325 |                     "issues": ["Processing error"],
326 |                     "prompt_tokens": 0,
327 |                     "completion_tokens": 0,
328 |                     "total_tokens": 0
329 |                 })
330 |         
331 |         return results
332 | 


--------------------------------------------------------------------------------
/evaluation/CoT/consistency_checker.py:
--------------------------------------------------------------------------------
  1 | from langchain_community.chat_models import ChatOpenAI
  2 | from langchain_core.prompts import ChatPromptTemplate
  3 | from typing import Dict, Any, Optional
  4 | import logging
  5 | import json
  6 | import re
  7 | 
  8 | class ConsistencyChecker:
  9 |     """
 10 |     A class to check consistency between commit messages and code changes using LLM with Chain of Thought reasoning.
 11 |     """
 12 |     
 13 |     def __init__(self, openai_api_key: str, openai_api_base: str, model: str = "gpt-3.5-turbo"):
 14 |         """
 15 |         Initialize the consistency checker with LLM configuration.
 16 |         
 17 |         Args:
 18 |             openai_api_key (str): OpenAI API key
 19 |             openai_api_base (str): OpenAI API base URL
 20 |             model (str): Model name to use
 21 |         """
 22 |         # Validate input parameters
 23 |         if not openai_api_key or not openai_api_key.strip():
 24 |             raise ValueError("OpenAI API key cannot be empty")
 25 |         if not openai_api_base or not openai_api_base.strip():
 26 |             raise ValueError("OpenAI API base URL cannot be empty")
 27 |         if not model or not model.strip():
 28 |             raise ValueError("Model name cannot be empty")
 29 |             
 30 |         self.chat_model = ChatOpenAI(
 31 |             openai_api_key=openai_api_key,
 32 |             openai_api_base=openai_api_base,
 33 |             model=model,
 34 |             temperature=0
 35 |         )
 36 |         
 37 |         # Define the system prompt with Chain of Thought reasoning for consistency checking
 38 |         self.system_prompt = """You are an expert code reviewer tasked with evaluating the consistency between commit messages and their corresponding code changes.
 39 | 
 40 | Your job is to analyze whether a commit message accurately describes the actual code changes made in the diff using a step-by-step Chain of Thought approach.
 41 | 
 42 | Please follow this structured reasoning process:
 43 | 
 44 | **Step 1: Understand the Commit Message**
 45 | - What is the main purpose/goal described in the commit message?
 46 | - What specific changes or fixes does it claim to make?
 47 | - What components, files, or functionality does it mention?
 48 | 
 49 | **Step 2: Analyze the Code Diff**
 50 | - What files are actually modified?
 51 | - What specific code changes are made (additions, deletions, modifications)?
 52 | - What functionality is actually being changed or implemented?
 53 | 
 54 | **Step 3: Compare Message vs Reality**
 55 | - Do the claimed changes in the message match the actual code changes?
 56 | - Are the mentioned components/files actually modified?
 57 | - Is the scope of changes (major/minor) consistent with the message?
 58 | - Are there any important changes in the code that are not mentioned in the message?
 59 | - Are there any claims in the message that are not supported by the code changes?
 60 | 
 61 | **Step 4: Identify Inconsistencies (if any)**
 62 | - List specific discrepancies between the message and code
 63 | - Note any misleading or false information in the commit message
 64 | - Identify missing information that should have been mentioned
 65 | 
 66 | **Step 5: Make Final Decision**
 67 | - Based on the analysis above, determine if the message is consistent with the code changes
 68 | - Assign a confidence level (0.0-1.0) based on the clarity and strength of the evidence
 69 | - Provide a clear reasoning for the decision
 70 | 
 71 | Consider the following aspects in your analysis:
 72 | 1. Does the commit message describe the actual changes made?
 73 | 2. Are the mentioned components/files/functions actually modified?
 74 | 3. Is the scope of changes (major/minor) consistent with the message?
 75 | 4. Are any important changes missing from the commit message?
 76 | 5. Does the commit message contain any false or misleading information?
 77 | 
 78 | Respond with a JSON object containing:
 79 | - "consistent": true/false,
 80 | - "confidence": 0.0-1.0,
 81 | - "reasoning": "detailed step-by-step explanation following the Chain of Thought process above",
 82 | - "issues": ["list of specific inconsistencies found, if any"]
 83 | 
 84 | Be thorough in your step-by-step analysis and provide clear reasoning for your decision."""
 85 | 
 86 |         self.user_prompt = """Please analyze the consistency between this commit message and the corresponding code diff using the Chain of Thought approach:
 87 | 
 88 | **Commit Message:**
 89 | {commit_message}
 90 | 
 91 | **Code Diff:**
 92 | {code_diff}
 93 | 
 94 | Follow the 5-step Chain of Thought process outlined in the system prompt to evaluate whether the commit message accurately describes the code changes, and respond with the requested JSON format."""
 95 | 
 96 |     def check_consistency(self, commit_message: str, code_diff: str) -> Dict[str, Any]:
 97 |         """
 98 |         Check consistency between a commit message and code diff using Chain of Thought reasoning.
 99 |         
100 |         Args:
101 |             commit_message (str): The commit message to analyze
102 |             code_diff (str): The code diff to analyze
103 |             
104 |         Returns:
105 |             Dict[str, Any]: Analysis result containing consistency, confidence, reasoning, and issues
106 |         """
107 |         try:
108 |             prompt = ChatPromptTemplate.from_messages([
109 |                 ("system", self.system_prompt),
110 |                 ("user", self.user_prompt)
111 |             ])
112 | 
113 |             # print(prompt)
114 |             
115 |             chain = prompt | self.chat_model
116 |             
117 |             response = chain.invoke({
118 |                 "commit_message": commit_message,
119 |                 "code_diff": code_diff
120 |             })
121 |             
122 |             logging.info(f"LLM CoT Consistency Check Response: {response.content}")
123 |             
124 |             # Extract token usage from response
125 |             prompt_tokens = 0
126 |             completion_tokens = 0
127 |             total_tokens = 0
128 |             
129 |             try:
130 |                 if hasattr(response, 'response_metadata') and response.response_metadata:
131 |                     token_usage = response.response_metadata.get('token_usage', {})
132 |                     prompt_tokens = token_usage.get('prompt_tokens', 0)
133 |                     completion_tokens = token_usage.get('completion_tokens', 0)
134 |                     total_tokens = token_usage.get('total_tokens', 0)
135 |                 elif hasattr(response, 'usage_metadata'):
136 |                     prompt_tokens = getattr(response.usage_metadata, 'input_tokens', 0)
137 |                     completion_tokens = getattr(response.usage_metadata, 'output_tokens', 0)
138 |                     total_tokens = getattr(response.usage_metadata, 'total_tokens', 0)
139 |             except Exception as e:
140 |                 logging.warning(f"Could not extract token usage: {e}")
141 |             
142 |             # Parse the JSON response
143 |             result = self._parse_response(response.content)
144 |             
145 |             # Add token usage to result
146 |             result['prompt_tokens'] = prompt_tokens
147 |             result['completion_tokens'] = completion_tokens
148 |             result['total_tokens'] = total_tokens
149 |             
150 |             return result
151 |             
152 |         except Exception as e:
153 |             logging.error(f"Error in CoT consistency check: {e}")
154 |             import traceback
155 |             traceback.print_exc()
156 |             return {
157 |                 "consistent": False,
158 |                 "confidence": 0.0,
159 |                 "reasoning": f"Error occurred during CoT analysis: {str(e)}",
160 |                 "issues": ["Analysis failed due to technical error"],
161 |                 "prompt_tokens": 0,
162 |                 "completion_tokens": 0,
163 |                 "total_tokens": 0
164 |             }
165 |     
166 |     def _parse_response(self, content: str) -> Dict[str, Any]:
167 |         """
168 |         Parse the LLM response to extract the JSON result.
169 |         
170 |         Args:
171 |             content (str): Raw LLM response content
172 |             
173 |         Returns:
174 |             Dict[str, Any]: Parsed analysis result
175 |         """
176 |         try:
177 |             content = content.strip()
178 |             
179 |             # Try to find JSON object in the response
180 |             # Look for patterns like: ```json {...} ``` or just {...}
181 |             json_pattern = r'```json\s*(\{.*?\})\s*```'
182 |             match = re.search(json_pattern, content, re.DOTALL)
183 |             
184 |             if match:
185 |                 json_str = match.group(1)
186 |             else:
187 |                 # Try to find JSON object without code blocks
188 |                 obj_pattern = r'(\{.*?\})'
189 |                 match = re.search(obj_pattern, content, re.DOTALL)
190 |                 if match:
191 |                     json_str = match.group(1)
192 |                 else:
193 |                     # Try to extract from the raw content
194 |                     json_str = content
195 |             
196 |             # Parse the JSON
197 |             result = json.loads(json_str)
198 |             
199 |             # Validate required fields
200 |             required_fields = ["consistent", "confidence", "reasoning"]
201 |             for field in required_fields:
202 |                 if field not in result:
203 |                     raise ValueError(f"Missing required field: {field}")
204 |             
205 |             # Ensure issues field exists
206 |             if "issues" not in result:
207 |                 result["issues"] = []
208 |             
209 |             # Validate data types
210 |             if not isinstance(result["consistent"], bool):
211 |                 result["consistent"] = str(result["consistent"]).lower() in ["true", "yes", "1"]
212 |             
213 |             if not isinstance(result["confidence"], (int, float)):
214 |                 try:
215 |                     result["confidence"] = float(result["confidence"])
216 |                 except:
217 |                     result["confidence"] = 0.5
218 |             
219 |             # Clamp confidence to [0, 1] range
220 |             result["confidence"] = max(0.0, min(1.0, float(result["confidence"])))
221 |             
222 |             if not isinstance(result["reasoning"], str):
223 |                 result["reasoning"] = str(result["reasoning"])
224 |             
225 |             if not isinstance(result["issues"], list):
226 |                 result["issues"] = [str(result["issues"])] if result["issues"] else []
227 |             
228 |             logging.info(f"Parsed CoT consistency result: consistent={result['consistent']}, confidence={result['confidence']}")
229 |             return result
230 |             
231 |         except json.JSONDecodeError as e:
232 |             logging.error(f"JSON parsing error: {e}")
233 |             logging.error(f"Response content: {content}")
234 |             
235 |             # Fallback: try to extract information manually
236 |             return self._fallback_parse(content)
237 |             
238 |         except Exception as e:
239 |             logging.error(f"Error parsing response: {e}")
240 |             return {
241 |                 "consistent": False,
242 |                 "confidence": 0.0,
243 |                 "reasoning": f"Failed to parse LLM response: {str(e)}",
244 |                 "issues": ["Response parsing failed"],
245 |                 "prompt_tokens": 0,
246 |                 "completion_tokens": 0,
247 |                 "total_tokens": 0
248 |             }
249 |     
250 |     def _fallback_parse(self, content: str) -> Dict[str, Any]:
251 |         """
252 |         Fallback method to extract information when JSON parsing fails.
253 |         
254 |         Args:
255 |             content (str): Raw LLM response content
256 |             
257 |         Returns:
258 |             Dict[str, Any]: Best-effort parsed result
259 |         """
260 |         try:
261 |             content_lower = content.lower()
262 |             
263 |             # Try to determine consistency
264 |             consistent = False
265 |             if any(phrase in content_lower for phrase in ["consistent", "matches", "accurate", "correct"]):
266 |                 consistent = True
267 |             elif any(phrase in content_lower for phrase in ["inconsistent", "mismatch", "inaccurate", "incorrect", "false"]):
268 |                 consistent = False
269 |             
270 |             # Try to extract confidence
271 |             confidence = 0.5
272 |             confidence_patterns = [
273 |                 r'confidence[:\s]+([0-9]*\.?[0-9]+)',
274 |                 r'([0-9]*\.?[0-9]+)\s*confidence',
275 |                 r'confidence.*?([0-9]*\.?[0-9]+)'
276 |             ]
277 |             
278 |             for pattern in confidence_patterns:
279 |                 match = re.search(pattern, content_lower)
280 |                 if match:
281 |                     try:
282 |                         conf_val = float(match.group(1))
283 |                         if conf_val <= 1.0:
284 |                             confidence = conf_val
285 |                         elif conf_val <= 100:
286 |                             confidence = conf_val / 100
287 |                         break
288 |                     except:
289 |                         continue
290 |             
291 |             return {
292 |                 "consistent": consistent,
293 |                 "confidence": confidence,
294 |                 "reasoning": content.strip(),
295 |                 "issues": ["Response format was not JSON, used fallback parsing"],
296 |                 "prompt_tokens": 0,
297 |                 "completion_tokens": 0,
298 |                 "total_tokens": 0
299 |             }
300 |             
301 |         except Exception as e:
302 |             logging.error(f"Fallback parsing failed: {e}")
303 |             return {
304 |                 "consistent": False,
305 |                 "confidence": 0.0,
306 |                 "reasoning": content.strip() if content else "No response content",
307 |                 "issues": ["Both JSON and fallback parsing failed"],
308 |                 "prompt_tokens": 0,
309 |                 "completion_tokens": 0,
310 |                 "total_tokens": 0
311 |             }
312 | 
313 |     def batch_check(self, data: list) -> list:
314 |         """
315 |         Check consistency for multiple commit-diff pairs using Chain of Thought reasoning.
316 |         
317 |         Args:
318 |             data (list): List of dictionaries with 'commit_message' and 'code_diff' keys
319 |             
320 |         Returns:
321 |             list: List of analysis results
322 |         """
323 |         results = []
324 |         
325 |         for i, item in enumerate(data):
326 |             try:
327 |                 commit_message = item.get('commit_message', '')
328 |                 code_diff = item.get('code_diff', '')
329 |                 
330 |                 if not commit_message or not code_diff:
331 |                     logging.warning(f"Skipping item {i}: missing commit_message or code_diff")
332 |                     results.append({
333 |                         "consistent": False,
334 |                         "confidence": 0.0,
335 |                         "reasoning": "Missing commit message or code diff",
336 |                         "issues": ["Incomplete input data"],
337 |                         "prompt_tokens": 0,
338 |                         "completion_tokens": 0,
339 |                         "total_tokens": 0
340 |                     })
341 |                     continue
342 |                 
343 |                 result = self.check_consistency(commit_message, code_diff)
344 |                 results.append(result)
345 |                 
346 |                 logging.info(f"Processed item {i+1}/{len(data)} with CoT reasoning")
347 |                 
348 |             except Exception as e:
349 |                 logging.error(f"Error processing item {i}: {e}")
350 |                 results.append({
351 |                     "consistent": False,
352 |                     "confidence": 0.0,
353 |                     "reasoning": f"Error processing item: {str(e)}",
354 |                     "issues": ["Processing error"],
355 |                     "prompt_tokens": 0,
356 |                     "completion_tokens": 0,
357 |                     "total_tokens": 0
358 |                 })
359 |         
360 |         return results
361 | 


--------------------------------------------------------------------------------
/data_synthesis/llm_interface.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LLM interface module for calling large language models to generate inconsistent commit messages
  3 | """
  4 | 
  5 | import json
  6 | import logging
  7 | import re
  8 | from typing import Dict, Any, Optional
  9 | from abc import ABC, abstractmethod
 10 | 
 11 | try:
 12 |     from langchain_community.chat_models import ChatOpenAI
 13 |     from langchain_core.prompts import ChatPromptTemplate
 14 |     from langchain_core.messages import HumanMessage, SystemMessage
 15 |     LANGCHAIN_AVAILABLE = True
 16 | except ImportError:
 17 |     LANGCHAIN_AVAILABLE = False
 18 |     logging.warning("Langchain not available, using mock responses")
 19 | 
 20 | 
 21 | class LLMInterface(ABC):
 22 |     """LLM interface abstract base class"""
 23 |     
 24 |     @abstractmethod
 25 |     def generate_inconsistent_message(self, prompt: str) -> Optional[str]:
 26 |         """Generate inconsistent commit message"""
 27 |         pass
 28 |     
 29 |     @abstractmethod
 30 |     def query(self, prompt: str) -> Optional[str]:
 31 |         """General LLM query method for any type of analysis"""
 32 |         pass
 33 | 
 34 | 
 35 | class LangChainOpenAIInterface(LLMInterface):
 36 |     """LangChain OpenAI interface matching model_manager.py style"""
 37 |     
 38 |     def __init__(self, api_key: str, base_url: str = "", 
 39 |                  model: str = "gpt-3.5-turbo", max_retries: int = 3):
 40 |         # Validate input parameters
 41 |         if not api_key or not api_key.strip():
 42 |             raise ValueError("OpenAI API key cannot be empty")
 43 |         if not base_url or not base_url.strip():
 44 |             raise ValueError("OpenAI API base URL cannot be empty")
 45 |         if not model or not model.strip():
 46 |             raise ValueError("Model name cannot be empty")
 47 |             
 48 |         self.api_key = api_key
 49 |         self.base_url = base_url
 50 |         self.model = model
 51 |         self.max_retries = max_retries
 52 |         self.logger = logging.getLogger(__name__)
 53 |         
 54 |         if not LANGCHAIN_AVAILABLE:
 55 |             self.logger.warning("Langchain not available, will use mock responses")
 56 |             self.chat_model = None
 57 |         elif api_key.lower() == "empty":
 58 |             self.logger.warning("API key is empty, will use mock responses")
 59 |             self.chat_model = None
 60 |         else:
 61 |             try:
 62 |                 self.chat_model = ChatOpenAI(
 63 |                     openai_api_key=api_key,
 64 |                     openai_api_base=base_url,
 65 |                     model=model,
 66 |                     temperature=0  
 67 |                 )
 68 |             except Exception as e:
 69 |                 self.logger.error(f"Failed to initialize ChatOpenAI: {e}")
 70 |                 self.chat_model = None
 71 |         
 72 |     def generate_inconsistent_message(self, prompt: str) -> Optional[str]:
 73 |         """Call LangChain OpenAI to generate inconsistent commit message"""
 74 |         if not self.chat_model:
 75 |             self.logger.warning("Chat model not available, returning mock response")
 76 |             return self._get_mock_response()
 77 |         
 78 |         try:
 79 |             # Use direct message construction to avoid template parsing issues with special characters
 80 |             # print(f"🔍 Generating inconsistent message with prompt: {prompt}")
 81 | 
 82 |             # Create messages directly without template parsing to handle special characters like {}
 83 |             system_message = SystemMessage(content="You are a helpful assistant that generates inconsistent commit messages for testing purposes. Always return valid JSON with a single 'message' field.")
 84 |             human_message = HumanMessage(content=prompt)
 85 |             
 86 |             # Call the model directly with messages
 87 |             print("send message...")
 88 |             response = self.chat_model.invoke([system_message, human_message])
 89 |             print("finish send message")
 90 |             
 91 |             self.logger.info(f"LLM Response: {response.content}")
 92 |             
 93 |             # Parse the response content to extract message
 94 |             try:
 95 |                 content = response.content.strip()
 96 |                 
 97 |                 # Try to find JSON in the response (similar to model_manager.py parsing logic)
 98 |                 # Look for patterns like: ```json {...} ``` or just {...} 
 99 |                 json_pattern = r'```json\s*(\{.*?\})\s*```'
100 |                 match = re.search(json_pattern, content, re.DOTALL)
101 |                 
102 |                 if match:
103 |                     json_str = match.group(1)
104 |                 else:
105 |                     # Try to find object without code blocks
106 |                     object_pattern = r'(\{.*?\})'
107 |                     match = re.search(object_pattern, content, re.DOTALL)
108 |                     if match:
109 |                         json_str = match.group(1)
110 |                     else:
111 |                         # If no JSON found, try to extract from quotes
112 |                         quote_pattern = r'"message"\s*:\s*"([^"]+)"'
113 |                         match = re.search(quote_pattern, content)
114 |                         if match:
115 |                             return match.group(1)
116 |                         else:
117 |                             self.logger.warning("Could not find JSON or message in response")
118 |                             return self._extract_message_from_text(content)
119 |                 
120 |                 # Parse the JSON
121 |                 parsed_response = json.loads(json_str)
122 |                 
123 |                 # Extract message field
124 |                 if isinstance(parsed_response, dict) and 'message' in parsed_response:
125 |                     message = parsed_response['message']
126 |                     if isinstance(message, str) and message.strip():
127 |                         return message.strip()
128 |                     else:
129 |                         self.logger.warning("Message field is empty or not a string")
130 |                         return self._extract_message_from_text(content)
131 |                 else:
132 |                     self.logger.warning("Parsed JSON does not contain 'message' field")
133 |                     return self._extract_message_from_text(content)
134 |                     
135 |             except json.JSONDecodeError as e:
136 |                 self.logger.error(f"Error parsing JSON from LLM response: {e}")
137 |                 self.logger.debug(f"Response content: {content}")
138 |                 
139 |                 # Fallback: try to extract meaningful text
140 |                 return self._extract_message_from_text(content)
141 |                 
142 |         except Exception as e:
143 |             self.logger.error(f"Error in LLM request: {e}")
144 |             import traceback
145 |             traceback.print_exc()
146 |             return self._get_mock_response()
147 |     
148 |     def _extract_message_from_text(self, text: str) -> str:
149 |         """Extract possible commit message from text (fallback method)"""
150 |         try:
151 |             # Clean up the text
152 |             text = text.strip()
153 |             
154 |             # Look for quoted strings that could be commit messages
155 |             quote_patterns = [
156 |                 r'"([^"]+)"',
157 |                 r"'([^']+)'",
158 |                 r'`([^`]+)`'
159 |             ]
160 |             
161 |             for pattern in quote_patterns:
162 |                 matches = re.findall(pattern, text)
163 |                 for match in matches:
164 |                     if len(match.strip()) > 10:  # Reasonable commit message length
165 |                         return match.strip()
166 |             
167 |             # If no quoted strings, look for lines that could be commit messages
168 |             lines = text.split('\n')
169 |             for line in lines:
170 |                 line = line.strip()
171 |                 if (line and 
172 |                     not line.startswith('{') and 
173 |                     not line.startswith('[') and
174 |                     not line.startswith('```') and
175 |                     len(line) > 10 and
176 |                     len(line) < 200):  # Reasonable commit message bounds
177 |                     return line
178 |             
179 |             # Last resort: return the first reasonable line
180 |             if text and len(text) > 10:
181 |                 return text.split('\n')[0][:100]  # Truncate if too long
182 |             
183 |             return "Generated inconsistent commit message"
184 |             
185 |         except Exception as e:
186 |             self.logger.error(f"Error extracting message from text: {e}")
187 |             return "Generated inconsistent commit message"
188 |     
189 |     def _get_mock_response(self) -> str:
190 |         """Return mock response for testing"""
191 |         return "Mock inconsistent commit message for testing"
192 |     
193 |     def query(self, prompt: str) -> Optional[str]:
194 |         """General LLM query method for any type of analysis"""
195 |         if not self.chat_model:
196 |             self.logger.warning("Chat model not available, returning mock response")
197 |             return self._get_mock_query_response()
198 |         
199 |         try:
200 |             # Use direct message construction to avoid template parsing issues with special characters
201 |             self.logger.debug(f"Sending general query to LLM")
202 | 
203 |             # Create messages directly without template parsing
204 |             system_message = SystemMessage(content="You are a helpful assistant that provides detailed analysis and responds in the requested format.")
205 |             human_message = HumanMessage(content=prompt)
206 |             
207 |             # Call the model directly with messages
208 |             response = self.chat_model.invoke([system_message, human_message])
209 |             
210 |             self.logger.info(f"LLM Query Response received")
211 |             
212 |             # Process the response to extract JSON
213 |             if response.content:
214 |                 raw_content = response.content.strip()
215 |                 return self._extract_json_from_response(raw_content)
216 |             else:
217 |                 return None
218 |                 
219 |         except Exception as e:
220 |             self.logger.error(f"Error in LLM query: {e}")
221 |             import traceback
222 |             traceback.print_exc()
223 |             return self._get_mock_query_response()
224 |     
225 |     def _extract_json_from_response(self, raw_response: str) -> Optional[str]:
226 |         """Extract JSON string from LLM response"""
227 |         import re
228 |         
229 |         try:
230 |             # First, try to parse the response directly as JSON
231 |             json.loads(raw_response)
232 |             return raw_response
233 |         except json.JSONDecodeError:
234 |             pass
235 |         
236 |         # Try to find JSON within the response
237 |         # Look for content between { and }
238 |         json_match = re.search(r'\{.*\}', raw_response, re.DOTALL)
239 |         if json_match:
240 |             json_str = json_match.group(0)
241 |             try:
242 |                 # Validate that it's proper JSON
243 |                 json.loads(json_str)
244 |                 return json_str
245 |             except json.JSONDecodeError:
246 |                 pass
247 |         
248 |         # Try to find content between ```json and ``` or ```
249 |         code_block_match = re.search(r'```(?:json)?\s*(.*?)\s*```', raw_response, re.DOTALL)
250 |         if code_block_match:
251 |             json_str = code_block_match.group(1).strip()
252 |             try:
253 |                 json.loads(json_str)
254 |                 return json_str
255 |             except json.JSONDecodeError:
256 |                 pass
257 |         
258 |         # If no valid JSON found, log warning and return mock response
259 |         self.logger.warning(f"Could not extract valid JSON from response: {raw_response[:200]}...")
260 |         return self._get_mock_query_response()
261 |     
262 |     def _get_mock_query_response(self) -> str:
263 |         """Return mock response for general queries"""
264 |         return '{"function_name_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "file_path_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "operation_type_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "purpose_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "component_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "feature_missing": {"applicable": false, "reasoning": "Mock reasoning"}, "extra_feature": {"applicable": true, "reasoning": "Mock reasoning"}}'
265 | 
266 | 
267 | class MockLLMInterface(LLMInterface):
268 |     """Mock LLM interface for testing when LangChain is not available"""
269 |     
270 |     def __init__(self, **kwargs):
271 |         self.logger = logging.getLogger(__name__)
272 |         
273 |     def generate_inconsistent_message(self, prompt: str) -> Optional[str]:
274 |         """Generate a mock inconsistent commit message"""
275 |         self.logger.info("Using mock LLM interface")
276 |         
277 |         # Extract some context from prompt to make mock more realistic
278 |         if "function" in prompt.lower():
279 |             return "Fix bug in login_handler() method"
280 |         elif "file" in prompt.lower():
281 |             return "Update database_config.py settings"
282 |         elif "operation" in prompt.lower():
283 |             return "Add new user registration feature"
284 |         else:
285 |             return "Mock inconsistent commit message for testing"
286 |     
287 |     def query(self, message: str) -> Optional[str]:
288 |         """Return mock response for general queries"""
289 |         self.logger.info("Using mock LLM interface for query")
290 |         return '{"function_name_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "file_path_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "operation_type_mismatch": {"applicable": true, "reasoning": "Mock reasoning"}, "purpose_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "component_mismatch": {"applicable": false, "reasoning": "Mock reasoning"}, "feature_missing": {"applicable": false, "reasoning": "Mock reasoning"}, "extra_feature": {"applicable": true, "reasoning": "Mock reasoning"}}'
291 | 
292 | 
293 | class LLMManager:
294 |     """LLM manager supporting multiple LLM interfaces"""
295 |     
296 |     def __init__(self, llm_config: Dict[str, Any]):
297 |         self.logger = logging.getLogger(__name__)
298 |         self.llm_interface = self._create_llm_interface(llm_config)
299 |         self.llm_config_test = llm_config
300 |     
301 |     def _create_llm_interface(self, config: Dict[str, Any]) -> LLMInterface:
302 |         """Create LLM interface based on configuration"""
303 |         provider = config.get('provider', 'openai').lower()
304 |         
305 |         if provider == 'openai':
306 |             if not LANGCHAIN_AVAILABLE:
307 |                 self.logger.warning("LangChain not available, using mock interface")
308 |                 return MockLLMInterface(**config)
309 |             
310 |             return LangChainOpenAIInterface(
311 |                 api_key=config.get('api_key', 'empty'),
312 |                 base_url=config.get('base_url', ''),
313 |                 model=config.get('model', 'gpt-3.5-turbo'),
314 |                 max_retries=config.get('max_retries', 3)
315 |             )
316 |         else:
317 |             raise ValueError(f"Unsupported LLM provider: {provider}")
318 |     
319 |     def generate_inconsistent_message(self, prompt: str) -> Optional[str]:
320 |         """Generate inconsistent commit message"""
321 |         try:
322 |             return self.llm_interface.generate_inconsistent_message(prompt)
323 |         except Exception as e:
324 |             self.logger.error(f"Failed to generate inconsistent message: {str(e)}")
325 |             return "Error generating inconsistent commit message"
326 | 
327 |     def query(self, message: str) -> Optional[str]:
328 |         """Perform general query to LLM"""
329 |         try:
330 |             return self.llm_interface.query(message)
331 |         except Exception as e:
332 |             self.logger.error(f"Failed to query LLM: {str(e)}")
333 |             return "Error querying LLM"
334 | 


--------------------------------------------------------------------------------
/data_synthesis/data_validator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data validation module for verifying the quality of generated inconsistent commit messages
  3 | """
  4 | 
  5 | import json
  6 | import logging
  7 | from pathlib import Path
  8 | from typing import Dict, Any, Tuple, Optional
  9 | from dataclasses import dataclass
 10 | 
 11 | from llm_interface import LLMManager
 12 | 
 13 | 
 14 | @dataclass
 15 | class ValidationResult:
 16 |     """Validation result for a single data entry"""
 17 |     line_number: int
 18 |     original_entry: Dict[str, Any]
 19 |     groundtruth_message: str
 20 |     consistency_with_new_message: bool
 21 |     consistency_with_original_message: bool
 22 |     new_message_reasoning: str
 23 |     original_message_reasoning: str
 24 |     is_valid: bool  # True if inconsistent with new but consistent with original
 25 | 
 26 | 
 27 | class DataValidator:
 28 |     """Validate generated inconsistent data quality"""
 29 |     
 30 |     def __init__(self, llm_manager: LLMManager, log_file: str = "valid.log"):
 31 |         self.llm_manager = llm_manager
 32 |         self.logger = logging.getLogger(__name__)
 33 |         self.log_file = log_file
 34 |         
 35 |         # Set up validation log file
 36 |         self.validation_logger = logging.getLogger("validation")
 37 |         self.validation_logger.setLevel(logging.INFO)
 38 |         
 39 |         # Create file handler for validation results
 40 |         file_handler = logging.FileHandler(self.log_file, mode='w', encoding='utf-8')
 41 |         file_handler.setLevel(logging.INFO)
 42 |         
 43 |         # Create formatter
 44 |         formatter = logging.Formatter('%(asctime)s - %(message)s')
 45 |         file_handler.setFormatter(formatter)
 46 |         
 47 |         # Add handler to validation logger
 48 |         self.validation_logger.addHandler(file_handler)
 49 |         
 50 |         # Log header
 51 |         self.validation_logger.info("=" * 80)
 52 |         self.validation_logger.info("Data Validation Results Log")
 53 |         self.validation_logger.info("=" * 80)
 54 |     
 55 |     def _generate_groundtruth_message(self, diff: str) -> Optional[str]:
 56 |         """Generate groundtruth commit message from diff"""
 57 |         
 58 |         prompt = f"""
 59 | You are an expert software developer. Given the following code diff, generate a clear and accurate commit message that precisely describes the changes made.
 60 | 
 61 | The commit message should:
 62 | 1. Be concise but descriptive
 63 | 2. Use conventional commit format if applicable
 64 | 3. Accurately reflect what was actually changed in the code
 65 | 4. Focus on the main purpose/intent of the changes
 66 | 
 67 | Code diff:
 68 | {diff}
 69 | 
 70 | Please respond with a JSON object in this exact format:
 71 | {{
 72 |     "message": "your generated commit message here"
 73 | }}
 74 | """
 75 |         
 76 |         try:
 77 |             response = self.llm_manager.query(prompt)
 78 |             if response:
 79 |                 # Parse JSON response
 80 |                 result = json.loads(response)
 81 |                 return result.get("message", "").strip()
 82 |             return None
 83 |         except Exception as e:
 84 |             self.logger.error(f"Error generating groundtruth message: {e}")
 85 |             return None
 86 |     
 87 |     def _check_message_consistency(self, groundtruth: str, test_message: str, message_type: str) -> Tuple[bool, str]:
 88 |         """Check if two messages are consistent"""
 89 |         
 90 |         prompt = f"""
 91 | You are an expert at analyzing commit messages. Compare the following two commit messages and determine if they describe the same changes with the same intent.
 92 | 
 93 | Groundtruth message (generated from actual code diff): {groundtruth}
 94 | {message_type} message: {test_message}
 95 | 
 96 | Please analyze:
 97 | 1. Do both messages describe the same type of operation (e.g., fix, add, update, refactor)?
 98 | 2. Do both messages refer to the same components/files/functions?
 99 | 3. Do both messages convey the same purpose/intent?
100 | 4. Are the key details consistent between them?
101 | 
102 | Consider the messages CONSISTENT if they describe the same changes with the same intent, even if wording differs slightly.
103 | Consider them INCONSISTENT if they describe different changes, different purposes, or contradictory information.
104 | 
105 | Respond with a JSON object in this exact format:
106 | {{
107 |     "consistent": true/false,
108 |     "reasoning": "detailed explanation of your analysis and decision"
109 | }}
110 | """
111 |         
112 |         try:
113 |             response = self.llm_manager.query(prompt)
114 |             if response:
115 |                 result = json.loads(response)
116 |                 return result.get("consistent", False), result.get("reasoning", "No reasoning provided")
117 |             return False, "Failed to get LLM response"
118 |         except Exception as e:
119 |             self.logger.error(f"Error checking message consistency: {e}")
120 |             return False, f"Error in consistency check: {str(e)}"
121 |     
122 |     def validate_entry(self, entry: Dict[str, Any], line_number: int) -> ValidationResult:
123 |         """Validate a single data entry"""
124 |         
125 |         # Extract required fields
126 |         new_message = entry.get('message', '')
127 |         original_message = entry.get('original_message', '')
128 |         diff = entry.get('diff', '')
129 |         
130 |         self.logger.info(f"Validating entry {line_number}")
131 |         
132 |         # Generate groundtruth message from diff
133 |         groundtruth = self._generate_groundtruth_message(diff)
134 |         if not groundtruth:
135 |             self.logger.warning(f"Failed to generate groundtruth for entry {line_number}")
136 |             return ValidationResult(
137 |                 line_number=line_number,
138 |                 original_entry=entry,
139 |                 groundtruth_message="",
140 |                 consistency_with_new_message=False,
141 |                 consistency_with_original_message=False,
142 |                 new_message_reasoning="Failed to generate groundtruth",
143 |                 original_message_reasoning="Failed to generate groundtruth",
144 |                 is_valid=False
145 |             )
146 |         
147 |         # Check consistency with new message
148 |         consistent_with_new, new_reasoning = self._check_message_consistency(
149 |             groundtruth, new_message, "New"
150 |         )
151 |         
152 |         # Check consistency with original message
153 |         consistent_with_original, original_reasoning = self._check_message_consistency(
154 |             groundtruth, original_message, "Original"
155 |         )
156 |         
157 |         # Determine if entry is valid
158 |         # Valid if: inconsistent with new message AND consistent with original message
159 |         is_valid = (not consistent_with_new) and consistent_with_original
160 |         
161 |         # Create validation result
162 |         result = ValidationResult(
163 |             line_number=line_number,
164 |             original_entry=entry,
165 |             groundtruth_message=groundtruth,
166 |             consistency_with_new_message=consistent_with_new,
167 |             consistency_with_original_message=consistent_with_original,
168 |             new_message_reasoning=new_reasoning,
169 |             original_message_reasoning=original_reasoning,
170 |             is_valid=is_valid
171 |         )
172 |         
173 |         # Log validation result to file
174 |         self._log_validation_result(result)
175 |         
176 |         return result
177 |     
178 |     def _log_validation_result(self, result: ValidationResult) -> None:
179 |         """Log validation result to file"""
180 |         
181 |         # Format validation result
182 |         status = "✅ VALID" if result.is_valid else "❌ INVALID"
183 |         
184 |         self.validation_logger.info(f"\nEntry {result.line_number}: {status}")
185 |         self.validation_logger.info("-" * 60)
186 |         
187 |         # Log messages
188 |         self.validation_logger.info(f"Original Message: {result.original_entry.get('original_message', 'N/A')}")
189 |         self.validation_logger.info(f"Generated Message: {result.original_entry.get('message', 'N/A')}")
190 |         self.validation_logger.info(f"Groundtruth Message: {result.groundtruth_message}")
191 |         
192 |         # Log consistency analysis
193 |         new_status = "✅ CONSISTENT" if result.consistency_with_new_message else "❌ INCONSISTENT"
194 |         original_status = "✅ CONSISTENT" if result.consistency_with_original_message else "❌ INCONSISTENT"
195 |         
196 |         self.validation_logger.info(f"\nConsistency Analysis:")
197 |         self.validation_logger.info(f"  Generated vs Groundtruth: {new_status}")
198 |         self.validation_logger.info(f"  Reasoning: {result.new_message_reasoning}")
199 |         self.validation_logger.info(f"  Original vs Groundtruth: {original_status}")
200 |         self.validation_logger.info(f"  Reasoning: {result.original_message_reasoning}")
201 |         
202 |         # Log final decision
203 |         self.validation_logger.info(f"\nFinal Decision: {status}")
204 |         if result.is_valid:
205 |             self.validation_logger.info("  ✓ Generated message is inconsistent with groundtruth (as expected)")
206 |             self.validation_logger.info("  ✓ Original message is consistent with groundtruth (as expected)")
207 |         else:
208 |             if result.consistency_with_new_message:
209 |                 self.validation_logger.info("  ✗ Generated message is consistent with groundtruth (should be inconsistent)")
210 |             if not result.consistency_with_original_message:
211 |                 self.validation_logger.info("  ✗ Original message is inconsistent with groundtruth (should be consistent)")
212 |         
213 |         self.validation_logger.info("=" * 80)
214 |         
215 |         # Format validation result
216 |         status = "✅ VALID" if result.is_valid else "❌ INVALID"
217 |         
218 |         self.validation_logger.info(f"\nEntry {result.line_number}: {status}")
219 |         self.validation_logger.info("-" * 60)
220 |         
221 |         # Log messages
222 |         self.validation_logger.info(f"Original Message: {result.original_entry.get('original_message', 'N/A')}")
223 |         self.validation_logger.info(f"Generated Message: {result.original_entry.get('message', 'N/A')}")
224 |         self.validation_logger.info(f"Groundtruth Message: {result.groundtruth_message}")
225 |         
226 |         # Log consistency analysis
227 |         new_status = "✅ CONSISTENT" if result.consistency_with_new_message else "❌ INCONSISTENT"
228 |         original_status = "✅ CONSISTENT" if result.consistency_with_original_message else "❌ INCONSISTENT"
229 |         
230 |         self.validation_logger.info(f"\nConsistency Analysis:")
231 |         self.validation_logger.info(f"  Generated vs Groundtruth: {new_status}")
232 |         self.validation_logger.info(f"  Reasoning: {result.new_message_reasoning}")
233 |         self.validation_logger.info(f"  Original vs Groundtruth: {original_status}")
234 |         self.validation_logger.info(f"  Reasoning: {result.original_message_reasoning}")
235 |         
236 |         # Log final decision
237 |         self.validation_logger.info(f"\nFinal Decision: {status}")
238 |         if result.is_valid:
239 |             self.validation_logger.info("  ✓ Generated message is inconsistent with groundtruth (as expected)")
240 |             self.validation_logger.info("  ✓ Original message is consistent with groundtruth (as expected)")
241 |         else:
242 |             if result.consistency_with_new_message:
243 |                 self.validation_logger.info("  ✗ Generated message is consistent with groundtruth (should be inconsistent)")
244 |             if not result.consistency_with_original_message:
245 |                 self.validation_logger.info("  ✗ Original message is inconsistent with groundtruth (should be consistent)")
246 |         
247 |         self.validation_logger.info("=" * 80)
248 |     
249 |     def validate_file(self, input_file: str, output_file: str, max_entries: Optional[int] = None) -> Dict[str, Any]:
250 |         """
251 |         Validate entire file and create filtered output
252 |         
253 |         Args:
254 |             input_file: Input JSONL file with generated data
255 |             output_file: Output file for valid entries
256 |             max_entries: Maximum number of entries to process
257 |             
258 |         Returns:
259 |             Statistics dictionary
260 |         """
261 |         
262 |         input_path = Path(input_file)
263 |         output_path = Path(output_file)
264 |         
265 |         if not input_path.exists():
266 |             raise FileNotFoundError(f"Input file not found: {input_file}")
267 |         
268 |         # Ensure output directory exists
269 |         output_path.parent.mkdir(parents=True, exist_ok=True)
270 |         
271 |         # Statistics tracking
272 |         stats = {
273 |             'total_processed': 0,
274 |             'valid_entries': 0,
275 |             'invalid_entries': 0,
276 |             'groundtruth_generation_failures': 0,
277 |             'consistent_with_new': 0,
278 |             'inconsistent_with_new': 0,
279 |             'consistent_with_original': 0,
280 |             'inconsistent_with_original': 0,
281 |             'validation_details': []
282 |         }
283 |         
284 |         valid_entries = []
285 |         
286 |         with open(input_path, 'r', encoding='utf-8') as infile:
287 |             for line_num, line in enumerate(infile, 1):
288 |                 if max_entries and stats['total_processed'] >= max_entries:
289 |                     break
290 |                 
291 |                 try:
292 |                     entry = json.loads(line.strip())
293 |                     
294 |                     # Skip entries that don't have required fields
295 |                     if not all(key in entry for key in ['message', 'original_message', 'diff']):
296 |                         self.logger.warning(f"Skipping entry {line_num}: missing required fields")
297 |                         continue
298 |                     
299 |                     # Validate entry
300 |                     result = self.validate_entry(entry, line_num)
301 |                     
302 |                     # Update statistics
303 |                     stats['total_processed'] += 1
304 |                     
305 |                     if not result.groundtruth_message:
306 |                         stats['groundtruth_generation_failures'] += 1
307 |                         continue
308 |                     
309 |                     if result.consistency_with_new_message:
310 |                         stats['consistent_with_new'] += 1
311 |                     else:
312 |                         stats['inconsistent_with_new'] += 1
313 |                     
314 |                     if result.consistency_with_original_message:
315 |                         stats['consistent_with_original'] += 1
316 |                     else:
317 |                         stats['inconsistent_with_original'] += 1
318 |                     
319 |                     if result.is_valid:
320 |                         stats['valid_entries'] += 1
321 |                         valid_entries.append(entry)
322 |                         self.logger.info(f"Entry {line_num} is valid")
323 |                     else:
324 |                         stats['invalid_entries'] += 1
325 |                         self.logger.info(f"Entry {line_num} is invalid")
326 |                     
327 |                     # Store detailed validation info
328 |                     stats['validation_details'].append({
329 |                         'line_number': line_num,
330 |                         'groundtruth': result.groundtruth_message,
331 |                         'consistent_with_new': result.consistency_with_new_message,
332 |                         'consistent_with_original': result.consistency_with_original_message,
333 |                         'new_reasoning': result.new_message_reasoning,
334 |                         'original_reasoning': result.original_message_reasoning,
335 |                         'is_valid': result.is_valid
336 |                     })
337 |                     
338 |                     # Log progress
339 |                     if stats['total_processed'] % 10 == 0:
340 |                         self.logger.info(f"Processed {stats['total_processed']} entries...")
341 |                 
342 |                 except json.JSONDecodeError as e:
343 |                     self.logger.error(f"Failed to parse JSON on line {line_num}: {e}")
344 |                 except Exception as e:
345 |                     self.logger.error(f"Error processing line {line_num}: {e}")
346 |         
347 |         # Write valid entries to output file
348 |         with open(output_path, 'w', encoding='utf-8') as outfile:
349 |             for entry in valid_entries:
350 |                 outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')
351 |         
352 |         self.logger.info(f"Validation complete. {stats['valid_entries']} valid entries written to {output_file}")
353 |         
354 |         return stats
355 |     
356 |     def print_validation_statistics(self, stats: Dict[str, Any]) -> None:
357 |         """Print detailed validation statistics"""
358 |         
359 |         total = stats['total_processed']
360 |         if total == 0:
361 |             print("📊 No entries processed")
362 |             return
363 |         
364 |         print("📊 Data Validation Statistics")
365 |         print("=" * 60)
366 |         print(f"Total entries processed: {total}")
367 |         print(f"Valid entries (inconsistent with new, consistent with original): {stats['valid_entries']} ({stats['valid_entries']/total*100:.1f}%)")
368 |         print(f"Invalid entries: {stats['invalid_entries']} ({stats['invalid_entries']/total*100:.1f}%)")
369 |         
370 |         if stats['groundtruth_generation_failures'] > 0:
371 |             print(f"Groundtruth generation failures: {stats['groundtruth_generation_failures']}")
372 |         
373 |         print("\n🔍 Consistency Analysis:")
374 |         print(f"Consistent with new message: {stats['consistent_with_new']} ({stats['consistent_with_new']/total*100:.1f}%)")
375 |         print(f"Inconsistent with new message: {stats['inconsistent_with_new']} ({stats['inconsistent_with_new']/total*100:.1f}%)")
376 |         print(f"Consistent with original message: {stats['consistent_with_original']} ({stats['consistent_with_original']/total*100:.1f}%)")
377 |         print(f"Inconsistent with original message: {stats['inconsistent_with_original']} ({stats['inconsistent_with_original']/total*100:.1f}%)")
378 |         
379 |         print("\n📋 Validation Criteria:")
380 |         print("✅ Valid: Inconsistent with new message AND consistent with original message")
381 |         print("❌ Invalid: Either consistent with new message OR inconsistent with original message")
382 |         
383 |         # Calculate quality metrics
384 |         if stats['inconsistent_with_new'] > 0:
385 |             inconsistency_rate = stats['inconsistent_with_new'] / total * 100
386 |             print(f"\n🎯 Data Quality Metrics:")
387 |             print(f"Inconsistency generation rate: {inconsistency_rate:.1f}%")
388 |         
389 |         if stats['valid_entries'] > 0:
390 |             validity_rate = stats['valid_entries'] / total * 100
391 |             print(f"Overall data validity rate: {validity_rate:.1f}%")
392 | 


--------------------------------------------------------------------------------
/evaluation/evaluate_main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Evaluation Framework for Commit Message Consistency Systems
  4 | 
  5 | This script provides a unified evaluation framework for testing different
  6 | commit message consistency checking systems.
  7 | 
  8 | Supported systems:
  9 | - commaster: System 1 for commit message consistency checking
 10 | - pure_llm: System 2 using pure LLM-based approach
 11 | """
 12 | 
 13 | import argparse
 14 | import json
 15 | import os
 16 | import subprocess
 17 | import sys
 18 | import tempfile
 19 | import time
 20 | from pathlib import Path
 21 | from typing import Dict, Any, List, Optional
 22 | import logging
 23 | from concurrent.futures import ProcessPoolExecutor, as_completed
 24 | from functools import partial
 25 | import multiprocessing
 26 | 
 27 | # import resolvers
 28 | import pure_llm.consistency_checker as pure_llm_checker
 29 | import fewshot.consistency_checker as fewshot_checker
 30 | import CoT.consistency_checker as cot_checker
 31 | 
 32 | # Import commaster system for direct function calls
 33 | import sys
 34 | import os
 35 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 36 | from commit_analyzer import evaluation as commaster_evaluation
 37 | 
 38 | # entry of commaster system (keeping for reference)
 39 | COMMASTER_MAIN = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "commit_analyzer.py")
 40 | 
 41 | # Configure logging
 42 | logging.basicConfig(
 43 |     level=logging.INFO,
 44 |     format='%(asctime)s - %(levelname)s - %(message)s'
 45 | )
 46 | logger = logging.getLogger(__name__)
 47 | 
 48 | def parse_jsonl_file(file_path: str) -> List[Dict[str, Any]]:
 49 |     """
 50 |     Parse JSONL file and return list of records.
 51 |     
 52 |     Args:
 53 |         file_path (str): Path to the JSONL file
 54 |         
 55 |     Returns:
 56 |         List[Dict[str, Any]]: List of parsed JSON records
 57 |     """
 58 |     records = []
 59 |     try:
 60 |         with open(file_path, 'r', encoding='utf-8') as f:
 61 |             for line_num, line in enumerate(f, 1):
 62 |                 line = line.strip()
 63 |                 if not line:
 64 |                     continue
 65 |                 try:
 66 |                     record = json.loads(line)
 67 |                     records.append(record)
 68 |                 except json.JSONDecodeError as e:
 69 |                     logger.warning(f"Failed to parse JSON on line {line_num}: {e}")
 70 |                     continue
 71 |     except FileNotFoundError:
 72 |         logger.error(f"File not found: {file_path}")
 73 |         sys.exit(1)
 74 |     except Exception as e:
 75 |         logger.error(f"Error reading file {file_path}: {e}")
 76 |         sys.exit(1)
 77 |     
 78 |     logger.info(f"Successfully loaded {len(records)} records from {file_path}")
 79 |     return records
 80 | 
 81 | def extract_data_fields(record: Dict[str, Any], repo_collect_dir: Optional[str] = None) -> Dict[str, Any]:
 82 |     """
 83 |     Extract required fields from a data record.
 84 |     
 85 |     Args:
 86 |         record (Dict[str, Any]): Single data record from JSONL file
 87 |         
 88 |     Returns:
 89 |         Dict[str, Any]: Extracted fields (repo, commit_sha, message, files, diff, gt_consistent, loc)
 90 |     """
 91 |     
 92 |     extracted = {
 93 |         'repo': os.path.join(repo_collect_dir, record.get('repo', '')),
 94 |         'commit_sha': record.get('commit_sha', ''),
 95 |         'message': record.get('message', ''),
 96 |         'files': record.get('files', []),
 97 |         'diff': record.get('diff', ''),
 98 |         'gt_consistent': 'original_message' not in record,
 99 |         'loc': record.get('loc', 0)
100 |     }
101 |     
102 |     return extracted
103 | 
104 | def test_commaster_system(data: Dict[str, Any], api_key: str, api_base: str, model: str) -> Dict[str, Any]:
105 |     """
106 |     Test the commaster system (System 1) by directly calling the evaluation function.
107 |     
108 |     Args:
109 |         data (Dict[str, Any]): Extracted data fields
110 |         api_key (str): OpenAI API key
111 |         api_base (str): OpenAI API base URL
112 |         model (str): Model name to use
113 |         
114 |     Returns:
115 |         Dict[str, Any]: Test results
116 |     """
117 |     repo_path = os.path.join(os.path.dirname(data['repo']), "apache_" + os.path.basename(data['repo']))
118 |     logger.info(f"Testing commaster system with repo: {repo_path}, commit_sha: {data['commit_sha']}")
119 | 
120 |     try:
121 |         # Create a temporary work directory for this evaluation
122 |         import tempfile
123 |         with tempfile.TemporaryDirectory() as work_dir:
124 |             
125 |             logger.info(f"Running commaster evaluation function with repo: {repo_path}, commit: {data['commit_sha']}")
126 |      
127 |             # Run the evaluation function and capture output
128 |             start_time = time.time()
129 |             try:
130 |                 exit_code, result = commaster_evaluation(
131 |                     message=data['message'],
132 |                     repo_path=repo_path,
133 |                     work_dir=work_dir,
134 |                     commit=data['commit_sha'],
135 |                     openai_api_key=api_key,
136 |                     openai_api_base=api_base,
137 |                     model=model
138 |                 )
139 |                 end_time = time.time()
140 |                 
141 |             except Exception as e:
142 |                 end_time = time.time()
143 |                 logger.error(f"Error during commaster evaluation: {e}")
144 |                 exit_code = 1
145 |                 json_output = json.dumps({
146 |                     'consistent': False,
147 |                     'prompt_tokens': 0,
148 |                     'completion_tokens': 0,
149 |                     'total_tokens': 0
150 |                 })
151 | 
152 |             try:
153 |                 import glob
154 |                 import shutil
155 |                 commaster_result_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "commaster_result")
156 |                 os.makedirs(commaster_result_dir, exist_ok=True)
157 |                 for f in glob.glob(os.path.join(work_dir, "*.analysis")) + glob.glob(os.path.join(work_dir, "*.log")):
158 |                     shutil.move(f, commaster_result_dir)
159 |             except Exception as e:
160 |                 logger.warning(f"Failed to move .analysis/.log files: {e}")
161 | 
162 |             if exit_code == 0:
163 |                 try:
164 |                     # Parse JSON output
165 |                     commaster_result = result
166 | 
167 |                     # Format result to match expected output
168 |                     formatted_result = {
169 |                         'system': 'commaster',
170 |                         'repo': data['repo'],
171 |                         'commit_sha': data['commit_sha'],
172 |                         'consistent': commaster_result["consistency_analysis"]["consistent"],
173 |                         'confidence': 1.0,
174 |                         'reasoning': 'IGNORE',
175 |                         'gt_consistent': data.get('gt_consistent', True),
176 |                         'loc': data.get('loc', 0),
177 |                         'processing_time_ms': round((end_time - start_time) * 1000, 2),
178 |                         'error': None,
179 |                         "prompt_tokens": commaster_result["token_usage"]["grand_total"]["prompt_tokens"],
180 |                         "completion_tokens": commaster_result["token_usage"]["grand_total"].get("completion_tokens", 0),
181 |                         "total_tokens": commaster_result["token_usage"]["grand_total"].get("total_tokens", 0)
182 |                     }
183 | 
184 |                     # Add any additional fields from commaster result
185 |                     for key, value in commaster_result.items():
186 |                         if key not in formatted_result:
187 |                             formatted_result[key] = value
188 | 
189 |                     logger.debug(f"Commaster result: consistent={formatted_result['consistent']}, confidence={formatted_result['confidence']}")
190 |                     return formatted_result
191 | 
192 |                 except json.JSONDecodeError as e:
193 |                     logger.error(f"Failed to parse commaster JSON output: {e}")
194 |                     logger.error(f"Raw output: {json_output}")
195 |                     return {
196 |                         'system': 'commaster',
197 |                         'repo': data['repo'],
198 |                         'commit_sha': data['commit_sha'],
199 |                         'consistent': False,
200 |                         'confidence': 0.0,
201 |                         'reasoning': f"Failed to parse JSON output: {str(e)}",
202 |                         'gt_consistent': data.get('gt_consistent', True),
203 |                         'loc': data.get('loc', 0),
204 |                         'processing_time_ms': round((end_time - start_time) * 1000, 2),
205 |                         'error': f"JSON parse error: {str(e)}"
206 |                     }
207 |             else:
208 |                 logger.error(f"Commaster evaluation failed with exit code {exit_code}")
209 |                 return {
210 |                     'system': 'commaster',
211 |                     'repo': data['repo'],
212 |                     'commit_sha': data['commit_sha'],
213 |                     'consistent': False,
214 |                     'confidence': 0.0,
215 |                     'reasoning': f"Evaluation failed with exit code {exit_code}",
216 |                     'gt_consistent': data.get('gt_consistent', True),
217 |                     'loc': data.get('loc', 0),
218 |                     'processing_time_ms': round((end_time - start_time) * 1000, 2),
219 |                     'error': f"Function error: exit code {exit_code}"
220 |                 }
221 |             
222 |             
223 |                 
224 |     except Exception as e:
225 |         logger.error(f"Error running commaster system: {e}")
226 |         return {
227 |             'system': 'commaster',
228 |             'repo': data['repo'],
229 |             'commit_sha': data['commit_sha'],
230 |             'consistent': False,
231 |             'confidence': 0.0,
232 |             'reasoning': f"System error: {str(e)}",
233 |             'gt_consistent': data.get('gt_consistent', True),
234 |             'loc': data.get('loc', 0),
235 |             'processing_time_ms': 0,
236 |             'error': str(e)
237 |         }
238 | 
239 | def get_git_diff(repo_path: str, commit_sha: str, ctx = 0) -> str:
240 |     """
241 |     Get git diff for a specific commit using git command.
242 |     
243 |     Args:
244 |         repo_path (str): Path to the git repository
245 |         commit_sha (str): Commit hash to get diff for
246 |         ctx (int): Number of context lines around changes (0 or 20)
247 |         
248 |     Returns:
249 |         str: Git diff content
250 |     """
251 |     try:
252 |         # Define file extensions to include
253 |         # file_extensions = [
254 |         #     "*.c", "*.h", "*.cpp", "*.hpp", "*.cxx", "*.hxx", "*.cc", "*.hh", 
255 |         #     "*.c++", "*.h++", "*.cxx++", "*.hxx++", "*.C", "*.java", "*.py", "*.pyx", "*.pyi"
256 |         # ]
257 |         
258 |         # Build git diff command
259 |         cmd = [
260 |             'git', '-C', repo_path, 'diff', f'--unified={ctx}', 
261 |             f'{commit_sha}^', commit_sha]
262 |         
263 |         # Execute git command
264 |         result = subprocess.run(
265 |             cmd,
266 |             capture_output=True,
267 |             text=True,
268 |             check=True
269 |         )
270 |         
271 |         return result.stdout
272 |         
273 |     except subprocess.CalledProcessError as e:
274 |         logger.error(f"Git diff command failed for repo {repo_path}, commit {commit_sha}: {e}")
275 |         if e.stderr:
276 |             logger.error(f"Git error output: {e.stderr}")
277 |         return ""
278 |     except Exception as e:
279 |         logger.error(f"Error getting git diff: {e}")
280 |         return ""
281 | 
282 | def test_pure_llm_system(data: Dict[str, Any], api_key: str, api_base: str, model: str, repo_collect_dir: str, ctx: int = 0) -> Dict[str, Any]:
283 |     """
284 |     Test the pure_llm system (System 2).
285 |     ·
286 |     Args:
287 |         data (Dict[str, Any]): Extracted data fields
288 |         api_key (str): OpenAI API key
289 |         api_base (str): OpenAI API base URL
290 |         model (str): Model name to use
291 |         repo_collect_dir (str): Directory containing cloned repositories
292 |         ctx (int): Number of context lines for git diff (0 or 20)
293 |         
294 |     Returns:
295 |         Dict[str, Any]: Test results
296 |     """
297 |     logger.debug(f"Testing pure_llm system with repo: {data['repo']}, hash: {data['commit_sha']}")
298 |     
299 |     try:
300 |         # Initialize the consistency checker with provided API configuration
301 |         checker = pure_llm_checker.ConsistencyChecker(
302 |             openai_api_key=api_key,
303 |             openai_api_base=api_base,
304 |             model=model
305 |         )
306 |         
307 |         # Get git diff dynamically using git command
308 |         # Build repo path based on repo_collections structure
309 |         repo_base_name = os.path.basename(data['repo'])
310 |         repo_path = os.path.join(repo_collect_dir, "apache_" + repo_base_name)
311 |         
312 |         logger.debug(f"Getting git diff for repo: {repo_path}, commit: {data['commit_sha']}")
313 |         git_diff = get_git_diff(repo_path, data['commit_sha'], ctx)
314 |         
315 |         if not git_diff:
316 |             logger.warning(f"No git diff found for repo {repo_path}, commit {data['commit_sha']}")
317 |             logger.warning(f"Falling back to original diff data")
318 |             # Fall back to original diff if git command fails
319 |             git_diff = data.get('diff', '')
320 |             if not git_diff:
321 |                 logger.error(f"No diff data available for commit {data['commit_sha']}")
322 |         else:
323 |             logger.debug(f"Successfully obtained git diff ({len(git_diff)} characters)")
324 |         
325 |         # Run consistency check with dynamically obtained diff
326 |         start_time = time.time()
327 |         llm_result = checker.check_consistency(data['message'], git_diff)
328 |         end_time = time.time()
329 |         
330 |         # Format result to match expected output
331 |         result = {
332 |             'system': 'pure_llm',
333 |             'repo': data['repo'],
334 |             'commit_sha': data['commit_sha'],
335 |             'consistent': llm_result.get('consistent', False),
336 |             'confidence': llm_result.get('confidence', 0.0),
337 |             'reasoning': llm_result.get('reasoning', ''),
338 |             'issues': llm_result.get('issues', []),
339 |             'gt_consistent': data.get('gt_consistent', True),
340 |             'loc': data.get('loc', 0),
341 |             'processing_time_ms': round((end_time - start_time) * 1000, 2),
342 |             'error': None,
343 |             "prompt_tokens": llm_result.get("prompt_tokens", 0),
344 |             "completion_tokens": llm_result.get("completion_tokens", 0),
345 |             "total_tokens": llm_result.get("total_tokens", 0)
346 |         }
347 |         
348 |         logger.debug(f"Pure LLM result: consistent={result['consistent']}, confidence={result['confidence']}")
349 |         return result
350 |         
351 |     except Exception as e:
352 |         logger.error(f"Error in pure_llm system: {e}")
353 |         return {
354 |             'system': 'pure_llm',
355 |             'repo': data['repo'],
356 |             'commit_sha': data['commit_sha'],
357 |             'consistent': False,
358 |             'confidence': 0.0,
359 |             'reasoning': f"Error occurred: {str(e)}",
360 |             'issues': ["System error"],
361 |             'gt_consistent': data.get('gt_consistent', True),
362 |             'loc': data.get('loc', 0),
363 |             'processing_time_ms': 0,
364 |             'error': str(e),
365 |             "prompt_tokens": 0,
366 |             "completion_tokens": 0,
367 |             "total_tokens": 0
368 |         }
369 | 
370 | def test_fewshot_llm_system(data: Dict[str, Any], api_key: str, api_base: str, model: str, repo_collect_dir: str) -> Dict[str, Any]:
371 |     """
372 |     Test the fewshot_llm system (System 3) using few-shot learning with examples.
373 |     ·
374 |     Args:
375 |         data (Dict[str, Any]): Extracted data fields
376 |         api_key (str): OpenAI API key
377 |         api_base (str): OpenAI API base URL
378 |         model (str): Model name to use
379 |         repo_collect_dir (str): Directory containing cloned repositories
380 |         
381 |     Returns:
382 |         Dict[str, Any]: Test results
383 |     """
384 |     logger.debug(f"Testing fewshot_llm system with repo: {data['repo']}, hash: {data['commit_sha']}")
385 |     
386 |     try:
387 |         # Initialize the consistency checker with provided API configuration
388 |         checker = fewshot_checker.ConsistencyChecker(
389 |             openai_api_key=api_key,
390 |             openai_api_base=api_base,
391 |             model=model
392 |         )
393 |         
394 |         # Get git diff dynamically using git command with ctx=0 (no context)
395 |         # Build repo path based on repo_collections structure
396 |         repo_base_name = os.path.basename(data['repo'])
397 |         repo_path = os.path.join(repo_collect_dir, "apache_" + repo_base_name)
398 |         
399 |         logger.debug(f"Getting git diff for repo: {repo_path}, commit: {data['commit_sha']}")
400 |         git_diff = get_git_diff(repo_path, data['commit_sha'], ctx=0)  # Always use ctx=0 for fewshot
401 |         
402 |         if not git_diff:
403 |             logger.warning(f"No git diff found for repo {repo_path}, commit {data['commit_sha']}")
404 |             logger.warning(f"Falling back to original diff data")
405 |             # Fall back to original diff if git command fails
406 |             git_diff = data.get('diff', '')
407 |             if not git_diff:
408 |                 logger.error(f"No diff data available for commit {data['commit_sha']}")
409 |         else:
410 |             logger.debug(f"Successfully obtained git diff ({len(git_diff)} characters)")
411 |         
412 |         # Run consistency check with dynamically obtained diff
413 |         start_time = time.time()
414 |         llm_result = checker.check_consistency(data['message'], git_diff)
415 |         end_time = time.time()
416 |         
417 |         # Format result to match expected output
418 |         result = {
419 |             'system': 'fewshot_llm',
420 |             'repo': data['repo'],
421 |             'commit_sha': data['commit_sha'],
422 |             'consistent': llm_result.get('consistent', False),
423 |             'confidence': llm_result.get('confidence', 0.0),
424 |             'reasoning': llm_result.get('reasoning', ''),
425 |             'issues': llm_result.get('issues', []),
426 |             'gt_consistent': data.get('gt_consistent', True),
427 |             'loc': data.get('loc', 0),
428 |             'processing_time_ms': round((end_time - start_time) * 1000, 2),
429 |             'error': None,
430 |             "prompt_tokens": llm_result.get("prompt_tokens", 0),
431 |             "completion_tokens": llm_result.get("completion_tokens", 0),
432 |             "total_tokens": llm_result.get("total_tokens", 0)
433 |         }
434 |         
435 |         logger.debug(f"Fewshot LLM result: consistent={result['consistent']}, confidence={result['confidence']}")
436 |         return result
437 |         
438 |     except Exception as e:
439 |         logger.error(f"Error in fewshot_llm system: {e}")
440 |         return {
441 |             'system': 'fewshot_llm',
442 |             'repo': data['repo'],
443 |             'commit_sha': data['commit_sha'],
444 |             'consistent': False,
445 |             'confidence': 0.0,
446 |             'reasoning': f"Error occurred: {str(e)}",
447 |             'issues': ["System error"],
448 |             'gt_consistent': data.get('gt_consistent', True),
449 |             'loc': data.get('loc', 0),
450 |             'processing_time_ms': 0,
451 |             'error': str(e),
452 |             "prompt_tokens": 0,
453 |             "completion_tokens": 0,
454 |             "total_tokens": 0
455 |         }
456 | 
457 | 
458 | def test_cot_llm_system(data: Dict[str, Any], api_key: str, api_base: str, model: str, repo_collect_dir: str) -> Dict[str, Any]:
459 |     """
460 |     Test the cot_llm system (System 4) using Chain of Thought reasoning.
461 |     ·
462 |     Args:
463 |         data (Dict[str, Any]): Extracted data fields
464 |         api_key (str): OpenAI API key
465 |         api_base (str): OpenAI API base URL
466 |         model (str): Model name to use
467 |         repo_collect_dir (str): Directory containing cloned repositories
468 |         
469 |     Returns:
470 |         Dict[str, Any]: Test results
471 |     """
472 |     logger.debug(f"Testing cot_llm system with repo: {data['repo']}, hash: {data['commit_sha']}")
473 |     
474 |     try:
475 |         # Initialize the consistency checker with provided API configuration
476 |         checker = cot_checker.ConsistencyChecker(
477 |             openai_api_key=api_key,
478 |             openai_api_base=api_base,
479 |             model=model
480 |         )
481 |         
482 |         # Get git diff dynamically using git command with ctx=0 (no context)
483 |         # Build repo path based on repo_collections structure
484 |         repo_base_name = os.path.basename(data['repo'])
485 |         repo_path = os.path.join(repo_collect_dir, "apache_" + repo_base_name)
486 |         
487 |         logger.debug(f"Getting git diff for repo: {repo_path}, commit: {data['commit_sha']}")
488 |         git_diff = get_git_diff(repo_path, data['commit_sha'], ctx=0)  # Always use ctx=0 for CoT
489 |         
490 |         if not git_diff:
491 |             logger.warning(f"No git diff found for repo {repo_path}, commit {data['commit_sha']}")
492 |             logger.warning(f"Falling back to original diff data")
493 |             # Fall back to original diff if git command fails
494 |             git_diff = data.get('diff', '')
495 |             if not git_diff:
496 |                 logger.error(f"No diff data available for commit {data['commit_sha']}")
497 |         else:
498 |             logger.debug(f"Successfully obtained git diff ({len(git_diff)} characters)")
499 |         
500 |         # Run consistency check with Chain of Thought reasoning
501 |         start_time = time.time()
502 |         llm_result = checker.check_consistency(data['message'], git_diff)
503 |         end_time = time.time()
504 |         
505 |         # Format result to match expected output
506 |         result = {
507 |             'system': 'cot_llm',
508 |             'repo': data['repo'],
509 |             'commit_sha': data['commit_sha'],
510 |             'consistent': llm_result.get('consistent', False),
511 |             'confidence': llm_result.get('confidence', 0.0),
512 |             'reasoning': llm_result.get('reasoning', ''),
513 |             'issues': llm_result.get('issues', []),
514 |             'gt_consistent': data.get('gt_consistent', True),
515 |             'loc': data.get('loc', 0),
516 |             'processing_time_ms': round((end_time - start_time) * 1000, 2),
517 |             'error': None,
518 |             "prompt_tokens": llm_result.get("prompt_tokens", 0),
519 |             "completion_tokens": llm_result.get("completion_tokens", 0),
520 |             "total_tokens": llm_result.get("total_tokens", 0)
521 |         }
522 |         
523 |         logger.debug(f"CoT LLM result: consistent={result['consistent']}, confidence={result['confidence']}")
524 |         return result
525 |         
526 |     except Exception as e:
527 |         logger.error(f"Error in cot_llm system: {e}")
528 |         return {
529 |             'system': 'cot_llm',
530 |             'repo': data['repo'],
531 |             'commit_sha': data['commit_sha'],
532 |             'consistent': False,
533 |             'confidence': 0.0,
534 |             'reasoning': f"Error occurred: {str(e)}",
535 |             'issues': ["System error"],
536 |             'gt_consistent': data.get('gt_consistent', True),
537 |             'loc': data.get('loc', 0),
538 |             'processing_time_ms': 0,
539 |             'error': str(e),
540 |             "prompt_tokens": 0,
541 |             "completion_tokens": 0,
542 |             "total_tokens": 0
543 |         }
544 | 
545 | 
546 | def process_single_record(record_data: tuple, system_name: str, api_key: str, api_base: str, model: str, repo_collect_dir: Optional[str] = None, ctx: int = 0) -> Dict[str, Any]:
547 |     """
548 |     Process a single record for evaluation. This function is designed to be used with multiprocessing.
549 |     
550 |     Args:
551 |         record_data (tuple): Tuple containing (index, record)
552 |         system_name (str): Name of the system to test ('commaster' or 'pure_llm')
553 |         api_key (str): OpenAI API key
554 |         api_base (str): OpenAI API base URL
555 |         model (str): Model name to use
556 |         repo_collect_dir (str, optional): Directory containing cloned repositories
557 |         ctx (int): Number of context lines for git diff (0 or 20)
558 |         
559 |     Returns:
560 |         Dict[str, Any]: Evaluation result for this record
561 |     """
562 |     i, record = record_data
563 |     
564 |     try:
565 |         # Extract required fields
566 |         data = extract_data_fields(record, repo_collect_dir)
567 |         
568 |         # Validate extracted data
569 |         if not data['repo'] or not data['commit_sha']:
570 |             return {
571 |                 'system': system_name,
572 |                 'record_index': i,
573 |                 'repo': data.get('repo', 'unknown'),
574 |                 'commit_sha': data.get('commit_sha', 'unknown'),
575 |                 'consistent': None,
576 |                 'confidence': None,
577 |                 'gt_consistent': data.get('gt_consistent', True),
578 |                 'loc': data.get('loc', 0),
579 |                 'error': 'Missing repo or commit_sha',
580 |                 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
581 |             }
582 |         
583 |         # Select the appropriate test function
584 |         if system_name == 'commaster':
585 |             test_function = lambda data: test_commaster_system(data, api_key, api_base, model)
586 |         elif system_name == 'pure_llm':
587 |             test_function = lambda data: test_pure_llm_system(data, api_key, api_base, model, repo_collect_dir, ctx)
588 |         elif system_name == 'fewshot_llm':
589 |             test_function = lambda data: test_fewshot_llm_system(data, api_key, api_base, model, repo_collect_dir)
590 |         else:  # cot_llm
591 |             test_function = lambda data: test_cot_llm_system(data, api_key, api_base, model, repo_collect_dir)
592 |         
593 |         # Run test on the selected system
594 |         start_time = time.time()
595 |         result = test_function(data)
596 |         end_time = time.time()
597 |         
598 |         # Add metadata to result
599 |         result.update({
600 |             'record_index': i,
601 |             'actual_processing_time_ms': round((end_time - start_time) * 1000, 2),
602 |             'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
603 |         })
604 |         
605 |         return result
606 |         
607 |     except Exception as e:
608 |         return {
609 |             'system': system_name,
610 |             'record_index': i,
611 |             'repo': data.get('repo', 'unknown') if 'data' in locals() else 'unknown',
612 |             'commit_sha': data.get('commit_sha', 'unknown') if 'data' in locals() else 'unknown',
613 |             'consistent': None,
614 |             'confidence': None,
615 |             'gt_consistent': data.get('gt_consistent', True) if 'data' in locals() else True,
616 |             'loc': data.get('loc', 0) if 'data' in locals() else 0,
617 |             'error': str(e),
618 |             'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
619 |         }
620 |     
621 | def run_evaluation(system_name: str, data_file: str, api_key: str, api_base: str, model: str, 
622 |                   output_file: Optional[str] = None, repo_collect_dir: Optional[str] = None, max_workers: int = 1, ctx: int = 0) -> List[Dict[str, Any]]:
623 |     """
624 |     Run evaluation on the specified system with optional parallel processing.
625 |     
626 |     Args:
627 |         system_name (str): Name of the system to test ('commaster' or 'pure_llm')
628 |         data_file (str): Path to the JSONL data file
629 |         api_key (str): OpenAI API key
630 |         api_base (str): OpenAI API base URL
631 |         model (str): Model name to use
632 |         output_file (str, optional): Path to save results
633 |         repo_collect_dir (str, optional): Directory containing cloned repositories
634 |         max_workers (int): Maximum number of parallel workers (default: 1 for sequential processing)
635 |         ctx (int): Number of context lines for git diff (0 or 20)
636 |         
637 |     Returns:
638 |         List[Dict[str, Any]]: List of evaluation results
639 |     """
640 |     # Validate system name
641 |     if system_name not in ['commaster', 'pure_llm', 'fewshot_llm', 'cot_llm']:
642 |         logger.error(f"Unsupported system: {system_name}. Supported systems: commaster, pure_llm, fewshot_llm, cot_llm")
643 |         sys.exit(1)
644 |     
645 |     logger.info(f"Starting evaluation for system: {system_name}")
646 |     logger.info(f"Data file: {data_file}")
647 |     logger.info(f"Max workers: {max_workers}")
648 |     
649 |     # Load data
650 |     records = parse_jsonl_file(data_file)
651 |     # records = records[2000:3500]
652 |     records = records[::-1]  # For testing, limit to last 100 records
653 |     total_records = len(records)
654 |     
655 |     completed_indices = set()
656 |     if output_file and os.path.exists(output_file):
657 |         logger.info(f"Resume mode: loading completed records from {output_file}")
658 |         with open(output_file, 'r', encoding='utf-8') as f:
659 |             for line in f:
660 |                 try:
661 |                     rec = json.loads(line)
662 |                     if 'record_index' in rec:
663 |                         completed_indices.add(rec['record_index'])
664 |                 except Exception:
665 |                     continue
666 |         logger.info(f"Resume mode: {len(completed_indices)} records already completed.")
667 | 
668 |     record_data = [(i+1, record) for i, record in enumerate(records)
669 |                    if (i+1) not in completed_indices]
670 |     if len(record_data) < total_records:
671 |         logger.info(f"Resume mode: {total_records-len(record_data)} records will be skipped (already done). {len(record_data)} to process.")
672 |     else:
673 |         logger.info(f"No resume: all {total_records} records will be processed.")
674 |     
675 |     results = []
676 |     output_fh = None
677 |     if output_file:
678 |         output_fh = open(output_file, 'w', encoding='utf-8')
679 | 
680 |     def write_result_line(res):
681 |         if output_fh:
682 |             output_fh.write(json.dumps(res, ensure_ascii=False) + '\n')
683 |             output_fh.flush()
684 | 
685 |     if max_workers == 1:
686 |         # Sequential processing (original behavior)
687 |         logger.info("Running in sequential mode")
688 |         for i, (index, record) in enumerate(record_data):
689 |             logger.info(f"Processing record {index}/{total_records}")
690 |             result = process_single_record(
691 |                 (index, record), system_name, api_key, api_base, model, repo_collect_dir, ctx
692 |             )
693 |             results.append(result)
694 |             write_result_line(result)
695 |     else:
696 |         # Parallel processing
697 |         logger.info(f"Running in parallel mode with {max_workers} workers")
698 |         process_func = partial(
699 |             process_single_record,
700 |             system_name=system_name,
701 |             api_key=api_key,
702 |             api_base=api_base,
703 |             model=model,
704 |             repo_collect_dir=repo_collect_dir,
705 |             ctx=ctx
706 |         )
707 |         with ProcessPoolExecutor(max_workers=max_workers) as executor:
708 |             future_to_index = {
709 |                 executor.submit(process_func, record_item): record_item[0] 
710 |                 for record_item in record_data
711 |             }
712 |             completed_count = 0
713 |             for future in as_completed(future_to_index):
714 |                 try:
715 |                     result = future.result()
716 |                     results.append(result)
717 |                     write_result_line(result)
718 |                     completed_count += 1
719 |                     if completed_count % 10 == 0 or completed_count == total_records:
720 |                         logger.info(f"Completed {completed_count}/{total_records} records")
721 |                 except Exception as e:
722 |                     index = future_to_index[future]
723 |                     logger.error(f"Error processing record {index}: {e}")
724 |                     error_result = {
725 |                         'system': system_name,
726 |                         'record_index': index,
727 |                         'repo': 'unknown',
728 |                         'commit_sha': 'unknown',
729 |                         'consistent': None,
730 |                         'confidence': None,
731 |                         'gt_consistent': True,
732 |                         'loc': 0,
733 |                         'error': str(e),
734 |                         'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
735 |                     }
736 |                     results.append(error_result)
737 |                     write_result_line(error_result)
738 |     if output_fh:
739 |         output_fh.close()
740 |     # Sort results by record index to maintain order
741 |     results.sort(key=lambda x: x.get('record_index', 0))
742 |     logger.info(f"Evaluation completed. Processed {len(results)} records.")
743 |     return results
744 | 
745 | def save_results(results: List[Dict[str, Any]], output_file: str) -> None:
746 |     """
747 |     Save evaluation results to a JSON file.
748 |     
749 |     Args:
750 |         results (List[Dict[str, Any]]): Evaluation results
751 |         output_file (str): Output file path
752 |     """
753 |     try:
754 |         with open(output_file, 'w', encoding='utf-8') as f:
755 |             json.dump(results, f, indent=2, ensure_ascii=False)
756 |         logger.info(f"Results saved to: {output_file}")
757 |     except Exception as e:
758 |         logger.error(f"Failed to save results to {output_file}: {e}")
759 | 
760 | def print_summary(results: List[Dict[str, Any]]) -> None:
761 |     """
762 |     Print a summary of evaluation results.
763 |     
764 |     Args:
765 |         results (List[Dict[str, Any]]): Evaluation results
766 |     """
767 |     if not results:
768 |         logger.warning("No results to summarize.")
769 |         return
770 |     
771 |     system_name = results[0].get('system', 'unknown')
772 |     total = len(results)
773 |     
774 |     # Count results
775 |     consistent_count = sum(1 for r in results if r.get('consistent') is True)
776 |     inconsistent_count = sum(1 for r in results if r.get('consistent') is False)
777 |     error_count = sum(1 for r in results if r.get('error') is not None)
778 |     
779 |     # Calculate average confidence
780 |     confidences = [r.get('confidence', 0) for r in results if r.get('confidence') is not None]
781 |     avg_confidence = sum(confidences) / len(confidences) if confidences else 0
782 |     
783 |     # Calculate average processing time
784 |     times = [r.get('actual_processing_time_ms', 0) for r in results if r.get('actual_processing_time_ms') is not None]
785 |     avg_time = sum(times) / len(times) if times else 0
786 |     
787 |     print("\n" + "=" * 60)
788 |     print(f"EVALUATION SUMMARY - {system_name.upper()}")
789 |     print("=" * 60)
790 |     print(f"Total records processed: {total}")
791 |     print(f"Consistent results: {consistent_count} ({consistent_count/total*100:.1f}%)")
792 |     print(f"Inconsistent results: {inconsistent_count} ({inconsistent_count/total*100:.1f}%)")
793 |     print(f"Errors: {error_count} ({error_count/total*100:.1f}%)")
794 |     print(f"Average confidence: {avg_confidence:.3f}")
795 |     print(f"Average processing time: {avg_time:.2f} ms")
796 |     print("=" * 60)
797 | 
798 | def main():
799 |     """Main function to handle command line arguments and orchestrate evaluation."""
800 |     parser = argparse.ArgumentParser(
801 |         description="Evaluation framework for commit message consistency systems",
802 |         formatter_class=argparse.RawDescriptionHelpFormatter,
803 |         epilog="""
804 | Examples:
805 |   # Test commaster system
806 |   python evaluate_main.py --system commaster --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY
807 |   
808 |   # Test pure_llm system with custom API base and model
809 |   python evaluate_main.py --system pure_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --api-base xxx_url --model gpt-4
810 |   
811 |   # Test pure_llm system with extended context (20 lines) for git diff
812 |   python evaluate_main.py --system pure_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --ctx 20
813 |   
814 |   # Test fewshot_llm system (uses few-shot examples in prompt, always ctx=0)
815 |   python evaluate_main.py --system fewshot_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY
816 |   
817 |   # Test cot_llm system (uses Chain of Thought reasoning, always ctx=0)
818 |   python evaluate_main.py --system cot_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY
819 |   
820 |   # With output file, verbose logging, and parallel processing
821 |   python evaluate_main.py --system pure_llm --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --output results.json --verbose --workers 4 --ctx 0
822 |   
823 |   # Use all available CPU cores with no context lines
824 |   python evaluate_main.py --system commaster --data /path/to/data.jsonl --repocollect /path/to/repos --api-key YOUR_API_KEY --workers 8
825 |         """
826 |     )
827 |     
828 |     parser.add_argument(
829 |         '--system', '-s',
830 |         required=True,
831 |         choices=['commaster', 'pure_llm', 'fewshot_llm', 'cot_llm'],
832 |         help='System to evaluate (commaster, pure_llm, fewshot_llm, or cot_llm)'
833 |     )
834 |     
835 |     parser.add_argument(
836 |         '--data', '-d',
837 |         required=True,
838 |         help='Path to the JSONL data file'
839 |     )
840 | 
841 |     parser.add_argument(
842 |         '--repocollect', '-r',
843 |         required=True,
844 |         help='Path to the repository collection directory'
845 |     )
846 |     
847 |     parser.add_argument(
848 |         '--api-key',
849 |         required=True,
850 |         help='OpenAI API key'
851 |     )
852 |     
853 |     parser.add_argument(
854 |         '--api-base',
855 |         default='EMPTY',
856 |         help='OpenAI API base URL (default: EMPTY)'
857 |     )
858 |     
859 |     parser.add_argument(
860 |         '--model',
861 |         default='gpt-3.5-turbo',
862 |         help='Model name to use (default: gpt-3.5-turbo)'
863 |     )
864 |     
865 |     parser.add_argument(
866 |         '--output', '-o',
867 |         help='Output file path for results (JSON format)'
868 |     )
869 |     
870 |     parser.add_argument(
871 |         '--verbose', '-v',
872 |         action='store_true',
873 |         help='Enable verbose logging'
874 |     )
875 |     
876 |     parser.add_argument(
877 |         '--workers', '-w',
878 |         type=int,
879 |         default=1,
880 |         help=f'Number of parallel workers for processing (default: 1, max: {multiprocessing.cpu_count()})'
881 |     )
882 |     
883 |     parser.add_argument(
884 |         '--ctx',
885 |         type=int,
886 |         choices=[0, 20],
887 |         default=0,
888 |         help='Number of context lines for git diff (0: no context, 20: extended context). Only affects pure_llm system. fewshot_llm and cot_llm always use ctx=0. (default: 0)'
889 |     )
890 |     
891 |     args = parser.parse_args()
892 |     
893 |     # Set logging level based on verbose flag
894 |     if args.verbose:
895 |         logging.getLogger().setLevel(logging.DEBUG)
896 |     
897 |     # Validate workers parameter
898 |     max_cpu_count = multiprocessing.cpu_count()
899 |     if args.workers < 1:
900 |         logger.error("Number of workers must be at least 1")
901 |         sys.exit(1)
902 |     elif args.workers > max_cpu_count:
903 |         logger.warning(f"Number of workers ({args.workers}) exceeds CPU count ({max_cpu_count}), using {max_cpu_count}")
904 |         args.workers = max_cpu_count
905 |     
906 |     # Validate data file exists
907 |     if not Path(args.data).exists():
908 |         logger.error(f"Data file not found: {args.data}")
909 |         sys.exit(1)
910 |     
911 |     # Run evaluation
912 |     try:
913 |         results = run_evaluation(
914 |             system_name=args.system, 
915 |             data_file=args.data, 
916 |             api_key=args.api_key,
917 |             api_base=args.api_base,
918 |             model=args.model,
919 |             output_file=args.output, 
920 |             repo_collect_dir=args.repocollect,
921 |             max_workers=args.workers,
922 |             ctx=args.ctx,
923 |         )
924 |         print_summary(results)
925 |         
926 |     except KeyboardInterrupt:
927 |         logger.info("Evaluation interrupted by user.")
928 |         sys.exit(1)
929 |     except Exception as e:
930 |         logger.error(f"Evaluation failed: {e}")
931 |         sys.exit(1)
932 | 
933 | if __name__ == "__main__":
934 |     main()
935 | 


--------------------------------------------------------------------------------
/data_synthesis/data_generator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Inconsistent data generator
  3 | """
  4 | 
  5 | import json
  6 | import logging
  7 | import random
  8 | import sys
  9 | import multiprocessing
 10 | import queue
 11 | import threading
 12 | from pathlib import Path
 13 | from typing import Dict, Any, List, Optional
 14 | from dataclasses import asdict
 15 | from concurrent.futures import ProcessPoolExecutor, as_completed
 16 | from functools import partial
 17 | 
 18 | from inconsistency_rules import InconsistencyRuleManager, InconsistencyRule
 19 | from rule_applicability_checker import EnhancedInconsistencyRuleManager
 20 | from llm_interface import LLMManager
 21 | 
 22 | 
 23 | def _worker_process_entry_simple(entry_data: tuple, llm_config: Dict[str, Any], 
 24 |                                 inconsistency_ratio: float, use_applicability_check: bool,
 25 |                                 seed_offset: int) -> Dict[str, Any]:
 26 |     """
 27 |     Simplified worker function for multiprocessing.Pool
 28 |     
 29 |     This is a standalone function that avoids complex serialization issues.
 30 |     
 31 |     Args:
 32 |         entry_data: Tuple of (line_number, original_entry)
 33 |         llm_config: Simplified LLM configuration dictionary
 34 |         inconsistency_ratio: Ratio for inconsistency generation
 35 |         use_applicability_check: Whether to use applicability checking
 36 |         seed_offset: Offset for random seed
 37 |         
 38 |     Returns:
 39 |         Processed entry with metadata
 40 |     """
 41 |     line_num, original_entry = entry_data
 42 |     
 43 |     # Set random seed for this process (deterministic but different per entry)
 44 |     random.seed(hash(str(original_entry)) + seed_offset + line_num)
 45 |     
 46 |     try:
 47 |         # Create minimal components for this worker
 48 |         llm_manager = LLMManager(llm_config)
 49 |         
 50 |         if use_applicability_check:
 51 |             rule_manager = EnhancedInconsistencyRuleManager(llm_manager)
 52 |         else:
 53 |             rule_manager = InconsistencyRuleManager()
 54 |         
 55 |         # Decide whether to generate inconsistent data
 56 |         is_inconsistent = random.random() < inconsistency_ratio
 57 |         
 58 |         result = {
 59 |             'line_number': line_num,
 60 |             'is_inconsistent': is_inconsistent,
 61 |             'error': None
 62 |         }
 63 |         
 64 |         if is_inconsistent:
 65 |             # Generate inconsistent data using simplified logic
 66 |             if use_applicability_check and hasattr(rule_manager, 'get_best_rule_for_commit'):
 67 |                 try:
 68 |                     rule, applicability_info = rule_manager.get_best_rule_for_commit(original_entry)
 69 |                     rule_selection_info = {
 70 |                         'selection_method': rule.rule_type.value,
 71 |                         'applicability_check': True,
 72 |                         'reasoning': applicability_info.get('reasoning', 'N/A'),
 73 |                         'was_applicable': applicability_info.get('applicable', True)
 74 |                     }
 75 |                 except Exception:
 76 |                     rule = rule_manager.get_random_rule()
 77 |                     rule_selection_info = {
 78 |                         'selection_method': 'fallback_random',
 79 |                         'applicability_check': False
 80 |                     }
 81 |             else:
 82 |                 rule = rule_manager.get_random_rule()
 83 |                 rule_selection_info = {
 84 |                     'selection_method': 'random',
 85 |                     'applicability_check': False
 86 |                 }
 87 |             
 88 |             # Format prompt and generate inconsistent message
 89 |             prompt = rule_manager.format_prompt(rule, original_entry)
 90 |             inconsistent_message = llm_manager.generate_inconsistent_message(prompt)
 91 |             
 92 |             if inconsistent_message is None:
 93 |                 inconsistent_message = f"[Generated by {rule.rule_type.value}] {original_entry.get('message', '')}"
 94 |             
 95 |             # Create inconsistent entry
 96 |             inconsistent_entry = original_entry.copy()
 97 |             inconsistent_entry['message'] = inconsistent_message
 98 |             inconsistent_entry['consistency'] = False
 99 |             inconsistent_entry['inconsistency_rule'] = rule.rule_type.value
100 |             inconsistent_entry['rule_weight'] = rule.weight
101 |             inconsistent_entry['original_message'] = original_entry.get('message', '')
102 |             inconsistent_entry['rule_selection_info'] = rule_selection_info
103 |             
104 |             result['entry'] = inconsistent_entry
105 |             result['type'] = 'inconsistent'
106 |         else:
107 |             # Keep original data (add consistency field)
108 |             consistent_entry = original_entry.copy()
109 |             consistent_entry['consistency'] = True
110 |             result['entry'] = consistent_entry
111 |             result['type'] = 'consistent'
112 |         
113 |         return result
114 |         
115 |     except Exception as e:
116 |         return {
117 |             'line_number': line_num,
118 |             'is_inconsistent': False,
119 |             'error': str(e),
120 |             'entry': original_entry,
121 |             'type': 'error'
122 |         }
123 | 
124 | 
125 | def _worker_process_entry(entry_data: tuple, llm_config: Dict[str, Any], 
126 |                          inconsistency_ratio: float, use_applicability_check: bool,
127 |                          seed_offset: int) -> Dict[str, Any]:
128 |     """
129 |     Worker function to process a single entry in multiprocessing.
130 |     
131 |     This function is defined at module level to ensure proper serialization.
132 |     
133 |     Args:
134 |         entry_data: Tuple of (line_number, original_entry)
135 |         llm_config: Simplified LLM configuration dictionary
136 |         inconsistency_ratio: Ratio for inconsistency generation
137 |         use_applicability_check: Whether to use applicability checking
138 |         seed_offset: Offset for random seed
139 |         
140 |     Returns:
141 |         Processed entry with metadata
142 |     """
143 |     line_num, original_entry = entry_data
144 |     
145 |     # Set random seed for this process (deterministic but different per entry)
146 |     random.seed(hash(str(original_entry)) + seed_offset + line_num)
147 |     
148 |     try:
149 |         # Create minimal components for this worker
150 |         # print(llm_config)
151 |         # exit(0)
152 |         llm_manager = LLMManager(llm_config)
153 |         
154 |         if use_applicability_check:
155 |             rule_manager = EnhancedInconsistencyRuleManager(llm_manager)
156 |         else:
157 |             rule_manager = InconsistencyRuleManager()
158 |         
159 |         # Decide whether to generate inconsistent data
160 |         is_inconsistent = random.random() < inconsistency_ratio
161 |         
162 |         result = {
163 |             'line_number': line_num,
164 |             'is_inconsistent': is_inconsistent,
165 |             'error': None
166 |         }
167 |         
168 |         if is_inconsistent:
169 |             # Generate inconsistent data using simplified logic
170 |             if use_applicability_check and hasattr(rule_manager, 'get_best_rule_for_commit'):
171 |                 try:
172 |                     rule, applicability_info = rule_manager.get_best_rule_for_commit(original_entry)
173 |                     rule_selection_info = {
174 |                         'selection_method': rule.rule_type.value,
175 |                         'applicability_check': True,
176 |                         'reasoning': applicability_info.get('reasoning', 'N/A'),
177 |                         'was_applicable': applicability_info.get('applicable', True)
178 |                     }
179 |                 except Exception:
180 |                     rule = rule_manager.get_random_rule()
181 |                     rule_selection_info = {
182 |                         'selection_method': 'fallback_random',
183 |                         'applicability_check': False
184 |                     }
185 |             else:
186 |                 rule = rule_manager.get_random_rule()
187 |                 rule_selection_info = {
188 |                     'selection_method': 'random',
189 |                     'applicability_check': False
190 |                 }
191 |             
192 |             # Format prompt and generate inconsistent message
193 |             prompt = rule_manager.format_prompt(rule, original_entry)
194 |             inconsistent_message = llm_manager.generate_inconsistent_message(prompt)
195 |             
196 |             if inconsistent_message is None:
197 |                 inconsistent_message = f"[Generated by {rule.rule_type.value}] {original_entry.get('message', '')}"
198 |             
199 |             # Create inconsistent entry
200 |             inconsistent_entry = original_entry.copy()
201 |             inconsistent_entry['message'] = inconsistent_message
202 |             inconsistent_entry['consistency'] = False
203 |             inconsistent_entry['inconsistency_rule'] = rule.rule_type.value
204 |             inconsistent_entry['rule_weight'] = rule.weight
205 |             inconsistent_entry['original_message'] = original_entry.get('message', '')
206 |             inconsistent_entry['rule_selection_info'] = rule_selection_info
207 |             
208 |             result['entry'] = inconsistent_entry
209 |             result['type'] = 'inconsistent'
210 |         else:
211 |             # Keep original data (add consistency field)
212 |             consistent_entry = original_entry.copy()
213 |             consistent_entry['consistency'] = True
214 |             result['entry'] = consistent_entry
215 |             result['type'] = 'consistent'
216 |         
217 |         return result
218 |         
219 |     except Exception as e:
220 |         return {
221 |             'line_number': line_num,
222 |             'is_inconsistent': False,
223 |             'error': str(e),
224 |             'entry': original_entry,
225 |             'type': 'error'
226 |         }
227 | 
228 | 
229 | class InconsistentDataGenerator:
230 |     """Inconsistent data generator with intelligent rule selection"""
231 |     
232 |     def __init__(self, llm_config: Dict[str, Any], seed: Optional[int] = None, 
233 |                  use_applicability_check: bool = True):
234 |         """
235 |         Initialize generator
236 |         
237 |         Args:
238 |             llm_config: LLM configuration dictionary
239 |             seed: Random seed for reproducible results
240 |             use_applicability_check: Whether to use LLM-based applicability checking
241 |         """
242 |         self.logger = logging.getLogger(__name__)
243 |         self.llm_manager = LLMManager(llm_config)
244 |         self.use_applicability_check = use_applicability_check
245 |         
246 |         if use_applicability_check:
247 |             self.rule_manager = EnhancedInconsistencyRuleManager(self.llm_manager)
248 |             self.logger.info("Using enhanced rule manager with applicability checking")
249 |         else:
250 |             self.rule_manager = InconsistencyRuleManager()
251 |             self.logger.info("Using basic rule manager without applicability checking")
252 |         
253 |         if seed is not None:
254 |             random.seed(seed)
255 |             self.logger.info(f"Set random seed to {seed}")
256 |     
257 |     def generate_inconsistent_entry(self, original_entry: Dict[str, Any], 
258 |                                   rule: Optional[InconsistencyRule] = None) -> Dict[str, Any]:
259 |         """
260 |         Generate inconsistent data entry based on original entry with intelligent rule selection
261 |         
262 |         Args:
263 |             original_entry: Original data entry
264 |             rule: Specified inconsistency rule, if None then intelligently select
265 |             
266 |         Returns:
267 |             Inconsistent data entry
268 |         """
269 |         rule_selection_info = {}
270 |         
271 |         # Use intelligent rule selection if applicability checking is enabled
272 |         if rule is None and self.use_applicability_check and hasattr(self.rule_manager, 'get_best_rule_for_commit'):
273 |             try:
274 |                 self.logger.info("Performing intelligent rule selection...")
275 |                 rule, applicability_info = self.rule_manager.get_best_rule_for_commit(original_entry)
276 |                 
277 |                 rule_selection_info = {
278 |                     'selection_method': rule.rule_type.value,
279 |                     'applicability_check': True,
280 |                     'reasoning': applicability_info.get('reasoning', 'N/A'),
281 |                     'was_applicable': applicability_info.get('applicable', True)
282 |                 }
283 |                 
284 |                 self.logger.info(f"Intelligently selected rule: {rule.name} (weight: {rule.weight})")
285 |                 self.logger.debug(f"Selection reasoning: {applicability_info.get('reasoning', 'N/A')}")
286 |                 
287 |             except Exception as e:
288 |                 self.logger.warning(f"Intelligent rule selection failed: {e}, falling back to random selection")
289 |                 rule = self.rule_manager.get_random_rule()
290 |                 rule_selection_info = {
291 |                     'selection_method': 'fallback_random',
292 |                     'applicability_check': False,
293 |                     'error': str(e)
294 |                 }
295 |                 
296 |         elif rule is None:
297 |             # Use random selection
298 |             rule = self.rule_manager.get_random_rule()
299 |             rule_selection_info = {
300 |                 'selection_method': 'random',
301 |                 'applicability_check': False
302 |             }
303 |         else:
304 |             # Use provided rule
305 |             rule_selection_info = {
306 |                 'selection_method': 'specified',
307 |                 'applicability_check': False
308 |             }
309 |         
310 |         self.logger.info(f"Applying rule: {rule.name} ({rule.rule_type.value})")
311 |         
312 |         # Format prompt
313 |         prompt = self.rule_manager.format_prompt(rule, original_entry)
314 |         
315 |         # Call LLM to generate inconsistent commit message
316 |         inconsistent_message = self.llm_manager.generate_inconsistent_message(prompt)
317 |         
318 |         if inconsistent_message is None:
319 |             self.logger.error("Failed to generate inconsistent message")
320 |             inconsistent_message = f"[Generated by {rule.rule_type.value}] {original_entry.get('message', '')}"
321 |         
322 |         # Create new inconsistent entry
323 |         inconsistent_entry = original_entry.copy()
324 |         inconsistent_entry['message'] = inconsistent_message
325 |         inconsistent_entry['consistency'] = False
326 |         inconsistent_entry['inconsistency_rule'] = rule.rule_type.value
327 |         inconsistent_entry['rule_weight'] = rule.weight
328 |         inconsistent_entry['original_message'] = original_entry.get('message', '')
329 |         inconsistent_entry['rule_selection_info'] = rule_selection_info
330 |         
331 |         return inconsistent_entry
332 |     
333 |     def process_jsonl_file(self, input_file: str, output_file: str, 
334 |                           num_samples: Optional[int] = None,
335 |                           inconsistency_ratio: float = 1.0,
336 |                           max_workers: int = 1) -> None:
337 |         """
338 |         Process JSONL file to generate inconsistent data
339 |         
340 |         Args:
341 |             input_file: Input JSONL file path
342 |             output_file: Output JSONL file path
343 |             num_samples: Number of samples to process, None means process all
344 |             inconsistency_ratio: Ratio of inconsistent data (0.0-1.0)
345 |             max_workers: Number of worker processes for parallel processing
346 |         """
347 |         if max_workers == 1:
348 |             # Use sequential processing (original behavior)
349 |             self._process_jsonl_file_sequential(input_file, output_file, num_samples, inconsistency_ratio)
350 |         else:
351 |             # Use multiprocessing
352 |             self._process_jsonl_file_parallel(input_file, output_file, num_samples, inconsistency_ratio, max_workers)
353 |     
354 |     def _process_jsonl_file_sequential(self, input_file: str, output_file: str, 
355 |                                      num_samples: Optional[int] = None,
356 |                                      inconsistency_ratio: float = 1.0) -> None:
357 |         """
358 |         Original sequential processing method
359 |         
360 |         Args:
361 |             input_file: Input JSONL file path
362 |             output_file: Output JSONL file path
363 |             num_samples: Number of samples to process, None means process all
364 |             inconsistency_ratio: Ratio of inconsistent data (0.0-1.0)
365 |         """
366 |         input_path = Path(input_file)
367 |         output_path = Path(output_file)
368 |         
369 |         if not input_path.exists():
370 |             raise FileNotFoundError(f"Input file not found: {input_file}")
371 |         
372 |         # Ensure output directory exists
373 |         output_path.parent.mkdir(parents=True, exist_ok=True)
374 |         
375 |         processed_count = 0
376 |         inconsistent_count = 0
377 |         
378 |         with open(input_path, 'r', encoding='utf-8') as infile, \
379 |              open(output_path, 'w', encoding='utf-8') as outfile:
380 |             
381 |             for line_num, line in enumerate(infile, 1):
382 |                 if num_samples is not None and processed_count >= num_samples:
383 |                     break
384 |                 
385 |                 try:
386 |                     original_entry = json.loads(line.strip())
387 |                     
388 |                     # Decide whether to generate inconsistent data
389 |                     if random.random() < inconsistency_ratio:
390 |                         # Generate inconsistent data
391 |                         inconsistent_entry = self.generate_inconsistent_entry(original_entry)
392 |                         outfile.write(json.dumps(inconsistent_entry, ensure_ascii=False) + '\n')
393 |                         inconsistent_count += 1
394 |                         self.logger.info(f"Generated inconsistent entry {inconsistent_count} from line {line_num}")
395 |                     else:
396 |                         # Keep original data (add consistency field)
397 |                         consistent_entry = original_entry.copy()
398 |                         consistent_entry['consistency'] = True
399 |                         outfile.write(json.dumps(consistent_entry, ensure_ascii=False) + '\n')
400 |                         self.logger.info(f"Kept consistent entry from line {line_num}")
401 |                     
402 |                     processed_count += 1
403 |                     
404 |                     if processed_count % 10 == 0:
405 |                         self.logger.info(f"Processed {processed_count} entries...")
406 |                         
407 |                 except json.JSONDecodeError as e:
408 |                     self.logger.error(f"Failed to parse JSON on line {line_num}: {e}")
409 |                 except Exception as e:
410 |                     self.logger.error(f"Error processing line {line_num}: {e}")
411 |         
412 |         self.logger.info(f"Processing complete. Total: {processed_count}, Inconsistent: {inconsistent_count}")
413 |     
414 |     def _process_jsonl_file_parallel(self, input_file: str, output_file: str, 
415 |                                    num_samples: Optional[int] = None,
416 |                                    inconsistency_ratio: float = 1.0,
417 |                                    max_workers: int = 4) -> None:
418 |         """
419 |         Parallel processing method using multiprocessing
420 |         
421 |         Args:
422 |             input_file: Input JSONL file path
423 |             output_file: Output JSONL file path
424 |             num_samples: Number of samples to process, None means process all
425 |             inconsistency_ratio: Ratio of inconsistent data (0.0-1.0)
426 |             max_workers: Number of worker processes
427 |         """
428 |         input_path = Path(input_file)
429 |         output_path = Path(output_file)
430 |         
431 |         if not input_path.exists():
432 |             raise FileNotFoundError(f"Input file not found: {input_file}")
433 |         
434 |         # Ensure output directory exists
435 |         output_path.parent.mkdir(parents=True, exist_ok=True)
436 |         
437 |         # Load all entries to process
438 |         entries_to_process = []
439 |         with open(input_path, 'r', encoding='utf-8') as infile:
440 |             for line_num, line in enumerate(infile, 1):
441 |                 if num_samples is not None and line_num > num_samples:
442 |                     break
443 |                 try:
444 |                     original_entry = json.loads(line.strip())
445 |                     entries_to_process.append((line_num, original_entry))
446 |                 except json.JSONDecodeError as e:
447 |                     self.logger.error(f"Failed to parse JSON on line {line_num}: {e}")
448 |         
449 |         total_entries = len(entries_to_process)
450 |         self.logger.info(f"🚀 Starting parallel processing:")
451 |         self.logger.info(f"   📊 Total entries: {total_entries}")
452 |         self.logger.info(f"   👥 Workers: {max_workers}")
453 |         self.logger.info(f"   📈 Inconsistency ratio: {inconsistency_ratio}")
454 |         self.logger.info(f"   🎯 Expected inconsistent entries: {int(total_entries * inconsistency_ratio)}")
455 |         self.logger.info(f"   🔧 Applicability check: {'enabled' if self.use_applicability_check else 'disabled'}")
456 |         
457 |         # Get simplified LLM config for worker processes (ensure serializability)
458 |         try:
459 |             if hasattr(self.llm_manager, 'llm_config_test'):
460 |                 llm_config = dict(self.llm_manager.llm_config_test)  # Ensure it's a dict, not custom object
461 |             else:
462 |                 # Fallback: construct config from basic attributes
463 |                 llm_config = {
464 |                     'provider': getattr(self.llm_manager, 'provider', 'openai'),
465 |                     'api_key': getattr(self.llm_manager, 'api_key', 'empty'),
466 |                     'base_url': getattr(self.llm_manager, 'base_url', None),
467 |                     'model': getattr(self.llm_manager, 'model', 'gpt-3.5-turbo'),
468 |                     'max_retries': getattr(self.llm_manager, 'max_retries', 3)
469 |                 }
470 |             # Ensure all values are serializable
471 |             serializable_config = {}
472 |             for key, value in llm_config.items():
473 |                 if isinstance(value, (str, int, float, bool, type(None))):
474 |                     serializable_config[key] = value
475 |                 else:
476 |                     serializable_config[key] = str(value)
477 |             
478 |             llm_config = serializable_config
479 | 
480 |             
481 |         except Exception as e:
482 |             self.logger.warning(f"Could not extract LLM config: {e}, using minimal defaults")
483 |             llm_config = {
484 |                 'provider': 'openai',
485 |                 'api_key': 'empty',
486 |                 'base_url': None,
487 |                 'model': 'gpt-3.5-turbo',
488 |                 'max_retries': 3
489 |             }
490 | 
491 |         
492 |         # Prepare worker arguments as simple tuples to avoid serialization issues
493 |         worker_args_list = []
494 |         for entry_data in entries_to_process:
495 |             worker_args = (
496 |                 entry_data,  # (line_num, original_entry)
497 |                 llm_config,
498 |                 inconsistency_ratio,
499 |                 self.use_applicability_check,
500 |                 hash(input_file)  # seed_offset
501 |             )
502 |             worker_args_list.append(worker_args)
503 |         
504 |         processed_count = 0
505 |         inconsistent_count = 0
506 |         error_count = 0
507 |         
508 |         # Process entries in parallel using simple map approach with progress monitoring
509 |         try:
510 |             import time
511 |             start_time = time.time()
512 |             self.logger.info(f"⏱️  Starting multiprocessing with {max_workers} workers at {time.strftime('%H:%M:%S')}...")
513 |             
514 |             # Use multiprocessing approach with progress monitoring and real-time file writing
515 |             with multiprocessing.Pool(processes=max_workers) as pool:
516 |                 # Submit all tasks asynchronously for better progress tracking
517 |                 async_results = []
518 |                 for worker_args in worker_args_list:
519 |                     async_result = pool.apply_async(_worker_process_entry_simple, worker_args)
520 |                     async_results.append(async_result)
521 |                 
522 |                 self.logger.info(f"📤 Submitted {len(async_results)} tasks, monitoring progress...")
523 |                 
524 |                 # Monitor progress and collect results, write to file immediately
525 |                 completed_count = 0
526 |                 last_progress_time = start_time
527 |                 
528 |                 # Open output file for writing results as they complete
529 |                 with open(output_path, 'w', encoding='utf-8') as outfile:
530 |                     for i, async_result in enumerate(async_results):
531 |                         try:
532 |                             # Get result (this will block until the specific task completes)
533 |                             result = async_result.get()
534 |                             completed_count += 1
535 |                             processed_count += 1
536 |                             
537 |                             # Update counters
538 |                             if result['type'] == 'inconsistent':
539 |                                 inconsistent_count += 1
540 |                             elif result['type'] == 'error':
541 |                                 error_count += 1
542 |                                 self.logger.error(f"Error processing line {result['line_number']}: {result['error']}")
543 |                             
544 |                             # Write result to file immediately
545 |                             outfile.write(json.dumps(result['entry'], ensure_ascii=False) + '\n')
546 |                             outfile.flush()  # Ensure data is written to disk
547 |                             
548 |                             # Progress reporting - show progress with timing info
549 |                             current_time = time.time()
550 |                             if completed_count % 1 == 0 or completed_count == total_entries:
551 |                                 progress_percentage = (completed_count / total_entries) * 100
552 |                                 elapsed_time = current_time - start_time
553 |                                 rate = completed_count / elapsed_time if elapsed_time > 0 else 0
554 |                                 eta_seconds = (total_entries - completed_count) / rate if rate > 0 else 0
555 |                                 eta_str = f"{int(eta_seconds//60)}m{int(eta_seconds%60)}s" if eta_seconds < 3600 else f"{int(eta_seconds//3600)}h{int((eta_seconds%3600)//60)}m"
556 |                                 
557 |                                 self.logger.info(f"🔄 Progress: {completed_count}/{total_entries} ({progress_percentage:.1f}%) | "
558 |                                                f"⚡ {rate:.1f}/s | ETA: {eta_str} | "
559 |                                                f"✅ Inconsistent: {inconsistent_count} | ❌ Errors: {error_count}")
560 |                             
561 |                         except Exception as e:
562 |                             error_count += 1
563 |                             self.logger.error(f"Failed to get result for task {i}: {e}")
564 |                             # Create error entry for failed task and write it immediately
565 |                             if i < len(worker_args_list):
566 |                                 entry_data = worker_args_list[i][0]
567 |                                 line_num, original_entry = entry_data
568 |                                 error_result = {
569 |                                     'line_number': line_num,
570 |                                     'is_inconsistent': False,
571 |                                     'error': str(e),
572 |                                     'entry': original_entry,
573 |                                     'type': 'error'
574 |                                 }
575 |                                 outfile.write(json.dumps(error_result['entry'], ensure_ascii=False) + '\n')
576 |                                 outfile.flush()
577 |         
578 |         except Exception as e:
579 |             self.logger.error(f"Multiprocessing failed: {e}")
580 |             raise
581 |         
582 |         self.logger.info(f"✅ Parallel processing complete. Total: {processed_count}, "
583 |                         f"Inconsistent: {inconsistent_count}, Errors: {error_count}")
584 |         self.logger.info(f"📄 Output written to: {output_path}")
585 |     
586 |     def analyze_commit_applicability(self, commit_entry: Dict[str, Any]) -> Dict[str, Any]:
587 |         """
588 |         Analyze rule applicability for a specific commit entry
589 |         
590 |         Args:
591 |             commit_entry: Commit data entry
592 |             
593 |         Returns:
594 |             Detailed applicability analysis
595 |         """
596 |         if not self.use_applicability_check or not hasattr(self.rule_manager, 'analyze_commit_applicability'):
597 |             return {
598 |                 'error': 'Applicability checking not available',
599 |                 'use_applicability_check': self.use_applicability_check
600 |             }
601 |         
602 |         try:
603 |             # Get detailed applicability analysis
604 |             applicability = self.rule_manager.analyze_commit_applicability(commit_entry)
605 |             
606 |             # Get best rule selection
607 |             best_rule, selection_info = self.rule_manager.get_best_rule_for_commit(commit_entry)
608 |             
609 |             # Count applicable rules
610 |             applicable_count = sum(1 for rule_info in applicability.values() if rule_info.get('applicable', False))
611 |             
612 |             analysis_result = {
613 |                 'commit_message': commit_entry.get('message', ''),
614 |                 'files_changed': commit_entry.get('files', []),
615 |                 'total_rules': len(applicability),
616 |                 'applicable_rules_count': applicable_count,
617 |                 'applicability_details': applicability,
618 |                 'selected_rule': {
619 |                     'name': best_rule.name,
620 |                     'type': best_rule.rule_type.value,
621 |                     'weight': best_rule.weight,
622 |                     'reasoning': selection_info.get('reasoning', 'N/A')
623 |                 },
624 |                 'summary': self.rule_manager.get_applicability_summary(commit_entry)
625 |             }
626 |             
627 |             return analysis_result
628 |             
629 |         except Exception as e:
630 |             self.logger.error(f"Error in applicability analysis: {e}")
631 |             return {
632 |                 'error': f'Applicability analysis failed: {str(e)}',
633 |                 'commit_message': commit_entry.get('message', ''),
634 |                 'files_changed': commit_entry.get('files', [])
635 |             }
636 |     
637 |     def process_jsonl_file_with_analysis(self, input_file: str, output_file: str,
638 |                                        analysis_file: Optional[str] = None,
639 |                                        num_samples: Optional[int] = None,
640 |                                        inconsistency_ratio: float = 1.0,
641 |                                        max_workers: int = 1) -> None:
642 |         """
643 |         Process JSONL file with detailed rule applicability analysis
644 |         
645 |         Args:
646 |             input_file: Input JSONL file path
647 |             output_file: Output JSONL file path
648 |             analysis_file: Optional file to save applicability analysis
649 |             num_samples: Number of samples to process
650 |             inconsistency_ratio: Ratio of inconsistent data (0.0-1.0)
651 |             max_workers: Number of worker processes for parallel processing
652 |         """
653 |         if max_workers == 1:
654 |             # Use sequential processing (original behavior)
655 |             self._process_jsonl_file_with_analysis_sequential(
656 |                 input_file, output_file, analysis_file, num_samples, inconsistency_ratio
657 |             )
658 |         else:
659 |             # Note: Analysis with multiprocessing is complex due to shared state
660 |             # For now, fall back to sequential processing with a warning
661 |             self.logger.warning("Analysis mode with multiprocessing not fully supported, using sequential processing")
662 |             self._process_jsonl_file_with_analysis_sequential(
663 |                 input_file, output_file, analysis_file, num_samples, inconsistency_ratio
664 |             )
665 |     
666 |     def _process_jsonl_file_with_analysis_sequential(self, input_file: str, output_file: str,
667 |                                                    analysis_file: Optional[str] = None,
668 |                                                    num_samples: Optional[int] = None,
669 |                                                    inconsistency_ratio: float = 1.0) -> None:
670 |         """Original sequential analysis processing method"""
671 |         input_path = Path(input_file)
672 |         output_path = Path(output_file)
673 |         
674 |         if not input_path.exists():
675 |             raise FileNotFoundError(f"Input file not found: {input_file}")
676 |         
677 |         # Ensure output directory exists
678 |         output_path.parent.mkdir(parents=True, exist_ok=True)
679 |         
680 |         processed_count = 0
681 |         inconsistent_count = 0
682 |         analysis_data = []
683 |         
684 |         with open(input_path, 'r', encoding='utf-8') as infile, \
685 |              open(output_path, 'w', encoding='utf-8') as outfile:
686 |             
687 |             for line_num, line in enumerate(infile, 1):
688 |                 if num_samples is not None and processed_count >= num_samples:
689 |                     break
690 |                 
691 |                 try:
692 |                     original_entry = json.loads(line.strip())
693 |                     
694 |                     # Decide whether to generate inconsistent data
695 |                     if random.random() < inconsistency_ratio:
696 |                         # Analyze applicability if enabled
697 |                         if self.use_applicability_check and analysis_file:
698 |                             analysis = self.analyze_commit_applicability(original_entry)
699 |                             analysis['line_number'] = line_num
700 |                             analysis['processed_as'] = 'inconsistent'
701 |                             analysis_data.append(analysis)
702 |                         
703 |                         # Generate inconsistent data
704 |                         inconsistent_entry = self.generate_inconsistent_entry(original_entry)
705 |                         outfile.write(json.dumps(inconsistent_entry, ensure_ascii=False) + '\n')
706 |                         inconsistent_count += 1
707 |                         self.logger.info(f"Generated inconsistent entry {inconsistent_count} from line {line_num}")
708 |                     else:
709 |                         # Keep original data (add consistency field)
710 |                         consistent_entry = original_entry.copy()
711 |                         consistent_entry['consistency'] = True
712 |                         outfile.write(json.dumps(consistent_entry, ensure_ascii=False) + '\n')
713 |                         self.logger.info(f"Kept consistent entry from line {line_num}")
714 |                         
715 |                         if analysis_file:
716 |                             analysis_data.append({
717 |                                 'line_number': line_num,
718 |                                 'processed_as': 'consistent',
719 |                                 'commit_message': original_entry.get('message', ''),
720 |                                 'files_changed': original_entry.get('files', [])
721 |                             })
722 |                     
723 |                     processed_count += 1
724 |                     
725 |                     if processed_count % 10 == 0:
726 |                         self.logger.info(f"Processed {processed_count} entries...")
727 |                         
728 |                 except json.JSONDecodeError as e:
729 |                     self.logger.error(f"Failed to parse JSON on line {line_num}: {e}")
730 |                 except Exception as e:
731 |                     self.logger.error(f"Error processing line {line_num}: {e}")
732 |         
733 |         # Save analysis data if requested
734 |         if analysis_file and analysis_data:
735 |             analysis_path = Path(analysis_file)
736 |             analysis_path.parent.mkdir(parents=True, exist_ok=True)
737 |             
738 |             with open(analysis_path, 'w', encoding='utf-8') as f:
739 |                 json.dump(analysis_data, f, ensure_ascii=False, indent=2)
740 |             
741 |             self.logger.info(f"Saved applicability analysis to {analysis_file}")
742 |         
743 |         self.logger.info(f"Processing complete. Total: {processed_count}, Inconsistent: {inconsistent_count}")
744 |     
745 |     def generate_batch_samples(self, original_entries: List[Dict[str, Any]], 
746 |                              batch_size: int = 10) -> List[Dict[str, Any]]:
747 |         """
748 |         Generate inconsistent samples in batches
749 |         
750 |         Args:
751 |             original_entries: List of original data entries
752 |             batch_size: Batch processing size
753 |             
754 |         Returns:
755 |             List of inconsistent data entries
756 |         """
757 |         inconsistent_entries = []
758 |         
759 |         for i in range(0, len(original_entries), batch_size):
760 |             batch = original_entries[i:i + batch_size]
761 |             self.logger.info(f"Processing batch {i // batch_size + 1}")
762 |             
763 |             for entry in batch:
764 |                 try:
765 |                     inconsistent_entry = self.generate_inconsistent_entry(entry)
766 |                     inconsistent_entries.append(inconsistent_entry)
767 |                 except Exception as e:
768 |                     self.logger.error(f"Failed to process entry: {e}")
769 |         
770 |         return inconsistent_entries
771 |     
772 |     def analyze_rules_distribution(self, output_file: str) -> Dict[str, int]:
773 |         """
774 |         Analyze distribution of rules in generated data
775 |         
776 |         Args:
777 |             output_file: Output file path
778 |             
779 |         Returns:
780 |             Rule distribution statistics
781 |         """
782 |         rule_counts = {}
783 |         
784 |         try:
785 |             with open(output_file, 'r', encoding='utf-8') as f:
786 |                 for line in f:
787 |                     try:
788 |                         entry = json.loads(line.strip())
789 |                         if entry.get('consistency') == False:
790 |                             rule_type = entry.get('inconsistency_rule', 'unknown')
791 |                             rule_counts[rule_type] = rule_counts.get(rule_type, 0) + 1
792 |                     except json.JSONDecodeError:
793 |                         continue
794 |         except FileNotFoundError:
795 |             self.logger.error(f"Output file not found: {output_file}")
796 |         
797 |         return rule_counts
798 |     
799 |     def get_available_rules(self) -> List[Dict[str, Any]]:
800 |         """Get information about all available inconsistency rules"""
801 |         rules_info = []
802 |         for rule in self.rule_manager.get_all_rules():
803 |             rule_info = {
804 |                 'type': rule.rule_type.value,
805 |                 'name': rule.name,
806 |                 'description': rule.description,
807 |                 'weight': rule.weight
808 |             }
809 |             rules_info.append(rule_info)
810 |         return rules_info
811 | 
812 | 
813 | def setup_logging(log_level: str = "INFO", log_file: Optional[str] = None):
814 |     """Setup logging configuration"""
815 |     log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
816 |     
817 |     handlers = [logging.StreamHandler(sys.stdout)]
818 |     if log_file:
819 |         handlers.append(logging.FileHandler(log_file))
820 |     
821 |     logging.basicConfig(
822 |         level=getattr(logging, log_level.upper()),
823 |         format=log_format,
824 |         handlers=handlers
825 |     )
826 | 
827 | 
828 | class ThreadSafeFileWriter:
829 |     """Thread-safe file writer to avoid write conflicts"""
830 |     
831 |     def __init__(self, file_path: str):
832 |         self.file_path = file_path
833 |         self.lock = threading.Lock()
834 |         self.file_handle = None
835 |     
836 |     def __enter__(self):
837 |         self.file_handle = open(self.file_path, 'w', encoding='utf-8')
838 |         return self
839 |     
840 |     def __exit__(self, exc_type, exc_val, exc_tb):
841 |         if self.file_handle:
842 |             self.file_handle.close()
843 |     
844 |     def write_entry(self, entry: Dict[str, Any]):
845 |         """Write an entry to file in a thread-safe manner"""
846 |         with self.lock:
847 |             self.file_handle.write(json.dumps(entry, ensure_ascii=False) + '\n')
848 |             self.file_handle.flush()
849 | 


--------------------------------------------------------------------------------